Python BeautifulSoup Parsing Script Tags












0















I am trying to parse the contents within a script tag to extract certain data. The following code uses a valid xbox live account.



from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import requests
import time
from bs4 import BeautifulSoup
import json
import re

email = 'email'
password = 'password'

driver = webdriver.Chrome()

driver.get(r'https://login.live.com/login.srf?wa=wsignin1.0&rpsnv=13&rver=6.7.6643.0&wp=MBI_SSL&wreply=https:%2f%2faccount.xbox.com%2fen-us%2faccountcreation%3freturnUrl%3dhttps:%252f%252fwww.xbox.com:443%252fen-US%252f%26pcexp%3dtrue%26uictx%3dme%26rtc%3d1&lc=1033&id=292543&aadredir=1')
time.sleep(3)
driver.find_element_by_xpath(""" //*[@id="i0116"] """).send_keys(email)
time.sleep(5)
driver.find_element_by_xpath(""" //*[@id="idSIButton9"] """).click()
time.sleep(5)
driver.find_element_by_xpath(""" //*[@id="i0118"] """).send_keys(password)
time.sleep(5)
driver.find_element_by_xpath(""" //*[@id="idSIButton9"] """).click()
time.sleep(5)
driver.get(r'https://account.xbox.com/en-us/Friends?xr=mebarnav&rtc=1')
print('Grabbing Cookies')
time.sleep(5)


headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'}

s = requests.Session()
s.headers.update(headers)

for cookie in driver.get_cookies():
c = {cookie['name'] : cookie['value']}
s.cookies.update(c)

#s.get('https://account.xbox.com/en-us/Friends?xr=mebarnav&rtc=1')


soup = BeautifulSoup(s.get('https://account.xbox.com/en-us/Profile?xr=mebarnav&activetab=tertiary:friendsTab&rtc=1').content, 'html.parser')

text = str(soup.find_all('script')[13])

value = re.findall(r'DisplayName', text)

print(value)


I am trying to access the certain data that comes after each "DisplayName" but I am failing to do so as I am just getting "DisplayName" instead of its value. If you need a better idea, you can print the "text" variable and search for "DisplayName". Thanks to all of those who reply in advance.










share|improve this question























  • Copy the display name code and paste it here. You're getting the display name with re but nothing behind it. Send the code and I'll help you fix that.

    – Kamikaze_goldfish
    Nov 21 '18 at 3:25
















0















I am trying to parse the contents within a script tag to extract certain data. The following code uses a valid xbox live account.



from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import requests
import time
from bs4 import BeautifulSoup
import json
import re

email = 'email'
password = 'password'

driver = webdriver.Chrome()

driver.get(r'https://login.live.com/login.srf?wa=wsignin1.0&rpsnv=13&rver=6.7.6643.0&wp=MBI_SSL&wreply=https:%2f%2faccount.xbox.com%2fen-us%2faccountcreation%3freturnUrl%3dhttps:%252f%252fwww.xbox.com:443%252fen-US%252f%26pcexp%3dtrue%26uictx%3dme%26rtc%3d1&lc=1033&id=292543&aadredir=1')
time.sleep(3)
driver.find_element_by_xpath(""" //*[@id="i0116"] """).send_keys(email)
time.sleep(5)
driver.find_element_by_xpath(""" //*[@id="idSIButton9"] """).click()
time.sleep(5)
driver.find_element_by_xpath(""" //*[@id="i0118"] """).send_keys(password)
time.sleep(5)
driver.find_element_by_xpath(""" //*[@id="idSIButton9"] """).click()
time.sleep(5)
driver.get(r'https://account.xbox.com/en-us/Friends?xr=mebarnav&rtc=1')
print('Grabbing Cookies')
time.sleep(5)


headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'}

s = requests.Session()
s.headers.update(headers)

for cookie in driver.get_cookies():
c = {cookie['name'] : cookie['value']}
s.cookies.update(c)

#s.get('https://account.xbox.com/en-us/Friends?xr=mebarnav&rtc=1')


soup = BeautifulSoup(s.get('https://account.xbox.com/en-us/Profile?xr=mebarnav&activetab=tertiary:friendsTab&rtc=1').content, 'html.parser')

text = str(soup.find_all('script')[13])

value = re.findall(r'DisplayName', text)

print(value)


I am trying to access the certain data that comes after each "DisplayName" but I am failing to do so as I am just getting "DisplayName" instead of its value. If you need a better idea, you can print the "text" variable and search for "DisplayName". Thanks to all of those who reply in advance.










share|improve this question























  • Copy the display name code and paste it here. You're getting the display name with re but nothing behind it. Send the code and I'll help you fix that.

    – Kamikaze_goldfish
    Nov 21 '18 at 3:25














0












0








0








I am trying to parse the contents within a script tag to extract certain data. The following code uses a valid xbox live account.



from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import requests
import time
from bs4 import BeautifulSoup
import json
import re

email = 'email'
password = 'password'

driver = webdriver.Chrome()

driver.get(r'https://login.live.com/login.srf?wa=wsignin1.0&rpsnv=13&rver=6.7.6643.0&wp=MBI_SSL&wreply=https:%2f%2faccount.xbox.com%2fen-us%2faccountcreation%3freturnUrl%3dhttps:%252f%252fwww.xbox.com:443%252fen-US%252f%26pcexp%3dtrue%26uictx%3dme%26rtc%3d1&lc=1033&id=292543&aadredir=1')
time.sleep(3)
driver.find_element_by_xpath(""" //*[@id="i0116"] """).send_keys(email)
time.sleep(5)
driver.find_element_by_xpath(""" //*[@id="idSIButton9"] """).click()
time.sleep(5)
driver.find_element_by_xpath(""" //*[@id="i0118"] """).send_keys(password)
time.sleep(5)
driver.find_element_by_xpath(""" //*[@id="idSIButton9"] """).click()
time.sleep(5)
driver.get(r'https://account.xbox.com/en-us/Friends?xr=mebarnav&rtc=1')
print('Grabbing Cookies')
time.sleep(5)


headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'}

s = requests.Session()
s.headers.update(headers)

for cookie in driver.get_cookies():
c = {cookie['name'] : cookie['value']}
s.cookies.update(c)

#s.get('https://account.xbox.com/en-us/Friends?xr=mebarnav&rtc=1')


soup = BeautifulSoup(s.get('https://account.xbox.com/en-us/Profile?xr=mebarnav&activetab=tertiary:friendsTab&rtc=1').content, 'html.parser')

text = str(soup.find_all('script')[13])

value = re.findall(r'DisplayName', text)

print(value)


I am trying to access the certain data that comes after each "DisplayName" but I am failing to do so as I am just getting "DisplayName" instead of its value. If you need a better idea, you can print the "text" variable and search for "DisplayName". Thanks to all of those who reply in advance.










share|improve this question














I am trying to parse the contents within a script tag to extract certain data. The following code uses a valid xbox live account.



from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import requests
import time
from bs4 import BeautifulSoup
import json
import re

email = 'email'
password = 'password'

driver = webdriver.Chrome()

driver.get(r'https://login.live.com/login.srf?wa=wsignin1.0&rpsnv=13&rver=6.7.6643.0&wp=MBI_SSL&wreply=https:%2f%2faccount.xbox.com%2fen-us%2faccountcreation%3freturnUrl%3dhttps:%252f%252fwww.xbox.com:443%252fen-US%252f%26pcexp%3dtrue%26uictx%3dme%26rtc%3d1&lc=1033&id=292543&aadredir=1')
time.sleep(3)
driver.find_element_by_xpath(""" //*[@id="i0116"] """).send_keys(email)
time.sleep(5)
driver.find_element_by_xpath(""" //*[@id="idSIButton9"] """).click()
time.sleep(5)
driver.find_element_by_xpath(""" //*[@id="i0118"] """).send_keys(password)
time.sleep(5)
driver.find_element_by_xpath(""" //*[@id="idSIButton9"] """).click()
time.sleep(5)
driver.get(r'https://account.xbox.com/en-us/Friends?xr=mebarnav&rtc=1')
print('Grabbing Cookies')
time.sleep(5)


headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'}

s = requests.Session()
s.headers.update(headers)

for cookie in driver.get_cookies():
c = {cookie['name'] : cookie['value']}
s.cookies.update(c)

#s.get('https://account.xbox.com/en-us/Friends?xr=mebarnav&rtc=1')


soup = BeautifulSoup(s.get('https://account.xbox.com/en-us/Profile?xr=mebarnav&activetab=tertiary:friendsTab&rtc=1').content, 'html.parser')

text = str(soup.find_all('script')[13])

value = re.findall(r'DisplayName', text)

print(value)


I am trying to access the certain data that comes after each "DisplayName" but I am failing to do so as I am just getting "DisplayName" instead of its value. If you need a better idea, you can print the "text" variable and search for "DisplayName". Thanks to all of those who reply in advance.







python regex selenium beautifulsoup python-requests






share|improve this question













share|improve this question











share|improve this question




share|improve this question










asked Nov 21 '18 at 1:08









otterdogotterdog

5428




5428













  • Copy the display name code and paste it here. You're getting the display name with re but nothing behind it. Send the code and I'll help you fix that.

    – Kamikaze_goldfish
    Nov 21 '18 at 3:25



















  • Copy the display name code and paste it here. You're getting the display name with re but nothing behind it. Send the code and I'll help you fix that.

    – Kamikaze_goldfish
    Nov 21 '18 at 3:25

















Copy the display name code and paste it here. You're getting the display name with re but nothing behind it. Send the code and I'll help you fix that.

– Kamikaze_goldfish
Nov 21 '18 at 3:25





Copy the display name code and paste it here. You're getting the display name with re but nothing behind it. Send the code and I'll help you fix that.

– Kamikaze_goldfish
Nov 21 '18 at 3:25












1 Answer
1






active

oldest

votes


















1














So the reason you're not getting anything is because you're telling re to search for the exact phrase. You're not telling it to get any more characters and where to stop. In the example below I am using single quotes but the code could be adjusted for double quotes. I then have re find the DisplayName but the .* find the characters behind it but stop at the single quote '. Then after that it's just replacing the stuff you don't want.



import re

url = "DisplayName='PoppaBear4'"

info = re.findall(r"DisplayName=.*'", url)
print(str(info).replace("DisplayName='",'').replace("'","").replace('["','').replace('"]',''))





share|improve this answer























    Your Answer






    StackExchange.ifUsing("editor", function () {
    StackExchange.using("externalEditor", function () {
    StackExchange.using("snippets", function () {
    StackExchange.snippets.init();
    });
    });
    }, "code-snippets");

    StackExchange.ready(function() {
    var channelOptions = {
    tags: "".split(" "),
    id: "1"
    };
    initTagRenderer("".split(" "), "".split(" "), channelOptions);

    StackExchange.using("externalEditor", function() {
    // Have to fire editor after snippets, if snippets enabled
    if (StackExchange.settings.snippets.snippetsEnabled) {
    StackExchange.using("snippets", function() {
    createEditor();
    });
    }
    else {
    createEditor();
    }
    });

    function createEditor() {
    StackExchange.prepareEditor({
    heartbeatType: 'answer',
    autoActivateHeartbeat: false,
    convertImagesToLinks: true,
    noModals: true,
    showLowRepImageUploadWarning: true,
    reputationToPostImages: 10,
    bindNavPrevention: true,
    postfix: "",
    imageUploader: {
    brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
    contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
    allowUrls: true
    },
    onDemand: true,
    discardSelector: ".discard-answer"
    ,immediatelyShowMarkdownHelp:true
    });


    }
    });














    draft saved

    draft discarded


















    StackExchange.ready(
    function () {
    StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53403912%2fpython-beautifulsoup-parsing-script-tags%23new-answer', 'question_page');
    }
    );

    Post as a guest















    Required, but never shown

























    1 Answer
    1






    active

    oldest

    votes








    1 Answer
    1






    active

    oldest

    votes









    active

    oldest

    votes






    active

    oldest

    votes









    1














    So the reason you're not getting anything is because you're telling re to search for the exact phrase. You're not telling it to get any more characters and where to stop. In the example below I am using single quotes but the code could be adjusted for double quotes. I then have re find the DisplayName but the .* find the characters behind it but stop at the single quote '. Then after that it's just replacing the stuff you don't want.



    import re

    url = "DisplayName='PoppaBear4'"

    info = re.findall(r"DisplayName=.*'", url)
    print(str(info).replace("DisplayName='",'').replace("'","").replace('["','').replace('"]',''))





    share|improve this answer




























      1














      So the reason you're not getting anything is because you're telling re to search for the exact phrase. You're not telling it to get any more characters and where to stop. In the example below I am using single quotes but the code could be adjusted for double quotes. I then have re find the DisplayName but the .* find the characters behind it but stop at the single quote '. Then after that it's just replacing the stuff you don't want.



      import re

      url = "DisplayName='PoppaBear4'"

      info = re.findall(r"DisplayName=.*'", url)
      print(str(info).replace("DisplayName='",'').replace("'","").replace('["','').replace('"]',''))





      share|improve this answer


























        1












        1








        1







        So the reason you're not getting anything is because you're telling re to search for the exact phrase. You're not telling it to get any more characters and where to stop. In the example below I am using single quotes but the code could be adjusted for double quotes. I then have re find the DisplayName but the .* find the characters behind it but stop at the single quote '. Then after that it's just replacing the stuff you don't want.



        import re

        url = "DisplayName='PoppaBear4'"

        info = re.findall(r"DisplayName=.*'", url)
        print(str(info).replace("DisplayName='",'').replace("'","").replace('["','').replace('"]',''))





        share|improve this answer













        So the reason you're not getting anything is because you're telling re to search for the exact phrase. You're not telling it to get any more characters and where to stop. In the example below I am using single quotes but the code could be adjusted for double quotes. I then have re find the DisplayName but the .* find the characters behind it but stop at the single quote '. Then after that it's just replacing the stuff you don't want.



        import re

        url = "DisplayName='PoppaBear4'"

        info = re.findall(r"DisplayName=.*'", url)
        print(str(info).replace("DisplayName='",'').replace("'","").replace('["','').replace('"]',''))






        share|improve this answer












        share|improve this answer



        share|improve this answer










        answered Nov 21 '18 at 3:56









        Kamikaze_goldfishKamikaze_goldfish

        493311




        493311
































            draft saved

            draft discarded




















































            Thanks for contributing an answer to Stack Overflow!


            • Please be sure to answer the question. Provide details and share your research!

            But avoid



            • Asking for help, clarification, or responding to other answers.

            • Making statements based on opinion; back them up with references or personal experience.


            To learn more, see our tips on writing great answers.




            draft saved


            draft discarded














            StackExchange.ready(
            function () {
            StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53403912%2fpython-beautifulsoup-parsing-script-tags%23new-answer', 'question_page');
            }
            );

            Post as a guest















            Required, but never shown





















































            Required, but never shown














            Required, but never shown












            Required, but never shown







            Required, but never shown

































            Required, but never shown














            Required, but never shown












            Required, but never shown







            Required, but never shown







            這個網誌中的熱門文章

            Xamarin.form Move up view when keyboard appear

            Post-Redirect-Get with Spring WebFlux and Thymeleaf

            Anylogic : not able to use stopDelay()