页面请求未完全加载
Page Request not loading fully
我的请求有问题。当我看到该页面时,没有获得我需要的某一特定数据。
from bs4 import BeautifulSoup
import requests
def getHTML(url):
headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0'}
r = requests.get(url, headers=headers, allow_redirects=True, timeout=None)
#print r.headers
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def main():
source = getHTML('http://connected2.me')
for link in source.find_all('ul'):
print(link)
if __name__ == '__main__':
main()
我需要从源获取的数据在此列表中
<ul class="usersOnlineList clear"></ul>
但结果总是这样
<ul class="usersOnlineList clear">
</ul>
<ul>
<li><input class="inptIcn icnUser" data-validate="validate(required, username, minlength(3), maxlength(25))" id="fos_user_registration_form_username" maxlength="255" name="fos_user_registration_form[username]" pattern=".{3,255}" placeholder="username" required="required" type="text" value=""/></li>
<li><input class="inptIcn icnPass" data-validate="validate(required, minlength(6))" id="fos_user_registration_form_plainPassword" name="fos_user_registration_form[plainPassword]" placeholder="password" required="required" type="password" value=""/></li>
<li><input class="inptIcn icnEmail" data-validate="validate(required, email)" id="fos_user_registration_form_email" maxlength="255" name="fos_user_registration_form[email]" pattern=".{2,255}" placeholder="email" required="required" type="email" value=""/></li>
<li class="formActions"><input class="btn btnGreen" id="signup-btn" name="signup-btn" type="submit" value="Sign Up!"/></li>
</ul>
列表为空。为什么我无法仅从该列表中获取列表项?
这是因为 <ul class="usersOnlineList clear"></ul>
内容被 JavaScript
注入了。您必须等待插入,而 requests
无法执行此操作。 selenium
可以解决这个问题:
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Firefox()
driver.set_page_load_timeout(5)
def main():
driver.get("http://connected2.me")
source = BeautifulSoup(driver.page_source, 'html.parser')
for link in source.find_all('ul', {'class': 'usersOnlineList'}):
print(link)
driver.close()
if __name__ == '__main__':
main()
我的请求有问题。当我看到该页面时,没有获得我需要的某一特定数据。
from bs4 import BeautifulSoup
import requests
def getHTML(url):
headers = {'user-agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/21.0'}
r = requests.get(url, headers=headers, allow_redirects=True, timeout=None)
#print r.headers
soup = BeautifulSoup(r.content, 'html.parser')
return soup
def main():
source = getHTML('http://connected2.me')
for link in source.find_all('ul'):
print(link)
if __name__ == '__main__':
main()
我需要从源获取的数据在此列表中
<ul class="usersOnlineList clear"></ul>
但结果总是这样
<ul class="usersOnlineList clear">
</ul>
<ul>
<li><input class="inptIcn icnUser" data-validate="validate(required, username, minlength(3), maxlength(25))" id="fos_user_registration_form_username" maxlength="255" name="fos_user_registration_form[username]" pattern=".{3,255}" placeholder="username" required="required" type="text" value=""/></li>
<li><input class="inptIcn icnPass" data-validate="validate(required, minlength(6))" id="fos_user_registration_form_plainPassword" name="fos_user_registration_form[plainPassword]" placeholder="password" required="required" type="password" value=""/></li>
<li><input class="inptIcn icnEmail" data-validate="validate(required, email)" id="fos_user_registration_form_email" maxlength="255" name="fos_user_registration_form[email]" pattern=".{2,255}" placeholder="email" required="required" type="email" value=""/></li>
<li class="formActions"><input class="btn btnGreen" id="signup-btn" name="signup-btn" type="submit" value="Sign Up!"/></li>
</ul>
列表为空。为什么我无法仅从该列表中获取列表项?
这是因为 <ul class="usersOnlineList clear"></ul>
内容被 JavaScript
注入了。您必须等待插入,而 requests
无法执行此操作。 selenium
可以解决这个问题:
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Firefox()
driver.set_page_load_timeout(5)
def main():
driver.get("http://connected2.me")
source = BeautifulSoup(driver.page_source, 'html.parser')
for link in source.find_all('ul', {'class': 'usersOnlineList'}):
print(link)
driver.close()
if __name__ == '__main__':
main()