xpath returns 使用请求时为空列表但适用于 selenium
xpath returns empty list while using requests but works with selenium
import requests
from lxml import html
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
file = open("source_code.txt","w+")
url = "https://www.careers360.com/colleges/act-college-of-engineering-and-technology-kancheepuram"
page = session.get(url)
file.write(page.text)
content = html.fromstring(page.text)
intake = content.xpath("//div[@class='col-sm-12']//table//tbody//tr//td[contains(text(),'Total Student Enrollment')]//strong")
print(intake)
使用此脚本时,我得到一个空列表作为输出。
期望的输出是 100。
相同的 XPath 适用于 selenium 但不适用于此脚本。
如果您对此有任何解决方案,请分享。
问题是 html 标签 tbody
import requests
from lxml import html
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
file = open("source_code.txt","w+")
url = "https://www.careers360.com/colleges/act-college-of-engineering-and-technology-kancheepuram"
page = session.get(url)
file.write(page.text)
content = html.fromstring(page.text)
intake = content.xpath("//div[@class='col-sm-12']//table//tr//td[contains(text(),'Total Student Enrollment')]//strong")
for item in intake:
print(html.tostring(item))
结果:
b' 100'
为什么不像下面这样直接试试呢?您无需创建循环来解析所需的结果。
import requests
from lxml.html import fromstring
url = "https://www.careers360.com/colleges/act-college-of-engineering-and-technology-kancheepuram"
with requests.Session() as s:
page = s.get(url)
content = fromstring(page.text)
intake = content.xpath("//table[@class='baseUl']//td[contains(.,'Total Student Enrollment')]/strong")[0].text.strip()
print(intake)
import requests
from lxml import html
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
file = open("source_code.txt","w+")
url = "https://www.careers360.com/colleges/act-college-of-engineering-and-technology-kancheepuram"
page = session.get(url)
file.write(page.text)
content = html.fromstring(page.text)
intake = content.xpath("//div[@class='col-sm-12']//table//tbody//tr//td[contains(text(),'Total Student Enrollment')]//strong")
print(intake)
使用此脚本时,我得到一个空列表作为输出。 期望的输出是 100。 相同的 XPath 适用于 selenium 但不适用于此脚本。 如果您对此有任何解决方案,请分享。
问题是 html 标签 tbody
import requests
from lxml import html
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)
file = open("source_code.txt","w+")
url = "https://www.careers360.com/colleges/act-college-of-engineering-and-technology-kancheepuram"
page = session.get(url)
file.write(page.text)
content = html.fromstring(page.text)
intake = content.xpath("//div[@class='col-sm-12']//table//tr//td[contains(text(),'Total Student Enrollment')]//strong")
for item in intake:
print(html.tostring(item))
结果: b' 100'
为什么不像下面这样直接试试呢?您无需创建循环来解析所需的结果。
import requests
from lxml.html import fromstring
url = "https://www.careers360.com/colleges/act-college-of-engineering-and-technology-kancheepuram"
with requests.Session() as s:
page = s.get(url)
content = fromstring(page.text)
intake = content.xpath("//table[@class='baseUl']//td[contains(.,'Total Student Enrollment')]/strong")[0].text.strip()
print(intake)