javascript __doPostBack 的网络抓取在 td 中包含一个 href
web scraping for javascript __doPostBack contain a herf in td
我想抓取一个网站,即 https://www.unspsc.org/search-code/default.aspx?CSS=51%&Type=desc&SS%27=
使用 selenium 但我只能抓取一个页面,不能抓取其他页面。
这里我用的是selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
chromeOptions = webdriver.ChromeOptions()
chromeOptions.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(executable_path='C:/Users/ptiwar34/Documents/chromedriver.exe', chrome_options=chromeOptions, desired_capabilities=chromeOptions.to_capabilities())
driver.get('https://www.unspsc.org/search-code/default.aspx?CSS=51%&Type=desc&SS%27=')
WebDriverWait(driver, 20).until(EC.staleness_of(driver.find_element_by_xpath("//td/a[text()='2']")))
driver.find_element_by_xpath("//td/a[text()='2']").click()
numLinks = len(WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "//td/a[text()='2']"))))
print(numLinks)
for i in range(numLinks):
print("Perform your scraping here on page {}".format(str(i+1)))
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//td/a[text()='2']/span//following::span[1]"))).click()
driver.quit()
这里是html内容
<td><span>1</span></td>
<td><a
href="javascript:__doPostBack
('dnn$ctr1535$UNSPSCSearch$gvDetailsSearchView','Page')"
style="color:#333333;">2</a>
</td>
这会引发错误:
raise TimeoutException(message, screen, stacktrace)
TimeoutException
至find/click您可以使用的页码:
for x in driver.find_elements_by_xpath("//a[contains(@href,'UNSPSCSearch$gvDetailsSearchView')]"):
if x.text.isdigit():
print(x.text)
#x.click()
#...
输出:
2
3
4
...
根据您的评论,您可以使用:
max_pages = 10
for page_number in range(2, max_pages+1):
for x in driver.find_elements_by_xpath("//a[contains(@href,'UNSPSCSearch$gvDetailsSearchView')]"):
if x.text.isdigit():
if int(x.strip()) == page_number:
x.click()
#parse results here
break
使用 you can use the following 抓取网站 https://www.unspsc.org/search-code/default.aspx?CSS=51%&Type=desc&SS%27=
:
代码块:
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("start-maximized")
driver = webdriver.Chrome(options=chrome_options, executable_path=r'C:\WebDrivers\chromedriver.exe')
driver.get("https://www.unspsc.org/search-code/default.aspx?CSS=51%&Type=desc&SS%27=%27")
while True:
try:
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//table[contains(@id, 'UNSPSCSearch_gvDetailsSearchView')]//tr[last()]//table//span//following::a[1]"))).click()
print("Clicked for next page")
except TimeoutException:
print("No more pages")
break
driver.quit()
控制台输出:
Clicked for next page
Clicked for next page
Clicked for next page
.
.
.
说明:如果您观察到 HTML DOM, 页码 在 <table>
内,动态 id
包含文本 UNSPSCSearch_gvDetailsSearchView 的属性。此外 页码 在 last <tr>
中,后者有 child <table>
。在 child table 中, 当前页码 在 <span>
中,其中包含密钥。因此,对于 下一页 上的 click()
,您只需识别索引为 [1]
的以下 <a>
标签。最后,由于该元素具有 javascript:__doPostBack()
,您必须为所需的 element_to_be_clickable()
.
引入 WebDriverWait
You can find a detailed discussion in
我想抓取一个网站,即 https://www.unspsc.org/search-code/default.aspx?CSS=51%&Type=desc&SS%27=
使用 selenium 但我只能抓取一个页面,不能抓取其他页面。
这里我用的是selenium
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
chromeOptions = webdriver.ChromeOptions()
chromeOptions.add_experimental_option('useAutomationExtension', False)
driver = webdriver.Chrome(executable_path='C:/Users/ptiwar34/Documents/chromedriver.exe', chrome_options=chromeOptions, desired_capabilities=chromeOptions.to_capabilities())
driver.get('https://www.unspsc.org/search-code/default.aspx?CSS=51%&Type=desc&SS%27=')
WebDriverWait(driver, 20).until(EC.staleness_of(driver.find_element_by_xpath("//td/a[text()='2']")))
driver.find_element_by_xpath("//td/a[text()='2']").click()
numLinks = len(WebDriverWait(driver, 20).until(EC.visibility_of_all_elements_located((By.XPATH, "//td/a[text()='2']"))))
print(numLinks)
for i in range(numLinks):
print("Perform your scraping here on page {}".format(str(i+1)))
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//td/a[text()='2']/span//following::span[1]"))).click()
driver.quit()
这里是html内容
<td><span>1</span></td>
<td><a
href="javascript:__doPostBack
('dnn$ctr1535$UNSPSCSearch$gvDetailsSearchView','Page')"
style="color:#333333;">2</a>
</td>
这会引发错误:
raise TimeoutException(message, screen, stacktrace)
TimeoutException
至find/click您可以使用的页码:
for x in driver.find_elements_by_xpath("//a[contains(@href,'UNSPSCSearch$gvDetailsSearchView')]"):
if x.text.isdigit():
print(x.text)
#x.click()
#...
输出:
2
3
4
...
根据您的评论,您可以使用:
max_pages = 10
for page_number in range(2, max_pages+1):
for x in driver.find_elements_by_xpath("//a[contains(@href,'UNSPSCSearch$gvDetailsSearchView')]"):
if x.text.isdigit():
if int(x.strip()) == page_number:
x.click()
#parse results here
break
使用 https://www.unspsc.org/search-code/default.aspx?CSS=51%&Type=desc&SS%27=
:
代码块:
from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC chrome_options = webdriver.ChromeOptions() chrome_options.add_argument("start-maximized") driver = webdriver.Chrome(options=chrome_options, executable_path=r'C:\WebDrivers\chromedriver.exe') driver.get("https://www.unspsc.org/search-code/default.aspx?CSS=51%&Type=desc&SS%27=%27") while True: try: WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.XPATH, "//table[contains(@id, 'UNSPSCSearch_gvDetailsSearchView')]//tr[last()]//table//span//following::a[1]"))).click() print("Clicked for next page") except TimeoutException: print("No more pages") break driver.quit()
控制台输出:
Clicked for next page Clicked for next page Clicked for next page . . .
说明:如果您观察到 HTML DOM, 页码 在
引入 WebDriverWait<table>
内,动态id
包含文本 UNSPSCSearch_gvDetailsSearchView 的属性。此外 页码 在 last<tr>
中,后者有 child<table>
。在 child table 中, 当前页码 在<span>
中,其中包含密钥。因此,对于 下一页 上的click()
,您只需识别索引为[1]
的以下<a>
标签。最后,由于该元素具有javascript:__doPostBack()
,您必须为所需的element_to_be_clickable()
.
You can find a detailed discussion in