单击 aspnet link - 不是表单 - 使用 selenium
Clicking an aspnet link - not form - using selenium
我正在尝试抓取本网站的内容:
http://media.ethics.ga.gov/search/Lobbyist/Lobbyist_results.aspx?&Year=2016&LastName=A&FirstName=&City=&FilerID=
其中 LastName 为字母 A-Z,以获取说客信息。这是乔治亚州的开放网站。
我使用了 mechanize 和 Selenium 的组合(实际上 either/or)来获取所需的基本信息,在基本的 for 循环中对每个字母进行排序。 (下面的代码)
我遇到问题的地方 - Selenium 和 mechanize 都试图点击与每个游说者关联的 'View Lobbyist' link。
使用 Selenium,它将点击第一个 link,然后失败并显示 "selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {" 方法":"id","selector":"ctl00_ContentPlaceHolder1_Results_ctl03_lnkView"}"留言。
使用机械化,因为每个 'View Lobbyist' link 都是一个 href 而不是一个表单,任何 br.submit() 都会失败。
这是 Selenium 代码的简短版本:
def __init__(self):
self.url = "http://media.ethics.ga.gov/search/Lobbyist/Lobbyist_ByName.aspx"
self.br = mechanize.Browser()
self.br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
def scrape_lobbyists(self, letter):
urlstr = "http://media.ethics.ga.gov/search/Lobbyist/Lobbyist_results.aspx?Year=2016&LastName="+letter+"&FirstName=&City=&FilerID="
driver.get(urlstr)
soup = BS(driver.page_source)
table = soup.find("table", { "id" : "ctl00_ContentPlaceHolder1_Results" }) # Need to add error check here...
if table is None: # No lobbyist with last name starting with 'X' :-)
return
records = table.find_all('tr') # List of all results for this letter
for row in records:
rec_print = ""
span = row.find_all('span', 'lblentry', 'value')
for sname in span:
stext = sname.get_text()
if ',' in stext:
continue
rec_print = rec_print + stext + "," # Create comma-delimited output
print(rec_print[:-1]) # Strip final comma
lnks = row.find_all('a', 'lblentrylink')
for lnk in lnks:
if lnk is None: # For some reason, first record is blank.
continue
newlnk = lnk['id'] # OK, this is the new URL
newstr = lnk['href']
newctl = newstr[+25:-5] # Matching placeholder (strip javascript....)
print("Lnk: ", lnk)
print("NewLnk: ", newlnk) # Just look at various elements
print("LnkStr: ", newstr)
print("LnkCtl: ", newctl)
driver.find_element_by_id(newlnk).click() # newlnk seems to be the right one...
在此处使用机械化代码:
br.open("http://media.ethics.ga.gov/search/Lobbyist/Lobbyist_results.aspx?&Year=2016&LastName="+letter+"&FirstName=&City=&FilerID=")
soup = BS(br.response().read())
table = soup.find("table", { "id" : "ctl00_ContentPlaceHolder1_Results" }) # Need to add error check here...
if table is None: # No lobbyist with last name starting with 'X' :-)
continue
records = table.find_all('tr') # List of all results for this letter
for form in br.forms():
print "Form name:", form.name
print form
for row in records:
rec_print = ""
span = row.find_all('span', 'lblentry', 'value')
for sname in span:
if ',' in sname.get_text(): # They actually have a field named 'comma'!!
continue
rec_print = rec_print + sname.get_text() + "," # Create comma-delimited output
print(rec_print[:-1]) # Strip final comma
lnk = row.find('a', 'lblentrylink')
if lnk is None: # For some reason, first record is blank.
continue
print("Lnk: ", lnk)
newlnk = lnk['id']
print("NEWLNK: ", newlnk)
newstr = lnk['href']
newctl = newstr[+25:-5] # Matching placeholder (strip javascript....)
br.select_form('aspnetForm') # Tried (nr=0) also...
print("NEWCTL: ", newctl)
br[__EVENTTARGET] = newctl
response = br.submit(name=newlnk).read()
无论如何,我有点难过,所以感谢任何指导!
问题是 - 单击 "View Lobbyist" link 后,您将被重定向到同一浏览器 window 中的另一个 URL。如果你想点击第二个 "View Lobbyist" link,你需要返回游说者列表。
这里是实施 - 收集游说者姓名,跟随 "profile" links 并获取文件管理器 ID,返回重复该过程:
from pprint import pprint
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
driver.get("http://media.ethics.ga.gov/search/Lobbyist/Lobbyist_results.aspx?&Year=2016&LastName=A&FirstName=&City=&FilerID=")
wait = WebDriverWait(driver, 15)
results = []
# iterate over the results skipping the header row
for index in range(1, len(driver.find_elements_by_css_selector("table#ctl00_ContentPlaceHolder1_Results tr"))):
# get the current row
rows = driver.find_elements_by_css_selector("table#ctl00_ContentPlaceHolder1_Results tr")
lobbyist = rows[index]
# extract some data and follow the link
name = lobbyist.find_element_by_css_selector("[id$=lblFName]").text
profile_link = lobbyist.find_element_by_css_selector("[id$=lnkView]")
profile_link.click()
# wait for the page to load
filer_id = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "[id$=lblFilerID]"))).text
# go back
driver.back()
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table#ctl00_ContentPlaceHolder1_Results tr")))
results.append({"name": name,
"filer_id": filer_id})
driver.close()
pprint(results)
打印:
[{'filer_id': 'L20160009', 'name': 'ASHLEY'},
{'filer_id': 'L20120018', 'name': 'ANNA'},
{'filer_id': 'L20050601', 'name': 'BILLY'},
{'filer_id': 'L20090142', 'name': 'CHANDON'},
{'filer_id': 'L20130009', 'name': 'CHARLES'},
{'filer_id': 'L20140179', 'name': 'MARY PAIGE'},
{'filer_id': 'L20050237', 'name': 'NORMER'},
{'filer_id': 'L20060195', 'name': 'PAMELA'},
{'filer_id': 'L20090281', 'name': 'SHAUN'},
{'filer_id': 'L20150090', 'name': 'TYLER'},
{'filer_id': 'L20160162', 'name': 'SARKIS'},
{'filer_id': 'L20150045', 'name': 'SAMUEL'},
{'filer_id': 'L20160098', 'name': 'JOSHUA'},
{'filer_id': 'L20130110', 'name': 'TIMOTHY'},
{'filer_id': 'L20060300', 'name': 'JENNIFER'},
{'filer_id': 'L20080329', 'name': 'BRAD'},
{'filer_id': 'L20130177', 'name': 'ELIZABETH'},
{'filer_id': 'L20120102', 'name': 'C.'},
{'filer_id': 'L20050996', 'name': 'STEVE'},
{'filer_id': 'L20110128', 'name': 'TRACY'},
{'filer_id': 'L20100284', 'name': 'JASON'},
{'filer_id': 'L20150052', 'name': 'MOLLY'},
{'filer_id': 'L20050253', 'name': 'ELIZABETH'},
{'filer_id': 'L20150016', 'name': 'BLAKE'}]
我正在尝试抓取本网站的内容: http://media.ethics.ga.gov/search/Lobbyist/Lobbyist_results.aspx?&Year=2016&LastName=A&FirstName=&City=&FilerID= 其中 LastName 为字母 A-Z,以获取说客信息。这是乔治亚州的开放网站。
我使用了 mechanize 和 Selenium 的组合(实际上 either/or)来获取所需的基本信息,在基本的 for 循环中对每个字母进行排序。 (下面的代码) 我遇到问题的地方 - Selenium 和 mechanize 都试图点击与每个游说者关联的 'View Lobbyist' link。
使用 Selenium,它将点击第一个 link,然后失败并显示 "selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {" 方法":"id","selector":"ctl00_ContentPlaceHolder1_Results_ctl03_lnkView"}"留言。
使用机械化,因为每个 'View Lobbyist' link 都是一个 href 而不是一个表单,任何 br.submit() 都会失败。
这是 Selenium 代码的简短版本:
def __init__(self):
self.url = "http://media.ethics.ga.gov/search/Lobbyist/Lobbyist_ByName.aspx"
self.br = mechanize.Browser()
self.br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
def scrape_lobbyists(self, letter):
urlstr = "http://media.ethics.ga.gov/search/Lobbyist/Lobbyist_results.aspx?Year=2016&LastName="+letter+"&FirstName=&City=&FilerID="
driver.get(urlstr)
soup = BS(driver.page_source)
table = soup.find("table", { "id" : "ctl00_ContentPlaceHolder1_Results" }) # Need to add error check here...
if table is None: # No lobbyist with last name starting with 'X' :-)
return
records = table.find_all('tr') # List of all results for this letter
for row in records:
rec_print = ""
span = row.find_all('span', 'lblentry', 'value')
for sname in span:
stext = sname.get_text()
if ',' in stext:
continue
rec_print = rec_print + stext + "," # Create comma-delimited output
print(rec_print[:-1]) # Strip final comma
lnks = row.find_all('a', 'lblentrylink')
for lnk in lnks:
if lnk is None: # For some reason, first record is blank.
continue
newlnk = lnk['id'] # OK, this is the new URL
newstr = lnk['href']
newctl = newstr[+25:-5] # Matching placeholder (strip javascript....)
print("Lnk: ", lnk)
print("NewLnk: ", newlnk) # Just look at various elements
print("LnkStr: ", newstr)
print("LnkCtl: ", newctl)
driver.find_element_by_id(newlnk).click() # newlnk seems to be the right one...
在此处使用机械化代码:
br.open("http://media.ethics.ga.gov/search/Lobbyist/Lobbyist_results.aspx?&Year=2016&LastName="+letter+"&FirstName=&City=&FilerID=") soup = BS(br.response().read())
table = soup.find("table", { "id" : "ctl00_ContentPlaceHolder1_Results" }) # Need to add error check here...
if table is None: # No lobbyist with last name starting with 'X' :-)
continue
records = table.find_all('tr') # List of all results for this letter
for form in br.forms():
print "Form name:", form.name
print form
for row in records:
rec_print = ""
span = row.find_all('span', 'lblentry', 'value')
for sname in span:
if ',' in sname.get_text(): # They actually have a field named 'comma'!!
continue
rec_print = rec_print + sname.get_text() + "," # Create comma-delimited output
print(rec_print[:-1]) # Strip final comma
lnk = row.find('a', 'lblentrylink')
if lnk is None: # For some reason, first record is blank.
continue
print("Lnk: ", lnk)
newlnk = lnk['id']
print("NEWLNK: ", newlnk)
newstr = lnk['href']
newctl = newstr[+25:-5] # Matching placeholder (strip javascript....)
br.select_form('aspnetForm') # Tried (nr=0) also...
print("NEWCTL: ", newctl)
br[__EVENTTARGET] = newctl
response = br.submit(name=newlnk).read()
无论如何,我有点难过,所以感谢任何指导!
问题是 - 单击 "View Lobbyist" link 后,您将被重定向到同一浏览器 window 中的另一个 URL。如果你想点击第二个 "View Lobbyist" link,你需要返回游说者列表。
这里是实施 - 收集游说者姓名,跟随 "profile" links 并获取文件管理器 ID,返回重复该过程:
from pprint import pprint
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
driver = webdriver.Firefox()
driver.get("http://media.ethics.ga.gov/search/Lobbyist/Lobbyist_results.aspx?&Year=2016&LastName=A&FirstName=&City=&FilerID=")
wait = WebDriverWait(driver, 15)
results = []
# iterate over the results skipping the header row
for index in range(1, len(driver.find_elements_by_css_selector("table#ctl00_ContentPlaceHolder1_Results tr"))):
# get the current row
rows = driver.find_elements_by_css_selector("table#ctl00_ContentPlaceHolder1_Results tr")
lobbyist = rows[index]
# extract some data and follow the link
name = lobbyist.find_element_by_css_selector("[id$=lblFName]").text
profile_link = lobbyist.find_element_by_css_selector("[id$=lnkView]")
profile_link.click()
# wait for the page to load
filer_id = wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "[id$=lblFilerID]"))).text
# go back
driver.back()
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "table#ctl00_ContentPlaceHolder1_Results tr")))
results.append({"name": name,
"filer_id": filer_id})
driver.close()
pprint(results)
打印:
[{'filer_id': 'L20160009', 'name': 'ASHLEY'},
{'filer_id': 'L20120018', 'name': 'ANNA'},
{'filer_id': 'L20050601', 'name': 'BILLY'},
{'filer_id': 'L20090142', 'name': 'CHANDON'},
{'filer_id': 'L20130009', 'name': 'CHARLES'},
{'filer_id': 'L20140179', 'name': 'MARY PAIGE'},
{'filer_id': 'L20050237', 'name': 'NORMER'},
{'filer_id': 'L20060195', 'name': 'PAMELA'},
{'filer_id': 'L20090281', 'name': 'SHAUN'},
{'filer_id': 'L20150090', 'name': 'TYLER'},
{'filer_id': 'L20160162', 'name': 'SARKIS'},
{'filer_id': 'L20150045', 'name': 'SAMUEL'},
{'filer_id': 'L20160098', 'name': 'JOSHUA'},
{'filer_id': 'L20130110', 'name': 'TIMOTHY'},
{'filer_id': 'L20060300', 'name': 'JENNIFER'},
{'filer_id': 'L20080329', 'name': 'BRAD'},
{'filer_id': 'L20130177', 'name': 'ELIZABETH'},
{'filer_id': 'L20120102', 'name': 'C.'},
{'filer_id': 'L20050996', 'name': 'STEVE'},
{'filer_id': 'L20110128', 'name': 'TRACY'},
{'filer_id': 'L20100284', 'name': 'JASON'},
{'filer_id': 'L20150052', 'name': 'MOLLY'},
{'filer_id': 'L20050253', 'name': 'ELIZABETH'},
{'filer_id': 'L20150016', 'name': 'BLAKE'}]