从未隐藏的 href 元素获取超链接,Python Selenium
Get hyperlink from unhidden href element, Python Selenium
这个问题已经被问过很多次了,但我已经尝试了所有我能找到的解决方案,但都没有成功。简而言之,我正在抓取 table 的成员,并且可以成功收集除最后一列之外的所有列,其中包含一个带有指向成员电子邮件地址的超链接的按钮。超链接似乎没有被隐藏,因为当光标悬停在按钮上时可以看到电子邮件但是我不能 select 按钮元素并打印出超链接。
以下是 table(第 5 列)
的第一个电子邮件地址的 XPATH
/html/body/div[5]/div[1]/main/div/div[5]/div/div/div/table/tbody/tr[1]/td[5]/a
下面是 table
的第一个电子邮件地址的元素
<a href="mailto:mmabbott@mac.com"><span id="ember2071" class="ember-view aia-icon"><svg class="icon" version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px" viewBox="0 0 40 40" style="enable-background:new 0 0 40 40;" xml:space="preserve">
<path class="st0" d="M5.5,8.3v23.5h30.8V8.3H5.5z M8.6,26.4V13.6l6.3,6.4L8.6,26.4z M21.5,21.1c-0.2,0.3-0.9,0.3-1.2,0l-9.6-9.7
h20.4L21.5,21.1z M18.1,23.3c0.7,0.7,1.7,1.1,2.8,1.1c1.1,0,2.1-0.4,2.8-1.1l1-1.1l6.3,6.4H10.7l6.3-6.5L18.1,23.3z M26.9,20
l6.2-6.3v12.7L26.9,20z"></path>
</svg>
</span></a>
下面是用于提取电子邮件地址的脚本代码。最后,我希望脚本将电子邮件地址输出到 CSV 中,与其他列分开,但这是单独讨论的。
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
# open chrome
# driver = Webdriver.chrome("C:\Python Tools\chromedriver.exe")
s = Service("C:\Python Tools\chromedriver.exe")
driver = webdriver.Chrome(service=s)
# navigate to site and sign-in
driver.get("https://account.aia.org/signin?redirectUrl=https:%2F%2Fwww.aia.org%2F")
driver.implicitly_wait(10)
driver.get("https://account.aia.org/signin?redirectUrl=https:%2F%2Fwww.aia.org%2F")
username = driver.find_element(By.ID, "mat-input-0")
password = driver.find_element(By.ID, "mat-input-1")
username.send_keys("juzek2022@gmail.com")
password.send_keys("Test1234!")
driver.find_element(By.CLASS_NAME, "mat-button-wrapper").click()
driver.implicitly_wait(10)
# close cookies box
driver.find_element(By.XPATH, '//*[@id="truste-consent-button"]').click()
# navigate go member directory
driver.implicitly_wait(10)
driver.get("https://www.aia.org/member-directory?page%5Bnumber%5D=1")
driver.implicitly_wait(10)
# extract email addresses: list of tried and failed find element queries
# v1 = driver.find_elements(By.XPATH, "//button[contains(text(),'mailto')]")
# v1 = driver.find_elements(By.XPATH,'//a[contains(@href,".com")]')
# v1 = driver.find_elements(By.PARTIAL_LINK_TEXT, ".com")
# v1 = driver.find_elements(By.XPATH, '//a[contains(@href,"href")]')
# v1 = driver.find_elements(By.XPATH, '//a[@href="'+url+'"]')
# v1 = driver.find_elements(By.XPATH, "//a[contains(text(),'Verify Email')]").getAttribute('href')
# v1 = driver.find_elements(By.CLASS_NAME, "ember-view aia-icon").get_attribute("href")
# v1 = driver.find_elements(By.TAG_NAME, "a").getAttribute("href")
# v1 = driver.find_elements(By.XPATH,("//input[contains(td[5])]")).getAttribute("href")
# v1 = driver.find_elements(By.cssSelector("mailto").getAttribute("href")
# v1 = driver.find_elements(By.CLASS_NAME, "data-table").getAttribute("href")
# v1 = driver.find_elements(By.XPATH, "//div[@id='testId']/a").getAttribute("href")
# v1 = driver.find_elements(By.cssSelector("mailto")
# v1 = driver.find_elements(By.TAG_NAME, "td[5]")
# v1 = driver.find_elements(By.XPATH,("//input[contains(td[5])]"))
# v1 = driver.find_elements(By.TAG_NAME, "a")
# v1 = driver.find_elements(By.CLASS_NAME, "ember-view aia-icon")
print(v1)
# export email addresses to CSV
import csv
with open('AIAMemberSearch.csv', 'w', newline='') as file:
writer = csv.writer(file, quoting=csv.QUOTE_ALL,delimiter=';')
writer.writerows(v1)
其次,我想从 table 的所有五个列中收集数据并导出为 CSV,运行 循环遍历会员目录的所有页面。我的代码草稿如下
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
# open chrome
# driver = Webdriver.chrome("C:\Python Tools\chromedriver.exe")
s = Service("C:\Python Tools\chromedriver.exe")
driver = webdriver.Chrome(service=s)
# navigate to site
driver.get("https://account.aia.org/signin?redirectUrl=https:%2F%2Fwww.aia.org%2F")
driver.implicitly_wait(10)
# Enter login
driver.get("https://account.aia.org/signin?redirectUrl=https:%2F%2Fwww.aia.org%2F")
username = driver.find_element(By.ID, "mat-input-0")
password = driver.find_element(By.ID, "mat-input-1")
username.send_keys("juzek2022@gmail.com")
password.send_keys("Test1234!")
driver.find_element(By.CLASS_NAME, "mat-button-wrapper").click()
driver.implicitly_wait(10)
# close cookies box
# old way driver.find_element_by_xpath('//*[@id="truste-consent-button"]').click()
driver.find_element(By.XPATH, '//*[@id="truste-consent-button"]').click()
driver.implicitly_wait(10)
# for holding the resultant list
element_list = []
for page in range(1, 3, 1):
page_url = "https://www.aia.org/member-directory?page%5Bnumber%5D=" + str(page)
driver.get(page_url)
driver.implicitly_wait(10)
# collect name, chapter, firm and location columns (not working, needs a loop)
v1 = driver.find_elements(By.CLASS_NAME, "data-table")
# collect email addresses (working)
v2 = driver.find_elements(By.XPATH, '//a [contains(@href,"mailto")][@href]')
for i in v2:
email = i.get_attribute("href")
# loop across pages of directory
for i in range(len(v1)):
element_list.append([v1[i].text, v2[i].text])
# export to csv
import csv
with open('AIAMemberSearch.csv', 'w', newline='') as file:
writer = csv.writer(file, quoting=csv.QUOTE_ALL,delimiter=';')
writer.writerows(element_list)
你应该使用 .get_attribute('href') 而不是 .getAttribute()
因此您可以获得所有这样的电子邮件:
for item in driver.find_element(By.CLASS_NAME, 'data-table').find_elements(By.TAG_NAME, 'tr')[1:]:
try:
v1 = item.find_element(By.TAG_NAME, 'a').get_attribute("href")
except:
continue # element doesn't have an email
print(v1)
要获取剩余的 table 信息,您可以这样做:
data_table = []
for item in driver.find_element(By.CLASS_NAME, 'data-table').find_elements(By.TAG_NAME, 'tr')[1:]:
elements = item.find_elements(By.TAG_NAME, 'td')
name = elements[0].text
aia_branch = elements[1].text
company = elements[2].text
location = elements[3].text
data_table.append([name, aia_branch, company, location])
try:
v1 = item.find_element(By.TAG_NAME, 'a').get_attribute("href")
except:
continue # element doesn't have an email
data_table[-1].append(v1)
print(name, aia_branch, company, location, v1)
现在要将所有这些数据保存到 csv,您可以这样做:
with open('AIAMemberSearch.csv', 'w', newline='') as file:
writer = csv.writer(file, delimiter=',')
writer.writerows(data_table)
第 1 根本不要分享您的凭据
第 2 次在不共享凭据的情况下尽可能多地共享 HTML
这应该行得通我测试过
v1 = driver.find_elements(By.XPATH, '//a [contains(@href,"mailto")][@href]')
for i in v1:
email = i.get_attribute("href")
print (email)
这个问题已经被问过很多次了,但我已经尝试了所有我能找到的解决方案,但都没有成功。简而言之,我正在抓取 table 的成员,并且可以成功收集除最后一列之外的所有列,其中包含一个带有指向成员电子邮件地址的超链接的按钮。超链接似乎没有被隐藏,因为当光标悬停在按钮上时可以看到电子邮件但是我不能 select 按钮元素并打印出超链接。 以下是 table(第 5 列)
的第一个电子邮件地址的 XPATH/html/body/div[5]/div[1]/main/div/div[5]/div/div/div/table/tbody/tr[1]/td[5]/a
下面是 table
的第一个电子邮件地址的元素<a href="mailto:mmabbott@mac.com"><span id="ember2071" class="ember-view aia-icon"><svg class="icon" version="1.1" id="Layer_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px" viewBox="0 0 40 40" style="enable-background:new 0 0 40 40;" xml:space="preserve">
<path class="st0" d="M5.5,8.3v23.5h30.8V8.3H5.5z M8.6,26.4V13.6l6.3,6.4L8.6,26.4z M21.5,21.1c-0.2,0.3-0.9,0.3-1.2,0l-9.6-9.7
h20.4L21.5,21.1z M18.1,23.3c0.7,0.7,1.7,1.1,2.8,1.1c1.1,0,2.1-0.4,2.8-1.1l1-1.1l6.3,6.4H10.7l6.3-6.5L18.1,23.3z M26.9,20
l6.2-6.3v12.7L26.9,20z"></path>
</svg>
</span></a>
下面是用于提取电子邮件地址的脚本代码。最后,我希望脚本将电子邮件地址输出到 CSV 中,与其他列分开,但这是单独讨论的。
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
# open chrome
# driver = Webdriver.chrome("C:\Python Tools\chromedriver.exe")
s = Service("C:\Python Tools\chromedriver.exe")
driver = webdriver.Chrome(service=s)
# navigate to site and sign-in
driver.get("https://account.aia.org/signin?redirectUrl=https:%2F%2Fwww.aia.org%2F")
driver.implicitly_wait(10)
driver.get("https://account.aia.org/signin?redirectUrl=https:%2F%2Fwww.aia.org%2F")
username = driver.find_element(By.ID, "mat-input-0")
password = driver.find_element(By.ID, "mat-input-1")
username.send_keys("juzek2022@gmail.com")
password.send_keys("Test1234!")
driver.find_element(By.CLASS_NAME, "mat-button-wrapper").click()
driver.implicitly_wait(10)
# close cookies box
driver.find_element(By.XPATH, '//*[@id="truste-consent-button"]').click()
# navigate go member directory
driver.implicitly_wait(10)
driver.get("https://www.aia.org/member-directory?page%5Bnumber%5D=1")
driver.implicitly_wait(10)
# extract email addresses: list of tried and failed find element queries
# v1 = driver.find_elements(By.XPATH, "//button[contains(text(),'mailto')]")
# v1 = driver.find_elements(By.XPATH,'//a[contains(@href,".com")]')
# v1 = driver.find_elements(By.PARTIAL_LINK_TEXT, ".com")
# v1 = driver.find_elements(By.XPATH, '//a[contains(@href,"href")]')
# v1 = driver.find_elements(By.XPATH, '//a[@href="'+url+'"]')
# v1 = driver.find_elements(By.XPATH, "//a[contains(text(),'Verify Email')]").getAttribute('href')
# v1 = driver.find_elements(By.CLASS_NAME, "ember-view aia-icon").get_attribute("href")
# v1 = driver.find_elements(By.TAG_NAME, "a").getAttribute("href")
# v1 = driver.find_elements(By.XPATH,("//input[contains(td[5])]")).getAttribute("href")
# v1 = driver.find_elements(By.cssSelector("mailto").getAttribute("href")
# v1 = driver.find_elements(By.CLASS_NAME, "data-table").getAttribute("href")
# v1 = driver.find_elements(By.XPATH, "//div[@id='testId']/a").getAttribute("href")
# v1 = driver.find_elements(By.cssSelector("mailto")
# v1 = driver.find_elements(By.TAG_NAME, "td[5]")
# v1 = driver.find_elements(By.XPATH,("//input[contains(td[5])]"))
# v1 = driver.find_elements(By.TAG_NAME, "a")
# v1 = driver.find_elements(By.CLASS_NAME, "ember-view aia-icon")
print(v1)
# export email addresses to CSV
import csv
with open('AIAMemberSearch.csv', 'w', newline='') as file:
writer = csv.writer(file, quoting=csv.QUOTE_ALL,delimiter=';')
writer.writerows(v1)
其次,我想从 table 的所有五个列中收集数据并导出为 CSV,运行 循环遍历会员目录的所有页面。我的代码草稿如下
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
# open chrome
# driver = Webdriver.chrome("C:\Python Tools\chromedriver.exe")
s = Service("C:\Python Tools\chromedriver.exe")
driver = webdriver.Chrome(service=s)
# navigate to site
driver.get("https://account.aia.org/signin?redirectUrl=https:%2F%2Fwww.aia.org%2F")
driver.implicitly_wait(10)
# Enter login
driver.get("https://account.aia.org/signin?redirectUrl=https:%2F%2Fwww.aia.org%2F")
username = driver.find_element(By.ID, "mat-input-0")
password = driver.find_element(By.ID, "mat-input-1")
username.send_keys("juzek2022@gmail.com")
password.send_keys("Test1234!")
driver.find_element(By.CLASS_NAME, "mat-button-wrapper").click()
driver.implicitly_wait(10)
# close cookies box
# old way driver.find_element_by_xpath('//*[@id="truste-consent-button"]').click()
driver.find_element(By.XPATH, '//*[@id="truste-consent-button"]').click()
driver.implicitly_wait(10)
# for holding the resultant list
element_list = []
for page in range(1, 3, 1):
page_url = "https://www.aia.org/member-directory?page%5Bnumber%5D=" + str(page)
driver.get(page_url)
driver.implicitly_wait(10)
# collect name, chapter, firm and location columns (not working, needs a loop)
v1 = driver.find_elements(By.CLASS_NAME, "data-table")
# collect email addresses (working)
v2 = driver.find_elements(By.XPATH, '//a [contains(@href,"mailto")][@href]')
for i in v2:
email = i.get_attribute("href")
# loop across pages of directory
for i in range(len(v1)):
element_list.append([v1[i].text, v2[i].text])
# export to csv
import csv
with open('AIAMemberSearch.csv', 'w', newline='') as file:
writer = csv.writer(file, quoting=csv.QUOTE_ALL,delimiter=';')
writer.writerows(element_list)
你应该使用 .get_attribute('href') 而不是 .getAttribute()
因此您可以获得所有这样的电子邮件:
for item in driver.find_element(By.CLASS_NAME, 'data-table').find_elements(By.TAG_NAME, 'tr')[1:]:
try:
v1 = item.find_element(By.TAG_NAME, 'a').get_attribute("href")
except:
continue # element doesn't have an email
print(v1)
要获取剩余的 table 信息,您可以这样做:
data_table = []
for item in driver.find_element(By.CLASS_NAME, 'data-table').find_elements(By.TAG_NAME, 'tr')[1:]:
elements = item.find_elements(By.TAG_NAME, 'td')
name = elements[0].text
aia_branch = elements[1].text
company = elements[2].text
location = elements[3].text
data_table.append([name, aia_branch, company, location])
try:
v1 = item.find_element(By.TAG_NAME, 'a').get_attribute("href")
except:
continue # element doesn't have an email
data_table[-1].append(v1)
print(name, aia_branch, company, location, v1)
现在要将所有这些数据保存到 csv,您可以这样做:
with open('AIAMemberSearch.csv', 'w', newline='') as file:
writer = csv.writer(file, delimiter=',')
writer.writerows(data_table)
第 1 根本不要分享您的凭据
第 2 次在不共享凭据的情况下尽可能多地共享 HTML
这应该行得通我测试过
v1 = driver.find_elements(By.XPATH, '//a [contains(@href,"mailto")][@href]')
for i in v1:
email = i.get_attribute("href")
print (email)