如何使用 get_attribute 将属性值抓取为字符串而不是单个字母?
How can I scrape an attribute value as string as opposed to individual letters using get_attribute?
我正在使用 selenium(可能还有 BS4)来抓取过去 4/5 年比赛结果页面 (https://cuetracker.net/tournaments/gibraltar-open/2020/3542) 的不同部分,我已经抓取了这些比赛的链接。
我正在尝试想出一些健壮的代码来大致抓取这些匹配结果中给出的不同数据位。最初我尝试使用部分 Xpath 来抓取每个获胜玩家 (LHS) 的国籍,但是当我尝试获取属性值时,它 returns 一个字母列表而不是作为字符串的国籍。
我认为 BS4 可能更适合这个,因为 html 的格式可能会随着裁判数据的增加而改变,但据我所知,使用部分 Xpath 似乎没问题。
如何让 get_attribute 将值作为字符串而不是单个字母提供给我?
与 Selenium 相比,使用 BS4 完成此抓取是否更容易?
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.select import Select
from bs4 import BeautifulSoup
import os
import re
import time
import pandas as pd
def wait_for_page_load():
timer = 15
start_time = time.time()
page_state = None
while page_state != 'complete':
time.sleep(0.5)
page_state = browser.execute_script('return document.readyState;')
if time.time() - start_time > timer:
raise Exception('Timeout :(')
chrome_path = r"C:\Users\George\Desktop\chromedriver.exe"
browser = webdriver.Chrome(chrome_path)
page_source = browser.page_source
browser.get("https://cuetracker.net/seasons")
links = browser.find_elements_by_css_selector("table.table.table-striped a")
hrefs=[]
for link in links:
hrefs.append(link.get_attribute("href"))
hrefs = hrefs[1:5]
hrefs2 = []
for href in hrefs:
browser.get(href)
wait_for_page_load()
links2 = browser.find_element_by_xpath('.//tr/td[2]/a')
for link in links2:
hrefs2.append((link.get_attribute("href")))
Player_1_Nationality = []
for href in hrefs2:
browser.get(href)
wait_for_page_load()
list_1_Nationality = browser.find_elements_by_xpath('.//div/div[2]/div[1]/b/img').get_attribute("alt")
for lis in list_1_Nationality:
Player_1_Nationality.append(lis)
['E',
'n',
'g',
'l',
'a',
'n',
'd',
'E',
'n',
'g',
'l',
'a',
'n',
'd',
'E',
'n',
'g',
'l',
'a',
'n',
'd',
'E',
'n',
'g',
'l',
'a',
'n',
'd',
'A',
'u',
's',
't',
'r',
'a',
'l',
'i',
'a',
'E',
'n',
'g',
'l',
'a',
...
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.select import Select
from bs4 import BeautifulSoup
import os
import re
import time
import pandas as pd
def wait_for_page_load():
timer = 15
start_time = time.time()
page_state = None
while page_state != 'complete':
time.sleep(0.5)
page_state = browser.execute_script('return document.readyState;')
if time.time() - start_time > timer:
raise Exception('Timeout :(')
chrome_path = r"C:/chromedriver_win32/chromedriver.exe"
browser = webdriver.Chrome(chrome_path)
page_source = browser.page_source
browser.get("https://cuetracker.net/seasons")
links = browser.find_elements_by_css_selector("table.table.table-striped a")
hrefs=[]
for link in links:
hrefs.append(link.get_attribute("href"))
hrefs = hrefs[1:5]
hrefs2 = []
for href in hrefs:
browser.get(href)
wait_for_page_load()
links2 = browser.find_elements_by_xpath('.//tr/td[2]/a')
for link in links2:
hrefs2.append((link.get_attribute("href")))
Player_1_Nationality = []
for href in hrefs2:
browser.get(href)
wait_for_page_load()
list_1_Nationality = browser.find_elements_by_xpath('.//div/div[2]/div[1]/b/img')
for lis in list_1_Nationality:
Player_1_Nationality.append(lis.get_attribute("alt"))
find_elements_by_xpath
() returns list of elements
。迭代时只需使用 lis.get_attribute("alt")
for href in hrefs2:
browser.get(href)
wait_for_page_load()
list_1_Nationality = browser.find_elements_by_xpath('.//div/div[2]/div[1]/b/img')
for lis in list_1_Nationality:
Player_1_Nationality.append(lis.get_attribute("alt"))
我正在使用 selenium(可能还有 BS4)来抓取过去 4/5 年比赛结果页面 (https://cuetracker.net/tournaments/gibraltar-open/2020/3542) 的不同部分,我已经抓取了这些比赛的链接。
我正在尝试想出一些健壮的代码来大致抓取这些匹配结果中给出的不同数据位。最初我尝试使用部分 Xpath 来抓取每个获胜玩家 (LHS) 的国籍,但是当我尝试获取属性值时,它 returns 一个字母列表而不是作为字符串的国籍。
我认为 BS4 可能更适合这个,因为 html 的格式可能会随着裁判数据的增加而改变,但据我所知,使用部分 Xpath 似乎没问题。
如何让 get_attribute 将值作为字符串而不是单个字母提供给我?
与 Selenium 相比,使用 BS4 完成此抓取是否更容易?
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.select import Select
from bs4 import BeautifulSoup
import os
import re
import time
import pandas as pd
def wait_for_page_load():
timer = 15
start_time = time.time()
page_state = None
while page_state != 'complete':
time.sleep(0.5)
page_state = browser.execute_script('return document.readyState;')
if time.time() - start_time > timer:
raise Exception('Timeout :(')
chrome_path = r"C:\Users\George\Desktop\chromedriver.exe"
browser = webdriver.Chrome(chrome_path)
page_source = browser.page_source
browser.get("https://cuetracker.net/seasons")
links = browser.find_elements_by_css_selector("table.table.table-striped a")
hrefs=[]
for link in links:
hrefs.append(link.get_attribute("href"))
hrefs = hrefs[1:5]
hrefs2 = []
for href in hrefs:
browser.get(href)
wait_for_page_load()
links2 = browser.find_element_by_xpath('.//tr/td[2]/a')
for link in links2:
hrefs2.append((link.get_attribute("href")))
Player_1_Nationality = []
for href in hrefs2:
browser.get(href)
wait_for_page_load()
list_1_Nationality = browser.find_elements_by_xpath('.//div/div[2]/div[1]/b/img').get_attribute("alt")
for lis in list_1_Nationality:
Player_1_Nationality.append(lis)
['E',
'n',
'g',
'l',
'a',
'n',
'd',
'E',
'n',
'g',
'l',
'a',
'n',
'd',
'E',
'n',
'g',
'l',
'a',
'n',
'd',
'E',
'n',
'g',
'l',
'a',
'n',
'd',
'A',
'u',
's',
't',
'r',
'a',
'l',
'i',
'a',
'E',
'n',
'g',
'l',
'a',
...
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.select import Select
from bs4 import BeautifulSoup
import os
import re
import time
import pandas as pd
def wait_for_page_load():
timer = 15
start_time = time.time()
page_state = None
while page_state != 'complete':
time.sleep(0.5)
page_state = browser.execute_script('return document.readyState;')
if time.time() - start_time > timer:
raise Exception('Timeout :(')
chrome_path = r"C:/chromedriver_win32/chromedriver.exe"
browser = webdriver.Chrome(chrome_path)
page_source = browser.page_source
browser.get("https://cuetracker.net/seasons")
links = browser.find_elements_by_css_selector("table.table.table-striped a")
hrefs=[]
for link in links:
hrefs.append(link.get_attribute("href"))
hrefs = hrefs[1:5]
hrefs2 = []
for href in hrefs:
browser.get(href)
wait_for_page_load()
links2 = browser.find_elements_by_xpath('.//tr/td[2]/a')
for link in links2:
hrefs2.append((link.get_attribute("href")))
Player_1_Nationality = []
for href in hrefs2:
browser.get(href)
wait_for_page_load()
list_1_Nationality = browser.find_elements_by_xpath('.//div/div[2]/div[1]/b/img')
for lis in list_1_Nationality:
Player_1_Nationality.append(lis.get_attribute("alt"))
find_elements_by_xpath
() returns list of elements
。迭代时只需使用 lis.get_attribute("alt")
for href in hrefs2:
browser.get(href)
wait_for_page_load()
list_1_Nationality = browser.find_elements_by_xpath('.//div/div[2]/div[1]/b/img')
for lis in list_1_Nationality:
Player_1_Nationality.append(lis.get_attribute("alt"))