如何正确使用 Selenium 抓取 Twitter 用户名?
How do I scrape Twitter usernames using Selenium properly?
所以,我正在尝试抓取 Twitter 关注者,但问题是,它也抓取了不必要的 links,它们不是个人资料页面(Twitter accs)。
下面的代码所做的是,打开你想从中抓取粉丝的 Twitter 帐户页面,并使用 xpath 定位元素获取 links 个个人资料页面,同时逐渐向下滚动以获取所有目前的追随者。
这是我的代码:
def extract_followers_func():
driver.get("https://twitter.com/Username/followers")
sleep(5)
for twusernames in driver.find_elements_by_xpath('//div[@aria-label="Timeline: Followers"]//a[@role="link"]'):
file = open("scrapedlist.txt", "a")
file.write(twusernames.get_property('href'))
file.write("\n")
file.close()
sleep(5)
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
sleep(5)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
for twusernames in driver.find_elements_by_xpath('//div[@aria-label="Timeline: Followers"]//a[@role="link"]'):
file = open("scrapedlist.txt", "a")
file.write(twusernames.get_property('href'))
file.write("\n")
file.close()
更有效的方法是什么?我只想要用户名,而不是所有不必要的 link.
完整代码:
import tkinter as tk
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import TimeoutException
import threading
import time
from time import sleep
import datetime
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("start-maximized")
root = tk.Tk()
app_width = 300
app_height = 320
screen_width = root.winfo_screenwidth()
screen_height = root.winfo_screenheight()
x = (screen_width / 2) - (app_width / 2)
y = (screen_height / 2) - (app_height / 2)
root.geometry(f'{app_width}x{app_height}+{int(x)}+{int(y)}')
#
ser = Service("C:\Program Files (x86)\chromedriver.exe")
driver = webdriver.Chrome(service=ser, options=options)
wait = WebDriverWait(driver, 50)
testbtn_txt = tk.StringVar()
testbtn = tk.Button(root, textvariable=testbtn_txt, command=lambda:extract_followers_func(), font="Arial", bg="#808080", fg="white", height=1, width=10)
testbtn_txt.set("Test")
testbtn.grid(row=10, column=0, columnspan=2, pady=5, padx=5)
def extract_followers_func():
driver.get("https://twitter.com/Username/followers")
sleep(5)
for twusernames in driver.find_elements_by_xpath('//div[@aria-label="Timeline: Followers"]//a[@role="link" and not(@aria-hidden) and not(contains(@href,'search')) and not(contains(@href,'Live')) and not(@rel)]'):
file = open("scrapedlist.txt", "a")
file.write(twusernames.get_property('href'))
file.write("\n")
file.close()
sleep(5)
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
sleep(5)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
for twusernames in driver.find_elements_by_xpath('//div[@aria-label="Timeline: Followers"]//a[@role="link" and not(@aria-hidden) and not(contains(@href,'search')) and not(contains(@href,'Live')) and not(@rel)]'):
file = open("scrapedlist.txt", "a")
file.write(twusernames.get_property('href'))
file.write("\n")
file.close()
root.mainloop()
你快到了!
您只需要微调定位器即可。
所以,而不是
'//div[@aria-label="Timeline: Followers"]//a[@role="link"]'
你应该使用
'//div[@aria-label="Timeline: Followers"]//a[@role="link" and not(@aria-hidden) and not(contains(@href,"search")) and not(contains(@href,"Live")) and not(@rel)]'
所以,我正在尝试抓取 Twitter 关注者,但问题是,它也抓取了不必要的 links,它们不是个人资料页面(Twitter accs)。
下面的代码所做的是,打开你想从中抓取粉丝的 Twitter 帐户页面,并使用 xpath 定位元素获取 links 个个人资料页面,同时逐渐向下滚动以获取所有目前的追随者。
这是我的代码:
def extract_followers_func():
driver.get("https://twitter.com/Username/followers")
sleep(5)
for twusernames in driver.find_elements_by_xpath('//div[@aria-label="Timeline: Followers"]//a[@role="link"]'):
file = open("scrapedlist.txt", "a")
file.write(twusernames.get_property('href'))
file.write("\n")
file.close()
sleep(5)
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
sleep(5)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
for twusernames in driver.find_elements_by_xpath('//div[@aria-label="Timeline: Followers"]//a[@role="link"]'):
file = open("scrapedlist.txt", "a")
file.write(twusernames.get_property('href'))
file.write("\n")
file.close()
更有效的方法是什么?我只想要用户名,而不是所有不必要的 link.
完整代码:
import tkinter as tk
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import TimeoutException
import threading
import time
from time import sleep
import datetime
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("start-maximized")
root = tk.Tk()
app_width = 300
app_height = 320
screen_width = root.winfo_screenwidth()
screen_height = root.winfo_screenheight()
x = (screen_width / 2) - (app_width / 2)
y = (screen_height / 2) - (app_height / 2)
root.geometry(f'{app_width}x{app_height}+{int(x)}+{int(y)}')
#
ser = Service("C:\Program Files (x86)\chromedriver.exe")
driver = webdriver.Chrome(service=ser, options=options)
wait = WebDriverWait(driver, 50)
testbtn_txt = tk.StringVar()
testbtn = tk.Button(root, textvariable=testbtn_txt, command=lambda:extract_followers_func(), font="Arial", bg="#808080", fg="white", height=1, width=10)
testbtn_txt.set("Test")
testbtn.grid(row=10, column=0, columnspan=2, pady=5, padx=5)
def extract_followers_func():
driver.get("https://twitter.com/Username/followers")
sleep(5)
for twusernames in driver.find_elements_by_xpath('//div[@aria-label="Timeline: Followers"]//a[@role="link" and not(@aria-hidden) and not(contains(@href,'search')) and not(contains(@href,'Live')) and not(@rel)]'):
file = open("scrapedlist.txt", "a")
file.write(twusernames.get_property('href'))
file.write("\n")
file.close()
sleep(5)
last_height = driver.execute_script("return document.body.scrollHeight")
while True:
# Scroll down to bottom
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
# Wait to load page
sleep(5)
# Calculate new scroll height and compare with last scroll height
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
for twusernames in driver.find_elements_by_xpath('//div[@aria-label="Timeline: Followers"]//a[@role="link" and not(@aria-hidden) and not(contains(@href,'search')) and not(contains(@href,'Live')) and not(@rel)]'):
file = open("scrapedlist.txt", "a")
file.write(twusernames.get_property('href'))
file.write("\n")
file.close()
root.mainloop()
你快到了!
您只需要微调定位器即可。
所以,而不是
'//div[@aria-label="Timeline: Followers"]//a[@role="link"]'
你应该使用
'//div[@aria-label="Timeline: Followers"]//a[@role="link" and not(@aria-hidden) and not(contains(@href,"search")) and not(contains(@href,"Live")) and not(@rel)]'