使用selenium用不同的关键字抓取twitter时如何清除搜索框
How to clear the searchbox when using selenium to scrape twitter with different keywords
我正在尝试根据不同的关键字来抓取 Twitter,我希望脚本一个一个地获取单词并每次清除搜索框以使用下一个,但我对此有疑问
===========================================
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from time import sleep
searchbox = driver.find_element_by_xpath('//input[@aria-label="Search query"]')
keywords = ['Dog','Cat','Fox']
for keyword in keywords:
searchbox.clear()
searchbox.send_keys(keyword)
searchbox.send_keys(Keys.RETURN)
sleep(10)
driver.find_element_by_link_text('Latest').click()
sleep(5)
data = []
tweet_ids = set()
Keywoed=keyword
last_position = driver.execute_script("return window.pageYOffset;")
scrolling = True
while scrolling:
page_info = driver.find_elements_by_xpath('//article[@data-testid="tweet"]')
for info in page_info:
tweet = get_tweet_data(info)
if tweet:
tweet_id = ','.join(map(str, tweet))
if tweet_id not in tweet_ids:
tweet_ids.add(tweet_id)
data.append(tweet)
scroll_attempt = 0
while True:
# check scroll position
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
sleep(5)
curr_position = driver.execute_script("return window.pageYOffset;")
if last_position == curr_position:
scroll_attempt += 1
if scroll_attempt >= 3:
scrolling = False
break
else:
sleep(5) # attempt another scroll
else:
last_position = curr_position
break
def get_tweet_data(info):
UserName = info.find_element_by_xpath('.//span').text
try:
handle = info.find_element_by_xpath('.//span[contains(text(), "@")]').text
except NoSuchElementException:
return
try:
date = info.find_element_by_xpath('.//time').get_attribute('datetime')
except NoSuchElementException:
return
try:
image_element = info.find_elements_by_css_selector('div[data-testid="tweetPhoto"]')
images = []
for image_div in image_element:
href = image_div.find_element_by_tag_name("img").get_attribute("src")
images.append(href)
except NoSuchElementException:
return
try:
comment = info.find_element_by_xpath('.//div[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]').text
except NoSuchElementException:
return
retweet_cnt = info.find_element_by_xpath('.//div[@data-testid="retweet"]').text
like_cnt = info.find_element_by_xpath('.//div[@data-testid="like"]').text
tweet = (comment,UserName,handle,date,images,retweet_cnt, like_cnt)
return tweet
============================================= ================
使用 searchbox.clear() 没有帮助,它给了我错误:
for keyword in keywords:
----> searchbox.clear()
searchbox.send_keys(keyword)
def clear(self):
"""Clears the text if it's a text entry element."""
---> self._execute(Command.CLEAR_ELEMENT)
def get_property(self, name):
StaleElementReferenceException: Message: stale element reference:
element is not attached to the page document (Session info:
chrome=101.0.4951.54)
您正在离开页面 - 这会使搜索框元素“过时”。这意味着您已离开 page/the 搜索框元素在任何时间段内都不再可见。
要解决此问题,您必须加载包含搜索框元素的页面,重新运行 查找搜索框元素的代码,然后运行 代码。
我建议做类似的事情:
keywords = ['Dog','Cat','Fox']
for keyword in keywords:
driver.get("page_with_searchbox_element")
searchbox = driver.find_element_by_xpath('//input[@aria-label="Search query"]')
searchbox.clear()
searchbox.send_keys(keyword)
# Continue the rest of the code here...
这将在每次尝试时重新加载页面,您应该不会再遇到陈旧元素异常。
谢谢@Jeremy
在我了解问题并修复后,它完美运行:
keywords = ['Dog','Cat','Fox']
for keyword in keywords:
driver.get("https://twitter.com/search?q="+keyword+ "&src=typed_query&f=live")
我正在尝试根据不同的关键字来抓取 Twitter,我希望脚本一个一个地获取单词并每次清除搜索框以使用下一个,但我对此有疑问
===========================================
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from time import sleep
searchbox = driver.find_element_by_xpath('//input[@aria-label="Search query"]')
keywords = ['Dog','Cat','Fox']
for keyword in keywords:
searchbox.clear()
searchbox.send_keys(keyword)
searchbox.send_keys(Keys.RETURN)
sleep(10)
driver.find_element_by_link_text('Latest').click()
sleep(5)
data = []
tweet_ids = set()
Keywoed=keyword
last_position = driver.execute_script("return window.pageYOffset;")
scrolling = True
while scrolling:
page_info = driver.find_elements_by_xpath('//article[@data-testid="tweet"]')
for info in page_info:
tweet = get_tweet_data(info)
if tweet:
tweet_id = ','.join(map(str, tweet))
if tweet_id not in tweet_ids:
tweet_ids.add(tweet_id)
data.append(tweet)
scroll_attempt = 0
while True:
# check scroll position
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
sleep(5)
curr_position = driver.execute_script("return window.pageYOffset;")
if last_position == curr_position:
scroll_attempt += 1
if scroll_attempt >= 3:
scrolling = False
break
else:
sleep(5) # attempt another scroll
else:
last_position = curr_position
break
def get_tweet_data(info):
UserName = info.find_element_by_xpath('.//span').text
try:
handle = info.find_element_by_xpath('.//span[contains(text(), "@")]').text
except NoSuchElementException:
return
try:
date = info.find_element_by_xpath('.//time').get_attribute('datetime')
except NoSuchElementException:
return
try:
image_element = info.find_elements_by_css_selector('div[data-testid="tweetPhoto"]')
images = []
for image_div in image_element:
href = image_div.find_element_by_tag_name("img").get_attribute("src")
images.append(href)
except NoSuchElementException:
return
try:
comment = info.find_element_by_xpath('.//div[1]/div[1]/div[1]/div[2]/div[2]/div[2]/div[1]').text
except NoSuchElementException:
return
retweet_cnt = info.find_element_by_xpath('.//div[@data-testid="retweet"]').text
like_cnt = info.find_element_by_xpath('.//div[@data-testid="like"]').text
tweet = (comment,UserName,handle,date,images,retweet_cnt, like_cnt)
return tweet
============================================= ================
使用 searchbox.clear() 没有帮助,它给了我错误:
for keyword in keywords:
----> searchbox.clear()
searchbox.send_keys(keyword) def clear(self): """Clears the text if it's a text entry element."""
---> self._execute(Command.CLEAR_ELEMENT)
def get_property(self, name):
StaleElementReferenceException: Message: stale element reference: element is not attached to the page document (Session info: chrome=101.0.4951.54)
您正在离开页面 - 这会使搜索框元素“过时”。这意味着您已离开 page/the 搜索框元素在任何时间段内都不再可见。
要解决此问题,您必须加载包含搜索框元素的页面,重新运行 查找搜索框元素的代码,然后运行 代码。
我建议做类似的事情:
keywords = ['Dog','Cat','Fox']
for keyword in keywords:
driver.get("page_with_searchbox_element")
searchbox = driver.find_element_by_xpath('//input[@aria-label="Search query"]')
searchbox.clear()
searchbox.send_keys(keyword)
# Continue the rest of the code here...
这将在每次尝试时重新加载页面,您应该不会再遇到陈旧元素异常。
谢谢@Jeremy
在我了解问题并修复后,它完美运行:
keywords = ['Dog','Cat','Fox']
for keyword in keywords:
driver.get("https://twitter.com/search?q="+keyword+ "&src=typed_query&f=live")