为什么我无法使用 selenium python 从 Highchart 的隐藏工具提示中抓取值?
Why am I unable to scrape values from a Hidden tooltip of a Highchart using selenium python?
在最后一次问之前,我特别问了几个关于同一主题的问题。
首先,我从 https://www.similarweb.com/website/zalando.de/#overview
中抓取值
我正在尝试从图表中抓取内容。看看这张高图。
我想像它的值一样抓取值:27,100,000。从隐藏的工具提示。目前,我可以将月数设为 [11 月 20 日,....4 月 21 日],但是,我无法获取其值。
这是我的完整代码:
def website_monitoring():
websites = ['https://www.similarweb.com/website/zalando.de/#overview']
options = webdriver.ChromeOptions()
options.add_argument('start-maximized')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
browser = webdriver.Chrome(ChromeDriverManager().install(), options=options)
for crawler in websites:
browser.get(crawler)
wait = WebDriverWait(browser, 10)
website_names = browser.find_element_by_xpath('/html/body/div[1]/main/div/div/section[1]/div[1]/div/div[1]/a').get_attribute("href")
total_visits = browser.find_element_by_xpath('/html/body/div[1]/main/div/div/div[2]/div[2]/div/div[3]/div/div/div/div[2]/div/span[2]/span[1]').text
tooltip = wait.until(EC.presence_of_element_located((By.XPATH, "//*[local-name() = 'svg']/*[local-name()='g'][8]/*[local-name()='text']")))
ActionChains(browser).move_to_element(tooltip).perform()
month_value = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[local-name() = 'svg']/*[local-name()='g' and @class='highcharts-tooltip']/*[local-name()='text']")))
values = [elem.text for elem in month_value]
print('VALUES-->', values)
months = browser.find_elements(By.XPATH, "//*[local-name() = 'svg']/*[local-name()='g'][6]/*/*")
for date in months:
print(date.text)
# printing all scraped data
print('Website Names:', website_names)
print('Total visits:', total_visits)
if __name__ == "__main__":
website_monitoring()
我目前得到的输出:
VALUES--> ['']
Nov '20
Dec '20
Jan '21
Feb '21
Mar '21
Apr '21
我想要的输出:
VALUES--> ['27,100,000', .....]
Nov '20
Dec '20
Jan '21
Feb '21
Mar '21
Apr '21
我被困在这个问题上 2 天了,到目前为止没有任何尝试。请,请帮助!
编辑:我还尝试了一种方法来检查页面是否存在 csv 文件,然后转到网络选项卡,因为 Highcharts 图通常存储一个 csv 文件,但我估计是网站屏蔽了。这可以通过使用 json 或 lxml 实现吗?
我有一个有效的解决方案。我花时间确定了一种将鼠标悬停在每个点上、打印数据并移动到下一个点的方法。这可能更干净,但它确实有效。这是我的 python 文件:
from selenium import webdriver
import chromedriver_autoinstaller
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import time
def website_monitoring():
chromedriver_autoinstaller.install()
websites = ['https://www.similarweb.com/website/zalando.de/#overview']
options = webdriver.ChromeOptions()
options.add_argument('start-maximized')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
browser = webdriver.Chrome(options=options)
def stringToPrint():
return browser.find_element_by_css_selector('g.highcharts-tooltip > text > tspan:nth-child(1)').text + ': ' + browser.find_element_by_css_selector('tspan:nth-child(3)').text
for crawler in websites:
browser.get(crawler)
wait = WebDriverWait(browser, 10)
website_names = browser.find_element_by_xpath('/html/body/div[1]/main/div/div/section[1]/div[1]/div/div[1]/a').get_attribute("href")
total_visits = browser.find_element_by_xpath('/html/body/div[1]/main/div/div/div[2]/div[2]/div/div[3]/div/div/div/div[2]/div/span[2]/span[1]').text
highchartElement = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'g:nth-child(8) > path:nth-child(3)')))
ActionChains(browser).move_to_element(highchartElement).move_by_offset(-300,0).perform()
print(stringToPrint())
ActionChains(browser).move_by_offset(120,-10).perform()
print(stringToPrint())
ActionChains(browser).move_by_offset(120,0).perform()
print(stringToPrint())
ActionChains(browser).move_by_offset(120,-10).perform()
print(stringToPrint())
ActionChains(browser).move_by_offset(120,10).perform()
print(stringToPrint())
ActionChains(browser).move_by_offset(120,0).perform()
print(stringToPrint())
# printing all scraped data
print('Website Names:', website_names)
print('Total visits:', total_visits)
if __name__ == "__main__":
website_monitoring()
控制台输出如下:
November, 2020: 29,900,000
December, 2020: 27,100,000
January, 2020: 26,900,000
February, 2021: 22,600,000
March, 2021: 24,700,000
April, 2021: 26,200,000
Website Names: http://zalando.de/
Total visits: 19.94M
在最后一次问之前,我特别问了几个关于同一主题的问题。 首先,我从 https://www.similarweb.com/website/zalando.de/#overview
中抓取值我正在尝试从图表中抓取内容。看看这张高图。
我想像它的值一样抓取值:27,100,000。从隐藏的工具提示。目前,我可以将月数设为 [11 月 20 日,....4 月 21 日],但是,我无法获取其值。
这是我的完整代码:
def website_monitoring():
websites = ['https://www.similarweb.com/website/zalando.de/#overview']
options = webdriver.ChromeOptions()
options.add_argument('start-maximized')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
browser = webdriver.Chrome(ChromeDriverManager().install(), options=options)
for crawler in websites:
browser.get(crawler)
wait = WebDriverWait(browser, 10)
website_names = browser.find_element_by_xpath('/html/body/div[1]/main/div/div/section[1]/div[1]/div/div[1]/a').get_attribute("href")
total_visits = browser.find_element_by_xpath('/html/body/div[1]/main/div/div/div[2]/div[2]/div/div[3]/div/div/div/div[2]/div/span[2]/span[1]').text
tooltip = wait.until(EC.presence_of_element_located((By.XPATH, "//*[local-name() = 'svg']/*[local-name()='g'][8]/*[local-name()='text']")))
ActionChains(browser).move_to_element(tooltip).perform()
month_value = wait.until(EC.presence_of_all_elements_located((By.XPATH, "//*[local-name() = 'svg']/*[local-name()='g' and @class='highcharts-tooltip']/*[local-name()='text']")))
values = [elem.text for elem in month_value]
print('VALUES-->', values)
months = browser.find_elements(By.XPATH, "//*[local-name() = 'svg']/*[local-name()='g'][6]/*/*")
for date in months:
print(date.text)
# printing all scraped data
print('Website Names:', website_names)
print('Total visits:', total_visits)
if __name__ == "__main__":
website_monitoring()
我目前得到的输出:
VALUES--> ['']
Nov '20
Dec '20
Jan '21
Feb '21
Mar '21
Apr '21
我想要的输出:
VALUES--> ['27,100,000', .....]
Nov '20
Dec '20
Jan '21
Feb '21
Mar '21
Apr '21
我被困在这个问题上 2 天了,到目前为止没有任何尝试。请,请帮助!
编辑:我还尝试了一种方法来检查页面是否存在 csv 文件,然后转到网络选项卡,因为 Highcharts 图通常存储一个 csv 文件,但我估计是网站屏蔽了。这可以通过使用 json 或 lxml 实现吗?
我有一个有效的解决方案。我花时间确定了一种将鼠标悬停在每个点上、打印数据并移动到下一个点的方法。这可能更干净,但它确实有效。这是我的 python 文件:
from selenium import webdriver
import chromedriver_autoinstaller
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import time
def website_monitoring():
chromedriver_autoinstaller.install()
websites = ['https://www.similarweb.com/website/zalando.de/#overview']
options = webdriver.ChromeOptions()
options.add_argument('start-maximized')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
browser = webdriver.Chrome(options=options)
def stringToPrint():
return browser.find_element_by_css_selector('g.highcharts-tooltip > text > tspan:nth-child(1)').text + ': ' + browser.find_element_by_css_selector('tspan:nth-child(3)').text
for crawler in websites:
browser.get(crawler)
wait = WebDriverWait(browser, 10)
website_names = browser.find_element_by_xpath('/html/body/div[1]/main/div/div/section[1]/div[1]/div/div[1]/a').get_attribute("href")
total_visits = browser.find_element_by_xpath('/html/body/div[1]/main/div/div/div[2]/div[2]/div/div[3]/div/div/div/div[2]/div/span[2]/span[1]').text
highchartElement = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'g:nth-child(8) > path:nth-child(3)')))
ActionChains(browser).move_to_element(highchartElement).move_by_offset(-300,0).perform()
print(stringToPrint())
ActionChains(browser).move_by_offset(120,-10).perform()
print(stringToPrint())
ActionChains(browser).move_by_offset(120,0).perform()
print(stringToPrint())
ActionChains(browser).move_by_offset(120,-10).perform()
print(stringToPrint())
ActionChains(browser).move_by_offset(120,10).perform()
print(stringToPrint())
ActionChains(browser).move_by_offset(120,0).perform()
print(stringToPrint())
# printing all scraped data
print('Website Names:', website_names)
print('Total visits:', total_visits)
if __name__ == "__main__":
website_monitoring()
控制台输出如下:
November, 2020: 29,900,000
December, 2020: 27,100,000
January, 2020: 26,900,000
February, 2021: 22,600,000
March, 2021: 24,700,000
April, 2021: 26,200,000
Website Names: http://zalando.de/
Total visits: 19.94M