使用 selenium Python 检索 Google 趋势数据中的所有元素

Retrieve all elements in Google Trends Data using selenium Python

我正在尝试编写一个 Python 程序来从 Google Trends (GT) 中收集数据 - 具体来说,我想自动打开 URL 并访问标题中显示的特定值. 我已经编写了代码,并且能够成功地抓取数据。但是我比较了代码返回的数据和 url 中的数据,结果只返回了一部分。 例如在下图中,代码 returns 第一个标题 "Manchester United F.C. • Tottenham Hotspur F.C." 但实际网站有 4 个结果“Manchester United F.C. • Tottenham Hotspur F.C. , International Champions Cup,曼彻斯特 ”。 google trends image

screenshot output of code

我们目前已经尝试了页面中所有可能的定位元素,但我们仍然无法为此提供修复资金。我们不想为此使用 scrapy 或漂亮的汤

    import pandas as pd
    import requests
    import re
    from bs4 import BeautifulSoup
    import time
    from selenium import webdriver

    links=["https://trends.google.com/trends/trendingsearches/realtime?geo=DE&category=s"] 

    for link in links:
        Title_temp=[]
        Titile=''
        seleniumDriver = r"C:/Users/Downloads/chromedriver_win32/chromedriver.exe" 
        chrome_options = Options()
        brow = webdriver.Chrome(executable_path=seleniumDriver, chrome_options=chrome_options)
        try:
            brow.get(link) ## getting the url
            try:
                content = brow.find_elements_by_class_name("details-top")
                for element in content:
                    Title_temp.append(element.text)    
                Title=' '.join(Title_temp)
            except:
                Title=''       
            brow.quit()

        except Exception as error:
            print error
            break

    Final_df = pd.DataFrame(
        {'Title': Title_temp
        })

这是打印所有信息的代码。

url = "https://trends.google.com/trends/trendingsearches/realtime?geo=DE&category=s"
driver.get(url)
WebDriverWait(driver,30).until(EC.presence_of_element_located((By.CLASS_NAME,'details-top')))
Title_temp = []
try:
    content = driver.find_elements_by_class_name("details-top")
    for element in content:
        Title_temp.append(element.text)
    Title=' '.join(Title_temp)
except:
    Title=''
print(Title_temp)
driver.close()

这是输出。

['Hertha BSC • Fenerbahçe S.K. • Bundesliga • Ante Čović • Berlin', 'Eintracht Frankfurt • UEFA Europa League • Tallinn • Estonia • Frankfurt', 'FC Augsburg • Galatasaray S.K. • Martin Schmidt • Bundesliga • Stefan Reuter', 'Austria national football team • FIFA • Austria • FIFA World Rankings', 'Lechia Gdańsk • Brøndby IF • 2019–20 UEFA Europa League • Gdańsk', 'Alexander Zverev • Hamburg', 'Julian Lenz • Association of Tennis Professionals • Alexander Zverev', 'UEFA Europa League • Diego • Nairo Quintana • Tour de France']

截图:

我们能够找到解决此问题的方法。我们不得不从内部 html 中抓取数据,然后进行一些数据清理以获得所需的记录

import pandas as pd
import requests
import re
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

#html parser
def parse_html(content):    
    from bs4 import BeautifulSoup
    from bs4.element import Comment
    soup = BeautifulSoup(content, 'html.parser')
    text_elements = soup.findAll(text=True)
    tag_blacklist = ['style', 'script', 'head', 'title', 'meta', '[document]','img']
    clean_text = []
    for element in text_elements:
        if element.parent.name in tag_blacklist or isinstance(element, Comment):
            continue
        else:
            text_ = element.strip()
            clean_text.append(text_)
    result_text = " ".join(clean_text)
    result_text = result_text.replace(r'[\r\n]','')
    tag_remove_pattern = re.compile(r'<[^>]+>')
    result_text = tag_remove_pattern.sub('', result_text)
    result_text = re.sub(r'\','',result_text)
    return result_text

seleniumDriver = r"./chromedriver.exe" 
chrome_options = Options()
brow = webdriver.Chrome(executable_path=seleniumDriver, chrome_options=chrome_options)
links=["https://trends.google.com/trends/trendingsearches/realtime?geo=DE&category=s"]
title_temp = []
for link in links:
    try:
        brow.get(link)
        try:
            elements = brow.find_elements_by_class_name('details-top')
            for element in elements:
                html_text = parse_html(element.get_attribute("innerHTML"))
                title_temp.append(html_text.replace('share','').strip())
        except Exception as error:
            print(error)
        time.sleep(1)
        brow.quit()
    except Exception as error:
        print(error)
        break
Final_df = pd.DataFrame(
    {'Title': title_temp
    })

print(Final_df)

据我所知,数据是从您可以直接调用的 API 端点检索的。我展示了如何调用然后仅提取标题(请注意,除了 API 调用中的标题,还会返回更多信息)。您可以探索返回内容的广度(包括文章片段、网址、图像链接等)here

import requests
import json

r = requests.get('https://trends.google.com/trends/api/realtimetrends?hl=en-GB&tz=-60&cat=s&fi=0&fs=0&geo=DE&ri=300&rs=20&sort=0')
data = json.loads(r.text[5:])
titles = [story['title'] for story in data['storySummaries']['trendingStories']]
print(titles)