如何报废google大热趋势

How to scrap google hot trend

我正在尝试废弃 Google 热门趋势。我尝试运行 Chrome 开发者工具来捕获所有请求,但似乎没有请求进出。所以我尝试使用selenium,但由于多种原因我无法获取数据,数据是可变的并且不断变化。这是我试过的代码:

from selenium import webdriver
from selenium.webdriver.chrome import options
import os
from bs4 import BeautifulSoup

options = options.Options()
options.headless = True
options.add_argument("--headless")
url = "https://trends.google.com/trends/hottrends/visualize?nrow=5&ncol=5&pn=p36"


def HeadlessBrowserHttpRequest(target: str) -> str:
    driver = webdriver.Chrome(
        options=options, executable_path=os.path.abspath("chromedriver")
    )
    while True:
        driver.get(target)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        if soup.find("tile"):
            titles = [title for title in soup.find("div", class_="tile")]
            if len(titles) > 0:

                print(titles)


HeadlessBrowserHttpRequest(url)

您的代码看起来是正确的。
我在这里看到你唯一遗漏的一点是:你必须从你在这里获得的网络元素中提取文本。
另外,我更喜欢一个一个地打印文本,而不是一次打印所有数组。
如下:

from selenium import webdriver
from selenium.webdriver.chrome import options
import os
from bs4 import BeautifulSoup

options = options.Options()
options.headless = True
options.add_argument("--headless")
url = "https://trends.google.com/trends/hottrends/visualize?nrow=5&ncol=5&pn=p36"


def HeadlessBrowserHttpRequest(target: str) -> str:
    driver = webdriver.Chrome(
        options=options, executable_path=os.path.abspath("chromedriver")
    )
    while True:
        driver.get(target)
        soup = BeautifulSoup(driver.page_source, "html.parser")
        if soup.find("tile"):
            titles = [title for title in soup.find("div", class_="tile")]
            if len(titles) > 0:
                for title in titles:
                    print(title.text)


HeadlessBrowserHttpRequest(url)

我设法用下面的代码解决了这个问题:

from selenium import webdriver
from selenium.webdriver.chrome import options
import os
from bs4 import BeautifulSoup

options = options.Options()
url = "https://trends.google.com/trends/hottrends/visualize?nrow=5&ncol=5&pn=p36"


def HeadlessBrowserHttpRequest(target: str) -> str:
    driver = webdriver.Chrome(
        options=options, executable_path=os.path.abspath("chromedriver")
    )
    driver.get(target)
    while True:
        
        soup = BeautifulSoup(driver.page_source, "html.parser")
        if soup.find("div", {"class": "card current done-typing"}):
            titles = [title for title in soup.find("div", class_="card current done-typing")]
            
            if len(titles) > 0:
                for title in titles:
                    print(title.text)


HeadlessBrowserHttpRequest(url)