如何报废google大热趋势
How to scrap google hot trend
我正在尝试废弃 Google 热门趋势。我尝试运行 Chrome 开发者工具来捕获所有请求,但似乎没有请求进出。所以我尝试使用selenium,但由于多种原因我无法获取数据,数据是可变的并且不断变化。这是我试过的代码:
from selenium import webdriver
from selenium.webdriver.chrome import options
import os
from bs4 import BeautifulSoup
options = options.Options()
options.headless = True
options.add_argument("--headless")
url = "https://trends.google.com/trends/hottrends/visualize?nrow=5&ncol=5&pn=p36"
def HeadlessBrowserHttpRequest(target: str) -> str:
driver = webdriver.Chrome(
options=options, executable_path=os.path.abspath("chromedriver")
)
while True:
driver.get(target)
soup = BeautifulSoup(driver.page_source, "html.parser")
if soup.find("tile"):
titles = [title for title in soup.find("div", class_="tile")]
if len(titles) > 0:
print(titles)
HeadlessBrowserHttpRequest(url)
您的代码看起来是正确的。
我在这里看到你唯一遗漏的一点是:你必须从你在这里获得的网络元素中提取文本。
另外,我更喜欢一个一个地打印文本,而不是一次打印所有数组。
如下:
from selenium import webdriver
from selenium.webdriver.chrome import options
import os
from bs4 import BeautifulSoup
options = options.Options()
options.headless = True
options.add_argument("--headless")
url = "https://trends.google.com/trends/hottrends/visualize?nrow=5&ncol=5&pn=p36"
def HeadlessBrowserHttpRequest(target: str) -> str:
driver = webdriver.Chrome(
options=options, executable_path=os.path.abspath("chromedriver")
)
while True:
driver.get(target)
soup = BeautifulSoup(driver.page_source, "html.parser")
if soup.find("tile"):
titles = [title for title in soup.find("div", class_="tile")]
if len(titles) > 0:
for title in titles:
print(title.text)
HeadlessBrowserHttpRequest(url)
我设法用下面的代码解决了这个问题:
from selenium import webdriver
from selenium.webdriver.chrome import options
import os
from bs4 import BeautifulSoup
options = options.Options()
url = "https://trends.google.com/trends/hottrends/visualize?nrow=5&ncol=5&pn=p36"
def HeadlessBrowserHttpRequest(target: str) -> str:
driver = webdriver.Chrome(
options=options, executable_path=os.path.abspath("chromedriver")
)
driver.get(target)
while True:
soup = BeautifulSoup(driver.page_source, "html.parser")
if soup.find("div", {"class": "card current done-typing"}):
titles = [title for title in soup.find("div", class_="card current done-typing")]
if len(titles) > 0:
for title in titles:
print(title.text)
HeadlessBrowserHttpRequest(url)
我正在尝试废弃 Google 热门趋势。我尝试运行 Chrome 开发者工具来捕获所有请求,但似乎没有请求进出。所以我尝试使用selenium,但由于多种原因我无法获取数据,数据是可变的并且不断变化。这是我试过的代码:
from selenium import webdriver
from selenium.webdriver.chrome import options
import os
from bs4 import BeautifulSoup
options = options.Options()
options.headless = True
options.add_argument("--headless")
url = "https://trends.google.com/trends/hottrends/visualize?nrow=5&ncol=5&pn=p36"
def HeadlessBrowserHttpRequest(target: str) -> str:
driver = webdriver.Chrome(
options=options, executable_path=os.path.abspath("chromedriver")
)
while True:
driver.get(target)
soup = BeautifulSoup(driver.page_source, "html.parser")
if soup.find("tile"):
titles = [title for title in soup.find("div", class_="tile")]
if len(titles) > 0:
print(titles)
HeadlessBrowserHttpRequest(url)
您的代码看起来是正确的。
我在这里看到你唯一遗漏的一点是:你必须从你在这里获得的网络元素中提取文本。
另外,我更喜欢一个一个地打印文本,而不是一次打印所有数组。
如下:
from selenium import webdriver
from selenium.webdriver.chrome import options
import os
from bs4 import BeautifulSoup
options = options.Options()
options.headless = True
options.add_argument("--headless")
url = "https://trends.google.com/trends/hottrends/visualize?nrow=5&ncol=5&pn=p36"
def HeadlessBrowserHttpRequest(target: str) -> str:
driver = webdriver.Chrome(
options=options, executable_path=os.path.abspath("chromedriver")
)
while True:
driver.get(target)
soup = BeautifulSoup(driver.page_source, "html.parser")
if soup.find("tile"):
titles = [title for title in soup.find("div", class_="tile")]
if len(titles) > 0:
for title in titles:
print(title.text)
HeadlessBrowserHttpRequest(url)
我设法用下面的代码解决了这个问题:
from selenium import webdriver
from selenium.webdriver.chrome import options
import os
from bs4 import BeautifulSoup
options = options.Options()
url = "https://trends.google.com/trends/hottrends/visualize?nrow=5&ncol=5&pn=p36"
def HeadlessBrowserHttpRequest(target: str) -> str:
driver = webdriver.Chrome(
options=options, executable_path=os.path.abspath("chromedriver")
)
driver.get(target)
while True:
soup = BeautifulSoup(driver.page_source, "html.parser")
if soup.find("div", {"class": "card current done-typing"}):
titles = [title for title in soup.find("div", class_="card current done-typing")]
if len(titles) > 0:
for title in titles:
print(title.text)
HeadlessBrowserHttpRequest(url)