Python 从 tripadvisor 抓取 'things to do'

Python scraping 'things to do' from tripadvisor

this 页面,我想抓取列表 'Types of Things to Do in Miami'(您可以在页面末尾附近找到它)。这是我目前所拥有的:

import requests
from bs4 import BeautifulSoup

# Define header to prevent errors
user_agent = "Mozilla/44.0.2 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.109 Safari/9.0.2"

headers = {'User-Agent': user_agent}

new_url = "https://www.tripadvisor.com/Attractions-g34438-Activities-Miami_Florida.html"
# Get response from url
response = requests.get(new_url, headers = headers)
# Encode response for parsing
html = response.text.encode('utf-8')
# Soupify response
soup = BeautifulSoup(html, "lxml")

tag_elements = soup.findAll("a", {"class":"attractions-attraction-overview-main-Pill__pill--23S2Q"})

# Iterate over tag_elements and exctract strings
tags_list = []
for i in tag_elements:
    tags_list.append(i.string)

问题是,我从页面的 'Commonly Searched For in Miami' 区域获得了像 'Good for Couples (201)', 'Good for Big Groups (130)', 'Good for Kids (100)' 这样的值,该区域位于页面的 "Types of Things..." 部分下方。我也没有得到一些我需要的值,比如 "Traveler Resources (7)", "Day Trips (7)" 等。这两个列表 "Things to do..." 和 "Commonly searched..." 的 class 名称相同,我正在使用class in soup.findAll() 我猜这可能是这个问题的原因。这样做的正确方法是什么?我应该采取其他方法吗?

我认为您需要能够单击显示更多才能看到所有可用的。所以使用像硒这样的东西。这包括等待以确保所有元素都存在以及下拉菜单可点击。

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

d = webdriver.Chrome()
d.get("https://www.tripadvisor.com/Attractions-g34438-Activities-Miami_Florida.html")
WebDriverWait(d,5).until(EC.visibility_of_element_located((By.CSS_SELECTOR,".filter_list_0 div a")))
WebDriverWait(d, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#taplc_attraction_filters_clarity_0 span.ui_icon.caret-down"))).click()
tag_elements = WebDriverWait(d,5).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, ".filter_list_0 div a")))
tags_list = [i.text for i in tag_elements]
print(tags_list)
d.quit()


没有 selenium 我只能得到 15 个项目

import requests
from bs4 import BeautifulSoup

user_agent = "Mozilla/44.0.2 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.109 Safari/9.0.2"
headers = {'User-Agent': user_agent}
new_url = "https://www.tripadvisor.com/Attractions-g34438-Activities-Miami_Florida.html"
response = requests.get(new_url, headers = headers)
soup = BeautifulSoup(response.content, "lxml")
tag_elements = soup.select('#component_3 > div > div > div:nth-of-type(12) > div:nth-of-type(1) > div > div a')

tags_list = [i.text for i in tag_elements]       
print(tags_list)

看来您需要使用 selenium。问题是下拉菜单在您单击之前不会显示剩余的选项。

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

options = Options()
driver = webdriver.Chrome(options=options)
driver.get('https://www.tripadvisor.com/Attractions-g34438-Activities-Miami_Florida.html')

WebDriverWait(driver, 20).until(EC.presence_of_element_located((By.XPATH, '//*[@id="component_3"]/div/div/div[12]/div[1]/div/div/div/div[1]/span')))


driver.execute_script("arguments[0].scrollIntoView();", driver.find_element_by_xpath('//*[@id="component_3"]/div/div/div[12]/div[1]/div/div/div/div[1]/span'))
driver.execute_script("arguments[0].click();", driver.find_element_by_xpath('//*[@id="component_3"]/div/div/div[12]/div[1]/div/div/div/div[1]/span'))


html = driver.page_source
soup = BeautifulSoup(html, 'lxml')

items = soup.findAll('a', {'class':'attractions-attraction-overview-main-Pill__pill--23S2Q'})
#You could use this to not just get text but also the ['href'] too. 

for item in items:
    print(item.get_text())


driver.quit()

这在浏览器中非常简单:

filters = driver.execute_script("return [...document.querySelectorAll('.filterName a')].map(a => a.innerText)")

仅获取 Types of Things to Do in Miami headers 中的内容有点棘手。为此,您需要像我在下面所做的那样以有组织的方式定义选择器。以下脚本应单击上述 headers 下的 See all 按钮。发起点击后,脚本会解析你查找的相关内容:

from selenium import webdriver
from selenium.webdriver.support import ui
from bs4 import BeautifulSoup

driver = webdriver.Chrome()
wait = ui.WebDriverWait(driver, 10)
driver.get("https://www.tripadvisor.com/Attractions-g34438-Activities-Miami_Florida.html")

show_more = wait.until(lambda driver: driver.find_element_by_css_selector("[class='ui_container'] div:nth-of-type(1) .caret-down"))
driver.execute_script("arguments[0].click();",show_more)
soup = BeautifulSoup(driver.page_source,"lxml")
items = [item.text for item in soup.select("[class='ui_container'] div:nth-of-type(1) a[href^='/Attractions-']")]
print(items)   
driver.quit()

它产生的输出:

['Tours (277)', 'Outdoor Activities (255)', 'Boat Tours & Water Sports (184)', 'Shopping (126)', 'Nightlife (126)', 'Spas & Wellness (109)', 'Fun & Games (67)', 'Transportation (66)', 'Museums (61)', 'Sights & Landmarks (54)', 'Nature & Parks (54)', 'Food & Drink (27)', 'Concerts & Shows (25)', 'Classes & Workshops (22)', 'Zoos & Aquariums (7)', 'Traveler Resources (7)', 'Day Trips (7)', 'Water & Amusement Parks (5)', 'Casinos & Gambling (3)', 'Events (2)']