在 TripAdvisor 用户个人资料中抓取评论时点击 "Show more" 按钮
click on "Show more" button when scraping reviews in TripAdvisor user profiles
我正在尝试抓取发布在 TripAdvisor 用户个人资料中的评论,目的是分析我最后一年的评论 project.So 我使用以下 python 代码和 Beautiful Soup library.So 我的问题是如何点击“显示更多”按钮来抓取个人资料中的所有评论。 https://i.stack.imgur.com/OWUPt.png 图片显示了按钮及其我需要处理的源代码。
这是我用来在 TripAdvisor 用户个人资料中抓取数据的代码
import requests
import csv
import re
from bs4 import BeautifulSoup
maxcount = 50
filename = ""
def writecsv(c1,c2,c3,c4,c5):
with open(filename, mode='a',newline='') as f:
#keys = ['name', 'age', 'job', 'city']
writer = csv.writer(f)
writer.writerow([str(c1),str(c2),str(c3),str(c4),str(c5)])
def onereview(review):
try:
name = review.find(class_="_2fxQ4TOx").get_text() if review.find(class_="_2fxQ4TOx") else ""
reviewTitle = review.find(class_="_3IEJ3tAK _2K4zZcBv").get_text() if review.find(class_="_3IEJ3tAK _2K4zZcBv") else ""
reviewDetails = review.find(class_="_133ThCYf").get_text() if review.find(class_="_133ThCYf") else ""
reviewDate = review.find(class_="_3Coh9OJA").get_text() if review.find(class_="_3Coh9OJA") else ""
reviewFor = review.find(class_="_2ys8zX0p ui_link").get_text() if review.find(class_="_2ys8zX0p ui_link") else ""
#print(name)
writecsv(name,reviewTitle,reviewDetails,reviewDate,reviewFor)
except :
print('error')
def allreviews(URL,endcount):
page = requests.get(URL)
html = BeautifulSoup(page.content, 'html.parser')
body = html.find('body')
contents = body.find("div", {"id": "content"}).div
review = contents.div
for el in range(endcount):
try:
onereview(review.find_next_sibling('div'))
review = review.find_next_sibling('div')
except:
print('review not found')
break
def getallReviewsBymainUrl(URL):
global filename
page = requests.get(URL)
html = BeautifulSoup(page.content, 'html.parser')
count = html.find('body').find(class_="iX3IT_XP").get_text().replace(',','')
username = html.find('body').find(class_="_2wpJPTNc _345JQp5A").get_text().replace(' ','_')
filename = username+".csv"
print('start to fill '+filename)
with open(filename, mode='w') as f:
writer = csv.writer(f)
writer.writerow([str('user name'),str('reviewTitle'),str('reviewDetails'),str('reviewDate'),str('reviewFor')])
endcount = int(maxcount) if int(count) > int(maxcount) else int(count)
allreviews(URL,endcount)
print('save reviews in page = ',str(endcount),' user = ',filename)
print()
URLs = ['https://www.tripadvisor.com/Profile/KatieTuesday?fid=ba3cc2e7-5d8d-404e-88bd-f7f30',
'https://www.tripadvisor.com/Profile/elmila?fid=6497e4e4-1314-487b-a883-a6b519bc7efb'
]
for url in URLs:
try:
getallReviewsBymainUrl(url)
except:
print('There is a mistake, check again '+url)
print()
print('program is end, Thank you.')
我尝试按照以下几行来单击按钮,但它没有 work.And 我也无法在源代码中找到“显示更多”按钮的 url。This 是我用来抓取的 tripAdvisor 中的用户个人资料 URL
button = body.find("button", {"class": "_1JOGv2rJ _2oWqCEVy _3yBiBka1 _3fiJJkxX"})
button.click()
如果有人能帮助我解决这个问题,我将不胜感激。
您可以使用 selenium
来点击这样的按钮:
from selenium import webdriver
driver = webdriver.Chrome() #Opens a chrome browser. U should install a chromedriver for this and save it in the path where you have this program. You can download it from here: https://chromedriver.chromium.org/downloads
driver.get('https://www.tripadvisor.com/Profile/wwkalex-fernando?tab=reviews') #Opens the url
driver.find_element_by_class_name("button class name").click() #Finds the button with the mentioned class name and clicks on the it. Make sure that you specify the button's class name (the btn's class name given in the screenshot was very long, so I did not type it as it would be prone to mistakes). U can also use xpaths or css selectors instead of class names.
希望对您有所帮助!
我正在尝试抓取发布在 TripAdvisor 用户个人资料中的评论,目的是分析我最后一年的评论 project.So 我使用以下 python 代码和 Beautiful Soup library.So 我的问题是如何点击“显示更多”按钮来抓取个人资料中的所有评论。 https://i.stack.imgur.com/OWUPt.png 图片显示了按钮及其我需要处理的源代码。
这是我用来在 TripAdvisor 用户个人资料中抓取数据的代码
import requests
import csv
import re
from bs4 import BeautifulSoup
maxcount = 50
filename = ""
def writecsv(c1,c2,c3,c4,c5):
with open(filename, mode='a',newline='') as f:
#keys = ['name', 'age', 'job', 'city']
writer = csv.writer(f)
writer.writerow([str(c1),str(c2),str(c3),str(c4),str(c5)])
def onereview(review):
try:
name = review.find(class_="_2fxQ4TOx").get_text() if review.find(class_="_2fxQ4TOx") else ""
reviewTitle = review.find(class_="_3IEJ3tAK _2K4zZcBv").get_text() if review.find(class_="_3IEJ3tAK _2K4zZcBv") else ""
reviewDetails = review.find(class_="_133ThCYf").get_text() if review.find(class_="_133ThCYf") else ""
reviewDate = review.find(class_="_3Coh9OJA").get_text() if review.find(class_="_3Coh9OJA") else ""
reviewFor = review.find(class_="_2ys8zX0p ui_link").get_text() if review.find(class_="_2ys8zX0p ui_link") else ""
#print(name)
writecsv(name,reviewTitle,reviewDetails,reviewDate,reviewFor)
except :
print('error')
def allreviews(URL,endcount):
page = requests.get(URL)
html = BeautifulSoup(page.content, 'html.parser')
body = html.find('body')
contents = body.find("div", {"id": "content"}).div
review = contents.div
for el in range(endcount):
try:
onereview(review.find_next_sibling('div'))
review = review.find_next_sibling('div')
except:
print('review not found')
break
def getallReviewsBymainUrl(URL):
global filename
page = requests.get(URL)
html = BeautifulSoup(page.content, 'html.parser')
count = html.find('body').find(class_="iX3IT_XP").get_text().replace(',','')
username = html.find('body').find(class_="_2wpJPTNc _345JQp5A").get_text().replace(' ','_')
filename = username+".csv"
print('start to fill '+filename)
with open(filename, mode='w') as f:
writer = csv.writer(f)
writer.writerow([str('user name'),str('reviewTitle'),str('reviewDetails'),str('reviewDate'),str('reviewFor')])
endcount = int(maxcount) if int(count) > int(maxcount) else int(count)
allreviews(URL,endcount)
print('save reviews in page = ',str(endcount),' user = ',filename)
print()
URLs = ['https://www.tripadvisor.com/Profile/KatieTuesday?fid=ba3cc2e7-5d8d-404e-88bd-f7f30',
'https://www.tripadvisor.com/Profile/elmila?fid=6497e4e4-1314-487b-a883-a6b519bc7efb'
]
for url in URLs:
try:
getallReviewsBymainUrl(url)
except:
print('There is a mistake, check again '+url)
print()
print('program is end, Thank you.')
我尝试按照以下几行来单击按钮,但它没有 work.And 我也无法在源代码中找到“显示更多”按钮的 url。This 是我用来抓取的 tripAdvisor 中的用户个人资料 URL
button = body.find("button", {"class": "_1JOGv2rJ _2oWqCEVy _3yBiBka1 _3fiJJkxX"})
button.click()
如果有人能帮助我解决这个问题,我将不胜感激。
您可以使用 selenium
来点击这样的按钮:
from selenium import webdriver
driver = webdriver.Chrome() #Opens a chrome browser. U should install a chromedriver for this and save it in the path where you have this program. You can download it from here: https://chromedriver.chromium.org/downloads
driver.get('https://www.tripadvisor.com/Profile/wwkalex-fernando?tab=reviews') #Opens the url
driver.find_element_by_class_name("button class name").click() #Finds the button with the mentioned class name and clicks on the it. Make sure that you specify the button's class name (the btn's class name given in the screenshot was very long, so I did not type it as it would be prone to mistakes). U can also use xpaths or css selectors instead of class names.
希望对您有所帮助!