如何在 rvest 的 html_session 中使用 post
How to use post within rvest's html_session
As @chinsoon12 mentioned in the comment; it's against the term of use of TripAdvisor to scrape the information. But I just would like to know how to use POST method in rvest with this example. I've searched on google and Whosebug but the answers are not very helpful. Any general suggestions are also appreciated!
因此,我需要单击 "more" 按钮才能查看完整评论。否则它只会给出部分评论。
我已经成功地使用 Rselenium 模拟点击并获得完整评论,但我想知道如何使用 rvest 和 httr 来实现。
观察网络流量后,我发现点击"more"按钮后,我发送了两个POST请求如下:
我试过下面这段代码,但是正文是空的。
library(rvest)
library(httr)
url <- "https://www.tripadvisor.com/Hotel_Review-g33657-d85704-Reviews-Hotel_Bristol-Steamboat_Springs_Colorado.html"
post_to_url <-"https://www.tripadvisor.com/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer="
user_agent_table <- read.csv("https://raw.githubusercontent.com/yusuzech/top-50-user-agents/master/user_agent.csv",stringsAsFactors = F)
post_body <- "reviews=556957481%2C511497076%2C556144452%2C554686822%2C548218482&contextChoice=DETAIL_HR&haveJses=earlyRequireDefine%2Camdearly%2Cglobal_error%2Clong_lived_global%2Capg-Hotel_Review%2Capg-Hotel_Review-in%2Cbootstrap%2Cdesktop-rooms-guests-dust-en_US%2Cresponsive-calendar-templates-dust-en_US%2Ctaevents&haveCsses=apg-Hotel_Review-in&Action=install"
user_agent_list <- user_agent_table$User.agent
random_agent <- function(){user_agent(sample(user_agent_list,1))}
mysession <- html_session(url,random_agent())
result <- POST(url=post_to_url,
config=list(referer = mysession$url),
user_agent(mysession$config$options$useragent),
body=post_body,
encode="raw")
result
> result
Response [https://www.tripadvisor.com/OverlayWidgetAjaxMode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer=]
Date: 2018-05-10 01:49
Status: 200
Content-Type: text/html;charset=UTF-8
<EMPTY BODY>
我知道我需要使用POST方法,但我不知道如何设置正文和其他配置。我也不确定我是否必须同时发送两个 post 请求以及如何在 httr 和 rvest 中实现这一点。
感谢任何帮助!
我试过 post rvest:::request_POST
像你一样但是失败了。它收到 'Method Not Allowed (HTTP 405)' 错误消息。但事实证明我们不必 post rvest:::request_POST
那样,完整的评论已经在源中,只是默认不显示。以下是用于抓取关于这家酒店的所有评论的代码:
library(rvest)
library(stringr)
reviews_df <- data.frame(reviewers = character(),
review_dates = character(),
stars = integer(),
contributions = integer(),
helpful_votes = integer(),
review_titles = character(),
reviews = character())
pages_url <- character();
pages_url[1] <- "https://www.tripadvisor.com/Hotel_Review-g33657-d85704-Reviews-The_Bristol_by_Magnuson_Worldwide-Steamboat_Springs_Colorado.html#REVIEWS";
for (i in 2:42) pages_url[i] <- paste("https://www.tripadvisor.com/Hotel_Review-g33657-d85704-Reviews-or", as.character(5 * (i-1)),"-The_Bristol_by_Magnuson_Worldwide-Steamboat_Springs_Colorado.html#REVIEWS", sep = "");
for (i in 1:42){
reviewers <- character();
review_dates <- character();
stars <- integer();
contributions <- integer();
helpful_votes <- integer();
review_titles <- character();
reviews <- character();
page <- read_html(pages_url[i]);
review_nodes <- page %>% html_nodes(xpath = "//div[@data-test-target='reviews-tab']/div[@data-test-target='HR_CC_CARD']");
reviewers <- review_nodes %>% html_nodes(xpath = "./descendant::a[contains(@class, 'social-member-event-MemberEventOnObjectBlock__member')]") %>% html_text(trim = TRUE);
review_dates <- review_nodes %>% html_nodes(xpath = "./descendant::a[contains(@class, 'social-member-event-MemberEventOnObjectBlock__member')]/following-sibling::text()[1]") %>% html_text(trim = TRUE);
review_dates <- str_sub(review_dates, start = 16);
star_classes <- review_nodes %>% html_nodes(xpath = "./descendant::div[@data-test-target='review-rating']/span[contains(@class, 'ui_bubble_rating')]") %>% html_attr("class");
for (j in 1:length(review_nodes)){
if (grepl("bubble_10", star_classes[j], fixed = TRUE)) stars[j] <- 1
else if (grepl("bubble_20", star_classes[j], fixed = TRUE)) stars[j] <- 2
else if (grepl("bubble_30", star_classes[j], fixed = TRUE)) stars[j] <- 3
else if (grepl("bubble_40", star_classes[j], fixed = TRUE)) stars[j] <- 4
else if (grepl("bubble_50", star_classes[j], fixed = TRUE)) stars[j] <- 5;
if (length(review_nodes[j] %>% html_nodes(xpath = "./descendant::span[contains(@class, 'social-member-MemberHeaderStats__stat_item') and contains(., 'contribution')]/span")) == 0) contributions[j] <- 0
else contributions[j] <- as.numeric(review_nodes[j] %>% html_nodes(xpath = "./descendant::span[contains(@class, 'social-member-MemberHeaderStats__stat_item') and contains(., 'contribution')]/descendant::span[contains(@class, 'social-member-MemberHeaderStats__bold')]") %>% html_text());
if (length(review_nodes[j] %>% html_nodes(xpath = "./descendant::span[contains(@class, 'social-member-MemberHeaderStats__stat_item') and contains(., 'helpful vote')]/span")) == 0) helpful_votes[j] <- 0
else helpful_votes[j] <- as.numeric(review_nodes[j] %>% html_nodes(xpath = "./descendant::span[contains(@class, 'social-member-MemberHeaderStats__stat_item') and contains(., 'helpful vote')]/descendant::span[contains(@class, 'social-member-MemberHeaderStats__bold')]") %>% html_text());
}
review_titles <- review_nodes %>% html_nodes(xpath = "./descendant::div[@data-test-target='review-title']") %>% html_text(trim = TRUE);
reviews <- review_nodes %>% html_nodes(xpath = "./descendant::q[contains(@class, 'location-review-review-list-parts-ExpandableReview__reviewText')]") %>% html_text(trim = TRUE);
pgreviews_df <- data.frame(reviewers, review_dates, stars, contributions, helpful_votes, review_titles, reviews)
reviews_df <- rbind(reviews_df, pgreviews_df);
}
As @chinsoon12 mentioned in the comment; it's against the term of use of TripAdvisor to scrape the information. But I just would like to know how to use POST method in rvest with this example. I've searched on google and Whosebug but the answers are not very helpful. Any general suggestions are also appreciated!
因此,我需要单击 "more" 按钮才能查看完整评论。否则它只会给出部分评论。
我已经成功地使用 Rselenium 模拟点击并获得完整评论,但我想知道如何使用 rvest 和 httr 来实现。
观察网络流量后,我发现点击"more"按钮后,我发送了两个POST请求如下:
我试过下面这段代码,但是正文是空的。
library(rvest)
library(httr)
url <- "https://www.tripadvisor.com/Hotel_Review-g33657-d85704-Reviews-Hotel_Bristol-Steamboat_Springs_Colorado.html"
post_to_url <-"https://www.tripadvisor.com/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer="
user_agent_table <- read.csv("https://raw.githubusercontent.com/yusuzech/top-50-user-agents/master/user_agent.csv",stringsAsFactors = F)
post_body <- "reviews=556957481%2C511497076%2C556144452%2C554686822%2C548218482&contextChoice=DETAIL_HR&haveJses=earlyRequireDefine%2Camdearly%2Cglobal_error%2Clong_lived_global%2Capg-Hotel_Review%2Capg-Hotel_Review-in%2Cbootstrap%2Cdesktop-rooms-guests-dust-en_US%2Cresponsive-calendar-templates-dust-en_US%2Ctaevents&haveCsses=apg-Hotel_Review-in&Action=install"
user_agent_list <- user_agent_table$User.agent
random_agent <- function(){user_agent(sample(user_agent_list,1))}
mysession <- html_session(url,random_agent())
result <- POST(url=post_to_url,
config=list(referer = mysession$url),
user_agent(mysession$config$options$useragent),
body=post_body,
encode="raw")
result
> result
Response [https://www.tripadvisor.com/OverlayWidgetAjaxMode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer=]
Date: 2018-05-10 01:49
Status: 200
Content-Type: text/html;charset=UTF-8
<EMPTY BODY>
我知道我需要使用POST方法,但我不知道如何设置正文和其他配置。我也不确定我是否必须同时发送两个 post 请求以及如何在 httr 和 rvest 中实现这一点。
感谢任何帮助!
我试过 post rvest:::request_POST
像你一样但是失败了。它收到 'Method Not Allowed (HTTP 405)' 错误消息。但事实证明我们不必 post rvest:::request_POST
那样,完整的评论已经在源中,只是默认不显示。以下是用于抓取关于这家酒店的所有评论的代码:
library(rvest)
library(stringr)
reviews_df <- data.frame(reviewers = character(),
review_dates = character(),
stars = integer(),
contributions = integer(),
helpful_votes = integer(),
review_titles = character(),
reviews = character())
pages_url <- character();
pages_url[1] <- "https://www.tripadvisor.com/Hotel_Review-g33657-d85704-Reviews-The_Bristol_by_Magnuson_Worldwide-Steamboat_Springs_Colorado.html#REVIEWS";
for (i in 2:42) pages_url[i] <- paste("https://www.tripadvisor.com/Hotel_Review-g33657-d85704-Reviews-or", as.character(5 * (i-1)),"-The_Bristol_by_Magnuson_Worldwide-Steamboat_Springs_Colorado.html#REVIEWS", sep = "");
for (i in 1:42){
reviewers <- character();
review_dates <- character();
stars <- integer();
contributions <- integer();
helpful_votes <- integer();
review_titles <- character();
reviews <- character();
page <- read_html(pages_url[i]);
review_nodes <- page %>% html_nodes(xpath = "//div[@data-test-target='reviews-tab']/div[@data-test-target='HR_CC_CARD']");
reviewers <- review_nodes %>% html_nodes(xpath = "./descendant::a[contains(@class, 'social-member-event-MemberEventOnObjectBlock__member')]") %>% html_text(trim = TRUE);
review_dates <- review_nodes %>% html_nodes(xpath = "./descendant::a[contains(@class, 'social-member-event-MemberEventOnObjectBlock__member')]/following-sibling::text()[1]") %>% html_text(trim = TRUE);
review_dates <- str_sub(review_dates, start = 16);
star_classes <- review_nodes %>% html_nodes(xpath = "./descendant::div[@data-test-target='review-rating']/span[contains(@class, 'ui_bubble_rating')]") %>% html_attr("class");
for (j in 1:length(review_nodes)){
if (grepl("bubble_10", star_classes[j], fixed = TRUE)) stars[j] <- 1
else if (grepl("bubble_20", star_classes[j], fixed = TRUE)) stars[j] <- 2
else if (grepl("bubble_30", star_classes[j], fixed = TRUE)) stars[j] <- 3
else if (grepl("bubble_40", star_classes[j], fixed = TRUE)) stars[j] <- 4
else if (grepl("bubble_50", star_classes[j], fixed = TRUE)) stars[j] <- 5;
if (length(review_nodes[j] %>% html_nodes(xpath = "./descendant::span[contains(@class, 'social-member-MemberHeaderStats__stat_item') and contains(., 'contribution')]/span")) == 0) contributions[j] <- 0
else contributions[j] <- as.numeric(review_nodes[j] %>% html_nodes(xpath = "./descendant::span[contains(@class, 'social-member-MemberHeaderStats__stat_item') and contains(., 'contribution')]/descendant::span[contains(@class, 'social-member-MemberHeaderStats__bold')]") %>% html_text());
if (length(review_nodes[j] %>% html_nodes(xpath = "./descendant::span[contains(@class, 'social-member-MemberHeaderStats__stat_item') and contains(., 'helpful vote')]/span")) == 0) helpful_votes[j] <- 0
else helpful_votes[j] <- as.numeric(review_nodes[j] %>% html_nodes(xpath = "./descendant::span[contains(@class, 'social-member-MemberHeaderStats__stat_item') and contains(., 'helpful vote')]/descendant::span[contains(@class, 'social-member-MemberHeaderStats__bold')]") %>% html_text());
}
review_titles <- review_nodes %>% html_nodes(xpath = "./descendant::div[@data-test-target='review-title']") %>% html_text(trim = TRUE);
reviews <- review_nodes %>% html_nodes(xpath = "./descendant::q[contains(@class, 'location-review-review-list-parts-ExpandableReview__reviewText')]") %>% html_text(trim = TRUE);
pgreviews_df <- data.frame(reviewers, review_dates, stars, contributions, helpful_votes, review_titles, reviews)
reviews_df <- rbind(reviews_df, pgreviews_df);
}