使用 RStudio Chromote 获取页面生成的 XHR 请求的响应正文的正确方法
Correct way to get response body of XHR requests generated by a page with RStudio Chromote
我想使用 Chromote 来收集网站发出的 XHR 调用的响应主体,但我发现 API 掌握起来有点复杂,尤其是异步管道。
我想我需要先启用网络功能然后加载页面(这可以做到),但我需要:
- 列出所有 XHR 调用
- 通过识别请求中的模式来过滤它们 URL
- 访问所选来源的请求正文
有人可以在这方面提供任何指导或教程吗material?
更新:
好的,我切换到 package crrri
并为此目的制作了一个通用功能。唯一缺少的部分是决定何时关闭连接和 return 结果的一些逻辑:
get_website_resources <- function(url, url_filter = '*', type_filter = '*') {
library(crrri)
library(dplyr)
library(stringr)
library(jsonlite)
library(magrittr)
chrome <- Chrome$new()
out <- new.env()
out$l <- list()
client <- chrome$connect(callback = ~ NULL)
Fetch <- client$Fetch
Page <- client$Page
Fetch$enable(patterns = list(list(urlPattern="*", requestStage="Response"))) %...>% {
Fetch$requestPaused(callback = function(params) {
if (str_detect(params$request$url, url_filter) & str_detect(params$resourceType, type_filter)) {
Fetch$getResponseBody(requestId = params$requestId) %...>% {
resp <- .
if (resp$body != '') {
if (resp$base64Encoded) resp$body = base64_dec(resp$body) %>% rawToChar()
body <- list(list(
url = params$request$url,
response = resp
)) %>% set_names(params$requestId)
str(body)
out$l <- append(out$l, body)
}
}
}
Fetch$continueRequest(requestId = params$requestId)
})
} %...>% {
Page$navigate(url)
}
out$l
}
破解了。这是最终的功能。它使用 crrri::perform_with_chrome
强制同步行为和 运行 其余过程到 promise
对象中,并在 promise 本身之外定义了 resolve
回调,如果收集的资源数量或经过一定时间后:
get_website_resources <- function(url, url_filter = '*', type_filter = '*', wait_for = 20, n_of_resources = NULL, interactive = F) {
library(crrri)
library(promises)
crrri::perform_with_chrome(function(client) {
Fetch <- client$Fetch
Page <- client$Page
if (interactive) client$inspect()
out <- new.env()
out$results <- list()
out$resolve_function <- NULL
out$pr <- promises::promise(function(resolve, reject) {
out$resolve_function <- resolve
Fetch$enable(patterns = list(list(urlPattern="*", requestStage="Response"))) %...>% {
Fetch$requestPaused(callback = function(params) {
if (str_detect(params$request$url, url_filter) & str_detect(params$resourceType, type_filter)) {
Fetch$getResponseBody(requestId = params$requestId) %...>% {
resp <- .
if (resp$body != '') {
if (resp$base64Encoded) resp$body = jsonlite::base64_dec(resp$body) %>% rawToChar()
body <- list(list(
url = params$request$url,
response = resp
)) %>% set_names(params$requestId)
#str(body)
out$results <- append(out$results, body)
if (!is.null(n_of_resources) & length(out$results) >= n_of_resources) out$resolve_function(out$results)
}
}
}
Fetch$continueRequest(requestId = params$requestId)
})
} %...>% {
Page$navigate(url)
} %>% crrri::wait(wait_for) %>%
then(~ out$resolve_function(out$results))
})
out$pr$then(function(x) x)
}, timeouts = max(wait_for + 3, 30), cleaning_timeout = max(wait_for + 3, 30))
}
我想使用 Chromote 来收集网站发出的 XHR 调用的响应主体,但我发现 API 掌握起来有点复杂,尤其是异步管道。
我想我需要先启用网络功能然后加载页面(这可以做到),但我需要:
- 列出所有 XHR 调用
- 通过识别请求中的模式来过滤它们 URL
- 访问所选来源的请求正文
有人可以在这方面提供任何指导或教程吗material?
更新:
好的,我切换到 package crrri
并为此目的制作了一个通用功能。唯一缺少的部分是决定何时关闭连接和 return 结果的一些逻辑:
get_website_resources <- function(url, url_filter = '*', type_filter = '*') {
library(crrri)
library(dplyr)
library(stringr)
library(jsonlite)
library(magrittr)
chrome <- Chrome$new()
out <- new.env()
out$l <- list()
client <- chrome$connect(callback = ~ NULL)
Fetch <- client$Fetch
Page <- client$Page
Fetch$enable(patterns = list(list(urlPattern="*", requestStage="Response"))) %...>% {
Fetch$requestPaused(callback = function(params) {
if (str_detect(params$request$url, url_filter) & str_detect(params$resourceType, type_filter)) {
Fetch$getResponseBody(requestId = params$requestId) %...>% {
resp <- .
if (resp$body != '') {
if (resp$base64Encoded) resp$body = base64_dec(resp$body) %>% rawToChar()
body <- list(list(
url = params$request$url,
response = resp
)) %>% set_names(params$requestId)
str(body)
out$l <- append(out$l, body)
}
}
}
Fetch$continueRequest(requestId = params$requestId)
})
} %...>% {
Page$navigate(url)
}
out$l
}
破解了。这是最终的功能。它使用 crrri::perform_with_chrome
强制同步行为和 运行 其余过程到 promise
对象中,并在 promise 本身之外定义了 resolve
回调,如果收集的资源数量或经过一定时间后:
get_website_resources <- function(url, url_filter = '*', type_filter = '*', wait_for = 20, n_of_resources = NULL, interactive = F) {
library(crrri)
library(promises)
crrri::perform_with_chrome(function(client) {
Fetch <- client$Fetch
Page <- client$Page
if (interactive) client$inspect()
out <- new.env()
out$results <- list()
out$resolve_function <- NULL
out$pr <- promises::promise(function(resolve, reject) {
out$resolve_function <- resolve
Fetch$enable(patterns = list(list(urlPattern="*", requestStage="Response"))) %...>% {
Fetch$requestPaused(callback = function(params) {
if (str_detect(params$request$url, url_filter) & str_detect(params$resourceType, type_filter)) {
Fetch$getResponseBody(requestId = params$requestId) %...>% {
resp <- .
if (resp$body != '') {
if (resp$base64Encoded) resp$body = jsonlite::base64_dec(resp$body) %>% rawToChar()
body <- list(list(
url = params$request$url,
response = resp
)) %>% set_names(params$requestId)
#str(body)
out$results <- append(out$results, body)
if (!is.null(n_of_resources) & length(out$results) >= n_of_resources) out$resolve_function(out$results)
}
}
}
Fetch$continueRequest(requestId = params$requestId)
})
} %...>% {
Page$navigate(url)
} %>% crrri::wait(wait_for) %>%
then(~ out$resolve_function(out$results))
})
out$pr$then(function(x) x)
}, timeouts = max(wait_for + 3, 30), cleaning_timeout = max(wait_for + 3, 30))
}