使用 Rselenium 优化网页抓取
Optimize web scraping with Rselenium
我正在对一个动态网页进行一些网络抓取,我想优化这个过程,因为它非常慢。该网页显示一系列带有信息的销售,向下滚动会显示更多销售,尽管销售数量有限。我所做的是增加 window 的大小,这样它就可以在不滚动的情况下加载几乎所有的销售。但是,这需要一段时间才能加载,因为有很多信息和图像。我正在提取的信息是价格、资产名称和与资产关联的 link(当您单击图像时)。
我的目标是尽可能优化这个过程。一种方法是不加载图像,因为我不需要它们,但我找不到使用 Firefox 的方法。
如有任何改进,我们将不胜感激。
library(RSelenium)
library(rvest)
url <- "https://cnft.io/marketplace?project=Boss%20Cat%20Rocket%20Club&sort=_id:-1&type=listing,offer"
exCap <- list("moz:firefoxOptions" = list(args = list('--headless'))) # Hide browser --headless
rD <- rsDriver(browser = "firefox", port = as.integer(sample(4000:4700, 1)),
verbose = FALSE, extraCapabilities = exCap)
remDr <- rD[["client"]]
remDr$setWindowSize(30000, 30000)
remDr$navigate(url)
Sys.sleep(300)
html <- remDr$getPageSource()[[1]]
remDr$close()
html <- read_html(html)
好吧,在对该网站进行一些挖掘之后,我发现所有列表的 API:https://api.cnft.io/market/listings。它需要一个 POST 请求并将 return 分页 JSON 字符串。我们可以使用 httr
来发送这样的请求。这是您的网页抓取任务的小脚本。
api_link <- "https://api.cnft.io/market/listings"
project <- "Boss Cat Rocket Club"
query <- function(page, url, project) {
httr::content(httr::POST(
url = url,
body = list(
search = "",
types = c("listing", "offer"),
project = project,
sort = list(`_id` = -1L),
priceMin = NULL,
priceMax = NULL,
page = page,
verified = TRUE,
nsfw = FALSE,
sold = FALSE,
smartContract = FALSE
),
encode = "json"
), simplifyVector = TRUE)
}
query_all <- function(url, project) {
n <- query(1L, url, project)[["count"]]
out <- vector("list", n)
for (i in seq_len(n)) {
out[[i]] <- query(i, url, project)[["results"]]
if (length(out[[i]]) < 1L)
return(out[seq_len(i - 1L)])
}
out
}
collect_data <- function(results) {
dplyr::tibble(
asset_id = results[["asset"]][["assetId"]],
price = results[["price"]],
link = paste0("https://cnft.io/token/", results[["_id"]])
)
}
system.time(
dt <- query_all(api_link, project) |> lapply(collect_data) |> dplyr::bind_rows()
)
dt
输出(大约需要12秒完成)
> system.time(
+ dt <- query_all(api_link, project) |> lapply(collect_data) |> dplyr::bind_rows()
+ )
user system elapsed
0.78 0.00 12.33
> dt
# A tibble: 2,161 x 3
asset_id price link
<chr> <dbl> <chr>
1 BossCatRocketClub1373 222000000 https://cnft.io/token/61ce22eb4185f57d50190079
2 BossCatRocketClub4639 380000000 https://cnft.io/token/61ce229b9163f2db80db98fe
3 BossCatRocketClub5598 505000000 https://cnft.io/token/61ce22954185f57d5018e2ff
4 BossCatRocketClub2673 187000000 https://cnft.io/token/61ce2281ceed93ea12ae32ec
5 BossCatRocketClub1721 350000000 https://cnft.io/token/61ce2281398627cc52c5844c
6 BossCatRocketClub673 300000000 https://cnft.io/token/61ce22724185f57d5018d645
7 BossCatRocketClub5915 200000000000 https://cnft.io/token/61ce2241398627cc52c56eae
8 BossCatRocketClub5699 350000000 https://cnft.io/token/61ce21fa398627cc52c55644
9 BossCatRocketClub4570 350000000 https://cnft.io/token/61ce21ef4185f57d5018a9d4
10 BossCatRocketClub6125 250000000 https://cnft.io/token/61ce21e49163f2db80db58dd
# ... with 2,151 more rows
我正在对一个动态网页进行一些网络抓取,我想优化这个过程,因为它非常慢。该网页显示一系列带有信息的销售,向下滚动会显示更多销售,尽管销售数量有限。我所做的是增加 window 的大小,这样它就可以在不滚动的情况下加载几乎所有的销售。但是,这需要一段时间才能加载,因为有很多信息和图像。我正在提取的信息是价格、资产名称和与资产关联的 link(当您单击图像时)。
我的目标是尽可能优化这个过程。一种方法是不加载图像,因为我不需要它们,但我找不到使用 Firefox 的方法。
如有任何改进,我们将不胜感激。
library(RSelenium)
library(rvest)
url <- "https://cnft.io/marketplace?project=Boss%20Cat%20Rocket%20Club&sort=_id:-1&type=listing,offer"
exCap <- list("moz:firefoxOptions" = list(args = list('--headless'))) # Hide browser --headless
rD <- rsDriver(browser = "firefox", port = as.integer(sample(4000:4700, 1)),
verbose = FALSE, extraCapabilities = exCap)
remDr <- rD[["client"]]
remDr$setWindowSize(30000, 30000)
remDr$navigate(url)
Sys.sleep(300)
html <- remDr$getPageSource()[[1]]
remDr$close()
html <- read_html(html)
好吧,在对该网站进行一些挖掘之后,我发现所有列表的 API:https://api.cnft.io/market/listings。它需要一个 POST 请求并将 return 分页 JSON 字符串。我们可以使用 httr
来发送这样的请求。这是您的网页抓取任务的小脚本。
api_link <- "https://api.cnft.io/market/listings"
project <- "Boss Cat Rocket Club"
query <- function(page, url, project) {
httr::content(httr::POST(
url = url,
body = list(
search = "",
types = c("listing", "offer"),
project = project,
sort = list(`_id` = -1L),
priceMin = NULL,
priceMax = NULL,
page = page,
verified = TRUE,
nsfw = FALSE,
sold = FALSE,
smartContract = FALSE
),
encode = "json"
), simplifyVector = TRUE)
}
query_all <- function(url, project) {
n <- query(1L, url, project)[["count"]]
out <- vector("list", n)
for (i in seq_len(n)) {
out[[i]] <- query(i, url, project)[["results"]]
if (length(out[[i]]) < 1L)
return(out[seq_len(i - 1L)])
}
out
}
collect_data <- function(results) {
dplyr::tibble(
asset_id = results[["asset"]][["assetId"]],
price = results[["price"]],
link = paste0("https://cnft.io/token/", results[["_id"]])
)
}
system.time(
dt <- query_all(api_link, project) |> lapply(collect_data) |> dplyr::bind_rows()
)
dt
输出(大约需要12秒完成)
> system.time(
+ dt <- query_all(api_link, project) |> lapply(collect_data) |> dplyr::bind_rows()
+ )
user system elapsed
0.78 0.00 12.33
> dt
# A tibble: 2,161 x 3
asset_id price link
<chr> <dbl> <chr>
1 BossCatRocketClub1373 222000000 https://cnft.io/token/61ce22eb4185f57d50190079
2 BossCatRocketClub4639 380000000 https://cnft.io/token/61ce229b9163f2db80db98fe
3 BossCatRocketClub5598 505000000 https://cnft.io/token/61ce22954185f57d5018e2ff
4 BossCatRocketClub2673 187000000 https://cnft.io/token/61ce2281ceed93ea12ae32ec
5 BossCatRocketClub1721 350000000 https://cnft.io/token/61ce2281398627cc52c5844c
6 BossCatRocketClub673 300000000 https://cnft.io/token/61ce22724185f57d5018d645
7 BossCatRocketClub5915 200000000000 https://cnft.io/token/61ce2241398627cc52c56eae
8 BossCatRocketClub5699 350000000 https://cnft.io/token/61ce21fa398627cc52c55644
9 BossCatRocketClub4570 350000000 https://cnft.io/token/61ce21ef4185f57d5018a9d4
10 BossCatRocketClub6125 250000000 https://cnft.io/token/61ce21e49163f2db80db58dd
# ... with 2,151 more rows