使用 Rselenium 优化网页抓取

Optimize web scraping with Rselenium

我正在对一个动态网页进行一些网络抓取,我想优化这个过程,因为它非常慢。该网页显示一系列带有信息的销售,向下滚动会显示更多销售,尽管销售数量有限。我所做的是增加 window 的大小,这样它就可以在不滚动的情况下加载几乎所有的销售。但是,这需要一段时间才能加载,因为有很多信息和图像。我正在提取的信息是价格、资产名称和与资产关联的 link(当您单击图像时)。

我的目标是尽可能优化这个过程。一种方法是不加载图像,因为我不需要它们,但我找不到使用 Firefox 的方法。

如有任何改进,我们将不胜感激。

library(RSelenium)
library(rvest)

url <- "https://cnft.io/marketplace?project=Boss%20Cat%20Rocket%20Club&sort=_id:-1&type=listing,offer"

exCap <- list("moz:firefoxOptions" = list(args = list('--headless'))) # Hide browser --headless
rD <- rsDriver(browser = "firefox", port = as.integer(sample(4000:4700, 1)),
               verbose = FALSE, extraCapabilities = exCap)
remDr <- rD[["client"]]
remDr$setWindowSize(30000, 30000)
remDr$navigate(url)
Sys.sleep(300)
html <- remDr$getPageSource()[[1]]
remDr$close()

html <- read_html(html)

好吧,在对该网站进行一些挖掘之后,我发现所有列表的 API:https://api.cnft.io/market/listings。它需要一个 POST 请求并将 return 分页 JSON 字符串。我们可以使用 httr 来发送这样的请求。这是您的网页抓取任务的小脚本。

api_link <- "https://api.cnft.io/market/listings"
project <- "Boss Cat Rocket Club"

query <- function(page, url, project) {
  httr::content(httr::POST(
    url = url, 
    body = list(
      search = "", 
      types = c("listing", "offer"), 
      project = project, 
      sort = list(`_id` = -1L), 
      priceMin = NULL, 
      priceMax = NULL, 
      page = page, 
      verified = TRUE, 
      nsfw = FALSE, 
      sold = FALSE, 
      smartContract = FALSE
    ), 
    encode = "json"
  ), simplifyVector = TRUE)
}

query_all <- function(url, project) {
  n <- query(1L, url, project)[["count"]]
  out <- vector("list", n)
  for (i in seq_len(n)) {
    out[[i]] <- query(i, url, project)[["results"]]
    if (length(out[[i]]) < 1L)
      return(out[seq_len(i - 1L)])
  }
  out
}

collect_data <- function(results) {
  dplyr::tibble(
    asset_id = results[["asset"]][["assetId"]],
    price = results[["price"]],
    link = paste0("https://cnft.io/token/", results[["_id"]])
  )
}

system.time(
  dt <- query_all(api_link, project) |> lapply(collect_data) |> dplyr::bind_rows()  
)
dt

输出(大约需要12秒完成)

> system.time(
+   dt <- query_all(api_link, project) |> lapply(collect_data) |> dplyr::bind_rows()  
+ )
   user  system elapsed 
   0.78    0.00   12.33 
> dt
# A tibble: 2,161 x 3
   asset_id                     price link                                          
   <chr>                        <dbl> <chr>                                         
 1 BossCatRocketClub1373    222000000 https://cnft.io/token/61ce22eb4185f57d50190079
 2 BossCatRocketClub4639    380000000 https://cnft.io/token/61ce229b9163f2db80db98fe
 3 BossCatRocketClub5598    505000000 https://cnft.io/token/61ce22954185f57d5018e2ff
 4 BossCatRocketClub2673    187000000 https://cnft.io/token/61ce2281ceed93ea12ae32ec
 5 BossCatRocketClub1721    350000000 https://cnft.io/token/61ce2281398627cc52c5844c
 6 BossCatRocketClub673     300000000 https://cnft.io/token/61ce22724185f57d5018d645
 7 BossCatRocketClub5915 200000000000 https://cnft.io/token/61ce2241398627cc52c56eae
 8 BossCatRocketClub5699    350000000 https://cnft.io/token/61ce21fa398627cc52c55644
 9 BossCatRocketClub4570    350000000 https://cnft.io/token/61ce21ef4185f57d5018a9d4
10 BossCatRocketClub6125    250000000 https://cnft.io/token/61ce21e49163f2db80db58dd
# ... with 2,151 more rows