使用 R 并行化抓取网页内容
Parallelisation to scrape web content with R
我正尝试使用提到的异步方法从 Web 中抓取数据 in this post。这是我要从中抓取数据的网址。
我将 url 存储在 list.Rdata 文件中。可以从这里下载链接:https://www.dropbox.com/s/wl2per5npuq5h8y/list.Rdata?dl=1.
首先,我加载前 1000 个网址:
library(RCurl)
library(rvest)
library(XML)
library(httr)
library(reshape2)
library(reshape)
load("list.Rdata")
list <- list[1:1000]
un <- unlist(list)
然后我使用代码从该 url 中抓取内容:
get.asynch <- function(urls){
txt <- getURIAsynchronous(urls)
doc <- htmlParse(txt,asText=TRUE,encoding = "UTF-8")
base <- xpathSApply(doc, "//table//tr//td",xmlValue)
# Pavadinimas
uab <- ifelse(length(xpathSApply(doc, "//head//title",xmlValue))==1,gsub(". Rekvizitai.lt","", xpathSApply(doc, "//head//title",xmlValue)), "-")
# Imones kodas
ik <- ifelse(is.na(agrep("Imones kodas",base))==TRUE, "-", base[agrep("Imones kodas",base)+1])
# PVM kodas
pk <- ifelse(is.na(match("PVM kodas",base))==TRUE, "-", base[match("PVM kodas",base)+1])
# Vadovas
vad <- ifelse(is.na(match("Vadovas",base))==TRUE, "-", base[match("Vadovas",base)+1])
# Adresas
ad <- ifelse(is.na(match("Adresas",base))==TRUE, "-", base[match("Adresas",base)+1])
# Telefonas
tel <- ifelse(is.na(match("Telefonas",base))==TRUE, "-", paste("http://rekvizitai.vz.lt", xpathSApply(doc, "//table//tr//td//@src")[1], sep =""))
# Mobilusis
mob <- ifelse(is.na(match("Mobilusis",base))==TRUE, "-", paste("http://rekvizitai.vz.lt", xpathSApply(doc, "//table//tr//td//@src")[2], sep =""))
# Tinklalapis
url <- ifelse(is.na(match("Tinklalapis",base))==TRUE, "-", gsub("\t","",base[match("Tinklalapis",base)+1]))
# Skype
sk <- ifelse(is.na(match("Skype",base))==TRUE, "-", base[match("Skype",base)+1])
# Bankas
bnk <- ifelse(is.na(match("Bankas",base))==TRUE, "-", base[match("Bankas",base)+1])
# Atsiskaitomoji saskaita
ats <- ifelse(is.na(match("Atsiskaitomoji saskaita",base))==TRUE, "-", base[match("Atsiskaitomoji saskaita",base)+1])
# Darbo laikas
dl <- ifelse(is.na(match("Darbo laikas",base))==TRUE, "-", base[match("Darbo laikas",base)+1])
# Darbuotojai
drb <- ifelse(is.na(match("Darbuotojai",base))==TRUE, "-", gsub("\D","",base[match("Darbuotojai",base)+1]))
# SD draudejo kodas
sd <- ifelse(is.na(match("SD draudejo kodas",base))==TRUE, "-", base[match("SD draudejo kodas",base)+1])
# Apyvarta (be PVM)
apv <- ifelse(is.na(match("Apyvarta (be PVM)",base))==TRUE, "-", base[match("Apyvarta (be PVM)",base)+1])
# Transportas
trn <- ifelse(is.na(match("Transportas",base))==TRUE, "-", base[match("Transportas",base)+1])
# Ivertinimas
iv <- ifelse(length(xpathSApply(doc, "//span[@class='average']", xmlValue)) !=0, xpathSApply(doc, "//span[@class='average']", xmlValue),"-")
# Vertintoju skaicius
vert <- ifelse(length(xpathSApply(doc, "//span[@class='votes']", xmlValue)) !=0, xpathSApply(doc, "//span[@class='votes']", xmlValue),"-")
# Veiklos sritys
veikl <-xpathSApply(doc,"//div[@class='floatLeft about']//a | //div[@class='floatLeft about half']//a | //div[@class='about floatLeft']//a",
xmlValue)[1]
# Lentele
df <- cbind(uab, ik, pk, vad, ad, tel, mob, url, sk, bnk, ats, dl, drb, sd, apv, trn, iv, vert, veikl)
}
接下来,我使用我的函数来解析内容并得到错误。我很确定这个错误是对服务器的大量请求的结果。
> system.time(table <- do.call(rbind,lapply(un,get.asynch)))
Error in which(value == defs) :
argument "code" is missing, with no default Timing stopped at: 0.89 0.03 6.82
我正在寻找避免此类行为的解决方案。我尝试了 Sys.sleep() 函数,尽管结果是一样的。欢迎任何有关如何克服与服务器连接问题的帮助。
我搜索了几分钟,在这里找到了答案(第二次回复)R getURL() returning empty string
你需要使用
txt <- getURIAsynchronous(un, .opts = curlOptions(followlocation = TRUE))
还有一个问题——你实际上并不是异步的。使用 lapply(un,get.asynch)
,您可以将 URL 一个一个地发送到 get.asynch
。要并行执行此操作,您需要 get.asynch(un)
之类的东西,但随后您必须重写其余代码。我会把它分成两部分:
冰壶
txts <- getURIAsynchronous(un, .opts=curlOptions(followlocation = TRUE))
和解析
parse <- function(txt) {
doc <- htmlParse(txt,asText=TRUE,encoding = "UTF-8")
base <- xpathSApply(doc, "//table//tr//td",xmlValue)
...
}
table <- do.call(rbind, lapply(txts, parse))
卷曲对我来说效果很好,至少前 100 个链接是这样。不过我没有测试解析部分。
我正尝试使用提到的异步方法从 Web 中抓取数据 in this post。这是我要从中抓取数据的网址。 我将 url 存储在 list.Rdata 文件中。可以从这里下载链接:https://www.dropbox.com/s/wl2per5npuq5h8y/list.Rdata?dl=1.
首先,我加载前 1000 个网址:
library(RCurl)
library(rvest)
library(XML)
library(httr)
library(reshape2)
library(reshape)
load("list.Rdata")
list <- list[1:1000]
un <- unlist(list)
然后我使用代码从该 url 中抓取内容:
get.asynch <- function(urls){
txt <- getURIAsynchronous(urls)
doc <- htmlParse(txt,asText=TRUE,encoding = "UTF-8")
base <- xpathSApply(doc, "//table//tr//td",xmlValue)
# Pavadinimas
uab <- ifelse(length(xpathSApply(doc, "//head//title",xmlValue))==1,gsub(". Rekvizitai.lt","", xpathSApply(doc, "//head//title",xmlValue)), "-")
# Imones kodas
ik <- ifelse(is.na(agrep("Imones kodas",base))==TRUE, "-", base[agrep("Imones kodas",base)+1])
# PVM kodas
pk <- ifelse(is.na(match("PVM kodas",base))==TRUE, "-", base[match("PVM kodas",base)+1])
# Vadovas
vad <- ifelse(is.na(match("Vadovas",base))==TRUE, "-", base[match("Vadovas",base)+1])
# Adresas
ad <- ifelse(is.na(match("Adresas",base))==TRUE, "-", base[match("Adresas",base)+1])
# Telefonas
tel <- ifelse(is.na(match("Telefonas",base))==TRUE, "-", paste("http://rekvizitai.vz.lt", xpathSApply(doc, "//table//tr//td//@src")[1], sep =""))
# Mobilusis
mob <- ifelse(is.na(match("Mobilusis",base))==TRUE, "-", paste("http://rekvizitai.vz.lt", xpathSApply(doc, "//table//tr//td//@src")[2], sep =""))
# Tinklalapis
url <- ifelse(is.na(match("Tinklalapis",base))==TRUE, "-", gsub("\t","",base[match("Tinklalapis",base)+1]))
# Skype
sk <- ifelse(is.na(match("Skype",base))==TRUE, "-", base[match("Skype",base)+1])
# Bankas
bnk <- ifelse(is.na(match("Bankas",base))==TRUE, "-", base[match("Bankas",base)+1])
# Atsiskaitomoji saskaita
ats <- ifelse(is.na(match("Atsiskaitomoji saskaita",base))==TRUE, "-", base[match("Atsiskaitomoji saskaita",base)+1])
# Darbo laikas
dl <- ifelse(is.na(match("Darbo laikas",base))==TRUE, "-", base[match("Darbo laikas",base)+1])
# Darbuotojai
drb <- ifelse(is.na(match("Darbuotojai",base))==TRUE, "-", gsub("\D","",base[match("Darbuotojai",base)+1]))
# SD draudejo kodas
sd <- ifelse(is.na(match("SD draudejo kodas",base))==TRUE, "-", base[match("SD draudejo kodas",base)+1])
# Apyvarta (be PVM)
apv <- ifelse(is.na(match("Apyvarta (be PVM)",base))==TRUE, "-", base[match("Apyvarta (be PVM)",base)+1])
# Transportas
trn <- ifelse(is.na(match("Transportas",base))==TRUE, "-", base[match("Transportas",base)+1])
# Ivertinimas
iv <- ifelse(length(xpathSApply(doc, "//span[@class='average']", xmlValue)) !=0, xpathSApply(doc, "//span[@class='average']", xmlValue),"-")
# Vertintoju skaicius
vert <- ifelse(length(xpathSApply(doc, "//span[@class='votes']", xmlValue)) !=0, xpathSApply(doc, "//span[@class='votes']", xmlValue),"-")
# Veiklos sritys
veikl <-xpathSApply(doc,"//div[@class='floatLeft about']//a | //div[@class='floatLeft about half']//a | //div[@class='about floatLeft']//a",
xmlValue)[1]
# Lentele
df <- cbind(uab, ik, pk, vad, ad, tel, mob, url, sk, bnk, ats, dl, drb, sd, apv, trn, iv, vert, veikl)
}
接下来,我使用我的函数来解析内容并得到错误。我很确定这个错误是对服务器的大量请求的结果。
> system.time(table <- do.call(rbind,lapply(un,get.asynch)))
Error in which(value == defs) :
argument "code" is missing, with no default Timing stopped at: 0.89 0.03 6.82
我正在寻找避免此类行为的解决方案。我尝试了 Sys.sleep() 函数,尽管结果是一样的。欢迎任何有关如何克服与服务器连接问题的帮助。
我搜索了几分钟,在这里找到了答案(第二次回复)R getURL() returning empty string
你需要使用
txt <- getURIAsynchronous(un, .opts = curlOptions(followlocation = TRUE))
还有一个问题——你实际上并不是异步的。使用 lapply(un,get.asynch)
,您可以将 URL 一个一个地发送到 get.asynch
。要并行执行此操作,您需要 get.asynch(un)
之类的东西,但随后您必须重写其余代码。我会把它分成两部分:
冰壶
txts <- getURIAsynchronous(un, .opts=curlOptions(followlocation = TRUE))
和解析
parse <- function(txt) {
doc <- htmlParse(txt,asText=TRUE,encoding = "UTF-8")
base <- xpathSApply(doc, "//table//tr//td",xmlValue)
...
}
table <- do.call(rbind, lapply(txts, parse))
卷曲对我来说效果很好,至少前 100 个链接是这样。不过我没有测试解析部分。