网页抓取:网页抓取的对象与网站上的信息不匹配并导致 RStudio 崩溃
web-scraping: web-scraped object doesn't match information on the website and crashes RStudio
我收集了一系列类似于this one的URL。对于每个 URL,我正在使用 rvest
包来网络抓取与网页每个框中列出的每个从业者地址相关的信息。通过检查网页的 HTML 结构,我可以注意到我要检索的信息存在于名为 unit size1of2
的 HTML 部分中(通过将光标悬停,如 div.unit.size1of2
)。然后,我用下面的代码提取了我需要的信息:
library(rvest)
library(xlm2)
webpage <- read_html(x = "myURL")
webpage_name <- webpage %>%
html_nodes("div.unit.size1of2") %>%
html_text(trim = T)
但是,当我提取信息时,得到的结果非常混乱。首先,有些信息我不想抓取,其中一些似乎甚至没有出现在网站上。此外,我的 RStudio IDE 冻结了一段时间,每次我尝试输出结果时,之后都无法使用任何命令正常工作。结果不是我要的那个
您认为这是因为网站上存在某种保护措施吗?
感谢您的帮助!
您可以开始迭代可以使用 div.search-result .line
选择的行,然后:
- 使用
div:first-child h3
获取名称
- 使用
div:first-child p
获取序数
- 通过迭代
div:nth-child(2) p
获取位置,因为可以有多个位置(一个在您的页面上有 5 个位置)并将它们存储在列表中
有必要使用 gsub("[\t\n]", "", x)
删除制表符和新行作为名称和序号。对于地址,您可以获取文本并根据新行 \n
进行拆分,删除重复的新行并删除第一行和最后一行以获得如下列表:
[1] "CABINET VÉTÉRINAIRE DV FEYS JEAN-MARC"
[2] "Cabinet Veterinaire"
[3] "ZA de Kercadiou"
[4] "XXXXX"
[5] "LANVOLLON"
[6] "Tél : 0X.XX.XX.XX.XX"
以下代码还将向量列表转换为包含该页面上所有数据的数据框:
library(rvest)
library(plyr)
url = "https://www.veterinaire.fr/annuaires/trouver-un-veterinaire-pour-soigner-mon-animal.html?tx_siteveterinaire_general%5B__referrer%5D%5B%40extension%5D=SiteVeterinaire&tx_siteveterinaire_general%5B__referrer%5D%5B%40vendor%5D=SiteVeterinaire&tx_siteveterinaire_general%5B__referrer%5D%5B%40controller%5D=FrontendUser&tx_siteveterinaire_general%5B__referrer%5D%5B%40action%5D=search&tx_siteveterinaire_general%5B__referrer%5D%5Barguments%5D=YToxOntzOjY6InNlYXJjaCI7YTo1OntzOjM6Im5vbSI7czowOiIiO3M6NjoicmVnaW9uIjtzOjA6IiI7czoxMToiZGVwYXJ0ZW1lbnQiO3M6MDoiIjtzOjU6InZpbGxlIjtzOjA6IiI7czoxMjoiaXRlbXNQZXJQYWdlIjtzOjI6IjEwIjt9fQ%3D%3D21a1899f9a133814dfc1eb4e01b3b47913bd9925&tx_siteveterinaire_general%5B__referrer%5D%5B%40request%5D=a%3A4%3A%7Bs%3A10%3A%22%40extension%22%3Bs%3A15%3A%22SiteVeterinaire%22%3Bs%3A11%3A%22%40controller%22%3Bs%3A12%3A%22FrontendUser%22%3Bs%3A7%3A%22%40action%22%3Bs%3A6%3A%22search%22%3Bs%3A7%3A%22%40vendor%22%3Bs%3A15%3A%22SiteVeterinaire%22%3B%7D7cd75ca141359a98763248c24da8103293a53d08&tx_siteveterinaire_general%5B__trustedProperties%5D=a%3A1%3A%7Bs%3A6%3A%22search%22%3Ba%3A5%3A%7Bs%3A3%3A%22nom%22%3Bi%3A1%3Bs%3A6%3A%22region%22%3Bi%3A1%3Bs%3A11%3A%22departement%22%3Bi%3A1%3Bs%3A5%3A%22ville%22%3Bi%3A1%3Bs%3A12%3A%22itemsPerPage%22%3Bi%3A1%3B%7D%7D86c9510d17c093c44d053714ab20567929a45f9d&tx_siteveterinaire_general%5Bsearch%5D%5Bnom%5D=&tx_siteveterinaire_general%5Bsearch%5D%5Bregion%5D=&tx_siteveterinaire_general%5Bsearch%5D%5Bdepartement%5D=&tx_siteveterinaire_general%5Bsearch%5D%5Bville%5D=&tx_siteveterinaire_general%5Bsearch%5D%5BitemsPerPage%5D=100&tx_siteveterinaire_general%5B%40widget_0%5D%5BcurrentPage%5D=127&cHash=8d8dc78e004b4b9d0ecfdf9b884f54ca"
rows <- read_html(url) %>%
html_nodes("div.search-result .line")
strip <- function (x) gsub("[\t\n]", "", x)
i <- 1
data = list()
for(r in rows){
addresses = list()
j <- 1
locations = r %>% html_nodes("div:nth-child(2) p")
for(loc in locations){
addresses[[j]] <- loc %>% html_text() %>%
gsub("[\t]", "", .) %>% #remove tabs
gsub('([\n])\1+', '\1', .) %>% #remove duplicate \n
gsub('^\n|\n$', '', .) %>% #remove 1st and last \n
strsplit(., split='\n', fixed=TRUE) #split by \n
j <- j + 1
}
data[[i]] <- c(
name = r %>% html_nodes("div:first-child h3") %>% html_text() %>% strip(.),
ordinal = r %>% html_nodes("div:first-child p") %>% html_text() %>% strip(.),
addresses = addresses
)
i <- i + 1
}
df = rbind.fill(lapply(data,function(y){as.data.frame(t(y),stringsAsFactors=FALSE)}))
#show data
print(df)
for(i in 1:3){
print(paste("name",df[i,"name"]))
print(paste("ordinal",df[i,"ordinal"]))
print(paste("addresses",df[i,"addresses"]))
print(paste("addresses1",df[i,"addresses1"]))
print(paste("addresses2",df[i,"addresses2"]))
print(paste("addresses3",df[i,"addresses3"]))
}
我收集了一系列类似于this one的URL。对于每个 URL,我正在使用 rvest
包来网络抓取与网页每个框中列出的每个从业者地址相关的信息。通过检查网页的 HTML 结构,我可以注意到我要检索的信息存在于名为 unit size1of2
的 HTML 部分中(通过将光标悬停,如 div.unit.size1of2
)。然后,我用下面的代码提取了我需要的信息:
library(rvest)
library(xlm2)
webpage <- read_html(x = "myURL")
webpage_name <- webpage %>%
html_nodes("div.unit.size1of2") %>%
html_text(trim = T)
但是,当我提取信息时,得到的结果非常混乱。首先,有些信息我不想抓取,其中一些似乎甚至没有出现在网站上。此外,我的 RStudio IDE 冻结了一段时间,每次我尝试输出结果时,之后都无法使用任何命令正常工作。结果不是我要的那个
您认为这是因为网站上存在某种保护措施吗?
感谢您的帮助!
您可以开始迭代可以使用 div.search-result .line
选择的行,然后:
- 使用
div:first-child h3
获取名称
- 使用
div:first-child p
获取序数
- 通过迭代
div:nth-child(2) p
获取位置,因为可以有多个位置(一个在您的页面上有 5 个位置)并将它们存储在列表中
有必要使用 gsub("[\t\n]", "", x)
删除制表符和新行作为名称和序号。对于地址,您可以获取文本并根据新行 \n
进行拆分,删除重复的新行并删除第一行和最后一行以获得如下列表:
[1] "CABINET VÉTÉRINAIRE DV FEYS JEAN-MARC"
[2] "Cabinet Veterinaire"
[3] "ZA de Kercadiou"
[4] "XXXXX"
[5] "LANVOLLON"
[6] "Tél : 0X.XX.XX.XX.XX"
以下代码还将向量列表转换为包含该页面上所有数据的数据框:
library(rvest)
library(plyr)
url = "https://www.veterinaire.fr/annuaires/trouver-un-veterinaire-pour-soigner-mon-animal.html?tx_siteveterinaire_general%5B__referrer%5D%5B%40extension%5D=SiteVeterinaire&tx_siteveterinaire_general%5B__referrer%5D%5B%40vendor%5D=SiteVeterinaire&tx_siteveterinaire_general%5B__referrer%5D%5B%40controller%5D=FrontendUser&tx_siteveterinaire_general%5B__referrer%5D%5B%40action%5D=search&tx_siteveterinaire_general%5B__referrer%5D%5Barguments%5D=YToxOntzOjY6InNlYXJjaCI7YTo1OntzOjM6Im5vbSI7czowOiIiO3M6NjoicmVnaW9uIjtzOjA6IiI7czoxMToiZGVwYXJ0ZW1lbnQiO3M6MDoiIjtzOjU6InZpbGxlIjtzOjA6IiI7czoxMjoiaXRlbXNQZXJQYWdlIjtzOjI6IjEwIjt9fQ%3D%3D21a1899f9a133814dfc1eb4e01b3b47913bd9925&tx_siteveterinaire_general%5B__referrer%5D%5B%40request%5D=a%3A4%3A%7Bs%3A10%3A%22%40extension%22%3Bs%3A15%3A%22SiteVeterinaire%22%3Bs%3A11%3A%22%40controller%22%3Bs%3A12%3A%22FrontendUser%22%3Bs%3A7%3A%22%40action%22%3Bs%3A6%3A%22search%22%3Bs%3A7%3A%22%40vendor%22%3Bs%3A15%3A%22SiteVeterinaire%22%3B%7D7cd75ca141359a98763248c24da8103293a53d08&tx_siteveterinaire_general%5B__trustedProperties%5D=a%3A1%3A%7Bs%3A6%3A%22search%22%3Ba%3A5%3A%7Bs%3A3%3A%22nom%22%3Bi%3A1%3Bs%3A6%3A%22region%22%3Bi%3A1%3Bs%3A11%3A%22departement%22%3Bi%3A1%3Bs%3A5%3A%22ville%22%3Bi%3A1%3Bs%3A12%3A%22itemsPerPage%22%3Bi%3A1%3B%7D%7D86c9510d17c093c44d053714ab20567929a45f9d&tx_siteveterinaire_general%5Bsearch%5D%5Bnom%5D=&tx_siteveterinaire_general%5Bsearch%5D%5Bregion%5D=&tx_siteveterinaire_general%5Bsearch%5D%5Bdepartement%5D=&tx_siteveterinaire_general%5Bsearch%5D%5Bville%5D=&tx_siteveterinaire_general%5Bsearch%5D%5BitemsPerPage%5D=100&tx_siteveterinaire_general%5B%40widget_0%5D%5BcurrentPage%5D=127&cHash=8d8dc78e004b4b9d0ecfdf9b884f54ca"
rows <- read_html(url) %>%
html_nodes("div.search-result .line")
strip <- function (x) gsub("[\t\n]", "", x)
i <- 1
data = list()
for(r in rows){
addresses = list()
j <- 1
locations = r %>% html_nodes("div:nth-child(2) p")
for(loc in locations){
addresses[[j]] <- loc %>% html_text() %>%
gsub("[\t]", "", .) %>% #remove tabs
gsub('([\n])\1+', '\1', .) %>% #remove duplicate \n
gsub('^\n|\n$', '', .) %>% #remove 1st and last \n
strsplit(., split='\n', fixed=TRUE) #split by \n
j <- j + 1
}
data[[i]] <- c(
name = r %>% html_nodes("div:first-child h3") %>% html_text() %>% strip(.),
ordinal = r %>% html_nodes("div:first-child p") %>% html_text() %>% strip(.),
addresses = addresses
)
i <- i + 1
}
df = rbind.fill(lapply(data,function(y){as.data.frame(t(y),stringsAsFactors=FALSE)}))
#show data
print(df)
for(i in 1:3){
print(paste("name",df[i,"name"]))
print(paste("ordinal",df[i,"ordinal"]))
print(paste("addresses",df[i,"addresses"]))
print(paste("addresses1",df[i,"addresses1"]))
print(paste("addresses2",df[i,"addresses2"]))
print(paste("addresses3",df[i,"addresses3"]))
}