按行应用 rvest html_nodes() 并将输出存储在新列中
rowwise apply rvest html_nodes() and store in a new column the output
我有一些要抓取的网址。我最终得到 3 个数据帧(例如):
# A tibble: 255 × 7
id class tabindex role `aria-controls` style `data-testid`
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 App NA NA NA NA NA NA
2 NA re-AdTop1Container NA NA NA NA NA
3 NA re-AdTop1Container-block NA NA NA NA NA
4 NA re-AdvertisingDominanceCrossdevice-x65 NA NA NA NA NA
5 PubX65Detail_wrapper adit-XandrBanner adit-XandrBanner--notAvailable NA NA NA NA NA
6 PubX65Detail NA NA NA NA NA NA
7 NA re-AdvertisingDominanceCrossdevice-top1 NA NA NA NA NA
8 PubTop1_wrapper adit-XandrBanner adit-XandrBanner--notAvailable NA NA NA NA NA
9 PubTop1 NA NA NA NA NA NA
10 NA react-MoleculeDrawer-content react-MoleculeDrawer-content--placement-left react-MoleculeDrawer-content--size-auto react-Molecul… NA NA NA NA NA
我想遍历 class
列中的每一行并将收集的数据存储在新列中。
即我可以使用以下方式手动收集数据:
html3 %>%
html_nodes('.re-DetailHeader-propertyTitleContainer')
但我想保留 rvest
收集数据的“结构”。我想创建一个新列并使用 class
.
列中的 类 保留所有已保存的 html_nodes()
代码:
url1 = "https://www.fotocasa.es/es/comprar/vivienda/madrid-capital/terraza-piscina/163103410/d"
url2 = "https://www.fotocasa.es/es/comprar/vivienda/elche---elx/calefaccion-terraza-ascensor-parking-internet-no-amueblado/162434119/d"
url3 = "https://www.fotocasa.es/es/comprar/vivienda/almoradi/terraza-trastero-ascensor-amueblado/163000099/d"
##### process url 1 #####
html1 = url1 %>%
read_html()
classAttrs_1 = html1 %>%
html_nodes('div') %>%
html_attrs() %>%
bind_rows() %>%
mutate_all(na_if,"")
########################
##### process url 2 #####
html2 = url2 %>%
read_html()
classAttrs_2 = html2 %>%
html_nodes('div') %>%
html_attrs() %>%
bind_rows() %>%
mutate_all(na_if,"")
########################
##### process url 3 #####
html3 = url3 %>%
read_html()
classAttrs_3 = html3 %>%
html_nodes('div') %>%
html_attrs() %>%
bind_rows() %>%
mutate_all(na_if,"")
########################
每个收集到的 URL 的长度都可以改变,即
> length(unique(classAttrs_1$class))
[1] 113
> length(unique(classAttrs_2$class))
[1] 114
> length(unique(classAttrs_3$class))
[1] 115
所以我考虑单独处理每个数据框。
我们可以使用 rowwise
,检查 'class' 中的值是否为非 NA,应用代码并创建一个 list
列(否则 return NA)
library(rvest)
library(dplyr)
library(stringr)
classAttrs_3_new <- classAttrs_3 %>%
rowwise %>%
mutate(new = list(if(is.na(class)) NA else html3 %>%
html_nodes(str_c(".", class)))) %>%
ungroup
-输出
> head(classAttrs_3_new$new)
[[1]]
[1] NA
[[2]]
{xml_nodeset (1)}
[1] <div class="re-AdTop1Container"><div class="re-AdTop1Container-block">\n<div class="re-AdvertisingDominanceCrossdevice-x65"><div id="PubX65Detail_wrapper" class="adit-XandrB ...
[[3]]
{xml_nodeset (1)}
[1] <div class="re-AdTop1Container-block">\n<div class="re-AdvertisingDominanceCrossdevice-x65"><div id="PubX65Detail_wrapper" class="adit-XandrBanner adit-XandrBanner--notAvail ...
[[4]]
{xml_nodeset (1)}
[1] <div class="re-AdvertisingDominanceCrossdevice-x65"><div id="PubX65Detail_wrapper" class="adit-XandrBanner adit-XandrBanner--notAvailable"><div id="PubX65Detail"></div></div ...
[[5]]
{xml_nodeset (0)}
[[6]]
[1] NA
或者另一个选项是 map
library(purrr)
pfun_node <- possibly(function(html_obj, node_val )
html_obj %>% html_nodes(node_val), otherwise = NA)
classAttrs_3$new <- map(str_c(".", classAttrs_3$class), ~ pfun_node(html3, .x))
我有一些要抓取的网址。我最终得到 3 个数据帧(例如):
# A tibble: 255 × 7
id class tabindex role `aria-controls` style `data-testid`
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 App NA NA NA NA NA NA
2 NA re-AdTop1Container NA NA NA NA NA
3 NA re-AdTop1Container-block NA NA NA NA NA
4 NA re-AdvertisingDominanceCrossdevice-x65 NA NA NA NA NA
5 PubX65Detail_wrapper adit-XandrBanner adit-XandrBanner--notAvailable NA NA NA NA NA
6 PubX65Detail NA NA NA NA NA NA
7 NA re-AdvertisingDominanceCrossdevice-top1 NA NA NA NA NA
8 PubTop1_wrapper adit-XandrBanner adit-XandrBanner--notAvailable NA NA NA NA NA
9 PubTop1 NA NA NA NA NA NA
10 NA react-MoleculeDrawer-content react-MoleculeDrawer-content--placement-left react-MoleculeDrawer-content--size-auto react-Molecul… NA NA NA NA NA
我想遍历 class
列中的每一行并将收集的数据存储在新列中。
即我可以使用以下方式手动收集数据:
html3 %>%
html_nodes('.re-DetailHeader-propertyTitleContainer')
但我想保留 rvest
收集数据的“结构”。我想创建一个新列并使用 class
.
html_nodes()
代码:
url1 = "https://www.fotocasa.es/es/comprar/vivienda/madrid-capital/terraza-piscina/163103410/d"
url2 = "https://www.fotocasa.es/es/comprar/vivienda/elche---elx/calefaccion-terraza-ascensor-parking-internet-no-amueblado/162434119/d"
url3 = "https://www.fotocasa.es/es/comprar/vivienda/almoradi/terraza-trastero-ascensor-amueblado/163000099/d"
##### process url 1 #####
html1 = url1 %>%
read_html()
classAttrs_1 = html1 %>%
html_nodes('div') %>%
html_attrs() %>%
bind_rows() %>%
mutate_all(na_if,"")
########################
##### process url 2 #####
html2 = url2 %>%
read_html()
classAttrs_2 = html2 %>%
html_nodes('div') %>%
html_attrs() %>%
bind_rows() %>%
mutate_all(na_if,"")
########################
##### process url 3 #####
html3 = url3 %>%
read_html()
classAttrs_3 = html3 %>%
html_nodes('div') %>%
html_attrs() %>%
bind_rows() %>%
mutate_all(na_if,"")
########################
每个收集到的 URL 的长度都可以改变,即
> length(unique(classAttrs_1$class))
[1] 113
> length(unique(classAttrs_2$class))
[1] 114
> length(unique(classAttrs_3$class))
[1] 115
所以我考虑单独处理每个数据框。
我们可以使用 rowwise
,检查 'class' 中的值是否为非 NA,应用代码并创建一个 list
列(否则 return NA)
library(rvest)
library(dplyr)
library(stringr)
classAttrs_3_new <- classAttrs_3 %>%
rowwise %>%
mutate(new = list(if(is.na(class)) NA else html3 %>%
html_nodes(str_c(".", class)))) %>%
ungroup
-输出
> head(classAttrs_3_new$new)
[[1]]
[1] NA
[[2]]
{xml_nodeset (1)}
[1] <div class="re-AdTop1Container"><div class="re-AdTop1Container-block">\n<div class="re-AdvertisingDominanceCrossdevice-x65"><div id="PubX65Detail_wrapper" class="adit-XandrB ...
[[3]]
{xml_nodeset (1)}
[1] <div class="re-AdTop1Container-block">\n<div class="re-AdvertisingDominanceCrossdevice-x65"><div id="PubX65Detail_wrapper" class="adit-XandrBanner adit-XandrBanner--notAvail ...
[[4]]
{xml_nodeset (1)}
[1] <div class="re-AdvertisingDominanceCrossdevice-x65"><div id="PubX65Detail_wrapper" class="adit-XandrBanner adit-XandrBanner--notAvailable"><div id="PubX65Detail"></div></div ...
[[5]]
{xml_nodeset (0)}
[[6]]
[1] NA
或者另一个选项是 map
library(purrr)
pfun_node <- possibly(function(html_obj, node_val )
html_obj %>% html_nodes(node_val), otherwise = NA)
classAttrs_3$new <- map(str_c(".", classAttrs_3$class), ~ pfun_node(html3, .x))