按行应用 rvest html_nodes() 并将输出存储在新列中

rowwise apply rvest html_nodes() and store in a new column the output

我有一些要抓取的网址。我最终得到 3 个数据帧(例如):

# A tibble: 255 × 7
   id                   class                                                                                                                            tabindex role  `aria-controls` style `data-testid`
   <chr>                <chr>                                                                                                                            <chr>    <chr> <chr>           <chr> <chr>        
 1 App                  NA                                                                                                                               NA       NA    NA              NA    NA           
 2 NA                   re-AdTop1Container                                                                                                               NA       NA    NA              NA    NA           
 3 NA                   re-AdTop1Container-block                                                                                                         NA       NA    NA              NA    NA           
 4 NA                   re-AdvertisingDominanceCrossdevice-x65                                                                                           NA       NA    NA              NA    NA           
 5 PubX65Detail_wrapper adit-XandrBanner adit-XandrBanner--notAvailable                                                                                  NA       NA    NA              NA    NA           
 6 PubX65Detail         NA                                                                                                                               NA       NA    NA              NA    NA           
 7 NA                   re-AdvertisingDominanceCrossdevice-top1                                                                                          NA       NA    NA              NA    NA           
 8 PubTop1_wrapper      adit-XandrBanner adit-XandrBanner--notAvailable                                                                                  NA       NA    NA              NA    NA           
 9 PubTop1              NA                                                                                                                               NA       NA    NA              NA    NA           
10 NA                   react-MoleculeDrawer-content react-MoleculeDrawer-content--placement-left react-MoleculeDrawer-content--size-auto react-Molecul… NA       NA    NA              NA    NA

我想遍历 class 列中的每一行并将收集的数据存储在新列中。

即我可以使用以下方式手动收集数据:

html3 %>% 
  html_nodes('.re-DetailHeader-propertyTitleContainer')

但我想保留 rvest 收集数据的“结构”。我想创建一个新列并使用 class.

列中的 类 保留所有已保存的 html_nodes()

代码:

url1 = "https://www.fotocasa.es/es/comprar/vivienda/madrid-capital/terraza-piscina/163103410/d"
url2 = "https://www.fotocasa.es/es/comprar/vivienda/elche---elx/calefaccion-terraza-ascensor-parking-internet-no-amueblado/162434119/d"
url3 = "https://www.fotocasa.es/es/comprar/vivienda/almoradi/terraza-trastero-ascensor-amueblado/163000099/d"



##### process url 1 #####
html1 = url1 %>% 
  read_html()


classAttrs_1 = html1 %>% 
  html_nodes('div') %>% 
  html_attrs() %>% 
  bind_rows() %>% 
  mutate_all(na_if,"")

########################

##### process url 2 #####
html2 = url2 %>% 
  read_html()


classAttrs_2 = html2 %>% 
  html_nodes('div') %>% 
  html_attrs() %>% 
  bind_rows() %>% 
  mutate_all(na_if,"")

########################

##### process url 3 #####
html3 = url3 %>% 
  read_html()


classAttrs_3 = html3 %>% 
  html_nodes('div') %>% 
  html_attrs() %>% 
  bind_rows() %>% 
  mutate_all(na_if,"")

########################

每个收集到的 URL 的长度都可以改变,即

> length(unique(classAttrs_1$class))
[1] 113
> length(unique(classAttrs_2$class))
[1] 114
> length(unique(classAttrs_3$class))
[1] 115

所以我考虑单独处理每个数据框。

我们可以使用 rowwise,检查 'class' 中的值是否为非 NA,应用代码并创建一个 list 列(否则 return NA)

library(rvest)
library(dplyr)
library(stringr)
classAttrs_3_new <- classAttrs_3 %>%  
   rowwise %>%
   mutate(new = list(if(is.na(class)) NA else html3 %>%
   html_nodes(str_c(".", class)))) %>% 
   ungroup

-输出

> head(classAttrs_3_new$new)
[[1]]
[1] NA

[[2]]
{xml_nodeset (1)}
[1] <div class="re-AdTop1Container"><div class="re-AdTop1Container-block">\n<div class="re-AdvertisingDominanceCrossdevice-x65"><div id="PubX65Detail_wrapper" class="adit-XandrB ...

[[3]]
{xml_nodeset (1)}
[1] <div class="re-AdTop1Container-block">\n<div class="re-AdvertisingDominanceCrossdevice-x65"><div id="PubX65Detail_wrapper" class="adit-XandrBanner adit-XandrBanner--notAvail ...

[[4]]
{xml_nodeset (1)}
[1] <div class="re-AdvertisingDominanceCrossdevice-x65"><div id="PubX65Detail_wrapper" class="adit-XandrBanner adit-XandrBanner--notAvailable"><div id="PubX65Detail"></div></div ...

[[5]]
{xml_nodeset (0)}

[[6]]
[1] NA

或者另一个选项是 map

library(purrr)
pfun_node <- possibly(function(html_obj, node_val ) 
      html_obj %>% html_nodes(node_val), otherwise = NA)
classAttrs_3$new <- map(str_c(".", classAttrs_3$class), ~ pfun_node(html3, .x))