如何在 R 中组合两组数据并在解析后将它们分别添加到单个列中?
How in R can I combine two groups of data and add them each into single columns after parsing?
library(rvest)
link1 <- "https://somon.tj/adv/7866644_5-komn-kvartira-3-etazh-79-m2-a-sino/"
link2 <- "https://somon.tj/adv/7985721_2-komn-dom-grandzavod/"
house_link <- c(link1, link2)
house_features = lapply(houselink, function(link) {
page_data <-
tryCatch({
read_html(link)
pricing = page_data %>% html_nodes("h1") %>% html_text(trim = T)},
error = function(e) e,
warning = function(w) w)
if(!inherits(page_data, "error")) {
data.frame(
link = link,
parameters = page_data %>% html_nodes(".label") %>% html_text(trim = TRUE),
values = page_data %>% html_nodes(".info") %>% html_text(trim = TRUE)
)
list(
pricing = page_data %>% html_nodes("h1") %>% html_text(trim = T)
)
} else {
NULL
}
})
但是当我使用 do.call(rbind)
时,它会产生错误。
do.call(rbind, house_features) %>%
group_by(link, parameters) %>%
mutate(parameters = if_else(row_number() > 1, paste(parameters,row_number()), parameters)) %>%
pivot_wider(id_cols = link, names_from = parameters, values_from = values)
其中一个链接有 19 个变量,而第二个链接仅包含 5 个变量。你看到了差异。如何将所有变量分别放入单独的列中?如果它在那个变量上没有值,比如说,另外 14 个变量,我想为变量的值添加 NA。我应该怎么做,窥视?
试试这个方法:
- 在列表中收集房屋特征
house_features = lapply(house_link, function(link) {
page_data <- tryCatch(read_html(link),error = function(e) e ,warning=function(w) w)
if(!inherits(page_data, "error")) {
data.frame(
link = link,
parameters = page_data %>% html_nodes(".label") %>% html_text(trim = TRUE),
values = page_data %>% html_nodes(".info") %>% html_text(trim = TRUE)
)
} else {
NULL
}
})
rbind
它们使用 do.call
,确保参数名称是唯一的(它们不是/例如 link1 有两个名为 Floor
的参数),然后 pivot_wider
do.call(rbind,house_features) %>%
group_by(link, parameters) %>%
mutate(parameters = if_else(row_number()>1, paste(parameters,row_number()), parameters)) %>%
pivot_wider(id_cols = link, names_from=parameters,values_from=values)
输出:
link `Type of offer` Category House Floor Area Condition Internet Toilet Gas `Front door` Parking Furniture `Floor 2` `Ceiling height` Security Other `Possibility of…
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 https… from owner elite monol… 9 fl… 107 … european… optics 2 bat… trunk armored parking fully fu… laminate 3 m. bars on… plas… no
2 https… from agent NA panel… NA 255 … NA NA NA NA NA NA NA NA NA NA NA NA
# … with 4 more variables: Possibility of getting a mortgage <chr>, Possibility of exchange <chr>, Number of floors <chr>, Heating <chr>
house_data <- do.call(rbind, house_features) %>%
group_by(link, parameters) %>%
mutate(parameters = if_else(row_number() > 1, paste(parameters,row_number()), parameters)) %>%
pivot_wider(
id_cols = c(link, pricing,), names_from = parameters, values_from = values)
我发现了什么?
尽管变量 pricing
可能会导致跨数据帧的重复和冗余,如您所见,但令人惊讶的是,与传统的 for-loop 相比,lapply
函数仍然以惊人的速度快速运行!
我是说,你有一整个蜡球。谢谢@langtang :)
library(rvest)
link1 <- "https://somon.tj/adv/7866644_5-komn-kvartira-3-etazh-79-m2-a-sino/"
link2 <- "https://somon.tj/adv/7985721_2-komn-dom-grandzavod/"
house_link <- c(link1, link2)
house_features = lapply(houselink, function(link) {
page_data <-
tryCatch({
read_html(link)
pricing = page_data %>% html_nodes("h1") %>% html_text(trim = T)},
error = function(e) e,
warning = function(w) w)
if(!inherits(page_data, "error")) {
data.frame(
link = link,
parameters = page_data %>% html_nodes(".label") %>% html_text(trim = TRUE),
values = page_data %>% html_nodes(".info") %>% html_text(trim = TRUE)
)
list(
pricing = page_data %>% html_nodes("h1") %>% html_text(trim = T)
)
} else {
NULL
}
})
但是当我使用 do.call(rbind)
时,它会产生错误。
do.call(rbind, house_features) %>%
group_by(link, parameters) %>%
mutate(parameters = if_else(row_number() > 1, paste(parameters,row_number()), parameters)) %>%
pivot_wider(id_cols = link, names_from = parameters, values_from = values)
其中一个链接有 19 个变量,而第二个链接仅包含 5 个变量。你看到了差异。如何将所有变量分别放入单独的列中?如果它在那个变量上没有值,比如说,另外 14 个变量,我想为变量的值添加 NA。我应该怎么做,窥视?
试试这个方法:
- 在列表中收集房屋特征
house_features = lapply(house_link, function(link) {
page_data <- tryCatch(read_html(link),error = function(e) e ,warning=function(w) w)
if(!inherits(page_data, "error")) {
data.frame(
link = link,
parameters = page_data %>% html_nodes(".label") %>% html_text(trim = TRUE),
values = page_data %>% html_nodes(".info") %>% html_text(trim = TRUE)
)
} else {
NULL
}
})
rbind
它们使用do.call
,确保参数名称是唯一的(它们不是/例如 link1 有两个名为Floor
的参数),然后pivot_wider
do.call(rbind,house_features) %>%
group_by(link, parameters) %>%
mutate(parameters = if_else(row_number()>1, paste(parameters,row_number()), parameters)) %>%
pivot_wider(id_cols = link, names_from=parameters,values_from=values)
输出:
link `Type of offer` Category House Floor Area Condition Internet Toilet Gas `Front door` Parking Furniture `Floor 2` `Ceiling height` Security Other `Possibility of…
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 https… from owner elite monol… 9 fl… 107 … european… optics 2 bat… trunk armored parking fully fu… laminate 3 m. bars on… plas… no
2 https… from agent NA panel… NA 255 … NA NA NA NA NA NA NA NA NA NA NA NA
# … with 4 more variables: Possibility of getting a mortgage <chr>, Possibility of exchange <chr>, Number of floors <chr>, Heating <chr>
house_data <- do.call(rbind, house_features) %>%
group_by(link, parameters) %>%
mutate(parameters = if_else(row_number() > 1, paste(parameters,row_number()), parameters)) %>%
pivot_wider(
id_cols = c(link, pricing,), names_from = parameters, values_from = values)
我发现了什么?
尽管变量 pricing
可能会导致跨数据帧的重复和冗余,如您所见,但令人惊讶的是,与传统的 for-loop 相比,lapply
函数仍然以惊人的速度快速运行!
我是说,你有一整个蜡球。谢谢@langtang :)