将具有缺失值的嵌套列表转换为R中的数据框
Converting nested list with missing values to data frame in R
我有一个来自 googleway 包的地理编码输出列表(ggmap 地理编码不适用于 API 键)存储在一个列表中,每个元素包含两个列表。然而,对于没有找到结果的地址,列表的结构是不同的,这让我将列表转换为数据框的尝试受挫。
"non-missing" 结果(使用 dput() 创建)的结构如下(忽略乱码,RStudio 无法在控制台中正确显示西里尔字母):
structure(list(results = structure(list(address_components = list(
structure(list(long_name = c("11À", "óëèöà Ãîãîëÿ", "Çåëåíîãðàäñêèé àäìèíèñòðàòèâíûé îêðóã",
"Çåëåíîãðàä", "Ìîñêâà", "Ìîñêâà", "Ðîññèÿ", "124575"), short_name = c("11À",
"óë. Ãîãîëÿ", "Çåëåíîãðàäñêèé àäìèíèñòðàòèâíûé îêðóã", "Çåëåíîãðàä",
"Ìîñêâà", "Ìîñêâà", "RU", "124575"), types = list("street_number",
"route", c("political", "sublocality", "sublocality_level_1"
), c("locality", "political"), c("administrative_area_level_2",
"political"), c("administrative_area_level_1", "political"
), c("country", "political"), "postal_code")), .Names = c("long_name",
"short_name", "types"), class = "data.frame", row.names = c(NA,
8L))), formatted_address = "óë. Ãîãîëÿ, 11À, Çåëåíîãðàä, Ìîñêâà, Ðîññèÿ, 124575",
geometry = structure(list(location = structure(list(lat = 55.987567,
lng = 37.17152), .Names = c("lat", "lng"), class = "data.frame", row.names = 1L),
location_type = "ROOFTOP", viewport = structure(list(
northeast = structure(list(lat = 55.9889159802915,
lng = 37.1728689802915), .Names = c("lat", "lng"
), class = "data.frame", row.names = 1L), southwest = structure(list(
lat = 55.9862180197085, lng = 37.1701710197085), .Names = c("lat",
"lng"), class = "data.frame", row.names = 1L)), .Names = c("northeast",
"southwest"), class = "data.frame", row.names = 1L)), .Names = c("location",
"location_type", "viewport"), class = "data.frame", row.names = 1L),
place_id = "ChIJzXSgUeQUtUYREIohzQOG--A", types = list("street_address")), .Names = c("address_components",
"formatted_address", "geometry", "place_id", "types"), class = "data.frame", row.names = 1L),
status = "OK"), .Names = c("results", "status"))
一个"missing"结果的结构如下:
structure(list(results = list(), status = "ZERO_RESULTS"), .Names = c("results",
"status"))
基本上,问题似乎是当函数没有从 Google API 获得结果时,它会创建一个空列表,而不是具有与以 NA 作为值的 "non-missing" 列表。当您将这些列表传递给 data.frame()
时,这会产生错误,因为它无法从无到有地创建数据框。
我在将结果子列表提取到它们自己的列表中后尝试了这里的解决方案:Converting nested list (unequal length) to data frame。它应该填充 NAs 并创建等长列表,从而能够转换为数据框:
first100geocode.results.l <- vector("list", 100)
for(i in 1:length(first100geocode.results.l)){
first100geocode.results.l[[i]] <- first100geocode[[i]]$results
}
indx <- sapply(first100geocode.results.l, length)
res <- as.data.frame(do.call(rbind,lapply(first100geocode.results.l,
`length<-`, max(indx))))
colnames(res) <- names(first100geocode.results.l[[which.max(indx)]])
但是,创建 "res" 对象的行抛出错误:Error in rbind(deparse.level, ...) : invalid list argument: all variables should have the same length'.
是否有其他方法可以为缺失的结果填充 NA,以便我可以将其转换为数据框?
(注意:我不能简单地删除丢失的结果,我需要将其绑定回原始地址列表)。
我们会让 jsonlite::flatten
完成大部分工作:
将您的两个示例结果放在一个列表中(希望这符合您的实际数据结构):
first100geocode <- list(
structure(list(results = structure(list(address_components = list(
structure(list(long_name = c(
"11À", "óëèöà Ãîãîëÿ", "Çåëåíîãðàäñêèé àäìèíèñòðàòèâíûé îêðóã",
"Çåëåíîãðàä", "Ìîñêâà", "Ìîñêâà", "Ðîññèÿ", "124575"), short_name = c(
"11À", "óë. Ãîãîëÿ", "Çåëåíîãðàäñêèé àäìèíèñòðàòèâíûé îêðóã", "Çåëåíîãðàä",
"Ìîñêâà", "Ìîñêâà", "RU", "124575"), types = list(
"street_number", "route", c("political", "sublocality", "sublocality_level_1"
), c("locality", "political"), c(
"administrative_area_level_2",
"political"), c("administrative_area_level_1", "political"
), c("country", "political"), "postal_code")), .Names = c(
"long_name",
"short_name", "types"), class = "data.frame", row.names = c(NA, 8L))),
formatted_address = "óë. Ãîãîëÿ, 11À, Çåëåíîãðàä, Ìîñêâà, Ðîññèÿ, 124575",
geometry = structure(list(location = structure(
list(lat = 55.987567, lng = 37.17152),
.Names = c("lat", "lng"), class = "data.frame", row.names = 1L),
location_type = "ROOFTOP", viewport = structure(list(
northeast = structure(list(
lat = 55.9889159802915, lng = 37.1728689802915), .Names = c("lat", "lng"
), class = "data.frame", row.names = 1L), southwest = structure(list(
lat = 55.9862180197085, lng = 37.1701710197085), .Names = c("lat", "lng"),
class = "data.frame", row.names = 1L)), .Names = c("northeast", "southwest"),
class = "data.frame", row.names = 1L)),
.Names = c("location", "location_type", "viewport"),
class = "data.frame", row.names = 1L),
place_id = "ChIJzXSgUeQUtUYREIohzQOG--A", types = list("street_address")),
.Names = c("address_components",
"formatted_address", "geometry", "place_id", "types"),
class = "data.frame", row.names = 1L),
status = "OK"), .Names = c("results", "status")),
structure(list(results = list(), status = "ZERO_RESULTS"),
.Names = c("results", "status"))
)
进行实际的扁平化(并过滤掉 address_components
和 types
,它们有点棘手且您不感兴趣):
flatten_googleway <- function(df) {
res <- jsonlite::flatten(df)
res[, !names(res) %in% c("address_components", "types")]
}
准备我们将用于 "missing" 结果的模板数据框。并将其应用于那些:
template_res <- flatten_googleway(first100geocode[[1]]$results)[FALSE, ]
do.call(rbind, lapply(first100geocode, function(x) {
if (length(x$results) == 0) template_res[1, ] else flatten_googleway(x$results)
}))
# formatted_address place_id
# 1 óë. Ãîãîëÿ, 11À, Çåëåíîãðàä, Ìîñêâà, Ðîññèÿ, 124575 ChIJzXSgUeQUtUYREIohzQOG--A
# NA <NA> <NA>
# geometry.location_type geometry.location.lat geometry.location.lng
# 1 ROOFTOP 55.98757 37.17152
# NA <NA> NA NA
# geometry.viewport.northeast.lat geometry.viewport.northeast.lng
# 1 55.98892 37.17287
# NA NA NA
# geometry.viewport.southwest.lat geometry.viewport.southwest.lng
# 1 55.98622 37.17017
# NA NA NA
我有一个来自 googleway 包的地理编码输出列表(ggmap 地理编码不适用于 API 键)存储在一个列表中,每个元素包含两个列表。然而,对于没有找到结果的地址,列表的结构是不同的,这让我将列表转换为数据框的尝试受挫。
"non-missing" 结果(使用 dput() 创建)的结构如下(忽略乱码,RStudio 无法在控制台中正确显示西里尔字母):
structure(list(results = structure(list(address_components = list(
structure(list(long_name = c("11À", "óëèöà Ãîãîëÿ", "Çåëåíîãðàäñêèé àäìèíèñòðàòèâíûé îêðóã",
"Çåëåíîãðàä", "Ìîñêâà", "Ìîñêâà", "Ðîññèÿ", "124575"), short_name = c("11À",
"óë. Ãîãîëÿ", "Çåëåíîãðàäñêèé àäìèíèñòðàòèâíûé îêðóã", "Çåëåíîãðàä",
"Ìîñêâà", "Ìîñêâà", "RU", "124575"), types = list("street_number",
"route", c("political", "sublocality", "sublocality_level_1"
), c("locality", "political"), c("administrative_area_level_2",
"political"), c("administrative_area_level_1", "political"
), c("country", "political"), "postal_code")), .Names = c("long_name",
"short_name", "types"), class = "data.frame", row.names = c(NA,
8L))), formatted_address = "óë. Ãîãîëÿ, 11À, Çåëåíîãðàä, Ìîñêâà, Ðîññèÿ, 124575",
geometry = structure(list(location = structure(list(lat = 55.987567,
lng = 37.17152), .Names = c("lat", "lng"), class = "data.frame", row.names = 1L),
location_type = "ROOFTOP", viewport = structure(list(
northeast = structure(list(lat = 55.9889159802915,
lng = 37.1728689802915), .Names = c("lat", "lng"
), class = "data.frame", row.names = 1L), southwest = structure(list(
lat = 55.9862180197085, lng = 37.1701710197085), .Names = c("lat",
"lng"), class = "data.frame", row.names = 1L)), .Names = c("northeast",
"southwest"), class = "data.frame", row.names = 1L)), .Names = c("location",
"location_type", "viewport"), class = "data.frame", row.names = 1L),
place_id = "ChIJzXSgUeQUtUYREIohzQOG--A", types = list("street_address")), .Names = c("address_components",
"formatted_address", "geometry", "place_id", "types"), class = "data.frame", row.names = 1L),
status = "OK"), .Names = c("results", "status"))
一个"missing"结果的结构如下:
structure(list(results = list(), status = "ZERO_RESULTS"), .Names = c("results",
"status"))
基本上,问题似乎是当函数没有从 Google API 获得结果时,它会创建一个空列表,而不是具有与以 NA 作为值的 "non-missing" 列表。当您将这些列表传递给 data.frame()
时,这会产生错误,因为它无法从无到有地创建数据框。
我在将结果子列表提取到它们自己的列表中后尝试了这里的解决方案:Converting nested list (unequal length) to data frame。它应该填充 NAs 并创建等长列表,从而能够转换为数据框:
first100geocode.results.l <- vector("list", 100)
for(i in 1:length(first100geocode.results.l)){
first100geocode.results.l[[i]] <- first100geocode[[i]]$results
}
indx <- sapply(first100geocode.results.l, length)
res <- as.data.frame(do.call(rbind,lapply(first100geocode.results.l,
`length<-`, max(indx))))
colnames(res) <- names(first100geocode.results.l[[which.max(indx)]])
但是,创建 "res" 对象的行抛出错误:Error in rbind(deparse.level, ...) : invalid list argument: all variables should have the same length'.
是否有其他方法可以为缺失的结果填充 NA,以便我可以将其转换为数据框?
(注意:我不能简单地删除丢失的结果,我需要将其绑定回原始地址列表)。
我们会让 jsonlite::flatten
完成大部分工作:
将您的两个示例结果放在一个列表中(希望这符合您的实际数据结构):
first100geocode <- list(
structure(list(results = structure(list(address_components = list(
structure(list(long_name = c(
"11À", "óëèöà Ãîãîëÿ", "Çåëåíîãðàäñêèé àäìèíèñòðàòèâíûé îêðóã",
"Çåëåíîãðàä", "Ìîñêâà", "Ìîñêâà", "Ðîññèÿ", "124575"), short_name = c(
"11À", "óë. Ãîãîëÿ", "Çåëåíîãðàäñêèé àäìèíèñòðàòèâíûé îêðóã", "Çåëåíîãðàä",
"Ìîñêâà", "Ìîñêâà", "RU", "124575"), types = list(
"street_number", "route", c("political", "sublocality", "sublocality_level_1"
), c("locality", "political"), c(
"administrative_area_level_2",
"political"), c("administrative_area_level_1", "political"
), c("country", "political"), "postal_code")), .Names = c(
"long_name",
"short_name", "types"), class = "data.frame", row.names = c(NA, 8L))),
formatted_address = "óë. Ãîãîëÿ, 11À, Çåëåíîãðàä, Ìîñêâà, Ðîññèÿ, 124575",
geometry = structure(list(location = structure(
list(lat = 55.987567, lng = 37.17152),
.Names = c("lat", "lng"), class = "data.frame", row.names = 1L),
location_type = "ROOFTOP", viewport = structure(list(
northeast = structure(list(
lat = 55.9889159802915, lng = 37.1728689802915), .Names = c("lat", "lng"
), class = "data.frame", row.names = 1L), southwest = structure(list(
lat = 55.9862180197085, lng = 37.1701710197085), .Names = c("lat", "lng"),
class = "data.frame", row.names = 1L)), .Names = c("northeast", "southwest"),
class = "data.frame", row.names = 1L)),
.Names = c("location", "location_type", "viewport"),
class = "data.frame", row.names = 1L),
place_id = "ChIJzXSgUeQUtUYREIohzQOG--A", types = list("street_address")),
.Names = c("address_components",
"formatted_address", "geometry", "place_id", "types"),
class = "data.frame", row.names = 1L),
status = "OK"), .Names = c("results", "status")),
structure(list(results = list(), status = "ZERO_RESULTS"),
.Names = c("results", "status"))
)
进行实际的扁平化(并过滤掉 address_components
和 types
,它们有点棘手且您不感兴趣):
flatten_googleway <- function(df) {
res <- jsonlite::flatten(df)
res[, !names(res) %in% c("address_components", "types")]
}
准备我们将用于 "missing" 结果的模板数据框。并将其应用于那些:
template_res <- flatten_googleway(first100geocode[[1]]$results)[FALSE, ]
do.call(rbind, lapply(first100geocode, function(x) {
if (length(x$results) == 0) template_res[1, ] else flatten_googleway(x$results)
}))
# formatted_address place_id
# 1 óë. Ãîãîëÿ, 11À, Çåëåíîãðàä, Ìîñêâà, Ðîññèÿ, 124575 ChIJzXSgUeQUtUYREIohzQOG--A
# NA <NA> <NA>
# geometry.location_type geometry.location.lat geometry.location.lng
# 1 ROOFTOP 55.98757 37.17152
# NA <NA> NA NA
# geometry.viewport.northeast.lat geometry.viewport.northeast.lng
# 1 55.98892 37.17287
# NA NA NA
# geometry.viewport.southwest.lat geometry.viewport.southwest.lng
# 1 55.98622 37.17017
# NA NA NA