从R中的嵌套列表中提取数据
extracting data from nested list in R
我已经从 google_reverse_code API 下载了一个地址列表,但是对于一个带有纬度和经度信息的地点列表,因为我是 R 的新手。我不知道如何来提取有用的信息。所有下载数据库的代码都在题底
列表的结构大体是这样的。
`$ 60 :List of 1
..$ results:'data.frame': 1 obs. of 5 variables:
.. ..$ address_components:List of 1
.. .. ..$ :'data.frame': 8 obs. of 3 variables:
.. .. .. ..$ long_name : chr [1:8] "119" "Avenida Diego Díaz de Berlanga"
"Jardines de Anahuac 2do Sector" "San Nicolás de los Garza" ...
.. .. .. ..$ short_name: chr [1:8] "119" "Avenida Diego Díaz de Berlanga"
"Jardines de Anahuac 2do Sector" "San Nicolás de los Garza" ...
.. .. .. ..$ types :List of 8
.. .. .. .. ..$ : chr "street_number"
.. .. .. .. ..$ : chr "route"
.. .. .. .. ..$ : chr [1:3] "political" "sublocality" "sublocality_level_1"
.. .. .. .. ..$ : chr [1:2] "locality" "political"
.. .. .. .. ..$ : chr [1:2] "administrative_area_level_2" "political"
.. .. .. .. ..$ : chr [1:2] "administrative_area_level_1" "political"
.. .. .. .. ..$ : chr [1:2] "country" "political"
.. .. .. .. ..$ : chr "postal_code"
.. ..$ formatted_address : chr "Avenida Diego Díaz de Berlanga 119, Jardines
de Anahuac 2do Sector, 66444 San Nicolás de los Garza, N.L., Mexico"
.. ..$ geometry :'data.frame': 1 obs. of 3 variables:
.. .. ..$ location :'data.frame': 1 obs. of 2 variables:
.. .. .. ..$ lat: num 25.7
.. .. .. ..$ lng: num -100
.. .. ..$ location_type: chr "ROOFTOP"
.. .. ..$ viewport :'data.frame': 1 obs. of 2 variables:
.. .. .. ..$ northeast:'data.frame': 1 obs. of 2 variables:
.. .. .. .. ..$ lat: num 25.7
.. .. .. .. ..$ lng: num -100
.. .. .. ..$ southwest:'data.frame': 1 obs. of 2 variables:
.. .. .. .. ..$ lat: num 25.7
.. .. .. .. ..$ lng: num -100
.. ..$ place_id : chr "ChIJRY_wPdqUYoYRTJetT6AJETA"
.. ..$ types :List of 1
.. .. ..$ : chr "street_address"
我需要将信息作为数据框来执行我的分析。信息具体是c(latitude, longitude, formatted_address, place_id)
我写的代码是这样的:
prueba <- sapply(direccion1, function(x){
uno <- unlist(x[[1]])
})
pureba2 <- data.frame(prueba)
我收到以下错误:Error in (function (..., row.names = NULL,
check.rows = FALSE, check.names = TRUE, :
arguments imply differing number of rows: 40, 32, 37, 44, 36, 0, 41, 28, 39,
47, 43, 35, 48
在其他不起作用的代码中。
下载经纬度数据的代码如下
# CRE FILES
library(easypackages)
my_packages <- c("ggmap","maps","mapdata","rlist","readr", "tidyverse",
"lubridate", "stringr", "rebus", "stringi", "purrr", "geosphere", "XML",
"RCurl", "xml2")
libraries(my_packages)
# Set link to website
link1 <-
("https://publicacionexterna.azurewebsites.net/publicaciones/prices")
# Get data from webpage
data_prices <- getURL(link1)
# Parse XML data
xmlfile <- xmlParse(data_prices)
# Get place nodes
places <- getNodeSet(xmlfile, "//place")
# Get values for each place
values <- lapply(places, function(x){
# Get current place id
p_id <- xmlAttrs(x)
# Get values for each gas type for current place
newrows <- lapply(xmlChildren(x), function(y){
# Get type and update time
attrs <- xmlAttrs(y)
# Get price value
price <- xmlValue(y)
names(price) <- "price"
# Return values
return(c(p_id, attrs, price)
)
})
# Combine rows to single list
newrows <- do.call(rbind, newrows)
# Return rows
return(newrows)
})
# Combine all values into a single dataframe
datosDePrecios <- as.data.frame(do.call(rbind, values), stringsAsFactors =
FALSE)
# Re-set row names for dataframe
row.names(datosDePrecios) <- c(1:nrow(datosDePrecios))
# Set link to website to the places file
link2 <-
("https://publicacionexterna.azurewebsites.net/publicaciones/places")
data_places <- read_xml(link2)
datos_id <- data_places %>%
xml_find_all("//place") %>%
xml_attr("place_id")
datos_name <- data_places %>%
xml_find_all("//name") %>%
xml_text("name")
datos_brand <- data_places %>%
xml_find_all("//brand") %>%
xml_text("brand")
datos_cre_id <- data_places %>%
xml_find_all("//cre_id") %>%
xml_text("cre_id")
datos_category <- data_places %>%
xml_find_all("//category") %>%
xml_text("category")
datos_adress_street <- data_places %>%
xml_find_all("//address_street") %>%
xml_text("adress_street")
datos_longitud <- data_places %>%
xml_find_all("//x") %>%
xml_text("x")
datos_latitud <- data_places %>%
xml_find_all("//y") %>%
xml_text("y")
datosDeLugares <- data.frame(datos_id, datos_name,
datos_brand, datos_cre_id,
datos_category, datos_adress_street,
datos_latitud, datos_longitud)
colnames(datosDeLugares) <- c("place_id", "name", "brand","cre_id",
"category", "adress_street", "Latitude", "Longitude")
rm(data_prices,places,values,xmlfile,data_places, datos_adress_street,
datos_brand, datos_category, datos_cre_id, datos_id, datos_name,
datos_longitud, datos_latitud)
rm(results, results2)
获取地址信息的代码如下
datosDePrecios <- datosDePrecios %>%
data.frame(datosDePrecios) %>%
mutate(place_id = as.numeric(place_id))
datosDeLugares <- datosDeLugares %>%
data.frame(datosDeLugares) %>%
mutate(place_id = as.numeric(place_id))
baseGeneral <- inner_join(datosDeLugares, datosDePrecios, by = "place_id")
baseGeneral <- baseGeneral %>%
select(Latitude, Longitude, place_id) %>%
mutate(Latitude = as.numeric(as.character(Latitude))) %>%
mutate(Longitude = as.numeric(as.character(Longitude)))
baseGeneral <- baseGeneral[1:100,]
baseGeneral <- apply(baseGeneral,1 ,function(x) {
google_reverse_geocode(location = c(x["Latitude"],x["Longitude"]), key =
key, result_type = "street_address")
})
感谢您的帮助。 :)
我认为您在使用 unlist()
时遇到的问题是默认 recursive=TRUE
。所以它可能会取消列出您的数据框,并取消列出您的数据框的列表列,这会让人感到困惑。
所以您可以尝试 unlist(... recursive=FALSE)
,但如果我理解正确,那么 bind_rows()
就可以解决问题,只需从您的列表中提取 $results 即可。我猜你的大列表中的每个元素都包含一个名为结果的数据框。
results <- lapply(YOUR_BIG_FAT_LIST, function(x) {
df = x$results
as.data.frame(address = df$formatted_address,
id = df$place_id,
lat = df$geometry$location$lat,
lng = df$geometry$location$lng)}
information <- bind_rows(results)
您可以使用 [[
符号或 $
从列表中提取信息
如果我拿?google_reverse_geocode
中给出的例子得到结果
library(googleway)
res <- google_reverse_geocode(location = c(-37.81659, 144.9841),
result_type = c("street_address"),
location_type = "rooftop",
key = key)
lat/lon信息在res$results$geometry$location
格式化地址在res$results$formatted_address
而 place_id 在 res$results$place_id
所以您可以从这些元素
创建您的data.frame
data.frame(
lat = res$results$geometry$location$lat,
lon = res$results$geometry$location$lng,
formatted_address = res$results$formatted_address,
place_id = res$results$place_id
)
如果您有多个结果列表,则过程类似,但您需要将其包装在一个 *apply
函数(或您喜欢的任何循环机制)中
## a list of locations
locations <- list(c(-37.81659, 144.9841), c(-37.81827, 144.9671))
## generating the reverse geocode for each location
lst_res <- lapply(locations, function(x){
google_reverse_geocode(location = x, key = key)
})
这里,lst_res
是地理编码函数的所有结果列表,所以你可以迭代它来提取相关部分
## now we can extract the information
lst_df <- lapply(lst_res, function(x){
data.frame(
lat = x[['results']][['geometry']][['location']][['lat']],
lon = x[['results']][['geometry']][['location']][['lng']],
formatted_address = x[['results']][['formatted_address']],
place_id = x[['results']][['place_id']]
)
})
这里,lst_df
是data.frames的列表。如果你想将它们合二为一 data.frame 你可以
df <- do.call(rbind, lst_df)
## et voila!
head(df)
# lat lon
# 1 -37.81647 144.9841
# 2 -37.81659 144.9841
# 3 -37.81300 144.9850
# 4 -37.81363 144.9631
# 5 -37.81614 144.9805
# 6 -37.81005 144.9281
# formatted_address
# 1 Jolimont Station, 175 Wellington Parade, East Melbourne VIC 3002, Austalia
# 2 Jolimont Station, Wellington Cres, East Melbourne VIC 3002, Australia
# 3 East Melbourne VIC 3002, Australia
# 4 Melbourne VIC, Australia
# 5 East Melbourne VIC 3002, Australia
# 6 Melbourne, VIC, Australia
# place_id
# 1 ChIJSxAubOpC1moRqhRUnMoZV4M
# 2 ChIJIdtrbupC1moRMPT0CXZWBB0
# 3 ChIJz25SvMFC1moRAOiMIXVWBAU
# 4 ChIJ90260rVG1moRkM2MIXVWBAQ
# 5 ChIJG74w4Upd1moRsDQuRnhWBBw
# 6 ChIJv_FYgkNd1moRpxLuRXZURFs
我已经从 google_reverse_code API 下载了一个地址列表,但是对于一个带有纬度和经度信息的地点列表,因为我是 R 的新手。我不知道如何来提取有用的信息。所有下载数据库的代码都在题底
列表的结构大体是这样的。
`$ 60 :List of 1
..$ results:'data.frame': 1 obs. of 5 variables:
.. ..$ address_components:List of 1
.. .. ..$ :'data.frame': 8 obs. of 3 variables:
.. .. .. ..$ long_name : chr [1:8] "119" "Avenida Diego Díaz de Berlanga"
"Jardines de Anahuac 2do Sector" "San Nicolás de los Garza" ...
.. .. .. ..$ short_name: chr [1:8] "119" "Avenida Diego Díaz de Berlanga"
"Jardines de Anahuac 2do Sector" "San Nicolás de los Garza" ...
.. .. .. ..$ types :List of 8
.. .. .. .. ..$ : chr "street_number"
.. .. .. .. ..$ : chr "route"
.. .. .. .. ..$ : chr [1:3] "political" "sublocality" "sublocality_level_1"
.. .. .. .. ..$ : chr [1:2] "locality" "political"
.. .. .. .. ..$ : chr [1:2] "administrative_area_level_2" "political"
.. .. .. .. ..$ : chr [1:2] "administrative_area_level_1" "political"
.. .. .. .. ..$ : chr [1:2] "country" "political"
.. .. .. .. ..$ : chr "postal_code"
.. ..$ formatted_address : chr "Avenida Diego Díaz de Berlanga 119, Jardines
de Anahuac 2do Sector, 66444 San Nicolás de los Garza, N.L., Mexico"
.. ..$ geometry :'data.frame': 1 obs. of 3 variables:
.. .. ..$ location :'data.frame': 1 obs. of 2 variables:
.. .. .. ..$ lat: num 25.7
.. .. .. ..$ lng: num -100
.. .. ..$ location_type: chr "ROOFTOP"
.. .. ..$ viewport :'data.frame': 1 obs. of 2 variables:
.. .. .. ..$ northeast:'data.frame': 1 obs. of 2 variables:
.. .. .. .. ..$ lat: num 25.7
.. .. .. .. ..$ lng: num -100
.. .. .. ..$ southwest:'data.frame': 1 obs. of 2 variables:
.. .. .. .. ..$ lat: num 25.7
.. .. .. .. ..$ lng: num -100
.. ..$ place_id : chr "ChIJRY_wPdqUYoYRTJetT6AJETA"
.. ..$ types :List of 1
.. .. ..$ : chr "street_address"
我需要将信息作为数据框来执行我的分析。信息具体是c(latitude, longitude, formatted_address, place_id)
我写的代码是这样的:
prueba <- sapply(direccion1, function(x){
uno <- unlist(x[[1]])
})
pureba2 <- data.frame(prueba)
我收到以下错误:Error in (function (..., row.names = NULL,
check.rows = FALSE, check.names = TRUE, :
arguments imply differing number of rows: 40, 32, 37, 44, 36, 0, 41, 28, 39,
47, 43, 35, 48
在其他不起作用的代码中。
下载经纬度数据的代码如下
# CRE FILES
library(easypackages)
my_packages <- c("ggmap","maps","mapdata","rlist","readr", "tidyverse",
"lubridate", "stringr", "rebus", "stringi", "purrr", "geosphere", "XML",
"RCurl", "xml2")
libraries(my_packages)
# Set link to website
link1 <-
("https://publicacionexterna.azurewebsites.net/publicaciones/prices")
# Get data from webpage
data_prices <- getURL(link1)
# Parse XML data
xmlfile <- xmlParse(data_prices)
# Get place nodes
places <- getNodeSet(xmlfile, "//place")
# Get values for each place
values <- lapply(places, function(x){
# Get current place id
p_id <- xmlAttrs(x)
# Get values for each gas type for current place
newrows <- lapply(xmlChildren(x), function(y){
# Get type and update time
attrs <- xmlAttrs(y)
# Get price value
price <- xmlValue(y)
names(price) <- "price"
# Return values
return(c(p_id, attrs, price)
)
})
# Combine rows to single list
newrows <- do.call(rbind, newrows)
# Return rows
return(newrows)
})
# Combine all values into a single dataframe
datosDePrecios <- as.data.frame(do.call(rbind, values), stringsAsFactors =
FALSE)
# Re-set row names for dataframe
row.names(datosDePrecios) <- c(1:nrow(datosDePrecios))
# Set link to website to the places file
link2 <-
("https://publicacionexterna.azurewebsites.net/publicaciones/places")
data_places <- read_xml(link2)
datos_id <- data_places %>%
xml_find_all("//place") %>%
xml_attr("place_id")
datos_name <- data_places %>%
xml_find_all("//name") %>%
xml_text("name")
datos_brand <- data_places %>%
xml_find_all("//brand") %>%
xml_text("brand")
datos_cre_id <- data_places %>%
xml_find_all("//cre_id") %>%
xml_text("cre_id")
datos_category <- data_places %>%
xml_find_all("//category") %>%
xml_text("category")
datos_adress_street <- data_places %>%
xml_find_all("//address_street") %>%
xml_text("adress_street")
datos_longitud <- data_places %>%
xml_find_all("//x") %>%
xml_text("x")
datos_latitud <- data_places %>%
xml_find_all("//y") %>%
xml_text("y")
datosDeLugares <- data.frame(datos_id, datos_name,
datos_brand, datos_cre_id,
datos_category, datos_adress_street,
datos_latitud, datos_longitud)
colnames(datosDeLugares) <- c("place_id", "name", "brand","cre_id",
"category", "adress_street", "Latitude", "Longitude")
rm(data_prices,places,values,xmlfile,data_places, datos_adress_street,
datos_brand, datos_category, datos_cre_id, datos_id, datos_name,
datos_longitud, datos_latitud)
rm(results, results2)
获取地址信息的代码如下
datosDePrecios <- datosDePrecios %>%
data.frame(datosDePrecios) %>%
mutate(place_id = as.numeric(place_id))
datosDeLugares <- datosDeLugares %>%
data.frame(datosDeLugares) %>%
mutate(place_id = as.numeric(place_id))
baseGeneral <- inner_join(datosDeLugares, datosDePrecios, by = "place_id")
baseGeneral <- baseGeneral %>%
select(Latitude, Longitude, place_id) %>%
mutate(Latitude = as.numeric(as.character(Latitude))) %>%
mutate(Longitude = as.numeric(as.character(Longitude)))
baseGeneral <- baseGeneral[1:100,]
baseGeneral <- apply(baseGeneral,1 ,function(x) {
google_reverse_geocode(location = c(x["Latitude"],x["Longitude"]), key =
key, result_type = "street_address")
})
感谢您的帮助。 :)
我认为您在使用 unlist()
时遇到的问题是默认 recursive=TRUE
。所以它可能会取消列出您的数据框,并取消列出您的数据框的列表列,这会让人感到困惑。
所以您可以尝试 unlist(... recursive=FALSE)
,但如果我理解正确,那么 bind_rows()
就可以解决问题,只需从您的列表中提取 $results 即可。我猜你的大列表中的每个元素都包含一个名为结果的数据框。
results <- lapply(YOUR_BIG_FAT_LIST, function(x) {
df = x$results
as.data.frame(address = df$formatted_address,
id = df$place_id,
lat = df$geometry$location$lat,
lng = df$geometry$location$lng)}
information <- bind_rows(results)
您可以使用 [[
符号或 $
如果我拿?google_reverse_geocode
中给出的例子得到结果
library(googleway)
res <- google_reverse_geocode(location = c(-37.81659, 144.9841),
result_type = c("street_address"),
location_type = "rooftop",
key = key)
lat/lon信息在res$results$geometry$location
格式化地址在res$results$formatted_address
而 place_id 在 res$results$place_id
所以您可以从这些元素
创建您的data.frame
data.frame(
lat = res$results$geometry$location$lat,
lon = res$results$geometry$location$lng,
formatted_address = res$results$formatted_address,
place_id = res$results$place_id
)
如果您有多个结果列表,则过程类似,但您需要将其包装在一个 *apply
函数(或您喜欢的任何循环机制)中
## a list of locations
locations <- list(c(-37.81659, 144.9841), c(-37.81827, 144.9671))
## generating the reverse geocode for each location
lst_res <- lapply(locations, function(x){
google_reverse_geocode(location = x, key = key)
})
这里,lst_res
是地理编码函数的所有结果列表,所以你可以迭代它来提取相关部分
## now we can extract the information
lst_df <- lapply(lst_res, function(x){
data.frame(
lat = x[['results']][['geometry']][['location']][['lat']],
lon = x[['results']][['geometry']][['location']][['lng']],
formatted_address = x[['results']][['formatted_address']],
place_id = x[['results']][['place_id']]
)
})
这里,lst_df
是data.frames的列表。如果你想将它们合二为一 data.frame 你可以
df <- do.call(rbind, lst_df)
## et voila!
head(df)
# lat lon
# 1 -37.81647 144.9841
# 2 -37.81659 144.9841
# 3 -37.81300 144.9850
# 4 -37.81363 144.9631
# 5 -37.81614 144.9805
# 6 -37.81005 144.9281
# formatted_address
# 1 Jolimont Station, 175 Wellington Parade, East Melbourne VIC 3002, Austalia
# 2 Jolimont Station, Wellington Cres, East Melbourne VIC 3002, Australia
# 3 East Melbourne VIC 3002, Australia
# 4 Melbourne VIC, Australia
# 5 East Melbourne VIC 3002, Australia
# 6 Melbourne, VIC, Australia
# place_id
# 1 ChIJSxAubOpC1moRqhRUnMoZV4M
# 2 ChIJIdtrbupC1moRMPT0CXZWBB0
# 3 ChIJz25SvMFC1moRAOiMIXVWBAU
# 4 ChIJ90260rVG1moRkM2MIXVWBAQ
# 5 ChIJG74w4Upd1moRsDQuRnhWBBw
# 6 ChIJv_FYgkNd1moRpxLuRXZURFs