将 XML 提取到 R 中的数据框
Extract XML to dataframe in R
通过使用 httr
包的 POST 请求,我返回 XML 格式如下:-
<ReportDelivery responsecode="0" responsetext="descriptive text">
<Terminal isn="DCC000000001" imo="111111111" name="MV Vessel A">
<Report>
<DateTime>01/10/2014 15:30:45</DateTime>
<Lat>99.9999999</Lat>
<Lon>999.9999999</Lon>
<Cog>999</Cog>
<Sog>999</Sog>
<Voltage>99</Voltage>
<Status>Description of status</Status>
</Report>
<Report>
..
</Report>
</Terminal>
<Terminal isn="DCC000000002" imo="222222222" name="MV Vessel B">
..
</Terminal>
</ReportDelivery>
我可以使用两个可用函数在数据框中获取 "Report" 部分 here:-
#Using functions from https://rud.is/rpubs/xml2power/
xtrct <- function(doc, target) { xml_find_all(doc, target) %>% xml_text() %>% trimws() }
xtrct_df <- function(doc, top) {
xml_find_first(doc, sprintf(".//%s", top)) %>%
xml_children() %>%
xml_name() %>%
map(~{
xtrct(doc, sprintf(".//%s/%s", top, .x)) %>%
list() %>%
set_names(tolower(.x))
}) %>%
flatten_df() %>%
readr::type_convert()
}
x <- xtrct_df(doc, "Report")
在每个 Terminal 节点内,有多个与特定船舶相关的报告,其属性在 Terminal 节点中给出。
目前,x
中的列是:
names(x)
[1] "datetime" "lat" "lon" "cog" "sog" "voltage" "status"
如何将船名作为一列添加到此数据框中?我可以使用 :-
提取名称属性
xattrs <- xpathSApply(z, "//*/Terminal/@name")
B但不知道如何将其作为变量包含在数据框中。请提供一些帮助。
采用与@hrbrmstr 略有不同的路线,我们可以在每个元素上 map_df
,同时找到父元素并提取适当的 attr
:
library(xml2)
library(purrr)
col_names <- read_xml(x) %>%
xml_find_first('.//Report') %>%
xml_children() %>%
xml_name()
read_xml(x) %>%
xml_find_all(".//Report") %>%
map_df(~{
parent_name <- xml_parent(.x) %>%
xml_attr('name') %>%
as.character()
xml_children(.x) %>%
as_list() %>%
data.frame(stringsAsFactors = FALSE) %>%
set_names(col_names) %>%
cbind(VesselName=parent_name)
})
#> DateTime Lat Lon Cog Sog Voltage
#> 1 01/10/2014 15:30:45 99.9999999 999.9999999 999 999 99
#> 2 01/10/2014 15:30:45 99.9999999 999.9999999 999 999 99
#> 3 01/10/2014 15:30:45 99.9999999 999.9999999 999 999 99
#> Status VesselName
#> 1 Description of status MV Vessel A
#> 2 Description of status MV Vessel A
#> 3 Description of status MV Vessel B
由 reprex package (v0.2.0) 创建于 2018-05-07。
数据:
x <- '<ReportDelivery responsecode="0" responsetext="descriptive text">
<Terminal isn="DCC000000001" imo="111111111" name="MV Vessel A">
<Report>
<DateTime>01/10/2014 15:30:45</DateTime>
<Lat>99.9999999</Lat>
<Lon>999.9999999</Lon>
<Cog>999</Cog>
<Sog>999</Sog>
<Voltage>99</Voltage>
<Status>Description of status</Status>
</Report>
<Report>
<DateTime>01/10/2014 15:30:45</DateTime>
<Lat>99.9999999</Lat>
<Lon>999.9999999</Lon>
<Cog>999</Cog>
<Sog>999</Sog>
<Voltage>99</Voltage>
<Status>Description of status</Status>
</Report>
</Terminal>
<Terminal isn="DCC000000002" imo="222222222" name="MV Vessel B">
<Report>
<DateTime>01/10/2014 15:30:45</DateTime>
<Lat>99.9999999</Lat>
<Lon>999.9999999</Lon>
<Cog>999</Cog>
<Sog>999</Sog>
<Voltage>99</Voltage>
<Status>Description of status</Status>
</Report>
</Terminal>
</ReportDelivery>'
通过使用 httr
包的 POST 请求,我返回 XML 格式如下:-
<ReportDelivery responsecode="0" responsetext="descriptive text">
<Terminal isn="DCC000000001" imo="111111111" name="MV Vessel A">
<Report>
<DateTime>01/10/2014 15:30:45</DateTime>
<Lat>99.9999999</Lat>
<Lon>999.9999999</Lon>
<Cog>999</Cog>
<Sog>999</Sog>
<Voltage>99</Voltage>
<Status>Description of status</Status>
</Report>
<Report>
..
</Report>
</Terminal>
<Terminal isn="DCC000000002" imo="222222222" name="MV Vessel B">
..
</Terminal>
</ReportDelivery>
我可以使用两个可用函数在数据框中获取 "Report" 部分 here:-
#Using functions from https://rud.is/rpubs/xml2power/
xtrct <- function(doc, target) { xml_find_all(doc, target) %>% xml_text() %>% trimws() }
xtrct_df <- function(doc, top) {
xml_find_first(doc, sprintf(".//%s", top)) %>%
xml_children() %>%
xml_name() %>%
map(~{
xtrct(doc, sprintf(".//%s/%s", top, .x)) %>%
list() %>%
set_names(tolower(.x))
}) %>%
flatten_df() %>%
readr::type_convert()
}
x <- xtrct_df(doc, "Report")
在每个 Terminal 节点内,有多个与特定船舶相关的报告,其属性在 Terminal 节点中给出。
目前,x
中的列是:
names(x)
[1] "datetime" "lat" "lon" "cog" "sog" "voltage" "status"
如何将船名作为一列添加到此数据框中?我可以使用 :-
提取名称属性xattrs <- xpathSApply(z, "//*/Terminal/@name")
B但不知道如何将其作为变量包含在数据框中。请提供一些帮助。
采用与@hrbrmstr 略有不同的路线,我们可以在每个元素上 map_df
,同时找到父元素并提取适当的 attr
:
library(xml2)
library(purrr)
col_names <- read_xml(x) %>%
xml_find_first('.//Report') %>%
xml_children() %>%
xml_name()
read_xml(x) %>%
xml_find_all(".//Report") %>%
map_df(~{
parent_name <- xml_parent(.x) %>%
xml_attr('name') %>%
as.character()
xml_children(.x) %>%
as_list() %>%
data.frame(stringsAsFactors = FALSE) %>%
set_names(col_names) %>%
cbind(VesselName=parent_name)
})
#> DateTime Lat Lon Cog Sog Voltage
#> 1 01/10/2014 15:30:45 99.9999999 999.9999999 999 999 99
#> 2 01/10/2014 15:30:45 99.9999999 999.9999999 999 999 99
#> 3 01/10/2014 15:30:45 99.9999999 999.9999999 999 999 99
#> Status VesselName
#> 1 Description of status MV Vessel A
#> 2 Description of status MV Vessel A
#> 3 Description of status MV Vessel B
由 reprex package (v0.2.0) 创建于 2018-05-07。
数据:
x <- '<ReportDelivery responsecode="0" responsetext="descriptive text">
<Terminal isn="DCC000000001" imo="111111111" name="MV Vessel A">
<Report>
<DateTime>01/10/2014 15:30:45</DateTime>
<Lat>99.9999999</Lat>
<Lon>999.9999999</Lon>
<Cog>999</Cog>
<Sog>999</Sog>
<Voltage>99</Voltage>
<Status>Description of status</Status>
</Report>
<Report>
<DateTime>01/10/2014 15:30:45</DateTime>
<Lat>99.9999999</Lat>
<Lon>999.9999999</Lon>
<Cog>999</Cog>
<Sog>999</Sog>
<Voltage>99</Voltage>
<Status>Description of status</Status>
</Report>
</Terminal>
<Terminal isn="DCC000000002" imo="222222222" name="MV Vessel B">
<Report>
<DateTime>01/10/2014 15:30:45</DateTime>
<Lat>99.9999999</Lat>
<Lon>999.9999999</Lon>
<Cog>999</Cog>
<Sog>999</Sog>
<Voltage>99</Voltage>
<Status>Description of status</Status>
</Report>
</Terminal>
</ReportDelivery>'