从 XML 个名称混乱 path/node 的文件中提取数据
Extracting data from XML files with messy path/node names
我正在尝试使用 R 从机构 XML 文件中提取值。这是此类文件的示例:
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<missions xmlns="http://sensored">
<mission missiontype="2" year="2012" platform="4135" missionnumber="1" missiontypename="Referanseflåten-Hav" callsignal="LJBD" platformname="Nybo">
<startdate>30/01/2012</startdate>
<stopdate>28/11/2012</stopdate>
<purpose></purpose>
<fishstation serialno="86001">
<nation>58</nation>
<platform>4135</platform>
<station>1</station>
<startdate>30/01/2012</startdate>
<starttime>18:56:00</starttime>
<stopdate>30/01/2012</stopdate>
<stoptime>23:19:00</stoptime>
<latitudestart>65.13833</latitudestart>
<longitudestart>-11.04833</longitudestart>
<system>2</system>
<area>59</area>
<location>07</location>
<bottomdepthstart>653.0</bottomdepthstart>
<bottomdepthstop>653.0</bottomdepthstop>
<fishingdepthmax>85.0</fishingdepthmax>
<fishingdepthmin>40.0</fishingdepthmin>
<gear>3711</gear>
<gearcount>1</gearcount>
<gearcondition>4</gearcondition>
<trawlquality>7</trawlquality>
<dataquality>2</dataquality>
<catchsample species="161722.G03" samplenumber="1" noname="sild'G03" aphia="126417">
<sampletype>20</sampletype>
<conservation>1</conservation>
<producttype>1</producttype>
<weight>10.0</weight>
<count>46</count>
<sampleproducttype>1</sampleproducttype>
<lengthmeasurement>E</lengthmeasurement>
<lengthsampleweight>0.863</lengthsampleweight>
<lengthsamplecount>4</lengthsamplecount>
<specimensamplecount>4</specimensamplecount>
<individual specimenno="1">
<producttype>1</producttype>
<weight>0.222</weight>
<lengthunit>2</lengthunit>
<length>0.28</length>
</individual>
<individual specimenno="2">
<producttype>1</producttype>
<weight>0.127</weight>
<lengthunit>2</lengthunit>
<length>0.245</length>
</individual>
<individual specimenno="3">
<producttype>1</producttype>
<weight>0.172</weight>
<lengthunit>2</lengthunit>
<length>0.26</length>
</individual>
<individual specimenno="4">
<producttype>1</producttype>
<weight>0.342</weight>
<lengthunit>2</lengthunit>
<length>0.325</length>
</individual>
</catchsample>
</fishstation>
</mission>
</missions>
我设法毫无问题地解析文件。虽然名称在解析列表中显示正确,但我无法正确解析它们以使用 XPath 语言:
library(xml2)
library(xmltools)
doc <- xml2::read_xml("test.xml")
doc %>% xmltools::xml_view_trees()
#└── mission
# ├── startdate
# ├── stopdate
# ├── purpose
# └── fishstation
# ├── nation
# ├── platform
# ├── station
# ├── startdate
# ├── starttime
# ├── stopdate
# ├── stoptime
# ├── latitudestart
# ├── longitudestart
# ├── system
# ├── area
# ├── location
# ├── bottomdepthstart
# ├── bottomdepthstop
# ├── fishingdepthmax
# ├── fishingdepthmin
# ├── gear
# ├── gearcount
# ├── gearcondition
# ├── trawlquality
# ├── dataquality
# └── catchsample
# ├── sampletype
# ├── conservation
# ├── producttype
# ├── weight
# ├── count
# ├── sampleproducttype
# ├── lengthmeasurement
# ├── lengthsampleweight
# ├── lengthsamplecount
# ├── specimensamplecount
# ├── individual
# ├── producttype
# ├── weight
# ├── lengthunit
# └── length
# ├── individual
# ├── producttype
# ├── weight
# ├── lengthunit
# └── length
# ├── individual
# ├── producttype
# ├── weight
# ├── lengthunit
# └── length
# └── individual
# ├── producttype
# ├── weight
# ├── lengthunit
# └── length
doc %>% xmltools::xml_get_paths()
#[[1]]
# [1] "/*/*" "/*/*/*" "/*/*/*" "/*/*/*" #"/*/*/*" "/*/*/*/*" "/*/*/*/*" "/*/*/*/*" "/*/*/*/*" #"/*/*/*/*" "/*/*/*/*"
# [12] "/*/*/*/*" "/*/*/*/*" "/*/*/*/*" "/*/*/*/*" #"/*/*/*/*" "/*/*/*/*" "/*/*/*/*" "/*/*/*/*" "/*/*/*/*" #"/*/*/*/*" "/*/*/*/*"
# [23] "/*/*/*/*" "/*/*/*/*" "/*/*/*/*" "/*/*/*/*" #"/*/*/*/*" "/*/*/*/*/*" "/*/*/*/*/*" "/*/*/*/*/*" "/*/*/*/*/*" #"/*/*/*/*/*" "/*/*/*/*/*"
# [34] "/*/*/*/*/*" "/*/*/*/*/*" "/*/*/*/*/*" "/*/*/*/*/*" #"/*/*/*/*/*" "/*/*/*/*/*/*" "/*/*/*/*/*/*" "/*/*/*/*/*/*" "/*/*/*/*/*/*" #"/*/*/*/*/*" "/*/*/*/*/*/*"
# [45] "/*/*/*/*/*/*" "/*/*/*/*/*/*" "/*/*/*/*/*/*" "/*/*/*/*/*" #"/*/*/*/*/*/*" "/*/*/*/*/*/*" "/*/*/*/*/*/*" "/*/*/*/*/*/*" "/*/*/*/*/*" #"/*/*/*/*/*/*" "/*/*/*/*/*/*"
# [56] "/*/*/*/*/*/*" "/*/*/*/*/*/*"
假设我想提取每条 fishstation
的经度和纬度,以及每条采样鱼的物种代码和个体重量。我可以通过将 XML 文档转换为列表并使用标准 R 代码来使用内存效率不高的方式来做到这一点:
library(XML)
doc <- xmlInternalTreeParse("test.xml")
getNodeSet(doc, "//fishstation") # Does not work
getNodeSet(doc, "/*/*/*/*") # Works
tmp <- xmlToList(doc)[[1]]
tmp <- tmp[names(tmp) == "fishstation"]
lapply(tmp, function(k) {
sp <- k$catchsample$.attrs
inds <- k$catchsample
inds <- inds[names(inds) == "individual"]
data.frame(lon = k$longitudestart, lat = k$latitudestart,
sp = unname(sp[names(sp) == "species"]),
weight = unname(sapply(inds, function(j) j$weight)))
})
# $fishstation
# lon lat sp weight
# 1 -11.04833 65.13833 161722.G03 0.222
# 2 -11.04833 65.13833 161722.G03 0.127
# 3 -11.04833 65.13833 161722.G03 0.172
# 4 -11.04833 65.13833 161722.G03 0.342
但是,如果能够为此使用 XPath 语言会非常方便,因为这种方法会使过程更快(我的文件可能有几千兆字节)。到目前为止,我还没有幸运地调整提取过程,以至于使用 XPath 语言可以处理这些文件。我很清楚这可能是由于基本的用户错误造成的。我之前没有使用过 XML 文件。
1) 如何将示例文件读取到 R 以保留节点名称?
2) 如何以更高效的内存方式提取上述示例中的值?
使此文件棘手的是 xml 文本中的名称 space。 xml2
包有一个函数可以去掉名字space,这使得这个过程更容易。
一旦完成,接下来就是找到个体,然后向上移动树以找到父节点,然后提取请求的信息。
library(xml2)
#read the file in as xml
page<-read_xml("fish.xml")
#strip the name space out
page<-xml_ns_strip(page)
#find all of the individuals
individuals<-xml_find_all(page, ".//individual")
#find the parent catchsample nodes for the individuals
catchnodes<-xml_find_first( individuals, ".//parent::catchsample")
#find the parents for the catchsample node the fishstation
fishstations<-xml_find_first( catchnodes, ".//parent::fishstation ")
#individuals, catchnodes and fishstations should all have the same length
#extract the desired information from each set of nodes
weights<-xml_text(xml_find_first(individuals, ".//weight"))
lat<-xml_text(xml_find_first(fishstations, ".//latitudestart"))
long<-xml_text(xml_find_first(fishstations, ".//longitudestart"))
species<-xml_attr(catchnodes, "species") #information is an attribute
#package together
data.frame(lat, long, species, weights)
# lat long species weights
# 1 65.13833 -11.04833 161722.G03 0.222
# 2 65.13833 -11.04833 161722.G03 0.127
# 3 65.13833 -11.04833 161722.G03 0.172
# 4 65.13833 -11.04833 161722.G03 0.342
这是相同的代码,但没有删除名称space:
page<-read_xml("fish.xml")
ns <- xml_ns(page)
#find all of the individuals
individuals<-xml_find_all(page, ".//d1:individual", ns)
catchnodes<-xml_find_first(individuals, ".//parent::d1:catchsample", ns)
fishstations<-xml_find_first(catchnodes, ".//parent::d1:fishstation", ns)
weights<-xml_text(xml_find_first(individuals, ".//d1:weight", ns))
lat<-xml_text(xml_find_first(fishstations, ".//d1:latitudestart", ns))
long<-xml_text(xml_find_first(fishstations, ".//d1:longitudestart", ns))
species<-xml_attr(catchnodes, "species")
我正在尝试使用 R 从机构 XML 文件中提取值。这是此类文件的示例:
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<missions xmlns="http://sensored">
<mission missiontype="2" year="2012" platform="4135" missionnumber="1" missiontypename="Referanseflåten-Hav" callsignal="LJBD" platformname="Nybo">
<startdate>30/01/2012</startdate>
<stopdate>28/11/2012</stopdate>
<purpose></purpose>
<fishstation serialno="86001">
<nation>58</nation>
<platform>4135</platform>
<station>1</station>
<startdate>30/01/2012</startdate>
<starttime>18:56:00</starttime>
<stopdate>30/01/2012</stopdate>
<stoptime>23:19:00</stoptime>
<latitudestart>65.13833</latitudestart>
<longitudestart>-11.04833</longitudestart>
<system>2</system>
<area>59</area>
<location>07</location>
<bottomdepthstart>653.0</bottomdepthstart>
<bottomdepthstop>653.0</bottomdepthstop>
<fishingdepthmax>85.0</fishingdepthmax>
<fishingdepthmin>40.0</fishingdepthmin>
<gear>3711</gear>
<gearcount>1</gearcount>
<gearcondition>4</gearcondition>
<trawlquality>7</trawlquality>
<dataquality>2</dataquality>
<catchsample species="161722.G03" samplenumber="1" noname="sild'G03" aphia="126417">
<sampletype>20</sampletype>
<conservation>1</conservation>
<producttype>1</producttype>
<weight>10.0</weight>
<count>46</count>
<sampleproducttype>1</sampleproducttype>
<lengthmeasurement>E</lengthmeasurement>
<lengthsampleweight>0.863</lengthsampleweight>
<lengthsamplecount>4</lengthsamplecount>
<specimensamplecount>4</specimensamplecount>
<individual specimenno="1">
<producttype>1</producttype>
<weight>0.222</weight>
<lengthunit>2</lengthunit>
<length>0.28</length>
</individual>
<individual specimenno="2">
<producttype>1</producttype>
<weight>0.127</weight>
<lengthunit>2</lengthunit>
<length>0.245</length>
</individual>
<individual specimenno="3">
<producttype>1</producttype>
<weight>0.172</weight>
<lengthunit>2</lengthunit>
<length>0.26</length>
</individual>
<individual specimenno="4">
<producttype>1</producttype>
<weight>0.342</weight>
<lengthunit>2</lengthunit>
<length>0.325</length>
</individual>
</catchsample>
</fishstation>
</mission>
</missions>
我设法毫无问题地解析文件。虽然名称在解析列表中显示正确,但我无法正确解析它们以使用 XPath 语言:
library(xml2)
library(xmltools)
doc <- xml2::read_xml("test.xml")
doc %>% xmltools::xml_view_trees()
#└── mission
# ├── startdate
# ├── stopdate
# ├── purpose
# └── fishstation
# ├── nation
# ├── platform
# ├── station
# ├── startdate
# ├── starttime
# ├── stopdate
# ├── stoptime
# ├── latitudestart
# ├── longitudestart
# ├── system
# ├── area
# ├── location
# ├── bottomdepthstart
# ├── bottomdepthstop
# ├── fishingdepthmax
# ├── fishingdepthmin
# ├── gear
# ├── gearcount
# ├── gearcondition
# ├── trawlquality
# ├── dataquality
# └── catchsample
# ├── sampletype
# ├── conservation
# ├── producttype
# ├── weight
# ├── count
# ├── sampleproducttype
# ├── lengthmeasurement
# ├── lengthsampleweight
# ├── lengthsamplecount
# ├── specimensamplecount
# ├── individual
# ├── producttype
# ├── weight
# ├── lengthunit
# └── length
# ├── individual
# ├── producttype
# ├── weight
# ├── lengthunit
# └── length
# ├── individual
# ├── producttype
# ├── weight
# ├── lengthunit
# └── length
# └── individual
# ├── producttype
# ├── weight
# ├── lengthunit
# └── length
doc %>% xmltools::xml_get_paths()
#[[1]]
# [1] "/*/*" "/*/*/*" "/*/*/*" "/*/*/*" #"/*/*/*" "/*/*/*/*" "/*/*/*/*" "/*/*/*/*" "/*/*/*/*" #"/*/*/*/*" "/*/*/*/*"
# [12] "/*/*/*/*" "/*/*/*/*" "/*/*/*/*" "/*/*/*/*" #"/*/*/*/*" "/*/*/*/*" "/*/*/*/*" "/*/*/*/*" "/*/*/*/*" #"/*/*/*/*" "/*/*/*/*"
# [23] "/*/*/*/*" "/*/*/*/*" "/*/*/*/*" "/*/*/*/*" #"/*/*/*/*" "/*/*/*/*/*" "/*/*/*/*/*" "/*/*/*/*/*" "/*/*/*/*/*" #"/*/*/*/*/*" "/*/*/*/*/*"
# [34] "/*/*/*/*/*" "/*/*/*/*/*" "/*/*/*/*/*" "/*/*/*/*/*" #"/*/*/*/*/*" "/*/*/*/*/*/*" "/*/*/*/*/*/*" "/*/*/*/*/*/*" "/*/*/*/*/*/*" #"/*/*/*/*/*" "/*/*/*/*/*/*"
# [45] "/*/*/*/*/*/*" "/*/*/*/*/*/*" "/*/*/*/*/*/*" "/*/*/*/*/*" #"/*/*/*/*/*/*" "/*/*/*/*/*/*" "/*/*/*/*/*/*" "/*/*/*/*/*/*" "/*/*/*/*/*" #"/*/*/*/*/*/*" "/*/*/*/*/*/*"
# [56] "/*/*/*/*/*/*" "/*/*/*/*/*/*"
假设我想提取每条 fishstation
的经度和纬度,以及每条采样鱼的物种代码和个体重量。我可以通过将 XML 文档转换为列表并使用标准 R 代码来使用内存效率不高的方式来做到这一点:
library(XML)
doc <- xmlInternalTreeParse("test.xml")
getNodeSet(doc, "//fishstation") # Does not work
getNodeSet(doc, "/*/*/*/*") # Works
tmp <- xmlToList(doc)[[1]]
tmp <- tmp[names(tmp) == "fishstation"]
lapply(tmp, function(k) {
sp <- k$catchsample$.attrs
inds <- k$catchsample
inds <- inds[names(inds) == "individual"]
data.frame(lon = k$longitudestart, lat = k$latitudestart,
sp = unname(sp[names(sp) == "species"]),
weight = unname(sapply(inds, function(j) j$weight)))
})
# $fishstation
# lon lat sp weight
# 1 -11.04833 65.13833 161722.G03 0.222
# 2 -11.04833 65.13833 161722.G03 0.127
# 3 -11.04833 65.13833 161722.G03 0.172
# 4 -11.04833 65.13833 161722.G03 0.342
但是,如果能够为此使用 XPath 语言会非常方便,因为这种方法会使过程更快(我的文件可能有几千兆字节)。到目前为止,我还没有幸运地调整提取过程,以至于使用 XPath 语言可以处理这些文件。我很清楚这可能是由于基本的用户错误造成的。我之前没有使用过 XML 文件。
1) 如何将示例文件读取到 R 以保留节点名称?
2) 如何以更高效的内存方式提取上述示例中的值?
使此文件棘手的是 xml 文本中的名称 space。 xml2
包有一个函数可以去掉名字space,这使得这个过程更容易。
一旦完成,接下来就是找到个体,然后向上移动树以找到父节点,然后提取请求的信息。
library(xml2)
#read the file in as xml
page<-read_xml("fish.xml")
#strip the name space out
page<-xml_ns_strip(page)
#find all of the individuals
individuals<-xml_find_all(page, ".//individual")
#find the parent catchsample nodes for the individuals
catchnodes<-xml_find_first( individuals, ".//parent::catchsample")
#find the parents for the catchsample node the fishstation
fishstations<-xml_find_first( catchnodes, ".//parent::fishstation ")
#individuals, catchnodes and fishstations should all have the same length
#extract the desired information from each set of nodes
weights<-xml_text(xml_find_first(individuals, ".//weight"))
lat<-xml_text(xml_find_first(fishstations, ".//latitudestart"))
long<-xml_text(xml_find_first(fishstations, ".//longitudestart"))
species<-xml_attr(catchnodes, "species") #information is an attribute
#package together
data.frame(lat, long, species, weights)
# lat long species weights
# 1 65.13833 -11.04833 161722.G03 0.222
# 2 65.13833 -11.04833 161722.G03 0.127
# 3 65.13833 -11.04833 161722.G03 0.172
# 4 65.13833 -11.04833 161722.G03 0.342
这是相同的代码,但没有删除名称space:
page<-read_xml("fish.xml")
ns <- xml_ns(page)
#find all of the individuals
individuals<-xml_find_all(page, ".//d1:individual", ns)
catchnodes<-xml_find_first(individuals, ".//parent::d1:catchsample", ns)
fishstations<-xml_find_first(catchnodes, ".//parent::d1:fishstation", ns)
weights<-xml_text(xml_find_first(individuals, ".//d1:weight", ns))
lat<-xml_text(xml_find_first(fishstations, ".//d1:latitudestart", ns))
long<-xml_text(xml_find_first(fishstations, ".//d1:longitudestart", ns))
species<-xml_attr(catchnodes, "species")