XML 个节点到 R 数据框,所有更高级别的节点属性作为列

XML nodes to R data frame with all higher-level node attributes as columns

我有一个 XML 文件,格式如下:

<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
    <fishmeter>
      <mission cruise="2019114" station="344" platform="4174">
        <fishstation serialno="7">
          <platform>4174</platform>
          <nation>58</nation>
          <latitudestart>60.746062433333336</latitudestart>
          <longitudestart>2.6755209333333334</longitudestart>
          <latitudeend>60.75632006666667</latitudeend>
          <longitudeend>2.64776135</longitudeend>
          <catchsample species="172414" samplenumber="1" noname="makrell" aphia="127023">
            <conservation>1</conservation>
            <producttype>1</producttype>
            <weight>10.195</weight>
            <count>0</count>
            <lengthsampleweight>0</lengthsampleweight>
            <sampleproducttype>1</sampleproducttype>
            <lengthmeasurement>E</lengthmeasurement>
            <specimensamplecount>36</specimensamplecount>
            <individual specimenno="1">
              <lengthunit>2</lengthunit>
              <length>0.36</length>
              <individualproducttype>1</individualproducttype>
            </individual>
            <individual specimenno="2">
              <lengthunit>2</lengthunit>
              <length>0.36</length>
              <individualproducttype>1</individualproducttype>
            </individual>
            <individual specimenno="3">
              <lengthunit>2</lengthunit>
              <length>0.315</length>
              <individualproducttype>1</individualproducttype>
            </individual>
            <individual specimenno="4">
              <lengthunit>2</lengthunit>
              <length>0.315</length>
              <individualproducttype>1</individualproducttype>
            </individual>
          </catchsample>
          <catchsample species="167044" samplenumber="1" noname="knurr" aphia="150637">
            <conservation>1</conservation>
            <producttype>1</producttype>
            <weight>2.52</weight>
            <count>0</count>
            <lengthsampleweight>0</lengthsampleweight>
            <sampleproducttype>1</sampleproducttype>
            <lengthmeasurement>E</lengthmeasurement>
            <specimensamplecount>10</specimensamplecount>
            <individual specimenno="1">
              <lengthunit>2</lengthunit>
              <length>0.28</length>
              <individualproducttype>1</individualproducttype>
            </individual>
            <individual specimenno="2">
              <lengthunit>2</lengthunit>
              <length>0.285</length>
              <individualproducttype>1</individualproducttype>
            </individual>
            <individual specimenno="3">
              <lengthunit>2</lengthunit>
              <length>0.37</length>
              <individualproducttype>1</individualproducttype>
            </individual>
            <individual specimenno="4">
              <lengthunit>2</lengthunit>
              <length>0.315</length>
              <individualproducttype>1</individualproducttype>
            </individual>
            <individual specimenno="5">
              <lengthunit>2</lengthunit>
              <length>0.32</length>
              <individualproducttype>1</individualproducttype>
            </individual>
            <individual specimenno="6">
              <lengthunit>2</lengthunit>
              <length>0.38</length>
              <individualproducttype>1</individualproducttype>
            </individual>
            <individual specimenno="7">
              <lengthunit>2</lengthunit>
              <length>0.39</length>
              <individualproducttype>1</individualproducttype>
            </individual>
            <individual specimenno="8">
              <lengthunit>2</lengthunit>
              <length>0.305</length>
              <individualproducttype>1</individualproducttype>
            </individual>
            <individual specimenno="9">
              <lengthunit>2</lengthunit>
              <length>0.24</length>
              <individualproducttype>1</individualproducttype>
            </individual>
            <individual specimenno="10">
              <lengthunit>2</lengthunit>
              <length>0.36</length>
              <individualproducttype>1</individualproducttype>
            </individual>
          </catchsample>
        </fishstation>
      </mission>
    </fishmeter>

我正在尝试将 individual 节点作为行提取到数据框中,将信息从父 catchsample 和祖父 fishstation 节点中保留在附加列中,以便生成数据框包含以下列:

cruise, station, platform, serialno, platform, nation, latitudestart, longitudestart, latitudeend, longitudeend, species, samplenumber, noname, aphia, conservation, producttype, weight, count, lengthsampleweight, sampleproducttype, lengthmeasurement, specimensamplecount, specimenno, lengthunit, length, individualproducttype

根据 R XML - combining parent and child nodes into data frame 的回答,我设法将 individual 节点数据提取到数据框中,但是,不是来自更高级别节点的相关信息。

  fish<- read_xml('test.xml') %>% 
      xml_find_all('//individual') %>% 
      map_dfr(~flatten(c(xml_attrs(.x), 
                         map(xml_children(.x), 
                             ~set_names(as.list(xml_text(.x)), xml_name(.x)))))) %>%
      type_convert()

# A tibble: 14 x 4
   specimenno lengthunit length individualproducttype
        <dbl>      <dbl>  <dbl>                 <dbl>
 1          1          2  0.36                      1
 2          2          2  0.36                      1
 3          3          2  0.315                     1
 4          4          2  0.315                     1
 5          1          2  0.28                      1
 6          2          2  0.285                     1
 7          3          2  0.37                      1
 8          4          2  0.315                     1
 9          5          2  0.32                      1
10          6          2  0.38                      1
11          7          2  0.39                      1
12          8          2  0.305                     1
13          9          2  0.24                      1
14         10          2  0.36                      1

你可以这样做:

library(xml2)
library(purrr)
library(readr)
library(rvest)
library(tibble)

individuals <- read_xml('test.xml') %>% 
  xml_find_all('//individual')

to_add <- function(individual, xpath) individual %>% 
  html_nodes(xpath = xpath) %>% 
  {list(html_text(.), html_name(.))} %>% 
  {setNames(object = .[[1]], nm = .[[2]])}

get_data <- function(individual){
  
  out <- c(
    individual %>% html_attrs(),
    individual %>% html_nodes(xpath = "..") %>% html_attrs() %>% unlist,
    individual %>% html_nodes(xpath = "../..") %>% html_attrs() %>% unlist,
    individual %>% html_nodes(xpath = "../../..") %>% html_attrs() %>% unlist
  )
  
  xpathes <- c("../../*[not(descendant::*)]", "../*[not(descendant::*)]", "*")
  
  c(sapply(xpathes, to_add, individual = individual, USE.NAMES = FALSE) %>% unlist, out)
}

然后:

lapply(individuals, get_data) %>% 
  do.call(what = cbind) %>% 
  as.tibble