如何将此 XML 文件转换为 "tibble"?

How to transform this XML file into a "tibble"?

我想更改此 XML 文件 https://elections.interieur.gouv.fr/telechargements/MUNICIPALES2020/candidatureT1/001/C1001.xml 进入具有以下列名称的 "tibble"(或 data.frame):

Type Annee CodDpt CodMinDpt LibDpt CodSubCom LibSubCom TypCom PopSubCom ModeScrutin NbSAP EPCI NbSapEpci NbCandidatsMaj NumListe CodNuaListe NomListe LibLisExt NumOrdCand NomPsn PrePsn CivilitePsn TeteListe CandidatEPCI

我试过了:

library(tidyverse)
library(xml2)
library(rvest)
x <- read_xml("https://elections.interieur.gouv.fr/telechargements/MUNICIPALES2020/candidatureT1/001/C1001.xml")
tmp <- x %>% 
  xml_find_all('//Commune') %>%  
  map_df(~flatten(c(xml_attrs(.x), 
                    map(xml_children(.x), 
                        ~set_names(as.list(xml_text(.x)), xml_name(.x)))))) %>%
  type_convert()

但它并没有产生我所期待的...

由于元素和两组不同的候选元素(CandidMajCandid)之间存在相当多的细微差别,请考虑XSLT,用于转换 XML 文件,扁平化原始嵌套 XML 的专用语言。使用这种方法,您可以避免在 R 中完成所有关于程序结束的工作。然后,将生成的扁平 XML 绑定到所需的数据框:

XSLT (另存为.xsl文件,一个特殊的.xml文件)

<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:output method="xml" omit-xml-declaration="yes" indent="yes"/>
    <xsl:strip-space elements="*"/>

    <xsl:template match="/Election">
     <xsl:copy>
       <xsl:apply-templates select="descendant::CandidatMaj|descendant::Candidat"/>
     </xsl:copy>
    </xsl:template>

    <xsl:template match="CandidatMaj">
     <Candidat>
       <xsl:copy-of select="ancestor::Election/Scrutin/*"/>
       <xsl:copy-of select="ancestor::Departement/*[name() != 'Communes']"/>
       <xsl:copy-of select="ancestor::Commune/*[name() != 'CandidatsMaj']"/>
       <xsl:copy-of select="ancestor::CandidatsMaj/*[name() != 'ListeCandidatsMaj']"/>
       <xsl:copy-of select="*"/>
     </Candidat>
    </xsl:template>

    <xsl:template match="Candidat">
     <Candidat>
       <xsl:copy-of select="ancestor::Election/Scrutin/*"/>
       <xsl:copy-of select="ancestor::Departement/*[name() != 'Communes']"/>
       <xsl:copy-of select="ancestor::Commune/*[name() != 'Listes']"/>
       <xsl:copy-of select="ancestor::Liste/*[name() != 'CandidatsListe']"/>
       <xsl:copy-of select="*"/>
     </Candidat>
    </xsl:template>

</xsl:stylesheet>

R

library(xml2)
library(xslt)
library(dplyr)

# PARSE XML AND XSLT
doc <- read_xml('https://elections.interieur.gouv.fr/telechargements/MUNICIPALES2020/candidatureT1/001/C1001.xml')
style <- read_xml('/path/to/Script.xsl', package = "xslt")

# TRANSFORM INPUT INTO OUTPUT
new_xml <- xslt::xml_xslt(doc, style)

# BUILD DATA FRAME LIST
df_list <- lapply(xml_find_all(new_xml, 'Candidat'), function(x) { 
   vals <- xml_children(x)
   setNames(data.frame(t(xml_text(vals)), stringsAsFactors = FALSE), xml_name(vals))
})

# ROW BIND ALL DF
final_df <- bind_rows(df_list)

我确实设法创建了这个丑陋的代码

library(tidyverse)
library(xml2)
library(rvest)

x <- read_xml("https://elections.interieur.gouv.fr/telechargements/MUNICIPALES2020/candidatureT1/001/C1001.xml")

x <- x %>% xml_find_all('//Commune')
x <- as_list(x)
tbl <- tibble(Communes=x)

communes <- tbl %>% unnest_wider(Communes) %>%
  unnest_longer(CodSubCom) %>%
  unnest_longer(LibSubCom) %>%
  unnest_longer(TypCom) %>%
  unnest_longer(PopSubCom) %>%
  unnest_longer(ModeScrutin) %>%
  unnest_longer(NbSAP) %>%
  unnest_longer(EPCI) %>%
  unnest_longer(NbSapEpci) %>% 
  hoist(CandidatsMaj,NbCandidatsMaj="NbCandidatsMaj") %>% unnest_longer(NbCandidatsMaj) %>%
  hoist(CandidatsMaj,NbSapMajRestant="NbSapMajRestant") %>% unnest_longer(NbSapMajRestant) %>%
  hoist(CandidatsMaj,candidats=c("ListeCandidatsMaj")) %>% unnest_longer(candidats) %>% 
  hoist(candidats,NomPsn="NomPsn",PrePsn="PrePsn",CivilitePsn="CivilitePsn") %>% unnest_longer(NomPsn) %>% 
  unnest_longer(PrePsn) %>% unnest_longer(CivilitePsn) 

communes <- communes %>%
  unnest_longer(Listes) %>% 
  hoist(Listes,NumListe="NumListe",CodNuaListe="CodNuaListe",NomListe="NomListe",LibLisExt="LibLisExt") %>% 
  unnest_longer(NumListe) %>% unnest_longer(CodNuaListe) %>% unnest_longer(NomListe) %>% unnest_longer(LibLisExt) %>% 
  hoist(Listes,candidats_liste="CandidatsListe") %>% unnest_longer(candidats_liste) %>% 
  hoist(candidats_liste,NumOrdCand="NumOrdCand",
        NomPsnL="NomPsn",PrePsnL="PrePsn",CivilitePsnL="CivilitePsn",TeteListe="TeteListe",CandidatEPCI="CandidatEPCI") %>%
  unnest_longer(NumOrdCand) %>%
  unnest_longer(NomPsnL) %>% unnest_longer(PrePsnL) %>% unnest_longer(CivilitePsnL) %>%
  unnest_longer(TeteListe) %>% unnest_longer(CandidatEPCI) 

communes <- communes %>%
  mutate(NomPsn = case_when(is.na(NomPsn) ~NomPsnL,
                            TRUE ~ NomPsn) ,
         PrePsn = case_when(is.na(PrePsn) ~ PrePsnL,
                            TRUE ~ PrePsn),
         CivilitePsn = case_when(is.na(CivilitePsn) ~ CivilitePsnL,
                                 TRUE ~ CivilitePsn) )
communes <- communes %>%
  select(-candidats,-candidats_id,-CandidatsMaj,-candidats_liste,-candidats_liste_id,-Listes,-Listes_id,
         -NomPsnL,-PrePsnL,-CivilitePsnL)