如何将此 XML 文件转换为 "tibble"？

Question

我想更改此 XML 文件 https://elections.interieur.gouv.fr/telechargements/MUNICIPALES2020/candidatureT1/001/C1001.xml 进入具有以下列名称的 "tibble"（或 data.frame）：

Type Annee CodDpt CodMinDpt LibDpt CodSubCom LibSubCom TypCom PopSubCom ModeScrutin NbSAP EPCI NbSapEpci NbCandidatsMaj NumListe CodNuaListe NomListe LibLisExt NumOrdCand NomPsn PrePsn CivilitePsn TeteListe CandidatEPCI

我试过了：

library(tidyverse)
library(xml2)
library(rvest)
x <- read_xml("https://elections.interieur.gouv.fr/telechargements/MUNICIPALES2020/candidatureT1/001/C1001.xml")
tmp <- x %>% 
  xml_find_all('//Commune') %>%  
  map_df(~flatten(c(xml_attrs(.x), 
                    map(xml_children(.x), 
                        ~set_names(as.list(xml_text(.x)), xml_name(.x)))))) %>%
  type_convert()

但它并没有产生我所期待的...

Answer 1

由于元素和两组不同的候选元素（CandidMaj 和 Candid）之间存在相当多的细微差别，请考虑XSLT，用于转换 XML 文件，扁平化原始嵌套 XML 的专用语言。使用这种方法，您可以避免在 R 中完成所有关于程序结束的工作。然后，将生成的扁平 XML 绑定到所需的数据框：

XSLT (另存为.xsl文件，一个特殊的.xml文件)

<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:output method="xml" omit-xml-declaration="yes" indent="yes"/>
    <xsl:strip-space elements="*"/>

    <xsl:template match="/Election">
     <xsl:copy>
       <xsl:apply-templates select="descendant::CandidatMaj|descendant::Candidat"/>
     </xsl:copy>
    </xsl:template>

    <xsl:template match="CandidatMaj">
     <Candidat>
       <xsl:copy-of select="ancestor::Election/Scrutin/*"/>
       <xsl:copy-of select="ancestor::Departement/*[name() != 'Communes']"/>
       <xsl:copy-of select="ancestor::Commune/*[name() != 'CandidatsMaj']"/>
       <xsl:copy-of select="ancestor::CandidatsMaj/*[name() != 'ListeCandidatsMaj']"/>
       <xsl:copy-of select="*"/>
     </Candidat>
    </xsl:template>

    <xsl:template match="Candidat">
     <Candidat>
       <xsl:copy-of select="ancestor::Election/Scrutin/*"/>
       <xsl:copy-of select="ancestor::Departement/*[name() != 'Communes']"/>
       <xsl:copy-of select="ancestor::Commune/*[name() != 'Listes']"/>
       <xsl:copy-of select="ancestor::Liste/*[name() != 'CandidatsListe']"/>
       <xsl:copy-of select="*"/>
     </Candidat>
    </xsl:template>

</xsl:stylesheet>

R

library(xml2)
library(xslt)
library(dplyr)

# PARSE XML AND XSLT
doc <- read_xml('https://elections.interieur.gouv.fr/telechargements/MUNICIPALES2020/candidatureT1/001/C1001.xml')
style <- read_xml('/path/to/Script.xsl', package = "xslt")

# TRANSFORM INPUT INTO OUTPUT
new_xml <- xslt::xml_xslt(doc, style)

# BUILD DATA FRAME LIST
df_list <- lapply(xml_find_all(new_xml, 'Candidat'), function(x) { 
   vals <- xml_children(x)
   setNames(data.frame(t(xml_text(vals)), stringsAsFactors = FALSE), xml_name(vals))
})

# ROW BIND ALL DF
final_df <- bind_rows(df_list)

Answer 2

我确实设法创建了这个丑陋的代码

library(tidyverse)
library(xml2)
library(rvest)

x <- read_xml("https://elections.interieur.gouv.fr/telechargements/MUNICIPALES2020/candidatureT1/001/C1001.xml")

x <- x %>% xml_find_all('//Commune')
x <- as_list(x)
tbl <- tibble(Communes=x)

communes <- tbl %>% unnest_wider(Communes) %>%
  unnest_longer(CodSubCom) %>%
  unnest_longer(LibSubCom) %>%
  unnest_longer(TypCom) %>%
  unnest_longer(PopSubCom) %>%
  unnest_longer(ModeScrutin) %>%
  unnest_longer(NbSAP) %>%
  unnest_longer(EPCI) %>%
  unnest_longer(NbSapEpci) %>% 
  hoist(CandidatsMaj,NbCandidatsMaj="NbCandidatsMaj") %>% unnest_longer(NbCandidatsMaj) %>%
  hoist(CandidatsMaj,NbSapMajRestant="NbSapMajRestant") %>% unnest_longer(NbSapMajRestant) %>%
  hoist(CandidatsMaj,candidats=c("ListeCandidatsMaj")) %>% unnest_longer(candidats) %>% 
  hoist(candidats,NomPsn="NomPsn",PrePsn="PrePsn",CivilitePsn="CivilitePsn") %>% unnest_longer(NomPsn) %>% 
  unnest_longer(PrePsn) %>% unnest_longer(CivilitePsn) 

communes <- communes %>%
  unnest_longer(Listes) %>% 
  hoist(Listes,NumListe="NumListe",CodNuaListe="CodNuaListe",NomListe="NomListe",LibLisExt="LibLisExt") %>% 
  unnest_longer(NumListe) %>% unnest_longer(CodNuaListe) %>% unnest_longer(NomListe) %>% unnest_longer(LibLisExt) %>% 
  hoist(Listes,candidats_liste="CandidatsListe") %>% unnest_longer(candidats_liste) %>% 
  hoist(candidats_liste,NumOrdCand="NumOrdCand",
        NomPsnL="NomPsn",PrePsnL="PrePsn",CivilitePsnL="CivilitePsn",TeteListe="TeteListe",CandidatEPCI="CandidatEPCI") %>%
  unnest_longer(NumOrdCand) %>%
  unnest_longer(NomPsnL) %>% unnest_longer(PrePsnL) %>% unnest_longer(CivilitePsnL) %>%
  unnest_longer(TeteListe) %>% unnest_longer(CandidatEPCI) 

communes <- communes %>%
  mutate(NomPsn = case_when(is.na(NomPsn) ~NomPsnL,
                            TRUE ~ NomPsn) ,
         PrePsn = case_when(is.na(PrePsn) ~ PrePsnL,
                            TRUE ~ PrePsn),
         CivilitePsn = case_when(is.na(CivilitePsn) ~ CivilitePsnL,
                                 TRUE ~ CivilitePsn) )
communes <- communes %>%
  select(-candidats,-candidats_id,-CandidatsMaj,-candidats_liste,-candidats_liste_id,-Listes,-Listes_id,
         -NomPsnL,-PrePsnL,-CivilitePsnL)

如何将此 XML 文件转换为 "tibble"？

How to transform this XML file into a "tibble"?

xml

r

rvest

xml2

tidyverse