如何将此 XML 文件转换为 "tibble"?
How to transform this XML file into a "tibble"?
我想更改此 XML 文件
https://elections.interieur.gouv.fr/telechargements/MUNICIPALES2020/candidatureT1/001/C1001.xml
进入具有以下列名称的 "tibble"(或 data.frame):
Type Annee CodDpt CodMinDpt LibDpt CodSubCom LibSubCom TypCom PopSubCom ModeScrutin NbSAP EPCI NbSapEpci NbCandidatsMaj NumListe CodNuaListe NomListe LibLisExt NumOrdCand NomPsn PrePsn CivilitePsn TeteListe CandidatEPCI
我试过了:
library(tidyverse)
library(xml2)
library(rvest)
x <- read_xml("https://elections.interieur.gouv.fr/telechargements/MUNICIPALES2020/candidatureT1/001/C1001.xml")
tmp <- x %>%
xml_find_all('//Commune') %>%
map_df(~flatten(c(xml_attrs(.x),
map(xml_children(.x),
~set_names(as.list(xml_text(.x)), xml_name(.x)))))) %>%
type_convert()
但它并没有产生我所期待的...
由于元素和两组不同的候选元素(CandidMaj 和 Candid)之间存在相当多的细微差别,请考虑XSLT,用于转换 XML 文件,扁平化原始嵌套 XML 的专用语言。使用这种方法,您可以避免在 R 中完成所有关于程序结束的工作。然后,将生成的扁平 XML 绑定到所需的数据框:
XSLT (另存为.xsl文件,一个特殊的.xml文件)
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="xml" omit-xml-declaration="yes" indent="yes"/>
<xsl:strip-space elements="*"/>
<xsl:template match="/Election">
<xsl:copy>
<xsl:apply-templates select="descendant::CandidatMaj|descendant::Candidat"/>
</xsl:copy>
</xsl:template>
<xsl:template match="CandidatMaj">
<Candidat>
<xsl:copy-of select="ancestor::Election/Scrutin/*"/>
<xsl:copy-of select="ancestor::Departement/*[name() != 'Communes']"/>
<xsl:copy-of select="ancestor::Commune/*[name() != 'CandidatsMaj']"/>
<xsl:copy-of select="ancestor::CandidatsMaj/*[name() != 'ListeCandidatsMaj']"/>
<xsl:copy-of select="*"/>
</Candidat>
</xsl:template>
<xsl:template match="Candidat">
<Candidat>
<xsl:copy-of select="ancestor::Election/Scrutin/*"/>
<xsl:copy-of select="ancestor::Departement/*[name() != 'Communes']"/>
<xsl:copy-of select="ancestor::Commune/*[name() != 'Listes']"/>
<xsl:copy-of select="ancestor::Liste/*[name() != 'CandidatsListe']"/>
<xsl:copy-of select="*"/>
</Candidat>
</xsl:template>
</xsl:stylesheet>
R
library(xml2)
library(xslt)
library(dplyr)
# PARSE XML AND XSLT
doc <- read_xml('https://elections.interieur.gouv.fr/telechargements/MUNICIPALES2020/candidatureT1/001/C1001.xml')
style <- read_xml('/path/to/Script.xsl', package = "xslt")
# TRANSFORM INPUT INTO OUTPUT
new_xml <- xslt::xml_xslt(doc, style)
# BUILD DATA FRAME LIST
df_list <- lapply(xml_find_all(new_xml, 'Candidat'), function(x) {
vals <- xml_children(x)
setNames(data.frame(t(xml_text(vals)), stringsAsFactors = FALSE), xml_name(vals))
})
# ROW BIND ALL DF
final_df <- bind_rows(df_list)
我确实设法创建了这个丑陋的代码
library(tidyverse)
library(xml2)
library(rvest)
x <- read_xml("https://elections.interieur.gouv.fr/telechargements/MUNICIPALES2020/candidatureT1/001/C1001.xml")
x <- x %>% xml_find_all('//Commune')
x <- as_list(x)
tbl <- tibble(Communes=x)
communes <- tbl %>% unnest_wider(Communes) %>%
unnest_longer(CodSubCom) %>%
unnest_longer(LibSubCom) %>%
unnest_longer(TypCom) %>%
unnest_longer(PopSubCom) %>%
unnest_longer(ModeScrutin) %>%
unnest_longer(NbSAP) %>%
unnest_longer(EPCI) %>%
unnest_longer(NbSapEpci) %>%
hoist(CandidatsMaj,NbCandidatsMaj="NbCandidatsMaj") %>% unnest_longer(NbCandidatsMaj) %>%
hoist(CandidatsMaj,NbSapMajRestant="NbSapMajRestant") %>% unnest_longer(NbSapMajRestant) %>%
hoist(CandidatsMaj,candidats=c("ListeCandidatsMaj")) %>% unnest_longer(candidats) %>%
hoist(candidats,NomPsn="NomPsn",PrePsn="PrePsn",CivilitePsn="CivilitePsn") %>% unnest_longer(NomPsn) %>%
unnest_longer(PrePsn) %>% unnest_longer(CivilitePsn)
communes <- communes %>%
unnest_longer(Listes) %>%
hoist(Listes,NumListe="NumListe",CodNuaListe="CodNuaListe",NomListe="NomListe",LibLisExt="LibLisExt") %>%
unnest_longer(NumListe) %>% unnest_longer(CodNuaListe) %>% unnest_longer(NomListe) %>% unnest_longer(LibLisExt) %>%
hoist(Listes,candidats_liste="CandidatsListe") %>% unnest_longer(candidats_liste) %>%
hoist(candidats_liste,NumOrdCand="NumOrdCand",
NomPsnL="NomPsn",PrePsnL="PrePsn",CivilitePsnL="CivilitePsn",TeteListe="TeteListe",CandidatEPCI="CandidatEPCI") %>%
unnest_longer(NumOrdCand) %>%
unnest_longer(NomPsnL) %>% unnest_longer(PrePsnL) %>% unnest_longer(CivilitePsnL) %>%
unnest_longer(TeteListe) %>% unnest_longer(CandidatEPCI)
communes <- communes %>%
mutate(NomPsn = case_when(is.na(NomPsn) ~NomPsnL,
TRUE ~ NomPsn) ,
PrePsn = case_when(is.na(PrePsn) ~ PrePsnL,
TRUE ~ PrePsn),
CivilitePsn = case_when(is.na(CivilitePsn) ~ CivilitePsnL,
TRUE ~ CivilitePsn) )
communes <- communes %>%
select(-candidats,-candidats_id,-CandidatsMaj,-candidats_liste,-candidats_liste_id,-Listes,-Listes_id,
-NomPsnL,-PrePsnL,-CivilitePsnL)
我想更改此 XML 文件 https://elections.interieur.gouv.fr/telechargements/MUNICIPALES2020/candidatureT1/001/C1001.xml 进入具有以下列名称的 "tibble"(或 data.frame):
Type Annee CodDpt CodMinDpt LibDpt CodSubCom LibSubCom TypCom PopSubCom ModeScrutin NbSAP EPCI NbSapEpci NbCandidatsMaj NumListe CodNuaListe NomListe LibLisExt NumOrdCand NomPsn PrePsn CivilitePsn TeteListe CandidatEPCI
我试过了:
library(tidyverse)
library(xml2)
library(rvest)
x <- read_xml("https://elections.interieur.gouv.fr/telechargements/MUNICIPALES2020/candidatureT1/001/C1001.xml")
tmp <- x %>%
xml_find_all('//Commune') %>%
map_df(~flatten(c(xml_attrs(.x),
map(xml_children(.x),
~set_names(as.list(xml_text(.x)), xml_name(.x)))))) %>%
type_convert()
但它并没有产生我所期待的...
由于元素和两组不同的候选元素(CandidMaj 和 Candid)之间存在相当多的细微差别,请考虑XSLT,用于转换 XML 文件,扁平化原始嵌套 XML 的专用语言。使用这种方法,您可以避免在 R 中完成所有关于程序结束的工作。然后,将生成的扁平 XML 绑定到所需的数据框:
XSLT (另存为.xsl文件,一个特殊的.xml文件)
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="xml" omit-xml-declaration="yes" indent="yes"/>
<xsl:strip-space elements="*"/>
<xsl:template match="/Election">
<xsl:copy>
<xsl:apply-templates select="descendant::CandidatMaj|descendant::Candidat"/>
</xsl:copy>
</xsl:template>
<xsl:template match="CandidatMaj">
<Candidat>
<xsl:copy-of select="ancestor::Election/Scrutin/*"/>
<xsl:copy-of select="ancestor::Departement/*[name() != 'Communes']"/>
<xsl:copy-of select="ancestor::Commune/*[name() != 'CandidatsMaj']"/>
<xsl:copy-of select="ancestor::CandidatsMaj/*[name() != 'ListeCandidatsMaj']"/>
<xsl:copy-of select="*"/>
</Candidat>
</xsl:template>
<xsl:template match="Candidat">
<Candidat>
<xsl:copy-of select="ancestor::Election/Scrutin/*"/>
<xsl:copy-of select="ancestor::Departement/*[name() != 'Communes']"/>
<xsl:copy-of select="ancestor::Commune/*[name() != 'Listes']"/>
<xsl:copy-of select="ancestor::Liste/*[name() != 'CandidatsListe']"/>
<xsl:copy-of select="*"/>
</Candidat>
</xsl:template>
</xsl:stylesheet>
R
library(xml2)
library(xslt)
library(dplyr)
# PARSE XML AND XSLT
doc <- read_xml('https://elections.interieur.gouv.fr/telechargements/MUNICIPALES2020/candidatureT1/001/C1001.xml')
style <- read_xml('/path/to/Script.xsl', package = "xslt")
# TRANSFORM INPUT INTO OUTPUT
new_xml <- xslt::xml_xslt(doc, style)
# BUILD DATA FRAME LIST
df_list <- lapply(xml_find_all(new_xml, 'Candidat'), function(x) {
vals <- xml_children(x)
setNames(data.frame(t(xml_text(vals)), stringsAsFactors = FALSE), xml_name(vals))
})
# ROW BIND ALL DF
final_df <- bind_rows(df_list)
我确实设法创建了这个丑陋的代码
library(tidyverse)
library(xml2)
library(rvest)
x <- read_xml("https://elections.interieur.gouv.fr/telechargements/MUNICIPALES2020/candidatureT1/001/C1001.xml")
x <- x %>% xml_find_all('//Commune')
x <- as_list(x)
tbl <- tibble(Communes=x)
communes <- tbl %>% unnest_wider(Communes) %>%
unnest_longer(CodSubCom) %>%
unnest_longer(LibSubCom) %>%
unnest_longer(TypCom) %>%
unnest_longer(PopSubCom) %>%
unnest_longer(ModeScrutin) %>%
unnest_longer(NbSAP) %>%
unnest_longer(EPCI) %>%
unnest_longer(NbSapEpci) %>%
hoist(CandidatsMaj,NbCandidatsMaj="NbCandidatsMaj") %>% unnest_longer(NbCandidatsMaj) %>%
hoist(CandidatsMaj,NbSapMajRestant="NbSapMajRestant") %>% unnest_longer(NbSapMajRestant) %>%
hoist(CandidatsMaj,candidats=c("ListeCandidatsMaj")) %>% unnest_longer(candidats) %>%
hoist(candidats,NomPsn="NomPsn",PrePsn="PrePsn",CivilitePsn="CivilitePsn") %>% unnest_longer(NomPsn) %>%
unnest_longer(PrePsn) %>% unnest_longer(CivilitePsn)
communes <- communes %>%
unnest_longer(Listes) %>%
hoist(Listes,NumListe="NumListe",CodNuaListe="CodNuaListe",NomListe="NomListe",LibLisExt="LibLisExt") %>%
unnest_longer(NumListe) %>% unnest_longer(CodNuaListe) %>% unnest_longer(NomListe) %>% unnest_longer(LibLisExt) %>%
hoist(Listes,candidats_liste="CandidatsListe") %>% unnest_longer(candidats_liste) %>%
hoist(candidats_liste,NumOrdCand="NumOrdCand",
NomPsnL="NomPsn",PrePsnL="PrePsn",CivilitePsnL="CivilitePsn",TeteListe="TeteListe",CandidatEPCI="CandidatEPCI") %>%
unnest_longer(NumOrdCand) %>%
unnest_longer(NomPsnL) %>% unnest_longer(PrePsnL) %>% unnest_longer(CivilitePsnL) %>%
unnest_longer(TeteListe) %>% unnest_longer(CandidatEPCI)
communes <- communes %>%
mutate(NomPsn = case_when(is.na(NomPsn) ~NomPsnL,
TRUE ~ NomPsn) ,
PrePsn = case_when(is.na(PrePsn) ~ PrePsnL,
TRUE ~ PrePsn),
CivilitePsn = case_when(is.na(CivilitePsn) ~ CivilitePsnL,
TRUE ~ CivilitePsn) )
communes <- communes %>%
select(-candidats,-candidats_id,-CandidatsMaj,-candidats_liste,-candidats_liste_id,-Listes,-Listes_id,
-NomPsnL,-PrePsnL,-CivilitePsnL)