在 R 中读取多个 xml 文件并合并数据
Read multiple xml files in R and combine the data
我有一个包含超过 1000 个扩展名文件的文件夹(虽然它们不是 真实 xml 文件)。
我想自动从这些文件中提取某些内容,以便矩阵或 table 是最终结果(我可以在 R 中进一步使用它进行分析,或导出到 1 个 csv 文件等) .
我有 made/altered 一个适用于单个文件的代码,但无法让它自动为其余文件工作。通过循环?
所以我的单个文件的代码如下:
library(xml2)
temp <- read_xml("test.xml")
# get all the <ns2:opendataField>s
recs <- xml_find_all(temp, "//ns2:opendataField")
# extract and clean all the columns
vals <- trimws(xml_text(recs))
#create columns
cols <- xml_attr(xml_find_all(temp, "//ns2:opendataField"), "key")
#create rows
rows <- xml_attr(xml_find_all(temp, "//ns2:opendataField"), "value")
datakvk <- data.frame(cols,rows)
这导致:
> head(datakvk)
cols rows
1 SbiBusinessCode 18129
2 DocumentAdoptionDate 2017-08-22
3 FinancialYear 2016
4 BalanceSheet <NA>
5 BalanceSheetBeforeAfterAppropriationResultsTitle <NA>
6 BalanceSheetBeforeAfterAppropriationResults Na
>
最后,有了这 1000 多个文件,我希望得到这样的东西:
cols file 1 file 2
1 SbiBusinessCode 18129 34234
2 DocumentAdoptionDate 2017-08-22 452454
3 FinancialYear 2016 2016
4 BalanceSheet <NA> 2016
5 BalanceSheetBeforeAfterAppropriationResultsTitle <NA> <NA>
6 BalanceSheetBeforeAfterAppropriationResults Na
>
我尝试了以下代码,但没有成功:
list.files(pattern=".xml$") #
# create a list from these files
list.filenames<-list.files(pattern=".xml$")
# create an empty list that will serve as a container to receive the incoming files
list.data<-list()
# create a loop to read in your data
for (i in 1:length(list.filenames))
{
list.data[[i]]<-read_xml(list.filenames[i])
recs <- xml_find_all(list.data[[i]], "//ns2:opendataField")
vals <- trimws(xml_text(recs))
cols <- xml_attr(xml_find_all(list.data[[i]], "//ns2:opendataField"), "value")
rows <- xml_attr(xml_find_all(list.data[[i]], "//ns2:opendataField"), "key")
}
# add the names of data to the list
names(list.data)<-list.filenames
我错过了什么?我哪里出错了?
在此先感谢您对我的帮助....
要完整:(一个源文件(1000 个中的一个看起来像:)
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<opendata xmlns:ns2="http://schemas.kvk.nl/xb/query/service/2016/1/0/0">
<ns2:opendataField key="SbiBusinessCode" value="18129"/>
<ns2:opendataField key="DocumentAdoptionDate" value="2017-08-22"/>
<ns2:opendataField key="FinancialYear" value="2016"/>
<ns2:opendataField key="BalanceSheet">
<ns2:opendataField key="BalanceSheetBeforeAfterAppropriationResultsTitle">
<ns2:opendataField key="BalanceSheetBeforeAfterAppropriationResults" value="Na"/>
</ns2:opendataField>
<ns2:opendataField key="BalanceSheetTitle">
<ns2:opendataField key="Assets" value="61296">
<ns2:opendataField key="AssetsNoncurrent" value="8978">
<ns2:opendataField key="IntangibleAssets" value="8978"/>
</ns2:opendataField>
<ns2:opendataField key="AssetsCurrent" value="52318">
<ns2:opendataField key="Inventories" value="2239"/>
<ns2:opendataField key="Receivables" value="40560"/>
<ns2:opendataField key="CashAndCashEquivalents" value="9519"/>
</ns2:opendataField>
</ns2:opendataField>
<ns2:opendataField key="EquityAndLiabilities" value="61296">
<ns2:opendataField key="Equity" value="201">
<ns2:opendataField key="ShareCapital" value="1"/>
<ns2:opendataField key="ReservesOther" value="200"/>
</ns2:opendataField>
<ns2:opendataField key="LiabilitiesCurrent" value="61095"/>
</ns2:opendataField>
</ns2:opendataField>
</ns2:opendataField>
</opendata>
您可以使用 lapply
然后 cbind
列表元素 do.call
:
library(xml2)
library(dplyr)
files <- list.files(pattern = ".xml$")
data <- lapply(files, function(x) {
temp <- read_xml(x) %>% xml_find_all("//ns2:opendataField")
cols <- xml_attr(xml_find_all(temp, "//ns2:opendataField"), "key")
rows <- xml_attr(xml_find_all(temp, "//ns2:opendataField"), "value")
out <- data.frame(rows, row.names = cols)
names(out) <- x
out
})
do.call(cbind, data)
两个内容相同的文件的输出:
file1.xml file2.xml
SbiBusinessCode 18129 18129
DocumentAdoptionDate 2017-08-22 2017-08-22
FinancialYear 2016 2016
BalanceSheet <NA> <NA>
BalanceSheetBeforeAfterAppropriationResultsTitle <NA> <NA>
BalanceSheetBeforeAfterAppropriationResults Na Na
BalanceSheetTitle <NA> <NA>
Assets 61296 61296
AssetsNoncurrent 8978 8978
IntangibleAssets 8978 8978
AssetsCurrent 52318 52318
Inventories 2239 2239
Receivables 40560 40560
CashAndCashEquivalents 9519 9519
EquityAndLiabilities 61296 61296
Equity 201 201
ShareCapital 1 1
ReservesOther 200 200
LiabilitiesCurrent 61095 61095
考虑将您的 for
循环转换为 lapply
,调用 data.frame()
获取数据帧列表。并且因为您的 XML 文件可能有不同的 key/values,一个简单的 cbind
数据帧列表将不起作用,所以使用 Reduce()
的链式合并,保留所有行(即完全外连接)。
...
# BUILD DATAFRAME LIST
df_list <- lapply(list.filenames, function(f) {
doc <- read_xml(f)
setNames(data.frame(
xml_attr(xml_find_all(doc, "//ns2:opendataField"), "key"),
xml_attr(xml_find_all(doc, "//ns2:opendataField"), "value")
), c("key", f))
})
# CHAIN MERGE INTO MASTER DATAFRAME
final_df <- Reduce(function(x,y) merge(x, y, by="key", all=TRUE), df_list)
我有一个包含超过 1000 个扩展名文件的文件夹(虽然它们不是 真实 xml 文件)。
我想自动从这些文件中提取某些内容,以便矩阵或 table 是最终结果(我可以在 R 中进一步使用它进行分析,或导出到 1 个 csv 文件等) .
我有 made/altered 一个适用于单个文件的代码,但无法让它自动为其余文件工作。通过循环?
所以我的单个文件的代码如下:
library(xml2)
temp <- read_xml("test.xml")
# get all the <ns2:opendataField>s
recs <- xml_find_all(temp, "//ns2:opendataField")
# extract and clean all the columns
vals <- trimws(xml_text(recs))
#create columns
cols <- xml_attr(xml_find_all(temp, "//ns2:opendataField"), "key")
#create rows
rows <- xml_attr(xml_find_all(temp, "//ns2:opendataField"), "value")
datakvk <- data.frame(cols,rows)
这导致:
> head(datakvk)
cols rows
1 SbiBusinessCode 18129
2 DocumentAdoptionDate 2017-08-22
3 FinancialYear 2016
4 BalanceSheet <NA>
5 BalanceSheetBeforeAfterAppropriationResultsTitle <NA>
6 BalanceSheetBeforeAfterAppropriationResults Na
>
最后,有了这 1000 多个文件,我希望得到这样的东西:
cols file 1 file 2
1 SbiBusinessCode 18129 34234
2 DocumentAdoptionDate 2017-08-22 452454
3 FinancialYear 2016 2016
4 BalanceSheet <NA> 2016
5 BalanceSheetBeforeAfterAppropriationResultsTitle <NA> <NA>
6 BalanceSheetBeforeAfterAppropriationResults Na
>
我尝试了以下代码,但没有成功:
list.files(pattern=".xml$") #
# create a list from these files
list.filenames<-list.files(pattern=".xml$")
# create an empty list that will serve as a container to receive the incoming files
list.data<-list()
# create a loop to read in your data
for (i in 1:length(list.filenames))
{
list.data[[i]]<-read_xml(list.filenames[i])
recs <- xml_find_all(list.data[[i]], "//ns2:opendataField")
vals <- trimws(xml_text(recs))
cols <- xml_attr(xml_find_all(list.data[[i]], "//ns2:opendataField"), "value")
rows <- xml_attr(xml_find_all(list.data[[i]], "//ns2:opendataField"), "key")
}
# add the names of data to the list
names(list.data)<-list.filenames
我错过了什么?我哪里出错了?
在此先感谢您对我的帮助....
要完整:(一个源文件(1000 个中的一个看起来像:)
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<opendata xmlns:ns2="http://schemas.kvk.nl/xb/query/service/2016/1/0/0">
<ns2:opendataField key="SbiBusinessCode" value="18129"/>
<ns2:opendataField key="DocumentAdoptionDate" value="2017-08-22"/>
<ns2:opendataField key="FinancialYear" value="2016"/>
<ns2:opendataField key="BalanceSheet">
<ns2:opendataField key="BalanceSheetBeforeAfterAppropriationResultsTitle">
<ns2:opendataField key="BalanceSheetBeforeAfterAppropriationResults" value="Na"/>
</ns2:opendataField>
<ns2:opendataField key="BalanceSheetTitle">
<ns2:opendataField key="Assets" value="61296">
<ns2:opendataField key="AssetsNoncurrent" value="8978">
<ns2:opendataField key="IntangibleAssets" value="8978"/>
</ns2:opendataField>
<ns2:opendataField key="AssetsCurrent" value="52318">
<ns2:opendataField key="Inventories" value="2239"/>
<ns2:opendataField key="Receivables" value="40560"/>
<ns2:opendataField key="CashAndCashEquivalents" value="9519"/>
</ns2:opendataField>
</ns2:opendataField>
<ns2:opendataField key="EquityAndLiabilities" value="61296">
<ns2:opendataField key="Equity" value="201">
<ns2:opendataField key="ShareCapital" value="1"/>
<ns2:opendataField key="ReservesOther" value="200"/>
</ns2:opendataField>
<ns2:opendataField key="LiabilitiesCurrent" value="61095"/>
</ns2:opendataField>
</ns2:opendataField>
</ns2:opendataField>
</opendata>
您可以使用 lapply
然后 cbind
列表元素 do.call
:
library(xml2)
library(dplyr)
files <- list.files(pattern = ".xml$")
data <- lapply(files, function(x) {
temp <- read_xml(x) %>% xml_find_all("//ns2:opendataField")
cols <- xml_attr(xml_find_all(temp, "//ns2:opendataField"), "key")
rows <- xml_attr(xml_find_all(temp, "//ns2:opendataField"), "value")
out <- data.frame(rows, row.names = cols)
names(out) <- x
out
})
do.call(cbind, data)
两个内容相同的文件的输出:
file1.xml file2.xml
SbiBusinessCode 18129 18129
DocumentAdoptionDate 2017-08-22 2017-08-22
FinancialYear 2016 2016
BalanceSheet <NA> <NA>
BalanceSheetBeforeAfterAppropriationResultsTitle <NA> <NA>
BalanceSheetBeforeAfterAppropriationResults Na Na
BalanceSheetTitle <NA> <NA>
Assets 61296 61296
AssetsNoncurrent 8978 8978
IntangibleAssets 8978 8978
AssetsCurrent 52318 52318
Inventories 2239 2239
Receivables 40560 40560
CashAndCashEquivalents 9519 9519
EquityAndLiabilities 61296 61296
Equity 201 201
ShareCapital 1 1
ReservesOther 200 200
LiabilitiesCurrent 61095 61095
考虑将您的 for
循环转换为 lapply
,调用 data.frame()
获取数据帧列表。并且因为您的 XML 文件可能有不同的 key/values,一个简单的 cbind
数据帧列表将不起作用,所以使用 Reduce()
的链式合并,保留所有行(即完全外连接)。
...
# BUILD DATAFRAME LIST
df_list <- lapply(list.filenames, function(f) {
doc <- read_xml(f)
setNames(data.frame(
xml_attr(xml_find_all(doc, "//ns2:opendataField"), "key"),
xml_attr(xml_find_all(doc, "//ns2:opendataField"), "value")
), c("key", f))
})
# CHAIN MERGE INTO MASTER DATAFRAME
final_df <- Reduce(function(x,y) merge(x, y, by="key", all=TRUE), df_list)