使用xml2在R中获取xll xpaths
Obtain xll xpaths in in R using xml2
在 xml2 中,可以使用以下方法获取给定节点的 xpath:
xml_path
我想知道从给定文档中提取所有 xpath 的最快方法是什么。 IE。我想找到最终节点,然后向上迭代。
本质上我正在努力实现这一点:
library(xml2)
#Read
doc <- read_xml("http://www.w3schools.com/xml/plant_catalog.xml")
#Define a funciton to extract all xpaths:
extract_all_paths<-function(x){
if (xml_length(x)==0){
final_vector<-xml_path(x)
} else{
final_vector<-list("vector")
i<-1
while (length(x)!=0){
x<-do.call(c,lapply(x,xml_children))
x_length<-sapply(x,xml_length)
final_vector[[i]]<-x[x_length==0]
x<-x[x_length!=0]
i<-i+1
}
final_vector<-do.call(c,final_vector)
final_vector<-sapply(final_vector,xml_path)
final_vector
}
final_vector
}
#Function to extract everything for a given xpath:
function_extract_values<-function(x,y){
paste(xml_text(xml_find_all(y,x)),collapse="&&&&")
}
format_file<-function(x){
x<-xml_ns_strip(x)
data_path<-data.table(x=xml_children(x))
data_xpath<-data_path[,extract_all_paths(x),by=1:nrow(data_path)]
data_xpath[,V1:=gsub("\[(.*?)\]","",data_xpath$V1)]
data_xpath<-data_xpath[!duplicated(V1)]
data_xpath[,V2:=list(list(x)),by=1:nrow(data_xpath)]
data_xpath[,value:=function_extract_values(V1,V2[[1]]),by=1:nrow(data_xpath)]
data_xpath[,V1:=gsub("\/","_",V1)]
data_names<-data_xpath$V1
data_xpath[,V1:=NULL]
data_xpath[,nrow:=NULL]
data_xpath[,V2:=NULL]
data_xpath<-transpose(data_xpath)
setnames(data_xpath,data_names)
data_xpath
}
data<-format_file(doc)
本质上我想解析一个 .xml 文件,然后将其作为一行放入 data.table。我目前的解决方案很慢,如果我有很多文件,也许有人可以建议一些更快的解决方案。
可能有更好的方法从文档中获取完整的 xpath 列表,但这里提供一种解决方案。 (也可能有更好的方法来遍历 xml 文档以获得您想要的内容,但您要求提供所有 xpath 的列表):
library(XML) #This may work in xml2 but i usually stick with XML
#read document into R and select root
myXML <- xmlTreeParse("myXML.xml", useInternal = TRUE)
top <- xmlRoot(myXML)
#convert XML to list of lists
temp <- xmlToList(top)
#use names of recusive apply to get list of recusive steps through XML
temp <- unique(names(rapply(test, summary, how="unlist")))
#remove the last item created by summary function
temp <- unique(sub("\.[^.]*$", "", temp))
#remove attributes
temp <- unique(sub("..attrs", "", temp))
#sub . for / to create xpath
temp <- sub("\.","/", temp)
#add / to start the xpath at the docuemnt root
XPaths <- paste0("/", temp)
在 xml2 中,可以使用以下方法获取给定节点的 xpath:
xml_path
我想知道从给定文档中提取所有 xpath 的最快方法是什么。 IE。我想找到最终节点,然后向上迭代。
本质上我正在努力实现这一点:
library(xml2)
#Read
doc <- read_xml("http://www.w3schools.com/xml/plant_catalog.xml")
#Define a funciton to extract all xpaths:
extract_all_paths<-function(x){
if (xml_length(x)==0){
final_vector<-xml_path(x)
} else{
final_vector<-list("vector")
i<-1
while (length(x)!=0){
x<-do.call(c,lapply(x,xml_children))
x_length<-sapply(x,xml_length)
final_vector[[i]]<-x[x_length==0]
x<-x[x_length!=0]
i<-i+1
}
final_vector<-do.call(c,final_vector)
final_vector<-sapply(final_vector,xml_path)
final_vector
}
final_vector
}
#Function to extract everything for a given xpath:
function_extract_values<-function(x,y){
paste(xml_text(xml_find_all(y,x)),collapse="&&&&")
}
format_file<-function(x){
x<-xml_ns_strip(x)
data_path<-data.table(x=xml_children(x))
data_xpath<-data_path[,extract_all_paths(x),by=1:nrow(data_path)]
data_xpath[,V1:=gsub("\[(.*?)\]","",data_xpath$V1)]
data_xpath<-data_xpath[!duplicated(V1)]
data_xpath[,V2:=list(list(x)),by=1:nrow(data_xpath)]
data_xpath[,value:=function_extract_values(V1,V2[[1]]),by=1:nrow(data_xpath)]
data_xpath[,V1:=gsub("\/","_",V1)]
data_names<-data_xpath$V1
data_xpath[,V1:=NULL]
data_xpath[,nrow:=NULL]
data_xpath[,V2:=NULL]
data_xpath<-transpose(data_xpath)
setnames(data_xpath,data_names)
data_xpath
}
data<-format_file(doc)
本质上我想解析一个 .xml 文件,然后将其作为一行放入 data.table。我目前的解决方案很慢,如果我有很多文件,也许有人可以建议一些更快的解决方案。
可能有更好的方法从文档中获取完整的 xpath 列表,但这里提供一种解决方案。 (也可能有更好的方法来遍历 xml 文档以获得您想要的内容,但您要求提供所有 xpath 的列表):
library(XML) #This may work in xml2 but i usually stick with XML
#read document into R and select root
myXML <- xmlTreeParse("myXML.xml", useInternal = TRUE)
top <- xmlRoot(myXML)
#convert XML to list of lists
temp <- xmlToList(top)
#use names of recusive apply to get list of recusive steps through XML
temp <- unique(names(rapply(test, summary, how="unlist")))
#remove the last item created by summary function
temp <- unique(sub("\.[^.]*$", "", temp))
#remove attributes
temp <- unique(sub("..attrs", "", temp))
#sub . for / to create xpath
temp <- sub("\.","/", temp)
#add / to start the xpath at the docuemnt root
XPaths <- paste0("/", temp)