xml2 从 .atomsvc 文件中提取 URL
xml2 extract URL from .atomsvc file
我正在努力抓取 public 数据源,该数据源依赖于 .atomsvc 文件以允许用户在 Excel 中设置数据源。我在 R 中构建了一个非常脆弱的解析器,使用 XML 库来提取 URL。我想知道这如何在 xml2 中完成(最好以更简洁和优雅的方式)
下面是我如何使用 XML 库
# Crystal Reports Parser Sample
library(XML)
library(dplyr)
# Get the .atomsvc file from the Export to Data Feed Option on the PA DEP website
pa_string <- '<?xml version="1.0" encoding="utf-8" standalone="yes"?><service xmlns:atom="http://www.w3.org/2005/Atom" xmlns:app="http://www.w3.org/2007/app" xmlns="http://www.w3.org/2007/app"><workspace><atom:title>Oil_Gas_Well_Production</atom:title><collection href="http://www.depreportingservices.state.pa.us/ReportServer?%2FOil_Gas%2FOil_Gas_Well_Production&P_PERIOD_ID=198&P_COUNTY%3Aisnull=True&P_CLIENT%3Aisnull=True&P_PERMIT_NUM%3Aisnull=True&P_OGO_NUM%3Aisnull=True&P_PRODUCING%3Aisnull=True&rs%3AParameterLanguage=&rs%3ACommand=Render&rs%3AFormat=ATOM&rc%3ADataFeed=xAx0x2"><atom:title>Tablix1</atom:title></collection></workspace></service>'
pa_list <- pa_string %>% xmlParse() %>% xmlToList()
# Extract the URL
URL <- URLdecode(pa_list$workspace$collection$.attrs)
这是我对 xml2 版本的了解
# Crystal Reports xml2 Parser
library(xml2)
library(dplyr)
# Get the .atomsvc file from the Export to Data Feed Option on the PA DEP website
pa_string <- '<service xmlns:atom="http://www.w3.org/2005/Atom" xmlns:app="http://www.w3.org/2007/app" xmlns="http://www.w3.org/2007/app"><workspace><atom:title>Oil_Gas_Well_Production</atom:title><collection href="http://www.depreportingservices.state.pa.us/ReportServer?%2FOil_Gas%2FOil_Gas_Well_Production&P_PERIOD_ID=198&P_COUNTY%3Aisnull=True&P_CLIENT%3Aisnull=True&P_PERMIT_NUM%3Aisnull=True&P_OGO_NUM%3Aisnull=True&P_PRODUCING%3Aisnull=True&rs%3AParameterLanguage=&rs%3ACommand=Render&rs%3AFormat=ATOM&rc%3ADataFeed=xAx0x2"><atom:title>Tablix1</atom:title></collection></workspace></service>'
pa_list <- pa_string %>% read_xml() %>% as_list()
我不知道如何从这里提取 URL,或者这是否是正确的思考方式。任何帮助将不胜感激!
这是一种方法,通过从指定节点中提取属性:
library(xml2)
library(tidyverse)
pa_string <- '<?xml version="1.0" encoding="utf-8" standalone="yes"?><service xmlns:atom="http://www.w3.org/2005/Atom" xmlns:app="http://www.w3.org/2007/app" xmlns="http://www.w3.org/2007/app"><workspace><atom:title>Oil_Gas_Well_Production</atom:title><collection href="http://www.depreportingservices.state.pa.us/ReportServer?%2FOil_Gas%2FOil_Gas_Well_Production&P_PERIOD_ID=198&P_COUNTY%3Aisnull=True&P_CLIENT%3Aisnull=True&P_PERMIT_NUM%3Aisnull=True&P_OGO_NUM%3Aisnull=True&P_PRODUCING%3Aisnull=True&rs%3AParameterLanguage=&rs%3ACommand=Render&rs%3AFormat=ATOM&rc%3ADataFeed=xAx0x2"><atom:title>Tablix1</atom:title></collection></workspace></service>'
pa_string %>%
read_xml() %>%
xml_find_all("//*[name()='collection']")%>%
xml_attr("href")
#output
[1] "http://www.depreportingservices.state.pa.us/ReportServer?%2FOil_Gas%2FOil_Gas_Well_Production&P_PERIOD_ID=198&P_COUNTY%3Aisnull=True&P_CLIENT%3Aisnull=True&P_PERMIT_NUM%3Aisnull=True&P_OGO_NUM%3Aisnull=True&P_PRODUCING%3Aisnull=True&rs%3AParameterLanguage=&rs%3ACommand=Render&rs%3AFormat=ATOM&rc%3ADataFeed=xAx0x2"
xpath:
#// - Recursive descent; searches for the specified element at any depth.
#* - Matches any element node
#[ ] - Applies a filter pattern.
#name()='collection' - self explanatory
更短:
pa_string %>%
read_xml() %>%
xml_find_all("//@href") #select all attributes with name `href`
因为只有一个元素有属性,一个也可以做:
pa_string %>%
read_xml() %>%
xml_find_all("//@*") #Matches any attribute node
我正在努力抓取 public 数据源,该数据源依赖于 .atomsvc 文件以允许用户在 Excel 中设置数据源。我在 R 中构建了一个非常脆弱的解析器,使用 XML 库来提取 URL。我想知道这如何在 xml2 中完成(最好以更简洁和优雅的方式)
下面是我如何使用 XML 库
# Crystal Reports Parser Sample
library(XML)
library(dplyr)
# Get the .atomsvc file from the Export to Data Feed Option on the PA DEP website
pa_string <- '<?xml version="1.0" encoding="utf-8" standalone="yes"?><service xmlns:atom="http://www.w3.org/2005/Atom" xmlns:app="http://www.w3.org/2007/app" xmlns="http://www.w3.org/2007/app"><workspace><atom:title>Oil_Gas_Well_Production</atom:title><collection href="http://www.depreportingservices.state.pa.us/ReportServer?%2FOil_Gas%2FOil_Gas_Well_Production&P_PERIOD_ID=198&P_COUNTY%3Aisnull=True&P_CLIENT%3Aisnull=True&P_PERMIT_NUM%3Aisnull=True&P_OGO_NUM%3Aisnull=True&P_PRODUCING%3Aisnull=True&rs%3AParameterLanguage=&rs%3ACommand=Render&rs%3AFormat=ATOM&rc%3ADataFeed=xAx0x2"><atom:title>Tablix1</atom:title></collection></workspace></service>'
pa_list <- pa_string %>% xmlParse() %>% xmlToList()
# Extract the URL
URL <- URLdecode(pa_list$workspace$collection$.attrs)
这是我对 xml2 版本的了解
# Crystal Reports xml2 Parser
library(xml2)
library(dplyr)
# Get the .atomsvc file from the Export to Data Feed Option on the PA DEP website
pa_string <- '<service xmlns:atom="http://www.w3.org/2005/Atom" xmlns:app="http://www.w3.org/2007/app" xmlns="http://www.w3.org/2007/app"><workspace><atom:title>Oil_Gas_Well_Production</atom:title><collection href="http://www.depreportingservices.state.pa.us/ReportServer?%2FOil_Gas%2FOil_Gas_Well_Production&P_PERIOD_ID=198&P_COUNTY%3Aisnull=True&P_CLIENT%3Aisnull=True&P_PERMIT_NUM%3Aisnull=True&P_OGO_NUM%3Aisnull=True&P_PRODUCING%3Aisnull=True&rs%3AParameterLanguage=&rs%3ACommand=Render&rs%3AFormat=ATOM&rc%3ADataFeed=xAx0x2"><atom:title>Tablix1</atom:title></collection></workspace></service>'
pa_list <- pa_string %>% read_xml() %>% as_list()
我不知道如何从这里提取 URL,或者这是否是正确的思考方式。任何帮助将不胜感激!
这是一种方法,通过从指定节点中提取属性:
library(xml2)
library(tidyverse)
pa_string <- '<?xml version="1.0" encoding="utf-8" standalone="yes"?><service xmlns:atom="http://www.w3.org/2005/Atom" xmlns:app="http://www.w3.org/2007/app" xmlns="http://www.w3.org/2007/app"><workspace><atom:title>Oil_Gas_Well_Production</atom:title><collection href="http://www.depreportingservices.state.pa.us/ReportServer?%2FOil_Gas%2FOil_Gas_Well_Production&P_PERIOD_ID=198&P_COUNTY%3Aisnull=True&P_CLIENT%3Aisnull=True&P_PERMIT_NUM%3Aisnull=True&P_OGO_NUM%3Aisnull=True&P_PRODUCING%3Aisnull=True&rs%3AParameterLanguage=&rs%3ACommand=Render&rs%3AFormat=ATOM&rc%3ADataFeed=xAx0x2"><atom:title>Tablix1</atom:title></collection></workspace></service>'
pa_string %>%
read_xml() %>%
xml_find_all("//*[name()='collection']")%>%
xml_attr("href")
#output
[1] "http://www.depreportingservices.state.pa.us/ReportServer?%2FOil_Gas%2FOil_Gas_Well_Production&P_PERIOD_ID=198&P_COUNTY%3Aisnull=True&P_CLIENT%3Aisnull=True&P_PERMIT_NUM%3Aisnull=True&P_OGO_NUM%3Aisnull=True&P_PRODUCING%3Aisnull=True&rs%3AParameterLanguage=&rs%3ACommand=Render&rs%3AFormat=ATOM&rc%3ADataFeed=xAx0x2"
xpath:
#// - Recursive descent; searches for the specified element at any depth.
#* - Matches any element node
#[ ] - Applies a filter pattern.
#name()='collection' - self explanatory
更短:
pa_string %>%
read_xml() %>%
xml_find_all("//@href") #select all attributes with name `href`
因为只有一个元素有属性,一个也可以做:
pa_string %>%
read_xml() %>%
xml_find_all("//@*") #Matches any attribute node