xml2:在同一行中的指定文本之后抓取数组中的文本
xml2: grab text in array after specified text in same row
我有 XML 和一堆 envelope
元素。每个里面都有一个数组。数组中的每一行都有 2 个元素。第一个是标识符,第二个是我要抓取的文本。我需要行的第一个值来识别正确的行,以便我可以获取正确的值。
在下面的示例中,我在用代码 610954
表示的行中有 'food'。我想获取此代码后的 2 个元素(c('pizza', 'burger')
。同样,代码 605380
表示 'drinks'。我想获取 c('coke', 'pepsi')
。如何使用xml2包要干这个?
library(xml2)
library(magrittr)
myxml <- read_xml('
<inside>
<envelope>
<card-entries type="array">
<card-entry>
<card-id type="integer">605380</card-id>
<value>coke</value>
</card-entry>
<card-entry>
<card-id type="integer">610954</card-id>
<value>pizza</value>
</card-entry>
</card-entries>
</envelope>
<envelope>
<card-entries type="array">
<card-entry>
<card-id type="integer">605380</card-id>
<value>pepsi</value>
</card-entry>
<card-entry>
<card-id type="integer">610954</card-id>
<value>burger</value>
</card-entry>
</card-entries>
</envelope>
</inside>
'
)
## as far as I can parse it (but not specific enough)
myxml %>%
xml_find_all('//envelope/card-entries[@type="array"]/card-entry') %>%
xml_text()
food <- -CODE THAT GIVES HERE c('pizza', 'burger')- # 610954
drinks <- -CODE THAT GIVES HERE c('coke', 'pepsi')- # 605380
你原来的方法可以这样修改来得到饮料:
myxml %>%
xml_find_all('//envelope/card-entries[@type="array"]/card-entry[card-id = "605380"]/value') %>%
xml_text()
#[1] "coke" "pepsi"
但您可以采用多种其他方法
# get following sibling called value
myxml %>%
# foods
xml_find_all('//card-id[text()="610954"]/following-sibling::value') %>%
xml_text()
#[1] "pizza" "burger"
# get following::value[1] - Specify [1] or you would get all following values,
# including "pepsi". With value[1] you get only the following value.
myxml %>%
# foods
xml_find_all('//card-id[text()="610954"]/following::value[1]') %>%
xml_text()
#[1] "pizza" "burger"
# look for value nodes with a preceding sibling with the appropriate card-id
myxml %>%
# drinks
xml_find_all('//value[preceding-sibling::card-id[text()="605380"]]') %>%
xml_text()
#[1] "coke" "pepsi"
# Get value node that is a child of card-entry nodes with the appropriate card-id.
# specifically looking in envelope elements
myxml %>%
# drinks
xml_find_all('//envelope/card-entries/card-entry[card-id = "605380"]/value') %>%
xml_text()
#[1] "coke" "pepsi"
# less specific
myxml %>%
xml_find_all('//card-entry[card-id = "605380"]/value') %>%
xml_text()
#[1] "coke" "pepsi"
怎么样:
library(tidyverse)
library(stringr)
myxml %>%
xml_find_all('//envelope/card-entries[@type="array"]/card-entry') %>%
xml_text() %>%
map(.f = str_sub, start = c(1, 7), end = c(6, 1000000L)) %>%
reduce(rbind) %>%
as_tibble() %>%
mutate(type = ifelse(V1 == 605380, yes = "drinks", no = "food"))
然后您可以轻松地将饮料和食物分开。
我有 XML 和一堆 envelope
元素。每个里面都有一个数组。数组中的每一行都有 2 个元素。第一个是标识符,第二个是我要抓取的文本。我需要行的第一个值来识别正确的行,以便我可以获取正确的值。
在下面的示例中,我在用代码 610954
表示的行中有 'food'。我想获取此代码后的 2 个元素(c('pizza', 'burger')
。同样,代码 605380
表示 'drinks'。我想获取 c('coke', 'pepsi')
。如何使用xml2包要干这个?
library(xml2)
library(magrittr)
myxml <- read_xml('
<inside>
<envelope>
<card-entries type="array">
<card-entry>
<card-id type="integer">605380</card-id>
<value>coke</value>
</card-entry>
<card-entry>
<card-id type="integer">610954</card-id>
<value>pizza</value>
</card-entry>
</card-entries>
</envelope>
<envelope>
<card-entries type="array">
<card-entry>
<card-id type="integer">605380</card-id>
<value>pepsi</value>
</card-entry>
<card-entry>
<card-id type="integer">610954</card-id>
<value>burger</value>
</card-entry>
</card-entries>
</envelope>
</inside>
'
)
## as far as I can parse it (but not specific enough)
myxml %>%
xml_find_all('//envelope/card-entries[@type="array"]/card-entry') %>%
xml_text()
food <- -CODE THAT GIVES HERE c('pizza', 'burger')- # 610954
drinks <- -CODE THAT GIVES HERE c('coke', 'pepsi')- # 605380
你原来的方法可以这样修改来得到饮料:
myxml %>%
xml_find_all('//envelope/card-entries[@type="array"]/card-entry[card-id = "605380"]/value') %>%
xml_text()
#[1] "coke" "pepsi"
但您可以采用多种其他方法
# get following sibling called value
myxml %>%
# foods
xml_find_all('//card-id[text()="610954"]/following-sibling::value') %>%
xml_text()
#[1] "pizza" "burger"
# get following::value[1] - Specify [1] or you would get all following values,
# including "pepsi". With value[1] you get only the following value.
myxml %>%
# foods
xml_find_all('//card-id[text()="610954"]/following::value[1]') %>%
xml_text()
#[1] "pizza" "burger"
# look for value nodes with a preceding sibling with the appropriate card-id
myxml %>%
# drinks
xml_find_all('//value[preceding-sibling::card-id[text()="605380"]]') %>%
xml_text()
#[1] "coke" "pepsi"
# Get value node that is a child of card-entry nodes with the appropriate card-id.
# specifically looking in envelope elements
myxml %>%
# drinks
xml_find_all('//envelope/card-entries/card-entry[card-id = "605380"]/value') %>%
xml_text()
#[1] "coke" "pepsi"
# less specific
myxml %>%
xml_find_all('//card-entry[card-id = "605380"]/value') %>%
xml_text()
#[1] "coke" "pepsi"
怎么样:
library(tidyverse)
library(stringr)
myxml %>%
xml_find_all('//envelope/card-entries[@type="array"]/card-entry') %>%
xml_text() %>%
map(.f = str_sub, start = c(1, 7), end = c(6, 1000000L)) %>%
reduce(rbind) %>%
as_tibble() %>%
mutate(type = ifelse(V1 == 605380, yes = "drinks", no = "food"))
然后您可以轻松地将饮料和食物分开。