使用 xml2 从 TEI XML 创建数据框
Create data frame from TEI XML using xml2
我正在尝试使用 Hadley Wickham 的 xml2
包创建 TEI-XML 版 Moby Dick 的数据框。我希望数据框最终看起来像这样(对于小说中的所有单词):
df <- data.frame(
chapter = c("1", "1", "1"),
words = c("call", "me", "ishmael"))
我能得到碎片,但不是全部。到目前为止,这是我得到的:
library("xml2")
# Read file
melville <- read_xml("data/melville.xml")
# Get chapter divs (remember, doesn't include epilogue)
chap_frames <- xml_find_all(melville, "//d1:div1[@type='chapter']", xml_ns(melville))
这给了我们一个长度为 134 的列表(即每一章)。我们可以获得特定元素的章节编号,如下所示:
xml_attr(chap_frames[[1]], "n")
我们可以得到特定章节的段落(即减去章节标题)如下:
words <- xml_find_all(chap_frames[[1]], ".//d1:p", xml_ns(melville)) %>% # remember doesn't include epilogue
xml_text()
并且我们可以得到章节的单词如下:
# Split words function
split_words <- function (ll) {
result <- unlist(strsplit(ll, "\W+"))
result <- result[result != ""]
tolower(result)
}
# Apply function
words <- split_words(words)
我想不通的是如何获取每个单词的章节编号。我有一个有用的玩具示例:
mini <- read_xml(
'
<div1 type="chapter" n="1" id="_75784">
<head>Loomings</head>
<p rend="fiction">Call me Ishmael.</p>
<p rend="fiction">There now is your insular city of the Manhattoes, belted round by wharves as Indian isles by coral reefs- commerce surrounds it with her surf.</p>
</div1>
')
# Function
process_chap <- function(div){
chapter <- xml_attr(div, "n")
words <- xml_find_all(div, "//p") %>%
xml_text()
data.frame(chapter = chapter,
word = split_words(words))
}
process_chap(mini)
但它不适用于更长的例子
process_chap2 <- function(div){
chapter <- xml_attr(div, "n")
words <- xml_find_all(div, ".//d1:p", xml_ns(melville)) %>% # remember doesn't include epilogue
xml_text()
data.frame(chapter = chapter,
word = split_words(words))
}
# Fails because there are more words than chapter names
df <- process_chap2(chap_frames)
# Gives all the words p (not chapters), chapter numbers are `NULL`.
df2 <- process_chap2(melville)
(我知道为什么 toy example 有效但 Melville 的无效,但我想包括它以展示我正在尝试做的事情)。我猜我可能需要某种循环,但我不确定从哪里开始。有什么建议吗?
PS:我不完全确定我是否应该 link 到我在 Github 上找到的 xml 版本的 Moby Dick,但你可以找到它很容易搜索 melville1.xml
。
方法是一次绘制每一章的数据。然后将一章的单词连同章节编号组合成一个数据框。 R 将根据需要重复章节编号的单个值:
words <- letters[1:3]
n <- 1
df <- data.frame(words, n)
df
## words n
## 1 a 1
## 2 b 1
## 3 c 1
将所有章节的信息收集到整齐的数据框中后,您就可以使用 rbind()
将整体组合成一个数据框。
这就是您数据的前两章的样子....
library(xml2)
library(dplyr)
library(stringr)
# Read file
url <- "https://raw.githubusercontent.com/reganna/TextAnalysisWithR/master/data/XML1/melville1.xml"
melville <- read_xml(url)
# get chapter frame and number
chap_frames <- xml_find_all(melville, "//d1:div1[@type='chapter']", xml_ns(melville))
chap_n <- xml_attr(chap_frames, "n")
# get the date for first chapter
words1 <-
xml_find_all(chap_frames[[1]], ".//d1:p", xml_ns(melville)) %>%
xml_text() %>%
unlist() %>%
str_split("\W+") %>%
unlist() %>%
tolower()
n1 <- xml_attr(chap_frames[[1]], "n")
# get the data for the second chapter
words2 <-
xml_find_all(chap_frames[[2]], ".//d1:p", xml_ns(melville)) %>%
xml_text() %>%
unlist() %>%
str_split("\W+") %>%
unlist() %>%
tolower()
n2 <- xml_attr(chap_frames[[2]], "n")
# put it together
df <-
rbind(
data_frame(words=words1, chapter=n1),
data_frame(words=words2, chapter=n2)
)
df
## Source: local data frame [3,719 x 2]
##
## words chapter
## 1 call 1
## 2 me 1
## 3 ishmael 1
## 4 some 1
## 5 years 1
## 6 ago 1
## 7 never 1
## 8 mind 1
## 9 how 1
## 10 long 1
## .. ... ...
为了对所有章节更有效地执行此操作,您可以构建一个循环,为所有章节重复这些步骤,或者您可以考虑一个执行提取的函数,将其应用于所有章节,然后通过 rbind()
稍后。
...我可能会那样做:
# building function
extract_data <- function(chapter_frame){
words <-
xml_find_all(chapter_frame, ".//d1:p", xml_ns(melville)) %>%
xml_text() %>%
unlist() %>%
str_split("\W+") %>%
unlist() %>%
tolower()
n <- xml_attr(chapter_frame, "n")
pos <- seq_along(words)
data_frame(words, chapter=n, paragraph=pos)
}
# using function
chapter_words <-
lapply(chap_frames, extract_data)
# `rbind()`ing data
chapter_words <- do.call(rbind, chapter_words)
chapter_words
## Source: local data frame [216,669 x 3]
##
## words chapter paragraph
## 1 call 1 1
## 2 me 1 2
## 3 ishmael 1 3
## 4 some 1 4
## 5 years 1 5
## 6 ago 1 6
## 7 never 1 7
## 8 mind 1 8
## 9 how 1 9
## 10 long 1 10
## .. ... ... ...
我正在尝试使用 Hadley Wickham 的 xml2
包创建 TEI-XML 版 Moby Dick 的数据框。我希望数据框最终看起来像这样(对于小说中的所有单词):
df <- data.frame(
chapter = c("1", "1", "1"),
words = c("call", "me", "ishmael"))
我能得到碎片,但不是全部。到目前为止,这是我得到的:
library("xml2")
# Read file
melville <- read_xml("data/melville.xml")
# Get chapter divs (remember, doesn't include epilogue)
chap_frames <- xml_find_all(melville, "//d1:div1[@type='chapter']", xml_ns(melville))
这给了我们一个长度为 134 的列表(即每一章)。我们可以获得特定元素的章节编号,如下所示:
xml_attr(chap_frames[[1]], "n")
我们可以得到特定章节的段落(即减去章节标题)如下:
words <- xml_find_all(chap_frames[[1]], ".//d1:p", xml_ns(melville)) %>% # remember doesn't include epilogue
xml_text()
并且我们可以得到章节的单词如下:
# Split words function
split_words <- function (ll) {
result <- unlist(strsplit(ll, "\W+"))
result <- result[result != ""]
tolower(result)
}
# Apply function
words <- split_words(words)
我想不通的是如何获取每个单词的章节编号。我有一个有用的玩具示例:
mini <- read_xml(
'
<div1 type="chapter" n="1" id="_75784">
<head>Loomings</head>
<p rend="fiction">Call me Ishmael.</p>
<p rend="fiction">There now is your insular city of the Manhattoes, belted round by wharves as Indian isles by coral reefs- commerce surrounds it with her surf.</p>
</div1>
')
# Function
process_chap <- function(div){
chapter <- xml_attr(div, "n")
words <- xml_find_all(div, "//p") %>%
xml_text()
data.frame(chapter = chapter,
word = split_words(words))
}
process_chap(mini)
但它不适用于更长的例子
process_chap2 <- function(div){
chapter <- xml_attr(div, "n")
words <- xml_find_all(div, ".//d1:p", xml_ns(melville)) %>% # remember doesn't include epilogue
xml_text()
data.frame(chapter = chapter,
word = split_words(words))
}
# Fails because there are more words than chapter names
df <- process_chap2(chap_frames)
# Gives all the words p (not chapters), chapter numbers are `NULL`.
df2 <- process_chap2(melville)
(我知道为什么 toy example 有效但 Melville 的无效,但我想包括它以展示我正在尝试做的事情)。我猜我可能需要某种循环,但我不确定从哪里开始。有什么建议吗?
PS:我不完全确定我是否应该 link 到我在 Github 上找到的 xml 版本的 Moby Dick,但你可以找到它很容易搜索 melville1.xml
。
方法是一次绘制每一章的数据。然后将一章的单词连同章节编号组合成一个数据框。 R 将根据需要重复章节编号的单个值:
words <- letters[1:3]
n <- 1
df <- data.frame(words, n)
df
## words n
## 1 a 1
## 2 b 1
## 3 c 1
将所有章节的信息收集到整齐的数据框中后,您就可以使用 rbind()
将整体组合成一个数据框。
这就是您数据的前两章的样子....
library(xml2)
library(dplyr)
library(stringr)
# Read file
url <- "https://raw.githubusercontent.com/reganna/TextAnalysisWithR/master/data/XML1/melville1.xml"
melville <- read_xml(url)
# get chapter frame and number
chap_frames <- xml_find_all(melville, "//d1:div1[@type='chapter']", xml_ns(melville))
chap_n <- xml_attr(chap_frames, "n")
# get the date for first chapter
words1 <-
xml_find_all(chap_frames[[1]], ".//d1:p", xml_ns(melville)) %>%
xml_text() %>%
unlist() %>%
str_split("\W+") %>%
unlist() %>%
tolower()
n1 <- xml_attr(chap_frames[[1]], "n")
# get the data for the second chapter
words2 <-
xml_find_all(chap_frames[[2]], ".//d1:p", xml_ns(melville)) %>%
xml_text() %>%
unlist() %>%
str_split("\W+") %>%
unlist() %>%
tolower()
n2 <- xml_attr(chap_frames[[2]], "n")
# put it together
df <-
rbind(
data_frame(words=words1, chapter=n1),
data_frame(words=words2, chapter=n2)
)
df
## Source: local data frame [3,719 x 2]
##
## words chapter
## 1 call 1
## 2 me 1
## 3 ishmael 1
## 4 some 1
## 5 years 1
## 6 ago 1
## 7 never 1
## 8 mind 1
## 9 how 1
## 10 long 1
## .. ... ...
为了对所有章节更有效地执行此操作,您可以构建一个循环,为所有章节重复这些步骤,或者您可以考虑一个执行提取的函数,将其应用于所有章节,然后通过 rbind()
稍后。
...我可能会那样做:
# building function
extract_data <- function(chapter_frame){
words <-
xml_find_all(chapter_frame, ".//d1:p", xml_ns(melville)) %>%
xml_text() %>%
unlist() %>%
str_split("\W+") %>%
unlist() %>%
tolower()
n <- xml_attr(chapter_frame, "n")
pos <- seq_along(words)
data_frame(words, chapter=n, paragraph=pos)
}
# using function
chapter_words <-
lapply(chap_frames, extract_data)
# `rbind()`ing data
chapter_words <- do.call(rbind, chapter_words)
chapter_words
## Source: local data frame [216,669 x 3]
##
## words chapter paragraph
## 1 call 1 1
## 2 me 1 2
## 3 ishmael 1 3
## 4 some 1 4
## 5 years 1 5
## 6 ago 1 6
## 7 never 1 7
## 8 mind 1 8
## 9 how 1 9
## 10 long 1 10
## .. ... ... ...