使用 R 中的 melt 或 dplyr 将 JSON 行与不同数量的列组合在一起,有些未标记
Combine JSON rows with different number of columns, some unlabled, using either melt or dplyr in R
这与上一个问题有关。然而,问题已经演变。我有 JSON 数据,分为三列:"Left"、"Kwic" 和 "Right"。 "Left" 和 "Right" 列有时会进一步细分。此细分在 JSON 文件中表示为 "class"。然而,这个 "class" 通常没有标签。在细分的栏目中,总会有一个名为"coll"的class。
是获取 "pre" 和 "post" 列并重命名它们以包含到数据框中。但是,现在我们有混合的列,有些被细分,有些则没有。
我想做的是获取未分割的数据并将其添加到中心列"coll"。这适用于左分区和右分区。但是,现在我只能将它们作为单独的列进行捕获。我用 melt 和 dplyr2 尝试过各种方法,但都无济于事。
数据:
structure(list(Left = list(structure(list(class = "", str = " children tend to view authority figures"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "children have a computer . Wireless resources"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "unclear if increases in physical activity are"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "filtration pressure . Where recurrent disease is"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = c("", "strc", ""), str = c("multiply .", "</p><p>",
"When nevirapine is no longer")), .Names = c("class", "str"
), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = "", str = "white . We don't provide enough services ,"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = ", a sexually transmitted infection , are"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "continuous lowgrade itching and linear lesions"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " radiation oncology community is largely"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "the variability in response time that was"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "incremental cost effectiveness ratio that is"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "Through the use of warming , acrid herbs"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "start using tobacco : psychosocial factors"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "determining the severity because the fetus was"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = c("", "coll", ""), str = c("This occurred despite the ",
"significantly", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = "", str = "mission to eliminate the suffering and death"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "are more likely to be present , or to be"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "demonstrated primarily pulmonary signs and symptoms"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "criminal involvement . These findings are"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "model . There is a danger in using herbs"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L)), Kwic = list(structure(list(
class = "col0 coll", str = " such"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " such"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " due"), .Names = c("class", "str"
), class = "data.frame", row.names = 1L), structure(list(class = "col0 coll",
str = " responsible"), .Names = c("class", "str"), class = "data.frame", row.names = 1L),
structure(list(class = "col0 coll", str = " present"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " such"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " responsible"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " consistent"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " responsible"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " due"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " less"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " such"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " such"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " less"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = "higher"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " due"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " present"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " such"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " consistent"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " such"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L)), Right = list(
structure(list(class = c("", "coll", ""), str = c(" ", "as",
" physicians and parents as legitimate")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = c("", "coll", ""), str = c(" ", "as", " radio / CD headsets , handheld televisions"
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c(" ",
"to", " the physical environment itself , or")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = c("", "coll", ""), str = c(" ", "for", " blockage of lymphatic collaterals ,"
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c(" ",
"in", " the blood , the HIV strains that are")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = c("", "coll", "", "strc", ""), str = c(" ", "as",
" Spanish services . \"", "</p><p>", "She admits")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 5L)), structure(list(
class = c("", "coll", ""), str = c(" ", "for", " the majority of cervical cancer cases"
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", "", "strc", ""
), str = c(" ", "with", " vigorous scratching .", "</p><p>",
"Psoriasis")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
5L)), structure(list(class = c("", "coll", ""), str = c(" ",
"for", " having treated hundreds of thousands")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = c("", "coll", ""), str = c(" ", "to", " the distractor-ratio manipulation and"
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c(" ",
"than", " £ 30 000 per quality adjusted life")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = c("", "coll", ""), str = c(" ", "as", " aconitum carmichaeli praeparatum ( fu"
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c(" ",
"as", " personality or parental role modeling")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = c("", "coll", ""), str = c(" ", "than", " 28 weeks old , and the bilirubin had"
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = "", str = " level of psychiatric symptoms observed "), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = c("", "coll", "", "strc", ""), str = c(" ", "to",
" all cancers by 2015 .", "</p><p>", "The primary")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 5L)), structure(list(
class = c("", "coll", ""), str = c(" ", "in", " higher numbers , in sputum cultures "
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", "", "strc"), str = c(" ",
"as", " wheezing and shortness of breath .", "</p>")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 4L)), structure(list(
class = c("", "coll", ""), str = c(" ", "with", " those from DeLeon and Jainchill 's"
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c(" ",
"as", " mahuang in highly concentrated extracts")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 3L)))), .Names = c("Left",
"Kwic", "Right"), class = "data.frame", row.names = c(NA, 20L
))
关键的挑战是一些列没有标签,但可以根据数据的结构来识别。下面的代码生成的输出几乎就在那里,但调和这个小差异让我近乎疯狂。
代码:
## generate raw output
documentdata <- document$Lines[, c("Left", "Kwic", "Right")]
documentdata = cbind(documentdata,SeekID=query)
## generate tidied output
## generate left columns
docx <- melt(documentdata$Left, id.vars = c("class"))
pre <- which(docx$class %in% c("coll")) - 1
post <- which(docx$class %in% c("coll")) + 1
docx$class[pre] = "l.pre"
docx$class[post] = "l.post"
docx <- dcast(docx, L1 + variable ~ class, fun.aggregate=list)
names(docx)[names(docx)=="Var.3"] <- "l.full"
names(docx)[names(docx)=="coll"] <- "l.coll"
docx.left <- docx[, c("l.full", "l.pre", "l.coll", "l.post")]
#docx.left <- documentdata$Left %>% do.call(rbind, .) %>%
# do(data.frame(l.pre = .[["str"]][which(.[["class"]]=="coll")-1],
# l.coll = .[["str"]][which(.[["class"]]=="coll")],
# l.post = .[["str"]][which(.[["class"]]=="coll")+1]))
## generate center columns
docx <- melt(documentdata$Kwic, id.vars = c("class"))
names(docx)[names(docx)=="value"] <- "k.coll"
docx.kwic = docx[, c("k.coll"), drop = FALSE]
## generate right columns
docx <- melt(documentdata$Right, id.vars = c("class"))
post <- which(docx$class %in% c("coll")) + 1
docx$class[post] = "r.post"
docx <- dcast(docx, L1 + variable ~ class, fun.aggregate=list)
names(docx)[names(docx)=="coll"] <- "r.coll"
names(docx)[names(docx)=="Var.3"] <- "r.pre"
docx.right <- docx[, c("r.pre", "r.coll", "r.post")]
## final output
docx.output = cbind(docx.left, docx.kwic, docx.right)
docx.output = cbind(docx.output,SeekID=query)
docx.output <- docx.output[, c("SeekID", "l.full", "l.pre", "l.coll", "l.post", "k.coll", "r.pre", "r.coll", "r.post")]
更正代码(由@cgjeremy 提供)[已解决]
## general parsing function that handles "r" and "l" differently
myparse <- function(x, side){
if(any(x$class=="coll")){
pre <- x$str[which(x$class=="coll")-1]
coll <- x$str[which(x$class=="coll")]
post <- x$str[which(x$class=="coll")+1]
} else if(side=="l"){
pre <- paste0(x$str, collapse="")
coll <- ""
post <- ""
} else if(side=="r"){
pre <- ""
coll <- ""
post <- paste0(x$str, collapse="")
} else {
pre <- ""
coll <- ""
post <- ""
}
z <- data.frame(pre, coll, post)
names(z) <-c(paste0(side, ".pre"), paste0(side, ".coll"), paste0(side, ".post"))
z
}
## calls parsing function to generate left, mid, and right column-sets
library(dplyr)
left <- documentdata$Left %>% lapply(myparse, side="l") %>%
do.call(rbind, .)
mid <- do.call(rbind, documentdata$Kwic)$str
right <- documentdata$Right %>% lapply(myparse, side="r") %>%
do.call(rbind, .)
## combines left, mid, and right columns-sets to generate final output
docx.output <- cbind(left, mid, right)
在此先感谢您的帮助或建议,
我不太确定你的右手边规则,但我认为这就是你想要的。
首先让我们定义一个解析函数:
myparse <- function(x, side){
if(any(x$class=="coll")){
pre <- x$str[which(x$class=="coll")-1]
coll <- x$str[which(x$class=="coll")]
post <- x$str[which(x$class=="coll")+1]
all <- ""
} else {
pre <- ""
coll <- ""
post <- ""
all <- paste0(x$str, collapse="")
}
z <- data.frame(pre, coll, post, all)
names(z) <-c(paste0(side, ".pre"), paste0(side, ".coll"), paste0(side, ".post"), paste0(side, ".all"))
z
}
此函数检查 documentdata$Left
或 Right 的每个成员,如果有 class=="coll"
我们拆分,否则我们将所有内容粘贴到 all
列。
到运行一次,试试myparse(documentdata$Left[[1]], side="whatever")
你可以把1改成列表的其他成员。
然后我们可以使用 lapply
得到我们的左和右(它接受 myparse
并将其应用于列表的每个元素),然后 rbinding
列表变成 data.frame。中路更容易:
library(dplyr)
left <- documentdata$Left %>% lapply(myparse, side="l") %>%
do.call(rbind, .)
mid <- do.call(rbind, documentdata$Kwic)$str
right <- documentdata$Right %>% lapply(myparse, side="r") %>%
do.call(rbind, .)
然后我们cbind
他们在一起:
cbind(left, mid, right)
这与上一个问题有关。然而,问题已经演变。我有 JSON 数据,分为三列:"Left"、"Kwic" 和 "Right"。 "Left" 和 "Right" 列有时会进一步细分。此细分在 JSON 文件中表示为 "class"。然而,这个 "class" 通常没有标签。在细分的栏目中,总会有一个名为"coll"的class。
我想做的是获取未分割的数据并将其添加到中心列"coll"。这适用于左分区和右分区。但是,现在我只能将它们作为单独的列进行捕获。我用 melt 和 dplyr2 尝试过各种方法,但都无济于事。
数据:
structure(list(Left = list(structure(list(class = "", str = " children tend to view authority figures"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "children have a computer . Wireless resources"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "unclear if increases in physical activity are"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "filtration pressure . Where recurrent disease is"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = c("", "strc", ""), str = c("multiply .", "</p><p>",
"When nevirapine is no longer")), .Names = c("class", "str"
), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = "", str = "white . We don't provide enough services ,"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = ", a sexually transmitted infection , are"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "continuous lowgrade itching and linear lesions"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = " radiation oncology community is largely"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "the variability in response time that was"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "incremental cost effectiveness ratio that is"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "Through the use of warming , acrid herbs"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "start using tobacco : psychosocial factors"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "determining the severity because the fetus was"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = c("", "coll", ""), str = c("This occurred despite the ",
"significantly", " ")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = "", str = "mission to eliminate the suffering and death"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "are more likely to be present , or to be"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "demonstrated primarily pulmonary signs and symptoms"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "criminal involvement . These findings are"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "", str = "model . There is a danger in using herbs"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L)), Kwic = list(structure(list(
class = "col0 coll", str = " such"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " such"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " due"), .Names = c("class", "str"
), class = "data.frame", row.names = 1L), structure(list(class = "col0 coll",
str = " responsible"), .Names = c("class", "str"), class = "data.frame", row.names = 1L),
structure(list(class = "col0 coll", str = " present"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " such"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " responsible"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " consistent"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " responsible"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " due"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " less"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " such"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " such"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " less"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = "higher"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " due"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " present"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " such"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " consistent"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = "col0 coll", str = " such"), .Names = c("class",
"str"), class = "data.frame", row.names = 1L)), Right = list(
structure(list(class = c("", "coll", ""), str = c(" ", "as",
" physicians and parents as legitimate")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = c("", "coll", ""), str = c(" ", "as", " radio / CD headsets , handheld televisions"
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c(" ",
"to", " the physical environment itself , or")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = c("", "coll", ""), str = c(" ", "for", " blockage of lymphatic collaterals ,"
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c(" ",
"in", " the blood , the HIV strains that are")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = c("", "coll", "", "strc", ""), str = c(" ", "as",
" Spanish services . \"", "</p><p>", "She admits")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 5L)), structure(list(
class = c("", "coll", ""), str = c(" ", "for", " the majority of cervical cancer cases"
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", "", "strc", ""
), str = c(" ", "with", " vigorous scratching .", "</p><p>",
"Psoriasis")), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
5L)), structure(list(class = c("", "coll", ""), str = c(" ",
"for", " having treated hundreds of thousands")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = c("", "coll", ""), str = c(" ", "to", " the distractor-ratio manipulation and"
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c(" ",
"than", " £ 30 000 per quality adjusted life")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = c("", "coll", ""), str = c(" ", "as", " aconitum carmichaeli praeparatum ( fu"
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c(" ",
"as", " personality or parental role modeling")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 3L)), structure(list(
class = c("", "coll", ""), str = c(" ", "than", " 28 weeks old , and the bilirubin had"
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = "", str = " level of psychiatric symptoms observed "), .Names = c("class",
"str"), class = "data.frame", row.names = 1L), structure(list(
class = c("", "coll", "", "strc", ""), str = c(" ", "to",
" all cancers by 2015 .", "</p><p>", "The primary")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 5L)), structure(list(
class = c("", "coll", ""), str = c(" ", "in", " higher numbers , in sputum cultures "
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", "", "strc"), str = c(" ",
"as", " wheezing and shortness of breath .", "</p>")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 4L)), structure(list(
class = c("", "coll", ""), str = c(" ", "with", " those from DeLeon and Jainchill 's"
)), .Names = c("class", "str"), class = "data.frame", row.names = c(NA,
3L)), structure(list(class = c("", "coll", ""), str = c(" ",
"as", " mahuang in highly concentrated extracts")), .Names = c("class",
"str"), class = "data.frame", row.names = c(NA, 3L)))), .Names = c("Left",
"Kwic", "Right"), class = "data.frame", row.names = c(NA, 20L
))
关键的挑战是一些列没有标签,但可以根据数据的结构来识别。下面的代码生成的输出几乎就在那里,但调和这个小差异让我近乎疯狂。
代码:
## generate raw output
documentdata <- document$Lines[, c("Left", "Kwic", "Right")]
documentdata = cbind(documentdata,SeekID=query)
## generate tidied output
## generate left columns
docx <- melt(documentdata$Left, id.vars = c("class"))
pre <- which(docx$class %in% c("coll")) - 1
post <- which(docx$class %in% c("coll")) + 1
docx$class[pre] = "l.pre"
docx$class[post] = "l.post"
docx <- dcast(docx, L1 + variable ~ class, fun.aggregate=list)
names(docx)[names(docx)=="Var.3"] <- "l.full"
names(docx)[names(docx)=="coll"] <- "l.coll"
docx.left <- docx[, c("l.full", "l.pre", "l.coll", "l.post")]
#docx.left <- documentdata$Left %>% do.call(rbind, .) %>%
# do(data.frame(l.pre = .[["str"]][which(.[["class"]]=="coll")-1],
# l.coll = .[["str"]][which(.[["class"]]=="coll")],
# l.post = .[["str"]][which(.[["class"]]=="coll")+1]))
## generate center columns
docx <- melt(documentdata$Kwic, id.vars = c("class"))
names(docx)[names(docx)=="value"] <- "k.coll"
docx.kwic = docx[, c("k.coll"), drop = FALSE]
## generate right columns
docx <- melt(documentdata$Right, id.vars = c("class"))
post <- which(docx$class %in% c("coll")) + 1
docx$class[post] = "r.post"
docx <- dcast(docx, L1 + variable ~ class, fun.aggregate=list)
names(docx)[names(docx)=="coll"] <- "r.coll"
names(docx)[names(docx)=="Var.3"] <- "r.pre"
docx.right <- docx[, c("r.pre", "r.coll", "r.post")]
## final output
docx.output = cbind(docx.left, docx.kwic, docx.right)
docx.output = cbind(docx.output,SeekID=query)
docx.output <- docx.output[, c("SeekID", "l.full", "l.pre", "l.coll", "l.post", "k.coll", "r.pre", "r.coll", "r.post")]
更正代码(由@cgjeremy 提供)[已解决]
## general parsing function that handles "r" and "l" differently
myparse <- function(x, side){
if(any(x$class=="coll")){
pre <- x$str[which(x$class=="coll")-1]
coll <- x$str[which(x$class=="coll")]
post <- x$str[which(x$class=="coll")+1]
} else if(side=="l"){
pre <- paste0(x$str, collapse="")
coll <- ""
post <- ""
} else if(side=="r"){
pre <- ""
coll <- ""
post <- paste0(x$str, collapse="")
} else {
pre <- ""
coll <- ""
post <- ""
}
z <- data.frame(pre, coll, post)
names(z) <-c(paste0(side, ".pre"), paste0(side, ".coll"), paste0(side, ".post"))
z
}
## calls parsing function to generate left, mid, and right column-sets
library(dplyr)
left <- documentdata$Left %>% lapply(myparse, side="l") %>%
do.call(rbind, .)
mid <- do.call(rbind, documentdata$Kwic)$str
right <- documentdata$Right %>% lapply(myparse, side="r") %>%
do.call(rbind, .)
## combines left, mid, and right columns-sets to generate final output
docx.output <- cbind(left, mid, right)
在此先感谢您的帮助或建议,
我不太确定你的右手边规则,但我认为这就是你想要的。
首先让我们定义一个解析函数:
myparse <- function(x, side){
if(any(x$class=="coll")){
pre <- x$str[which(x$class=="coll")-1]
coll <- x$str[which(x$class=="coll")]
post <- x$str[which(x$class=="coll")+1]
all <- ""
} else {
pre <- ""
coll <- ""
post <- ""
all <- paste0(x$str, collapse="")
}
z <- data.frame(pre, coll, post, all)
names(z) <-c(paste0(side, ".pre"), paste0(side, ".coll"), paste0(side, ".post"), paste0(side, ".all"))
z
}
此函数检查 documentdata$Left
或 Right 的每个成员,如果有 class=="coll"
我们拆分,否则我们将所有内容粘贴到 all
列。
到运行一次,试试myparse(documentdata$Left[[1]], side="whatever")
你可以把1改成列表的其他成员。
然后我们可以使用 lapply
得到我们的左和右(它接受 myparse
并将其应用于列表的每个元素),然后 rbinding
列表变成 data.frame。中路更容易:
library(dplyr)
left <- documentdata$Left %>% lapply(myparse, side="l") %>%
do.call(rbind, .)
mid <- do.call(rbind, documentdata$Kwic)$str
right <- documentdata$Right %>% lapply(myparse, side="r") %>%
do.call(rbind, .)
然后我们cbind
他们在一起:
cbind(left, mid, right)