R:Sentence to word 的行号不匹配-table
R: Row numbers unmatched for Sentence to word-table
从我之前的 开始,我在不同的行中有一些 texts
,并且我试图从文本中为每个单词生成 word-table
。但是当文本列的行号与 word-table
的行号不同时,就会出现问题。已发现某些文本正在创建两行或更多行。所以最后我不能 cbind
这两个在一起。代码在这里。我只希望结果与文本的行号完全相同,我可以将它们绑定在一起以显示哪个文本是哪个 word-table
.
texts <- c("concratulations successfully signed company please find attached quick guide can t figure immediately ", " conversation laughing services sweden", "p please find attached drafted budget p ", "p please finad attached agenda today s board meeting p ", "p hi nbsp p p please find attached darft meeting minutes today s meeting p ", "p please find attached final version minutes updated action log please let know actions done ll update excel nbsp p ", "p hi p p please find attached draft meeting minutes action log please provide comments end next week p p nice spring party saturday p p tuija p ", " p welcome team priority hope enjoy yo p ", "p please find attached flyer can study share p ", "p attached new version voice receiver p p minor change request invitation code mentioned invitation code may tell check code invitation email end alarm bell example telling new comments ", "comment etc front page now seemed end without warning p ", "p memo attached actions p ", "p please find attached updated board roles responsibilities made changes red document please review especially role relevant contact info prepare comment meeting wednesday nbsp p ", "p attached documents review please comment soonest p ")
texts <- cbind(texts)
## to remove multi-white spaces
MyDf <- gsub("\s+"," ",texts)
MyDf <- gsub("\r?\n|\r", " ", MyDf)
MyDf <- cbind(MyDf)
colnames(MyDf) <- c("Introduction")
## this way, extra rows are being generated
word_table <- read.table(text = paste(gsub('\n', ' ', MyDf), collapse = '\n'), fill = TRUE)
## this way, the words are being repeated to match with the largest text
word_table <- do.call(rbind, strsplit(as.character(MyDf), " "))
更多细节:文本有多个空格或制表符。最初的假设是,可能是额外的空格造成了问题,但在删除额外的空格后,它仍然是同一个问题。
请帮忙
解决方案:breaker
和 cSplit
函数的荣誉奖。
texts <- c("concratulations successfully signed company please find attached quick guide can t figure immediately ", " conversation laughing services sweden", "p please find attached drafted budget p ", "p please finad attached agenda today s board meeting p ", "p hi nbsp p p please find attached darft meeting minutes today s meeting p ", "p please find attached final version minutes updated action log please let know actions done ll update excel nbsp p ", "p hi p p please find attached draft meeting minutes action log please provide comments end next week p p nice spring party saturday p p tuija p ", " p welcome team priority hope enjoy yo p ", "p please find attached flyer can study share p ", "p attached new version voice receiver p p minor change request invitation code mentioned invitation code may tell check code invitation email end alarm bell example telling new comments ", "comment etc front page now seemed end without warning p ", "p memo attached actions p ", "p please find attached updated board roles responsibilities made changes red document please review especially role relevant contact info prepare comment meeting wednesday nbsp p ", "p attached documents review please comment soonest p ")
texts <- cbind(texts)
## to remove multi-white spaces
MyDf <- gsub("\s+"," ",texts)
MyDf <- gsub("\r?\n|\r", " ", MyDf)
MyDf <- cbind(MyDf)
colnames(MyDf) <- c("Introduction")
n <- matrix(texts[ ,1 ], nrow = nrow(texts), ncol = ncol(texts))
library(splitstackshape)
library(data.table)
breaker <- function(X) {
strsplit(X, "[[:space:]]|(?=[.!?])", perl=TRUE)
}
aaa <- breaker(n)
aaa <- cbind(aaa)
#############################################################################################
cSplit <- function(indt, splitCols, sep = ",", direction = "wide",
makeEqual = NULL, fixed = TRUE, drop = TRUE,
stripWhite = FALSE) {
message("`cSplit` is now part of the 'splitstackshape' package (V1.4.0)")
## requires data.table >= 1.8.11
require(data.table)
if (!is.data.table(indt)) setDT(indt)
if (is.numeric(splitCols)) splitCols <- names(indt)[splitCols]
if (any(!vapply(indt[, splitCols, with = FALSE],
is.character, logical(1L)))) {
indt[, eval(splitCols) := lapply(.SD, as.character),
.SDcols = splitCols]
}
if (length(sep) == 1)
sep <- rep(sep, length(splitCols))
if (length(sep) != length(splitCols)) {
stop("Verify you have entered the correct number of sep")
}
if (isTRUE(stripWhite)) {
indt[, eval(splitCols) := mapply(function(x, y)
gsub(sprintf("\s+%s\s+|\s+%s|%s\s+",
x, x, x), x, y),
sep, indt[, splitCols, with = FALSE],
SIMPLIFY = FALSE)]
}
X <- lapply(seq_along(splitCols), function(x) {
strsplit(indt[[splitCols[x]]], split = sep[x], fixed = fixed)
})
if (direction == "long") {
if (is.null(makeEqual)) {
IV <- function(x,y) if (identical(x,y)) TRUE else FALSE
makeEqual <- ifelse(Reduce(IV, rapply(X, length, how = "list")),
FALSE, TRUE)
}
} else if (direction == "wide") {
if (!is.null(makeEqual)) {
if (!isTRUE(makeEqual)) {
message("makeEqual specified as FALSE but set to TRUE")
makeEqual <- TRUE
}
makeEqual <- TRUE
} else {
makeEqual <- TRUE
}
}
if (isTRUE(makeEqual)) {
SetUp <- lapply(seq_along(X), function(y) {
A <- vapply(X[[y]], length, 1L)
list(Mat = cbind(rep(seq_along(A), A), sequence(A)),
Val = unlist(X[[y]]))
})
Ncol <- max(unlist(lapply(SetUp, function(y) y[["Mat"]][, 2]),
use.names = FALSE))
X <- lapply(seq_along(SetUp), function(y) {
M <- matrix(NA_character_, nrow = nrow(indt), ncol = Ncol)
M[SetUp[[y]][["Mat"]]] <- SetUp[[y]][["Val"]]
M
})
if (direction == "wide") {
X <- lapply(seq_along(X), function(x) {
colnames(X[[x]]) <- paste(splitCols[x],
sequence(ncol(X[[x]])),
sep = "_")
X[[x]]
})
if (isTRUE(drop)) {
cbind(indt, do.call(cbind, X))[, eval(splitCols) := NULL][]
} else {
cbind(indt, do.call(cbind, X))
}
} else {
indt <- indt[rep(sequence(nrow(indt)), each = Ncol)]
X <- lapply(X, function(y) as.vector(t(y)))
indt[, eval(splitCols) := lapply(X, unlist, use.names = FALSE)][]
}
} else {
Rep <- vapply(X[[1]], length, integer(1L))
indt <- indt[rep(sequence(nrow(indt)), Rep)]
indt[, eval(splitCols) := lapply(X, unlist, use.names = FALSE)][]
}
}
df <- cSplit(as.data.frame(aaa), "aaa", ",")
df <- data.frame(cbind(texts, df))
######################################################################################
## Heading
Heading <- df[ ,1]
## Word Table
df <- df[ ,2:ncol(df)]
## first column
aaa_first <- df[,1]
aaa_first <- cbind(aaa_first)
c <- substring(aaa_first, 3)
## last column
aaa_end <- df[ ,ncol(df)]
aaa_end <- cbind(aaa_end)
e <- substr(aaa_end, 1, nchar(aaa_end)-1)
## Middole columns
d <- df[ ,3:ncol(df)-1]
cc <- cbind(Heading, c, d, e )
## cc <- cbind( c, d, e )
cc <- data.frame(lapply(cc, as.character), stringsAsFactors = FALSE)
df2 <- as.data.frame(sapply(cc,gsub,pattern= ")",replacement=""))
# df2 <- as.data.frame(sapply(df2,gsub,pattern="(",replacement=""))
df3 <- as.data.frame(sapply(df2, function(x) gsub("\"", "", x)))
从我之前的 texts
,并且我试图从文本中为每个单词生成 word-table
。但是当文本列的行号与 word-table
的行号不同时,就会出现问题。已发现某些文本正在创建两行或更多行。所以最后我不能 cbind
这两个在一起。代码在这里。我只希望结果与文本的行号完全相同,我可以将它们绑定在一起以显示哪个文本是哪个 word-table
.
texts <- c("concratulations successfully signed company please find attached quick guide can t figure immediately ", " conversation laughing services sweden", "p please find attached drafted budget p ", "p please finad attached agenda today s board meeting p ", "p hi nbsp p p please find attached darft meeting minutes today s meeting p ", "p please find attached final version minutes updated action log please let know actions done ll update excel nbsp p ", "p hi p p please find attached draft meeting minutes action log please provide comments end next week p p nice spring party saturday p p tuija p ", " p welcome team priority hope enjoy yo p ", "p please find attached flyer can study share p ", "p attached new version voice receiver p p minor change request invitation code mentioned invitation code may tell check code invitation email end alarm bell example telling new comments ", "comment etc front page now seemed end without warning p ", "p memo attached actions p ", "p please find attached updated board roles responsibilities made changes red document please review especially role relevant contact info prepare comment meeting wednesday nbsp p ", "p attached documents review please comment soonest p ")
texts <- cbind(texts)
## to remove multi-white spaces
MyDf <- gsub("\s+"," ",texts)
MyDf <- gsub("\r?\n|\r", " ", MyDf)
MyDf <- cbind(MyDf)
colnames(MyDf) <- c("Introduction")
## this way, extra rows are being generated
word_table <- read.table(text = paste(gsub('\n', ' ', MyDf), collapse = '\n'), fill = TRUE)
## this way, the words are being repeated to match with the largest text
word_table <- do.call(rbind, strsplit(as.character(MyDf), " "))
更多细节:文本有多个空格或制表符。最初的假设是,可能是额外的空格造成了问题,但在删除额外的空格后,它仍然是同一个问题。
请帮忙
解决方案:breaker
和 cSplit
函数的荣誉奖。
texts <- c("concratulations successfully signed company please find attached quick guide can t figure immediately ", " conversation laughing services sweden", "p please find attached drafted budget p ", "p please finad attached agenda today s board meeting p ", "p hi nbsp p p please find attached darft meeting minutes today s meeting p ", "p please find attached final version minutes updated action log please let know actions done ll update excel nbsp p ", "p hi p p please find attached draft meeting minutes action log please provide comments end next week p p nice spring party saturday p p tuija p ", " p welcome team priority hope enjoy yo p ", "p please find attached flyer can study share p ", "p attached new version voice receiver p p minor change request invitation code mentioned invitation code may tell check code invitation email end alarm bell example telling new comments ", "comment etc front page now seemed end without warning p ", "p memo attached actions p ", "p please find attached updated board roles responsibilities made changes red document please review especially role relevant contact info prepare comment meeting wednesday nbsp p ", "p attached documents review please comment soonest p ")
texts <- cbind(texts)
## to remove multi-white spaces
MyDf <- gsub("\s+"," ",texts)
MyDf <- gsub("\r?\n|\r", " ", MyDf)
MyDf <- cbind(MyDf)
colnames(MyDf) <- c("Introduction")
n <- matrix(texts[ ,1 ], nrow = nrow(texts), ncol = ncol(texts))
library(splitstackshape)
library(data.table)
breaker <- function(X) {
strsplit(X, "[[:space:]]|(?=[.!?])", perl=TRUE)
}
aaa <- breaker(n)
aaa <- cbind(aaa)
#############################################################################################
cSplit <- function(indt, splitCols, sep = ",", direction = "wide",
makeEqual = NULL, fixed = TRUE, drop = TRUE,
stripWhite = FALSE) {
message("`cSplit` is now part of the 'splitstackshape' package (V1.4.0)")
## requires data.table >= 1.8.11
require(data.table)
if (!is.data.table(indt)) setDT(indt)
if (is.numeric(splitCols)) splitCols <- names(indt)[splitCols]
if (any(!vapply(indt[, splitCols, with = FALSE],
is.character, logical(1L)))) {
indt[, eval(splitCols) := lapply(.SD, as.character),
.SDcols = splitCols]
}
if (length(sep) == 1)
sep <- rep(sep, length(splitCols))
if (length(sep) != length(splitCols)) {
stop("Verify you have entered the correct number of sep")
}
if (isTRUE(stripWhite)) {
indt[, eval(splitCols) := mapply(function(x, y)
gsub(sprintf("\s+%s\s+|\s+%s|%s\s+",
x, x, x), x, y),
sep, indt[, splitCols, with = FALSE],
SIMPLIFY = FALSE)]
}
X <- lapply(seq_along(splitCols), function(x) {
strsplit(indt[[splitCols[x]]], split = sep[x], fixed = fixed)
})
if (direction == "long") {
if (is.null(makeEqual)) {
IV <- function(x,y) if (identical(x,y)) TRUE else FALSE
makeEqual <- ifelse(Reduce(IV, rapply(X, length, how = "list")),
FALSE, TRUE)
}
} else if (direction == "wide") {
if (!is.null(makeEqual)) {
if (!isTRUE(makeEqual)) {
message("makeEqual specified as FALSE but set to TRUE")
makeEqual <- TRUE
}
makeEqual <- TRUE
} else {
makeEqual <- TRUE
}
}
if (isTRUE(makeEqual)) {
SetUp <- lapply(seq_along(X), function(y) {
A <- vapply(X[[y]], length, 1L)
list(Mat = cbind(rep(seq_along(A), A), sequence(A)),
Val = unlist(X[[y]]))
})
Ncol <- max(unlist(lapply(SetUp, function(y) y[["Mat"]][, 2]),
use.names = FALSE))
X <- lapply(seq_along(SetUp), function(y) {
M <- matrix(NA_character_, nrow = nrow(indt), ncol = Ncol)
M[SetUp[[y]][["Mat"]]] <- SetUp[[y]][["Val"]]
M
})
if (direction == "wide") {
X <- lapply(seq_along(X), function(x) {
colnames(X[[x]]) <- paste(splitCols[x],
sequence(ncol(X[[x]])),
sep = "_")
X[[x]]
})
if (isTRUE(drop)) {
cbind(indt, do.call(cbind, X))[, eval(splitCols) := NULL][]
} else {
cbind(indt, do.call(cbind, X))
}
} else {
indt <- indt[rep(sequence(nrow(indt)), each = Ncol)]
X <- lapply(X, function(y) as.vector(t(y)))
indt[, eval(splitCols) := lapply(X, unlist, use.names = FALSE)][]
}
} else {
Rep <- vapply(X[[1]], length, integer(1L))
indt <- indt[rep(sequence(nrow(indt)), Rep)]
indt[, eval(splitCols) := lapply(X, unlist, use.names = FALSE)][]
}
}
df <- cSplit(as.data.frame(aaa), "aaa", ",")
df <- data.frame(cbind(texts, df))
######################################################################################
## Heading
Heading <- df[ ,1]
## Word Table
df <- df[ ,2:ncol(df)]
## first column
aaa_first <- df[,1]
aaa_first <- cbind(aaa_first)
c <- substring(aaa_first, 3)
## last column
aaa_end <- df[ ,ncol(df)]
aaa_end <- cbind(aaa_end)
e <- substr(aaa_end, 1, nchar(aaa_end)-1)
## Middole columns
d <- df[ ,3:ncol(df)-1]
cc <- cbind(Heading, c, d, e )
## cc <- cbind( c, d, e )
cc <- data.frame(lapply(cc, as.character), stringsAsFactors = FALSE)
df2 <- as.data.frame(sapply(cc,gsub,pattern= ")",replacement=""))
# df2 <- as.data.frame(sapply(df2,gsub,pattern="(",replacement=""))
df3 <- as.data.frame(sapply(df2, function(x) gsub("\"", "", x)))