将标点符号添加到 data.frame r 中的列表和 coreace
Add punctuation to a list in and coreace to data.frame r
我有一个单词和标点符号库。我正在尝试用它制作一个数据框,以便以后使用。原始数据集有 2,000,000 行带标点符号但它是一个列表。我无法从其余单词中解析出列表中的标点符号。我想要单词中每个标点符号之间的空格。我可以在 excel 中通过查找替换轻松地做到这一点。但是想在 R 中做。我有一个例子叫做 = df,我想要在 r 中的输出叫做 = output。我附上了下面的代码和我目前所拥有的。我尝试了 str_split 的 How 但它删除了 "How " 并且没有返回任何内容 "".
#--------Upload 1st dataset and edit-------#
library("stringr")
sent1<-c("How did Quebec? 1 2 3")
sent2<-c("Why does valve = .245? .66")
sent3<-c("How do I use a period (.) comma [,] and hyphen {-} to columns?")
df <- data.frame(text = c(sent1,sent2,sent3))
df <- as.matrix(df)
str_split(df, " ")#spaces
#-------------output-------------#
words1<-c("How", "did" ,"Quebec"," ? ","1", "2" ,"3")
words2<-c('Why', "does", "valve"," = ",".245","?" ,".66")
words3<-c("How" ,"do", "I", "use", "a", "period", '(',".",')', "comma" ,'[',",","]" ,"and" ,"hyphen" ,"{","-",'}' ,"to" ,"columns",'?')
output<-data.frame(words1,words2,words3)
这是完成工作的粗略概念:
首先拆分所有不是单词字符的字符(受another answer启发)。然后取最大长度,把其他的都填成一样的长度。
dfsplt <- strsplit( gsub("([^\w])","~\1~", df, perl = TRUE), "~")
dfsplt <- lapply(dfsplt, function(x) x[!x %in% c("", " ")])
n <- max(lengths(dfsplt))
sapply(dfsplt, function(x) {x <- rep(x, ceiling(n / length(x))); x[1:n]})
# or
sapply(dfsplt, function(x) x[(1:n - 1) %% length(x) + 1])
[,1] [,2] [,3]
[1,] "How" "Why" "How"
[2,] "did" "does" "do"
[3,] "Quebec" "valve" "I"
[4,] "?" "=" "use"
[5,] "1" "." "a"
[6,] "2" "245" "period"
[7,] "3" "?" "("
[8,] "How" "." "."
[9,] "did" "66" ")"
[10,] "Quebec" "Why" "comma"
[11,] "?" "does" "["
[12,] "1" "valve" ","
[13,] "2" "=" "]"
[14,] "3" "." "and"
[15,] "How" "245" "hyphen"
[16,] "did" "?" "{"
[17,] "Quebec" "." "-"
[18,] "?" "66" "}"
[19,] "1" "Why" "to"
[20,] "2" "does" "columns"
[21,] "3" "valve" "?"
这是一个选项,我们在标点字符之间创建一个 space,然后单独 scan
do.call(cbind, lapply(gsub("([[:punct:]])", " \1 ",
df$text), function(x) scan(text = x, what = "", quiet = TRUE)))
# [,1] [,2] [,3]
# [1,] "How" "Why" "How"
# [2,] "did" "does" "do"
# [3,] "Quebec" "valve" "I"
# [4,] "?" "=" "use"
# [5,] "1" "." "a"
# [6,] "2" "245" "period"
# [7,] "3" "?" "("
# [8,] "How" "." "."
# [9,] "did" "66" ")"
#[10,] "Quebec" "Why" "comma"
#[11,] "?" "does" "["
#[12,] "1" "valve" ","
#[13,] "2" "=" "]"
#14,] "3" "." "and"
#[15,] "How" "245" "hyphen"
#[16,] "did" "?" "{"
#[17,] "Quebec" "." "-"
#[18,] "?" "66" "}"
#[19,] "1" "Why" "to"
#[20,] "2" "does" "columns"
#[21,] "3" "valve" "?"
我有一个单词和标点符号库。我正在尝试用它制作一个数据框,以便以后使用。原始数据集有 2,000,000 行带标点符号但它是一个列表。我无法从其余单词中解析出列表中的标点符号。我想要单词中每个标点符号之间的空格。我可以在 excel 中通过查找替换轻松地做到这一点。但是想在 R 中做。我有一个例子叫做 = df,我想要在 r 中的输出叫做 = output。我附上了下面的代码和我目前所拥有的。我尝试了 str_split 的 How 但它删除了 "How " 并且没有返回任何内容 "".
#--------Upload 1st dataset and edit-------#
library("stringr")
sent1<-c("How did Quebec? 1 2 3")
sent2<-c("Why does valve = .245? .66")
sent3<-c("How do I use a period (.) comma [,] and hyphen {-} to columns?")
df <- data.frame(text = c(sent1,sent2,sent3))
df <- as.matrix(df)
str_split(df, " ")#spaces
#-------------output-------------#
words1<-c("How", "did" ,"Quebec"," ? ","1", "2" ,"3")
words2<-c('Why', "does", "valve"," = ",".245","?" ,".66")
words3<-c("How" ,"do", "I", "use", "a", "period", '(',".",')', "comma" ,'[',",","]" ,"and" ,"hyphen" ,"{","-",'}' ,"to" ,"columns",'?')
output<-data.frame(words1,words2,words3)
这是完成工作的粗略概念:
首先拆分所有不是单词字符的字符(受another answer启发)。然后取最大长度,把其他的都填成一样的长度。
dfsplt <- strsplit( gsub("([^\w])","~\1~", df, perl = TRUE), "~")
dfsplt <- lapply(dfsplt, function(x) x[!x %in% c("", " ")])
n <- max(lengths(dfsplt))
sapply(dfsplt, function(x) {x <- rep(x, ceiling(n / length(x))); x[1:n]})
# or
sapply(dfsplt, function(x) x[(1:n - 1) %% length(x) + 1])
[,1] [,2] [,3]
[1,] "How" "Why" "How"
[2,] "did" "does" "do"
[3,] "Quebec" "valve" "I"
[4,] "?" "=" "use"
[5,] "1" "." "a"
[6,] "2" "245" "period"
[7,] "3" "?" "("
[8,] "How" "." "."
[9,] "did" "66" ")"
[10,] "Quebec" "Why" "comma"
[11,] "?" "does" "["
[12,] "1" "valve" ","
[13,] "2" "=" "]"
[14,] "3" "." "and"
[15,] "How" "245" "hyphen"
[16,] "did" "?" "{"
[17,] "Quebec" "." "-"
[18,] "?" "66" "}"
[19,] "1" "Why" "to"
[20,] "2" "does" "columns"
[21,] "3" "valve" "?"
这是一个选项,我们在标点字符之间创建一个 space,然后单独 scan
do.call(cbind, lapply(gsub("([[:punct:]])", " \1 ",
df$text), function(x) scan(text = x, what = "", quiet = TRUE)))
# [,1] [,2] [,3]
# [1,] "How" "Why" "How"
# [2,] "did" "does" "do"
# [3,] "Quebec" "valve" "I"
# [4,] "?" "=" "use"
# [5,] "1" "." "a"
# [6,] "2" "245" "period"
# [7,] "3" "?" "("
# [8,] "How" "." "."
# [9,] "did" "66" ")"
#[10,] "Quebec" "Why" "comma"
#[11,] "?" "does" "["
#[12,] "1" "valve" ","
#[13,] "2" "=" "]"
#14,] "3" "." "and"
#[15,] "How" "245" "hyphen"
#[16,] "did" "?" "{"
#[17,] "Quebec" "." "-"
#[18,] "?" "66" "}"
#[19,] "1" "Why" "to"
#[20,] "2" "does" "columns"
#[21,] "3" "valve" "?"