使用标点符号解析字符串,在 r 中的大列表的不均匀行中
Parsing strings with punctuation, in uneven rows from a large list in r
我的数据包含我要导入的词库。这个问题的不同之处在于我将单词和标点符号分开。我的原始文件大约有 1,000,000 行。我想出了如何分隔标点符号。但是,我发现我的行现在不均匀并且它们被回收以匹配行的最长长度。我试图停止回收,而是为回收添加 NA 值。我附上了我正在导入的数据 = df.我的预期输出为 = 输出。
#--------Upload 1st dataset and edit-------#
library("stringr")
sent1<-c("How did Quebec? 1 2 3")
sent2<-c("Why does valve = .245")
sent3<-c("How do I use a period (.) comma [,] and hyphen {-} to columns?")
df <- data.frame(text = c(sent1,sent2,sent3))
#--Parse the punctation and the words from df
df<-do.call(cbind, lapply(gsub("([[:punct:]])", " \1 ",
df$text), function(x) scan(text = x, what = "", quiet = TRUE)))
这是我现在得到的结果...
> do.call(cbind, lapply(gsub("([[:punct:]])", " \1 ",
+ df$text), function(x) scan(text = x, what = "", quiet = TRUE)))
[,1] [,2] [,3]
[1,] "How" "Why" "How"
[2,] "did" "does" "do"
[3,] "Quebec" "valve" "I"
[4,] "?" "=" "use"
[5,] "1" "." "a"
[6,] "2" "245" "period"
[7,] "3" "Why" "("
[8,] "How" "does" "."
[9,] "did" "valve" ")"
[10,] "Quebec" "=" "comma"
[11,] "?" "." "["
[12,] "1" "245" ","
[13,] "2" "Why" "]"
[14,] "3" "does" "and"
[15,] "How" "valve" "hyphen"
[16,] "did" "=" "{"
[17,] "Quebec" "." "-"
[18,] "?" "245" "}"
[19,] "1" "Why" "to"
[20,] "2" "does" "columns"
[21,] "3" "valve" "?"
这是我想要的输出...
#-------------output-------------#
words1<-c("How", "did" ,"Quebec"," ? ","1", "2" ,"3",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA)
words2<-c('Why', "does", "valve"," = ",".","245" ,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA)
words3<-c("How" ,"do", "I", "use", "a", "period", '(',".",')', "comma" ,'[',",","]" ,"and" ,"hyphen" ,"{","-",'}' ,"to" ,"columns",'?')
output<-data.frame(words1,words2,words3)
输出
> data.frame(words1,words2,words3)
words1 words2 words3
1 How Why How
2 did does do
3 Quebec valve I
4 ? = use
5 1 . a
6 2 245 period
7 3 <NA> (
8 <NA> <NA> .
9 <NA> <NA> )
10 <NA> <NA> comma
11 <NA> <NA> [
12 <NA> <NA> ,
13 <NA> <NA> ]
14 <NA> <NA> and
15 <NA> <NA> hyphen
16 <NA> <NA> {
17 <NA> <NA> -
18 <NA> <NA> }
19 <NA> <NA> to
20 <NA> <NA> columns
21 <NA> <NA> ?
您可以使用 rowr
包中的 cbind.fill
和附加参数 fill = NA
library(rowr)
do.call(cbind.fill, c(lapply(gsub("([[:punct:]])", " \1 ",
df$text), function(x) scan(text = x, what = "", quiet = TRUE)), fill = NA))
# object object object
#1 How Why How
#2 did does do
#3 Quebec valve I
#4 ? = use
#5 1 . a
#6 2 245 period
#7 3 <NA> (
#8 <NA> <NA> .
#9 <NA> <NA> )
#10 <NA> <NA> comma
#11 <NA> <NA> [
#12 <NA> <NA> ,
#13 <NA> <NA> ]
#14 <NA> <NA> and
#15 <NA> <NA> hyphen
#16 <NA> <NA> {
#17 <NA> <NA> -
#18 <NA> <NA> }
#19 <NA> <NA> to
#20 <NA> <NA> columns
#21 <NA> <NA> ?
或使用 sapply
的基础 R 选项
list_df <- lapply(gsub("([[:punct:]])", " \1 ",
df$text), function(x) scan(text = x, what = "", quiet = TRUE))
sapply(list_df, '[', 1:max(lengths(list_df)))
# [,1] [,2] [,3]
# [1,] "How" "Why" "How"
# [2,] "did" "does" "do"
# [3,] "Quebec" "valve" "I"
# [4,] "?" "=" "use"
# [5,] "1" "." "a"
# [6,] "2" "245" "period"
# [7,] "3" NA "("
# [8,] NA NA "."
# [9,] NA NA ")"
#[10,] NA NA "comma"
#[11,] NA NA "["
#......
我的数据包含我要导入的词库。这个问题的不同之处在于我将单词和标点符号分开。我的原始文件大约有 1,000,000 行。我想出了如何分隔标点符号。但是,我发现我的行现在不均匀并且它们被回收以匹配行的最长长度。我试图停止回收,而是为回收添加 NA 值。我附上了我正在导入的数据 = df.我的预期输出为 = 输出。
#--------Upload 1st dataset and edit-------#
library("stringr")
sent1<-c("How did Quebec? 1 2 3")
sent2<-c("Why does valve = .245")
sent3<-c("How do I use a period (.) comma [,] and hyphen {-} to columns?")
df <- data.frame(text = c(sent1,sent2,sent3))
#--Parse the punctation and the words from df
df<-do.call(cbind, lapply(gsub("([[:punct:]])", " \1 ",
df$text), function(x) scan(text = x, what = "", quiet = TRUE)))
这是我现在得到的结果...
> do.call(cbind, lapply(gsub("([[:punct:]])", " \1 ",
+ df$text), function(x) scan(text = x, what = "", quiet = TRUE)))
[,1] [,2] [,3]
[1,] "How" "Why" "How"
[2,] "did" "does" "do"
[3,] "Quebec" "valve" "I"
[4,] "?" "=" "use"
[5,] "1" "." "a"
[6,] "2" "245" "period"
[7,] "3" "Why" "("
[8,] "How" "does" "."
[9,] "did" "valve" ")"
[10,] "Quebec" "=" "comma"
[11,] "?" "." "["
[12,] "1" "245" ","
[13,] "2" "Why" "]"
[14,] "3" "does" "and"
[15,] "How" "valve" "hyphen"
[16,] "did" "=" "{"
[17,] "Quebec" "." "-"
[18,] "?" "245" "}"
[19,] "1" "Why" "to"
[20,] "2" "does" "columns"
[21,] "3" "valve" "?"
这是我想要的输出...
#-------------output-------------#
words1<-c("How", "did" ,"Quebec"," ? ","1", "2" ,"3",NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA)
words2<-c('Why', "does", "valve"," = ",".","245" ,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA)
words3<-c("How" ,"do", "I", "use", "a", "period", '(',".",')', "comma" ,'[',",","]" ,"and" ,"hyphen" ,"{","-",'}' ,"to" ,"columns",'?')
output<-data.frame(words1,words2,words3)
输出
> data.frame(words1,words2,words3)
words1 words2 words3
1 How Why How
2 did does do
3 Quebec valve I
4 ? = use
5 1 . a
6 2 245 period
7 3 <NA> (
8 <NA> <NA> .
9 <NA> <NA> )
10 <NA> <NA> comma
11 <NA> <NA> [
12 <NA> <NA> ,
13 <NA> <NA> ]
14 <NA> <NA> and
15 <NA> <NA> hyphen
16 <NA> <NA> {
17 <NA> <NA> -
18 <NA> <NA> }
19 <NA> <NA> to
20 <NA> <NA> columns
21 <NA> <NA> ?
您可以使用 rowr
包中的 cbind.fill
和附加参数 fill = NA
library(rowr)
do.call(cbind.fill, c(lapply(gsub("([[:punct:]])", " \1 ",
df$text), function(x) scan(text = x, what = "", quiet = TRUE)), fill = NA))
# object object object
#1 How Why How
#2 did does do
#3 Quebec valve I
#4 ? = use
#5 1 . a
#6 2 245 period
#7 3 <NA> (
#8 <NA> <NA> .
#9 <NA> <NA> )
#10 <NA> <NA> comma
#11 <NA> <NA> [
#12 <NA> <NA> ,
#13 <NA> <NA> ]
#14 <NA> <NA> and
#15 <NA> <NA> hyphen
#16 <NA> <NA> {
#17 <NA> <NA> -
#18 <NA> <NA> }
#19 <NA> <NA> to
#20 <NA> <NA> columns
#21 <NA> <NA> ?
或使用 sapply
list_df <- lapply(gsub("([[:punct:]])", " \1 ",
df$text), function(x) scan(text = x, what = "", quiet = TRUE))
sapply(list_df, '[', 1:max(lengths(list_df)))
# [,1] [,2] [,3]
# [1,] "How" "Why" "How"
# [2,] "did" "does" "do"
# [3,] "Quebec" "valve" "I"
# [4,] "?" "=" "use"
# [5,] "1" "." "a"
# [6,] "2" "245" "period"
# [7,] "3" NA "("
# [8,] NA NA "."
# [9,] NA NA ")"
#[10,] NA NA "comma"
#[11,] NA NA "["
#......