字符串包含标点符号的字符串匹配
String matching where strings contain punctuation
我想使用 grepl()
查找不区分大小写的匹配项。
我想在数据框 df
.
的 Text 列中找到以下关键字列表
# There is a long list of words, but for simplification I have provided only a subset.
I, I'm, the, and, to, a, of
我想为每个数据行分别计算这些单词的数量。
我将要在代码中使用的单词列表定义为:
word_list = c('\bI\b','\bthe\b','\band\b','\bto\b','\ba\b','\bof\b')
# Note that I'm is not currently in this word_list
在我的数据框中 df
我添加了如下列以保持上述单词的计数:
df$I = 0
df$IM = 0 # this is where I need help
df$THE = 0
df$AND = 0
df$TO = 0
df$A = 0
df$OF = 0
然后我对单词列表中的每个单词使用以下 for 循环来遍历所需列的每一行。
# for each word of my word_list
for (i in 1:length(word_list)){
# to search in each row of text response
for(j in 1:nrow(df)){
if(grepl(word_list[i], df$Text[j], ignore.case = T)){
df[j,i+4] = (df[j,i+4]) # 4 is added to go to the specific column
}#if
}#for
}#for
对于可重现的示例 dput(df) 如下:
dput(df)
structure(list(cluster3 = c(2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L), userID = c(3016094L, 3042038L, 3079341L, 3079396L, 3130832L, 3130864L, 3148118L, 3148914L, 3149040L, 3150222L), Text = structure(c(3L, 4L, 2L, 9L, 6L, 10L, 7L, 1L, 5L, 8L), .Label = c("I'm alright","I'm stressed", "I am a good person.", "I don't care", "I have a difficult task", "I like it", "I think it doesn't matter", "Let's not argue about this", "Let's see if I can run", "No, I'm not in a mood"), class = "factor"), I = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), IM = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), AND = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), THE = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), TO = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), OF = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), class = "data.frame", row.names = c(NA, -10L))
我可以通过在双引号中添加表达式来使我的代码工作:
word_list = c('\bI\b',"\bI'm\b",'\bthe\b','\band\b','\bto\b','\ba\b','\bof\b')
我建议采用更精简的方法:
## use a named vector for the word patterns
## with the column names you want to add to `df`
word_list = c('I' = '\bi\b', 'THE' = '\bthe\b', 'AND' = '\band\b',
'TO' = '\bto\b', 'A' = '\ba\b', 'OF' = '\bof\b', 'IM' = "\bim")
## use `stringr::str_count` instead of `grepl`
## sapply does the looping and result gathering for us
library(stringr)
results = sapply(word_list, str_count,
string = gsub("[[:punct:]]", "", tolower(df$Text))
)
results
# I THE AND TO A OF IM
# [1,] 1 3 2 1 1 1 0
# [2,] 0 0 1 0 0 0 0
# [3,] 0 0 0 0 0 0 0
# [4,] 2 2 3 2 1 1 1
# [5,] 0 0 0 1 1 0 0
# [6,] 0 3 2 2 0 0 0
# [7,] 1 3 0 1 1 0 0
# [8,] 1 2 0 1 1 1 0
# [9,] 0 0 0 0 0 0 0
# [10,] 0 0 0 1 2 0 0
## put the results into the data frame based on the names
df[colnames(results)] = data.frame(results)
由于我们依赖于矢量化的 str_count
,因此这应该比逐行方法快。
我想使用 grepl()
查找不区分大小写的匹配项。
我想在数据框 df
.
# There is a long list of words, but for simplification I have provided only a subset.
I, I'm, the, and, to, a, of
我想为每个数据行分别计算这些单词的数量。 我将要在代码中使用的单词列表定义为:
word_list = c('\bI\b','\bthe\b','\band\b','\bto\b','\ba\b','\bof\b')
# Note that I'm is not currently in this word_list
在我的数据框中 df
我添加了如下列以保持上述单词的计数:
df$I = 0
df$IM = 0 # this is where I need help
df$THE = 0
df$AND = 0
df$TO = 0
df$A = 0
df$OF = 0
然后我对单词列表中的每个单词使用以下 for 循环来遍历所需列的每一行。
# for each word of my word_list
for (i in 1:length(word_list)){
# to search in each row of text response
for(j in 1:nrow(df)){
if(grepl(word_list[i], df$Text[j], ignore.case = T)){
df[j,i+4] = (df[j,i+4]) # 4 is added to go to the specific column
}#if
}#for
}#for
对于可重现的示例 dput(df) 如下:
dput(df)
structure(list(cluster3 = c(2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L), userID = c(3016094L, 3042038L, 3079341L, 3079396L, 3130832L, 3130864L, 3148118L, 3148914L, 3149040L, 3150222L), Text = structure(c(3L, 4L, 2L, 9L, 6L, 10L, 7L, 1L, 5L, 8L), .Label = c("I'm alright","I'm stressed", "I am a good person.", "I don't care", "I have a difficult task", "I like it", "I think it doesn't matter", "Let's not argue about this", "Let's see if I can run", "No, I'm not in a mood"), class = "factor"), I = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), IM = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), AND = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), THE = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), TO = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), OF = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), class = "data.frame", row.names = c(NA, -10L))
我可以通过在双引号中添加表达式来使我的代码工作:
word_list = c('\bI\b',"\bI'm\b",'\bthe\b','\band\b','\bto\b','\ba\b','\bof\b')
我建议采用更精简的方法:
## use a named vector for the word patterns
## with the column names you want to add to `df`
word_list = c('I' = '\bi\b', 'THE' = '\bthe\b', 'AND' = '\band\b',
'TO' = '\bto\b', 'A' = '\ba\b', 'OF' = '\bof\b', 'IM' = "\bim")
## use `stringr::str_count` instead of `grepl`
## sapply does the looping and result gathering for us
library(stringr)
results = sapply(word_list, str_count,
string = gsub("[[:punct:]]", "", tolower(df$Text))
)
results
# I THE AND TO A OF IM
# [1,] 1 3 2 1 1 1 0
# [2,] 0 0 1 0 0 0 0
# [3,] 0 0 0 0 0 0 0
# [4,] 2 2 3 2 1 1 1
# [5,] 0 0 0 1 1 0 0
# [6,] 0 3 2 2 0 0 0
# [7,] 1 3 0 1 1 0 0
# [8,] 1 2 0 1 1 1 0
# [9,] 0 0 0 0 0 0 0
# [10,] 0 0 0 1 2 0 0
## put the results into the data frame based on the names
df[colnames(results)] = data.frame(results)
由于我们依赖于矢量化的 str_count
,因此这应该比逐行方法快。