使用 stringr 在另一个附近查找单词
find word near another using stringr
我有一个简单的问题,考虑这个例子
library(dplyr)
library(stringr)
dataframe <- data_frame(mytext = c('Whosebug is pretty good my friend',
'but sometimes pretty bad as well'))
# A tibble: 2 x 1
mytext
<chr>
1 Whosebug is pretty good my friend
2 but sometimes pretty bad as well
我想统计Whosebug
接近good
的次数。我使用以下正则表达式,但它不起作用。
dataframe %>% mutate(mycount = str_count(mytext,
regex('Whosebug(?:\w+){0,5}good', ignore_case = TRUE)))
# A tibble: 2 x 2
mytext mycount
<chr> <int>
1 Whosebug is pretty good my friend 0
2 but sometimes pretty bad as well 0
有人能告诉我我在这里缺少什么吗?
谢谢!
我在这方面也遇到了很多麻烦,我仍然不确定为什么我尝试的东西不起作用。但我只擅长正则表达式,不是专家。但是,我能够让它与回顾和展望一起工作。
library(dplyr)
library(stringr)
dataframe <- data_frame(mytext = c('Whosebug is pretty good my friend',
'but sometimes pretty bad as well',
'Whosebug one two three four five six good',
'Whosebug good'))
dataframe
dataframe %>% mutate(mycount = str_count(mytext,
regex('(?<=Whosebug)\s(?:\w+\s){0,5}(?=good)', ignore_case = TRUE)))
## A tibble: 4 x 2
# mytext mycount
# <chr> <int>
#1 Whosebug is pretty good my friend 1
#2 but sometimes pretty bad as well 0
#3 Whosebug one two three four five six good 0
#4 Whosebug good 1
我想我明白了
dataframe %>%
mutate(mycount = str_count(mytext,
regex('Whosebug\W+(?:\w+ ){0,5}good', ignore_case = TRUE)))
# A tibble: 4 x 2
mytext mycount
<chr> <int>
1 Whosebug is pretty good my friend 1
2 but sometimes pretty bad as well 0
3 Whosebug good good Whosebug 1
4 Whosebuggood 0
关键是添加 \W+
元字符,匹配 个单词之间的任何内容。
语料库 库使这变得非常简单:
library(corpus)
dataframe <- data.frame(mytext = c('Whosebug is pretty good my friend',
'but sometimes pretty bad as well'))
# find instances of 'Whosebug'
loc <- text_locate(dataframe$mytext, "Whosebug")
# count the number of times 'good' is within 5 tokens
near_good <- (text_detect(text_sub(loc$before, -4, -1), "good")
| text_detect(text_sub(loc$after, 1, 4), "good"))
# aggregate over text
count <- tapply(near_good, loc$text, sum, default = 0)
从概念上讲,语料库 将文本视为一系列标记。该库允许您使用 text_sub()
命令索引这些序列。您还可以使用 text_filter()
.
更改标记的定义
这是一个以相同方式工作但忽略标点符号的示例:
corpus <- corpus_frame(text = c("Whosebug, is pretty (?) GOOD my friend!",
"But sometimes pretty bad as well"))
text_filter(corpus)$drop_punct <- TRUE
loc <- text_locate(corpus, "Whosebug")
near_good <- (text_detect(text_sub(loc$before, -4, -1), "good")
| text_detect(text_sub(loc$after, 1, 4), "good"))
count <- tapply(near_good, loc$text, sum, default = 0)
我有一个简单的问题,考虑这个例子
library(dplyr)
library(stringr)
dataframe <- data_frame(mytext = c('Whosebug is pretty good my friend',
'but sometimes pretty bad as well'))
# A tibble: 2 x 1
mytext
<chr>
1 Whosebug is pretty good my friend
2 but sometimes pretty bad as well
我想统计Whosebug
接近good
的次数。我使用以下正则表达式,但它不起作用。
dataframe %>% mutate(mycount = str_count(mytext,
regex('Whosebug(?:\w+){0,5}good', ignore_case = TRUE)))
# A tibble: 2 x 2
mytext mycount
<chr> <int>
1 Whosebug is pretty good my friend 0
2 but sometimes pretty bad as well 0
有人能告诉我我在这里缺少什么吗?
谢谢!
我在这方面也遇到了很多麻烦,我仍然不确定为什么我尝试的东西不起作用。但我只擅长正则表达式,不是专家。但是,我能够让它与回顾和展望一起工作。
library(dplyr)
library(stringr)
dataframe <- data_frame(mytext = c('Whosebug is pretty good my friend',
'but sometimes pretty bad as well',
'Whosebug one two three four five six good',
'Whosebug good'))
dataframe
dataframe %>% mutate(mycount = str_count(mytext,
regex('(?<=Whosebug)\s(?:\w+\s){0,5}(?=good)', ignore_case = TRUE)))
## A tibble: 4 x 2
# mytext mycount
# <chr> <int>
#1 Whosebug is pretty good my friend 1
#2 but sometimes pretty bad as well 0
#3 Whosebug one two three four five six good 0
#4 Whosebug good 1
我想我明白了
dataframe %>%
mutate(mycount = str_count(mytext,
regex('Whosebug\W+(?:\w+ ){0,5}good', ignore_case = TRUE)))
# A tibble: 4 x 2
mytext mycount
<chr> <int>
1 Whosebug is pretty good my friend 1
2 but sometimes pretty bad as well 0
3 Whosebug good good Whosebug 1
4 Whosebuggood 0
关键是添加 \W+
元字符,匹配 个单词之间的任何内容。
语料库 库使这变得非常简单:
library(corpus)
dataframe <- data.frame(mytext = c('Whosebug is pretty good my friend',
'but sometimes pretty bad as well'))
# find instances of 'Whosebug'
loc <- text_locate(dataframe$mytext, "Whosebug")
# count the number of times 'good' is within 5 tokens
near_good <- (text_detect(text_sub(loc$before, -4, -1), "good")
| text_detect(text_sub(loc$after, 1, 4), "good"))
# aggregate over text
count <- tapply(near_good, loc$text, sum, default = 0)
从概念上讲,语料库 将文本视为一系列标记。该库允许您使用 text_sub()
命令索引这些序列。您还可以使用 text_filter()
.
这是一个以相同方式工作但忽略标点符号的示例:
corpus <- corpus_frame(text = c("Whosebug, is pretty (?) GOOD my friend!",
"But sometimes pretty bad as well"))
text_filter(corpus)$drop_punct <- TRUE
loc <- text_locate(corpus, "Whosebug")
near_good <- (text_detect(text_sub(loc$before, -4, -1), "good")
| text_detect(text_sub(loc$after, 1, 4), "good"))
count <- tapply(near_good, loc$text, sum, default = 0)