使用字典创建关键字列并丢弃较长的匹配项
Create keyword column with dictionary discarding longer matches
我正在使用 tokens_lookup
来查看某些文本是否包含我的词典中的单词,并丢弃包含在某些单词模式中与 nested_scope = "dictionary"
匹配的单词,如本 中所述。这个想法是丢弃包含嵌套目标词的较长字典匹配(例如,包括爱尔兰但不包括北爱尔兰)。
现在我想:
(1) 创建一个虚拟变量,指示文本是否包含字典中的单词。我设法用下面的代码做到了,但我不明白为什么我必须在 as.logical
.
中将 IE 写成小写
df <- structure(list(num = c(2345, 3564, 3636), text = c("Ireland lorem ipsum", "Lorem ipsum Northern
Ireland", "Ireland lorem ipsum Northern Ireland")), row.names = c(NA, -3L),
class = c("tbl_df", "tbl", "data.frame"))
dict <- dictionary(list(IE = "Ireland", "Northern Ireland" = "Northern Ireland"),
tolower = F)
corpus <- corpus(df, text_field = "text")
toks <- tokens(corpus)
dfm <- tokens_lookup(toks, dictionary = dict, nested_scope = "dictionary", case_insensitive = F) %>%
tokens_remove("Northern Ireland") %>%
dfm()
df$contains <- as.logical(dfm[, "ie"], case_insensitive = FALSE)
(2) 使用kwic
将匹配项存储在另一列中。有没有办法排除 kwic 中的字典键(示例中的北爱尔兰)?在我的尝试中,我得到了一个包含爱尔兰和北爱尔兰匹配项的关键字列。 (我不知道这是否有任何区别,但在我的完整数据集中,我每行有多个匹配项)。谢谢。
words <- kwic(toks, pattern = dict, case_insensitive = FALSE)
df$docname = dfm@Dimnames[["docs"]]
df_keywords <- merge(df, words[ , c("keyword")], by = 'docname', all.x = T)
df_keywords <- df_keywords %>% group_by(docname, num) %>%
mutate(n = row_number()) %>%
pivot_wider(id_cols = c(docname, num, text, contains),
values_from = keyword, names_from = n, names_prefix = 'keyword')
你可以这样做:
df <- structure(list(
num = c(2345, 3564, 3636),
text = c("Ireland lorem ipsum", "Lorem ipsum Northern
Ireland", "Ireland lorem ipsum Northern Ireland")
),
row.names = c(NA, -3L),
class = c("tbl_df", "tbl", "data.frame")
)
library("quanteda")
## Package version: 3.1.0
## Unicode version: 13.0
## ICU version: 69.1
## Parallel computing: 12 of 12 threads used.
## See https://quanteda.io for tutorials and examples.
dict <- dictionary(list(IE = "Ireland", "Northern Ireland" = "Northern Ireland"),
tolower = FALSE
)
corpus <- corpus(df, text_field = "text", docid_field = "num")
toks <- tokens(corpus)
这里你需要翻转 dfm()
调用中的 tolower = FALSE
,否则它会将 tokens_lookup()
.
中的键小写
dfmat <- tokens_lookup(toks, dict, nested_scope = "dictionary", case_insensitive = FALSE) %>%
dfm(tolower = FALSE)
dfmat
## Document-feature matrix of: 3 documents, 2 features (33.33% sparse) and 0 docvars.
## features
## docs IE Northern Ireland
## 2345 1 0
## 3564 0 1
## 3636 1 1
df$contains_Ireland <- as.logical(dfmat[, "IE"])
df
## # A tibble: 3 × 3
## num text contains_Ireland
## <dbl> <chr> <lgl>
## 1 2345 "Ireland lorem ipsum" TRUE
## 2 3564 "Lorem ipsum Northern\nIreland" FALSE
## 3 3636 "Ireland lorem ipsum Northern Ireland" TRUE
对于第 2 部分,我们没有为 kwic()
实现匹配嵌套。但是您可以搜索“爱尔兰”,然后排除之前出现“北方”的匹配项吗?
words <- kwic(toks, pattern = "Ireland", case_insensitive = FALSE, window = 2) %>%
as.data.frame() %>%
# removes the matches on IE value "Ireland" nested withing "Northern Ireland"
dplyr::filter(!stringr::str_detect(pre, "Northern$")) %>%
dplyr::mutate(num = as.numeric(docname))
words
## docname from to pre keyword post pattern num
## 1 2345 1 1 Ireland lorem ipsum Ireland 2345
## 2 3636 1 1 Ireland lorem ipsum Ireland 3636
dplyr::full_join(df, words, by = "num")
## # A tibble: 3 × 10
## num text contains_Ireland docname from to pre keyword post pattern
## <dbl> <chr> <lgl> <chr> <int> <int> <chr> <chr> <chr> <fct>
## 1 2345 "Irela… TRUE 2345 1 1 "" Ireland lore… Ireland
## 2 3564 "Lorem… FALSE <NA> NA NA <NA> <NA> <NA> <NA>
## 3 3636 "Irela… TRUE 3636 1 1 "" Ireland lore… Ireland
我正在使用 tokens_lookup
来查看某些文本是否包含我的词典中的单词,并丢弃包含在某些单词模式中与 nested_scope = "dictionary"
匹配的单词,如本
现在我想:
(1) 创建一个虚拟变量,指示文本是否包含字典中的单词。我设法用下面的代码做到了,但我不明白为什么我必须在 as.logical
.
df <- structure(list(num = c(2345, 3564, 3636), text = c("Ireland lorem ipsum", "Lorem ipsum Northern
Ireland", "Ireland lorem ipsum Northern Ireland")), row.names = c(NA, -3L),
class = c("tbl_df", "tbl", "data.frame"))
dict <- dictionary(list(IE = "Ireland", "Northern Ireland" = "Northern Ireland"),
tolower = F)
corpus <- corpus(df, text_field = "text")
toks <- tokens(corpus)
dfm <- tokens_lookup(toks, dictionary = dict, nested_scope = "dictionary", case_insensitive = F) %>%
tokens_remove("Northern Ireland") %>%
dfm()
df$contains <- as.logical(dfm[, "ie"], case_insensitive = FALSE)
(2) 使用kwic
将匹配项存储在另一列中。有没有办法排除 kwic 中的字典键(示例中的北爱尔兰)?在我的尝试中,我得到了一个包含爱尔兰和北爱尔兰匹配项的关键字列。 (我不知道这是否有任何区别,但在我的完整数据集中,我每行有多个匹配项)。谢谢。
words <- kwic(toks, pattern = dict, case_insensitive = FALSE)
df$docname = dfm@Dimnames[["docs"]]
df_keywords <- merge(df, words[ , c("keyword")], by = 'docname', all.x = T)
df_keywords <- df_keywords %>% group_by(docname, num) %>%
mutate(n = row_number()) %>%
pivot_wider(id_cols = c(docname, num, text, contains),
values_from = keyword, names_from = n, names_prefix = 'keyword')
你可以这样做:
df <- structure(list(
num = c(2345, 3564, 3636),
text = c("Ireland lorem ipsum", "Lorem ipsum Northern
Ireland", "Ireland lorem ipsum Northern Ireland")
),
row.names = c(NA, -3L),
class = c("tbl_df", "tbl", "data.frame")
)
library("quanteda")
## Package version: 3.1.0
## Unicode version: 13.0
## ICU version: 69.1
## Parallel computing: 12 of 12 threads used.
## See https://quanteda.io for tutorials and examples.
dict <- dictionary(list(IE = "Ireland", "Northern Ireland" = "Northern Ireland"),
tolower = FALSE
)
corpus <- corpus(df, text_field = "text", docid_field = "num")
toks <- tokens(corpus)
这里你需要翻转 dfm()
调用中的 tolower = FALSE
,否则它会将 tokens_lookup()
.
dfmat <- tokens_lookup(toks, dict, nested_scope = "dictionary", case_insensitive = FALSE) %>%
dfm(tolower = FALSE)
dfmat
## Document-feature matrix of: 3 documents, 2 features (33.33% sparse) and 0 docvars.
## features
## docs IE Northern Ireland
## 2345 1 0
## 3564 0 1
## 3636 1 1
df$contains_Ireland <- as.logical(dfmat[, "IE"])
df
## # A tibble: 3 × 3
## num text contains_Ireland
## <dbl> <chr> <lgl>
## 1 2345 "Ireland lorem ipsum" TRUE
## 2 3564 "Lorem ipsum Northern\nIreland" FALSE
## 3 3636 "Ireland lorem ipsum Northern Ireland" TRUE
对于第 2 部分,我们没有为 kwic()
实现匹配嵌套。但是您可以搜索“爱尔兰”,然后排除之前出现“北方”的匹配项吗?
words <- kwic(toks, pattern = "Ireland", case_insensitive = FALSE, window = 2) %>%
as.data.frame() %>%
# removes the matches on IE value "Ireland" nested withing "Northern Ireland"
dplyr::filter(!stringr::str_detect(pre, "Northern$")) %>%
dplyr::mutate(num = as.numeric(docname))
words
## docname from to pre keyword post pattern num
## 1 2345 1 1 Ireland lorem ipsum Ireland 2345
## 2 3636 1 1 Ireland lorem ipsum Ireland 3636
dplyr::full_join(df, words, by = "num")
## # A tibble: 3 × 10
## num text contains_Ireland docname from to pre keyword post pattern
## <dbl> <chr> <lgl> <chr> <int> <int> <chr> <chr> <chr> <fct>
## 1 2345 "Irela… TRUE 2345 1 1 "" Ireland lore… Ireland
## 2 3564 "Lorem… FALSE <NA> NA NA <NA> <NA> <NA> <NA>
## 3 3636 "Irela… TRUE 3636 1 1 "" Ireland lore… Ireland