计算字符串中每个元素的每次出现
Count each occurrence of each element in a string
我想计算字符串中每个字符的出现次数并将它们输出到数据框。
这是我的尝试:
q.data<-data.frame(number=1,string=c("COUNTTHESECHARACTERS"))
q.data[,3]<-string.counter(strings=q.data$string, pattern="A")
q.data[,3]<-string.counter(strings=q.data$string, pattern="B")
我想得到类似这样的输出:
x <- c("string","C","O","U","...")
colnames(df) <- x
df[1,] <- c("COUNTTHESECHARACTERS","3","1","1","...")
df
我在您的示例中添加了另一行以增加更多变化。这应该非常有效:
library(tibble)
library(purrr)
library(dplyr)
library(stringr)
q_data <- tibble(number = 1:2, string = c("COUNTTHESECHARACTERS", "countthesecharacters"))
tmp_data <- map_df(q_data$string, function(s) {
tmp <- t(str_count(s, fixed(LETTERS, ignore_case = TRUE)))
tmp <- as_tibble(tmp, .name_repair = "minimal")
colnames(tmp) <- LETTERS
tmp
}) %>%
bind_rows()
q_data_new <- cbind(q_data, tmp_data)
q_data_new
#> number string A B C D E F G H I J K L M N O P Q R S T U V
#> 1 1 COUNTTHESECHARACTERS 2 0 3 0 3 0 0 2 0 0 0 0 0 1 1 0 0 2 2 3 1 0
#> 2 2 countthesecharacters 2 0 3 0 3 0 0 2 0 0 0 0 0 1 1 0 0 2 2 3 1 0
#> W X Y Z
#> 1 0 0 0 0
#> 2 0 0 0 0
由 reprex package (v0.3.0)
于 2019-10-18 创建
如果您从 stringr
中查找 ?str_count
,您会看到更多可能对您的情况有用的选项。
更新
我只是从其他答案中才意识到,您要做的是计算字符串的所有元素,而不仅仅是字母。在这种情况下,您基本上是在寻找文档特征矩阵:
library(quanteda)
tmp <- q_data$string %>%
tokens("character", remove_separators = FALSE) %>%
dfm() %>%
convert("data.frame") %>%
select(-document) %>%
select(noquote(order(colnames(.)))) %>% # this is just for ordering alpabetically
as_tibble() # just for better comparison to other results
q_data_new <- cbind(q_data, tmp)
q_data_new
这比答案中已经给出的两个选项还要快得多。基准测试:
q_data <- tibble(number = 1:2000, string = stringi::stri_rand_strings(2000, 20))
stringr <- function(q_data, pattern = c(0:9, letters)) {
tmp_data <- map_df(q_data$string, function(s) {
tmp <- t(str_count(s, fixed(pattern, ignore_case = TRUE)))
tmp <- as_tibble(tmp, .name_repair = "minimal")
colnames(tmp) <- pattern
tmp
}) %>%
bind_rows() %>%
mutate_if(is.integer, as.numeric)
q_data_new <- bind_cols(q_data, tmp_data)
q_data_new
}
tidytext <- function(q_data) {
q_data %>%
group_by(number, string) %>%
unnest_tokens(character, string, token = "characters", drop = FALSE) %>%
count(number, character) %>%
complete(character = letters) %>%
spread(character, n, fill = 0) %>%
ungroup()
}
quanteda <- function(q_data) {
tmp <- q_data$string %>%
tokens("character", remove_separators = FALSE) %>%
dfm() %>%
convert("data.frame") %>%
select(-document) %>%
select(noquote(order(colnames(.)))) %>%
as_tibble()
q_data_new <- cbind(q_data, tmp)
q_data_new
}
结果
res <- bench::mark(
stringr = stringr(q_data),
tidytext = tidytext(q_data),
quanteda = quanteda(q_data)
)
res
#> # A tibble: 3 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 stringr 1.82s 1.82s 0.549 17.05MB 3.84
#> 2 tidytext 6.06s 6.06s 0.165 35.17MB 2.31
#> 3 quanteda 56.4ms 70.74ms 13.9 8.75MB 5.95
基本上你想tokenize字符串的字符。然后只是一些操作来获得你想要的。
library(dplyr)
library(tidyr)
library(tidytext)
q.data <- data.frame(number=c(1, 2),string=c("COUNTTHESECHARACTERS", "COUNTTHISTOO"), stringsAsFactors = FALSE)
q.data %>%
group_by(number, string) %>%
unnest_tokens(character, string, token = "characters", drop = FALSE) %>%
count(number, character) %>%
complete(character = letters) %>%
spread(character, n, fill = 0)
# A tibble: 2 x 28
# Groups: number, string [2]
number string a b c d e f g h i j k
<dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 COUNTTHE~ 2 0 3 0 3 0 0 2 0 0 0
2 2 COUNTTHI~ 0 0 1 0 0 0 0 1 1 0 0
# ... with 15 more variables: l <dbl>, m <dbl>, n <dbl>, o <dbl>, p <dbl>, q <dbl>,
# r <dbl>, s <dbl>, t <dbl>, u <dbl>, v <dbl>, w <dbl>, x <dbl>, y <dbl>, z <dbl>
如果你想保留所有的原始大小写(即不转换为小写),那么你可以将 to_lower = FALSE
添加到 unnest_tokens()
.
我想计算字符串中每个字符的出现次数并将它们输出到数据框。
这是我的尝试:
q.data<-data.frame(number=1,string=c("COUNTTHESECHARACTERS"))
q.data[,3]<-string.counter(strings=q.data$string, pattern="A")
q.data[,3]<-string.counter(strings=q.data$string, pattern="B")
我想得到类似这样的输出:
x <- c("string","C","O","U","...")
colnames(df) <- x
df[1,] <- c("COUNTTHESECHARACTERS","3","1","1","...")
df
我在您的示例中添加了另一行以增加更多变化。这应该非常有效:
library(tibble)
library(purrr)
library(dplyr)
library(stringr)
q_data <- tibble(number = 1:2, string = c("COUNTTHESECHARACTERS", "countthesecharacters"))
tmp_data <- map_df(q_data$string, function(s) {
tmp <- t(str_count(s, fixed(LETTERS, ignore_case = TRUE)))
tmp <- as_tibble(tmp, .name_repair = "minimal")
colnames(tmp) <- LETTERS
tmp
}) %>%
bind_rows()
q_data_new <- cbind(q_data, tmp_data)
q_data_new
#> number string A B C D E F G H I J K L M N O P Q R S T U V
#> 1 1 COUNTTHESECHARACTERS 2 0 3 0 3 0 0 2 0 0 0 0 0 1 1 0 0 2 2 3 1 0
#> 2 2 countthesecharacters 2 0 3 0 3 0 0 2 0 0 0 0 0 1 1 0 0 2 2 3 1 0
#> W X Y Z
#> 1 0 0 0 0
#> 2 0 0 0 0
由 reprex package (v0.3.0)
于 2019-10-18 创建如果您从 stringr
中查找 ?str_count
,您会看到更多可能对您的情况有用的选项。
更新
我只是从其他答案中才意识到,您要做的是计算字符串的所有元素,而不仅仅是字母。在这种情况下,您基本上是在寻找文档特征矩阵:
library(quanteda)
tmp <- q_data$string %>%
tokens("character", remove_separators = FALSE) %>%
dfm() %>%
convert("data.frame") %>%
select(-document) %>%
select(noquote(order(colnames(.)))) %>% # this is just for ordering alpabetically
as_tibble() # just for better comparison to other results
q_data_new <- cbind(q_data, tmp)
q_data_new
这比答案中已经给出的两个选项还要快得多。基准测试:
q_data <- tibble(number = 1:2000, string = stringi::stri_rand_strings(2000, 20))
stringr <- function(q_data, pattern = c(0:9, letters)) {
tmp_data <- map_df(q_data$string, function(s) {
tmp <- t(str_count(s, fixed(pattern, ignore_case = TRUE)))
tmp <- as_tibble(tmp, .name_repair = "minimal")
colnames(tmp) <- pattern
tmp
}) %>%
bind_rows() %>%
mutate_if(is.integer, as.numeric)
q_data_new <- bind_cols(q_data, tmp_data)
q_data_new
}
tidytext <- function(q_data) {
q_data %>%
group_by(number, string) %>%
unnest_tokens(character, string, token = "characters", drop = FALSE) %>%
count(number, character) %>%
complete(character = letters) %>%
spread(character, n, fill = 0) %>%
ungroup()
}
quanteda <- function(q_data) {
tmp <- q_data$string %>%
tokens("character", remove_separators = FALSE) %>%
dfm() %>%
convert("data.frame") %>%
select(-document) %>%
select(noquote(order(colnames(.)))) %>%
as_tibble()
q_data_new <- cbind(q_data, tmp)
q_data_new
}
结果
res <- bench::mark(
stringr = stringr(q_data),
tidytext = tidytext(q_data),
quanteda = quanteda(q_data)
)
res
#> # A tibble: 3 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 stringr 1.82s 1.82s 0.549 17.05MB 3.84
#> 2 tidytext 6.06s 6.06s 0.165 35.17MB 2.31
#> 3 quanteda 56.4ms 70.74ms 13.9 8.75MB 5.95
基本上你想tokenize字符串的字符。然后只是一些操作来获得你想要的。
library(dplyr)
library(tidyr)
library(tidytext)
q.data <- data.frame(number=c(1, 2),string=c("COUNTTHESECHARACTERS", "COUNTTHISTOO"), stringsAsFactors = FALSE)
q.data %>%
group_by(number, string) %>%
unnest_tokens(character, string, token = "characters", drop = FALSE) %>%
count(number, character) %>%
complete(character = letters) %>%
spread(character, n, fill = 0)
# A tibble: 2 x 28
# Groups: number, string [2]
number string a b c d e f g h i j k
<dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 COUNTTHE~ 2 0 3 0 3 0 0 2 0 0 0
2 2 COUNTTHI~ 0 0 1 0 0 0 0 1 1 0 0
# ... with 15 more variables: l <dbl>, m <dbl>, n <dbl>, o <dbl>, p <dbl>, q <dbl>,
# r <dbl>, s <dbl>, t <dbl>, u <dbl>, v <dbl>, w <dbl>, x <dbl>, y <dbl>, z <dbl>
如果你想保留所有的原始大小写(即不转换为小写),那么你可以将 to_lower = FALSE
添加到 unnest_tokens()
.