检查 non-unique 个字符的字符串模式
Check string pattern for non-unique characters
我有一个包含两列的数据框:id
和 gradelist
。
gradelist
列中的值包括不同长度的成绩列表(由 ;
分隔)。
这是数据:
id <- seq(1,7)
gradelist <- c("a;b;b",
"c;c",
"d;d;d;f",
"f;f;f;f;f;f",
"a;a;a;a",
"f;b;b;b;b;b;b;b",
"c;c;d;d;a;a")
df <- data.frame(id, gradelist)
df$gradelist <- as.character(df$gradelist)
我需要再添加一个cloumn来检查是否所有成绩都是每个id的smae。
输出如下:
我们可以提取字符并检查 n_distinct
发现不同元素的数量是 1
library(dplyr)
library(purrr)
df %>%
mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"),
~ c("no", "yes")[1+(n_distinct(.x)==1)]))
# id gradelist same
#1 1 a;b;b no
#2 2 c;c yes
#3 3 d;d;d;f no
#4 4 f;f;f;f;f;f yes
#5 5 a;a;a;a yes
#6 6 f;b;b;b;b;b;b;b no
#7 7 c;c;d;d;a;a no
或利用case_when
df %>%
mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"), ~
case_when(n_distinct(.x) == 1 ~ "yes", TRUE ~ "no")))
或者另一个选项是 'gradelist' 上的 separate_rows
来扩展数据,找到 n_distinct
library(tidyr)
df %>%
separate_rows(gradelist) %>%
distinct %>%
group_by(id) %>%
summarise(same = c("no", "yes")[1 + (n_distinct(gradelist) == 1)]) %>%
left_join(df)
尝试:
transform(df, same = c('No', 'Yes')[grepl("^(.)\1*$", gsub(';', '', gradelist)) + 1])
输出:
id gradelist same
1 1 a;b;b No
2 2 c;c Yes
3 3 d;d;d;f No
4 4 f;f;f;f;f;f Yes
5 5 a;a;a;a Yes
6 6 f;b;b;b;b;b;b;b No
7 7 c;c;d;d;a;a No
您也可以选择 strsplit
方式,如下所示:
transform(df, same = c('No', 'Yes')[sapply(strsplit(gradelist, split = ';'), function(x) length(unique(unlist(x))) == 1) + 1])
基准
我们重复字符串几次。我们还重复 df
的行,以便我们最终得到略多于 100k 行,并分配 @ThomasIsCoding 使用的函数。
df$gradelist <- sapply(df$gradelist, function(x) paste(replicate(20, x), collapse = ";"))
df <- df[rep(seq_len(nrow(df)), each = 15000), ]
f <- Vectorize(function(x) ifelse(length(unique(unlist(strsplit(x,";"))))==1,"yes","no"))
我们对所有 base
函数使用 transform
来模拟 mutate
在 tidy
解和 microbenchmark
10 次情况下的行为:
mBench <- microbenchmark::microbenchmark(
akrun1 = { df %>% mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"),
~ c("no", "yes")[1+(n_distinct(.x)==1)])) },
akrun2 = { df %>% mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"), ~
case_when(n_distinct(.x) == 1 ~ "yes", TRUE ~ "no"))) },
akrun3 = { df %>%
separate_rows(gradelist) %>%
distinct %>%
group_by(id) %>%
summarise(same = c("no", "yes")[1 + (n_distinct(gradelist) == 1)]) %>%
left_join(df) },
db = { transform(df, same = sapply(gradelist, function(x) {
nchar(gsub(paste0(substring(x, 1, 1), "|;"), "", x)) == 0}, USE.NAMES = FALSE)) },
`M--` = { transform(df, same = factor(unlist(lapply(strsplit(gradelist, ";"), function(x) length(unique(x))))==1, labels=c("No", "Yes"))) },
ThomasIsCoding1 = { transform(df, same = f(gradelist)) },
ThomasIsCoding2 = { transform(df, same = sapply(regmatches(df$gradelist,gregexpr("\w",df$gradelist)),function(x) ifelse(length(unique(x))==1,"yes","no"))) },
arg0naut91_1 = { transform(df, same = c('No', 'Yes')[grepl("^(.)\1*$", gsub(';', '', df$gradelist)) + 1]) },
arg0naut91_2 = { transform(df, same = c('No', 'Yes')[sapply(strsplit(df$gradelist, split = ';'), function(x) length(unique(unlist(x))) == 1) + 1]) },
times = 10
)
结果:
Unit: seconds
expr min lq mean median uq max neval
akrun1 19.684781 19.912789 21.084244 20.646490 21.606763 24.008420 10
akrun2 30.393006 31.066965 32.590679 31.824528 33.567449 37.780535 10
akrun3 6.378463 7.190472 7.379439 7.373730 7.704365 8.321929 10
db 3.738271 3.785858 3.935769 3.911479 3.926385 4.523876 10
M-- 3.551592 3.648720 3.723315 3.741075 3.798664 3.915588 10
ThomasIsCoding1 4.453528 4.498858 4.702160 4.613088 4.823517 5.379984 10
ThomasIsCoding2 3.368358 3.532593 3.752111 3.610664 3.773345 4.969414 10
arg0naut91_1 1.638212 1.683986 1.699327 1.704614 1.716077 1.759059 10
arg0naut91_2 3.665604 3.739662 3.774542 3.750144 3.774753 4.071887 10
剧情:
检查哪个字符排在首位,并将所有出现的该字符替换为空字符串。如果什么都没有留下,那就意味着所有的字符都是一样的。
sapply(df$gradelist, function(x) {
nchar(gsub(paste0(substring(x, 1, 1), "|;"), "", x)) == 0
}, USE.NAMES = FALSE)
#[1] FALSE TRUE FALSE TRUE TRUE FALSE FALSE
df$same <- factor(unlist(lapply(strsplit(df$g, ";"), function(x)
length(unique(x))))==1, labels=c("No", "Yes"))
df
#> id gradelist same
#> 1 1 a;b;b No
#> 2 2 c;c Yes
#> 3 3 d;d;d;f No
#> 4 4 f;f;f;f;f;f Yes
#> 5 5 a;a;a;a Yes
#> 6 6 f;b;b;b;b;b;b;b No
#> 7 7 c;c;d;d;a;a No
这里有一些基本的 R 解决方案。
- 定义您的自定义函数
f
,即
f <- Vectorize(function(x) ifelse(length(unique(unlist(strsplit(x,";"))))==1,"yes","no"))
然后您可以通过
添加列same
df$same <- f(df$gradelist)
- 使用
regmatches
+ sapply
df <- within(df,same <- sapply(regmatches(gradelist,gregexpr("\w",gradelist)),function(x) ifelse(length(unique(x))==1,"yes","no")))
这样
> df
id gradelist same
1 1 a;b;b no
2 2 c;c yes
3 3 d;d;d;f no
4 4 f;f;f;f;f;f yes
5 5 a;a;a;a yes
6 6 f;b;b;b;b;b;b;b no
7 7 c;c;d;d;a;a no
我有一个包含两列的数据框:id
和 gradelist
。
gradelist
列中的值包括不同长度的成绩列表(由 ;
分隔)。
这是数据:
id <- seq(1,7)
gradelist <- c("a;b;b",
"c;c",
"d;d;d;f",
"f;f;f;f;f;f",
"a;a;a;a",
"f;b;b;b;b;b;b;b",
"c;c;d;d;a;a")
df <- data.frame(id, gradelist)
df$gradelist <- as.character(df$gradelist)
我需要再添加一个cloumn来检查是否所有成绩都是每个id的smae。
输出如下:
我们可以提取字符并检查 n_distinct
发现不同元素的数量是 1
library(dplyr)
library(purrr)
df %>%
mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"),
~ c("no", "yes")[1+(n_distinct(.x)==1)]))
# id gradelist same
#1 1 a;b;b no
#2 2 c;c yes
#3 3 d;d;d;f no
#4 4 f;f;f;f;f;f yes
#5 5 a;a;a;a yes
#6 6 f;b;b;b;b;b;b;b no
#7 7 c;c;d;d;a;a no
或利用case_when
df %>%
mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"), ~
case_when(n_distinct(.x) == 1 ~ "yes", TRUE ~ "no")))
或者另一个选项是 'gradelist' 上的 separate_rows
来扩展数据,找到 n_distinct
library(tidyr)
df %>%
separate_rows(gradelist) %>%
distinct %>%
group_by(id) %>%
summarise(same = c("no", "yes")[1 + (n_distinct(gradelist) == 1)]) %>%
left_join(df)
尝试:
transform(df, same = c('No', 'Yes')[grepl("^(.)\1*$", gsub(';', '', gradelist)) + 1])
输出:
id gradelist same
1 1 a;b;b No
2 2 c;c Yes
3 3 d;d;d;f No
4 4 f;f;f;f;f;f Yes
5 5 a;a;a;a Yes
6 6 f;b;b;b;b;b;b;b No
7 7 c;c;d;d;a;a No
您也可以选择 strsplit
方式,如下所示:
transform(df, same = c('No', 'Yes')[sapply(strsplit(gradelist, split = ';'), function(x) length(unique(unlist(x))) == 1) + 1])
基准
我们重复字符串几次。我们还重复 df
的行,以便我们最终得到略多于 100k 行,并分配 @ThomasIsCoding 使用的函数。
df$gradelist <- sapply(df$gradelist, function(x) paste(replicate(20, x), collapse = ";"))
df <- df[rep(seq_len(nrow(df)), each = 15000), ]
f <- Vectorize(function(x) ifelse(length(unique(unlist(strsplit(x,";"))))==1,"yes","no"))
我们对所有 base
函数使用 transform
来模拟 mutate
在 tidy
解和 microbenchmark
10 次情况下的行为:
mBench <- microbenchmark::microbenchmark(
akrun1 = { df %>% mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"),
~ c("no", "yes")[1+(n_distinct(.x)==1)])) },
akrun2 = { df %>% mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"), ~
case_when(n_distinct(.x) == 1 ~ "yes", TRUE ~ "no"))) },
akrun3 = { df %>%
separate_rows(gradelist) %>%
distinct %>%
group_by(id) %>%
summarise(same = c("no", "yes")[1 + (n_distinct(gradelist) == 1)]) %>%
left_join(df) },
db = { transform(df, same = sapply(gradelist, function(x) {
nchar(gsub(paste0(substring(x, 1, 1), "|;"), "", x)) == 0}, USE.NAMES = FALSE)) },
`M--` = { transform(df, same = factor(unlist(lapply(strsplit(gradelist, ";"), function(x) length(unique(x))))==1, labels=c("No", "Yes"))) },
ThomasIsCoding1 = { transform(df, same = f(gradelist)) },
ThomasIsCoding2 = { transform(df, same = sapply(regmatches(df$gradelist,gregexpr("\w",df$gradelist)),function(x) ifelse(length(unique(x))==1,"yes","no"))) },
arg0naut91_1 = { transform(df, same = c('No', 'Yes')[grepl("^(.)\1*$", gsub(';', '', df$gradelist)) + 1]) },
arg0naut91_2 = { transform(df, same = c('No', 'Yes')[sapply(strsplit(df$gradelist, split = ';'), function(x) length(unique(unlist(x))) == 1) + 1]) },
times = 10
)
结果:
Unit: seconds
expr min lq mean median uq max neval
akrun1 19.684781 19.912789 21.084244 20.646490 21.606763 24.008420 10
akrun2 30.393006 31.066965 32.590679 31.824528 33.567449 37.780535 10
akrun3 6.378463 7.190472 7.379439 7.373730 7.704365 8.321929 10
db 3.738271 3.785858 3.935769 3.911479 3.926385 4.523876 10
M-- 3.551592 3.648720 3.723315 3.741075 3.798664 3.915588 10
ThomasIsCoding1 4.453528 4.498858 4.702160 4.613088 4.823517 5.379984 10
ThomasIsCoding2 3.368358 3.532593 3.752111 3.610664 3.773345 4.969414 10
arg0naut91_1 1.638212 1.683986 1.699327 1.704614 1.716077 1.759059 10
arg0naut91_2 3.665604 3.739662 3.774542 3.750144 3.774753 4.071887 10
剧情:
检查哪个字符排在首位,并将所有出现的该字符替换为空字符串。如果什么都没有留下,那就意味着所有的字符都是一样的。
sapply(df$gradelist, function(x) {
nchar(gsub(paste0(substring(x, 1, 1), "|;"), "", x)) == 0
}, USE.NAMES = FALSE)
#[1] FALSE TRUE FALSE TRUE TRUE FALSE FALSE
df$same <- factor(unlist(lapply(strsplit(df$g, ";"), function(x)
length(unique(x))))==1, labels=c("No", "Yes"))
df
#> id gradelist same
#> 1 1 a;b;b No
#> 2 2 c;c Yes
#> 3 3 d;d;d;f No
#> 4 4 f;f;f;f;f;f Yes
#> 5 5 a;a;a;a Yes
#> 6 6 f;b;b;b;b;b;b;b No
#> 7 7 c;c;d;d;a;a No
这里有一些基本的 R 解决方案。
- 定义您的自定义函数
f
,即
f <- Vectorize(function(x) ifelse(length(unique(unlist(strsplit(x,";"))))==1,"yes","no"))
然后您可以通过
添加列same
df$same <- f(df$gradelist)
- 使用
regmatches
+sapply
df <- within(df,same <- sapply(regmatches(gradelist,gregexpr("\w",gradelist)),function(x) ifelse(length(unique(x))==1,"yes","no")))
这样
> df
id gradelist same
1 1 a;b;b no
2 2 c;c yes
3 3 d;d;d;f no
4 4 f;f;f;f;f;f yes
5 5 a;a;a;a yes
6 6 f;b;b;b;b;b;b;b no
7 7 c;c;d;d;a;a no