检查 non-unique 个字符的字符串模式

Check string pattern for non-unique characters

我有一个包含两列的数据框:idgradelist

gradelist 列中的值包括不同长度的成绩列表(由 ; 分隔)。

这是数据:

id <- seq(1,7)
gradelist <- c("a;b;b",
            "c;c",
            "d;d;d;f",
            "f;f;f;f;f;f",
            "a;a;a;a",
            "f;b;b;b;b;b;b;b",
            "c;c;d;d;a;a")

df <- data.frame(id, gradelist)
df$gradelist <- as.character(df$gradelist)

我需要再添加一个cloumn来检查是否所有成绩都是每个id的smae。

输出如下:

我们可以提取字符并检查 n_distinct 发现不同元素的数量是 1

library(dplyr)
library(purrr)
df %>% 
   mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"), 
       ~ c("no", "yes")[1+(n_distinct(.x)==1)]))
#   id       gradelist same
#1  1           a;b;b   no
#2  2             c;c  yes
#3  3         d;d;d;f   no
#4  4     f;f;f;f;f;f  yes
#5  5         a;a;a;a  yes
#6  6 f;b;b;b;b;b;b;b   no
#7  7     c;c;d;d;a;a   no

或利用case_when

df %>% 
   mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"), ~
         case_when(n_distinct(.x) == 1 ~ "yes", TRUE ~ "no")))

或者另一个选项是 'gradelist' 上的 separate_rows 来扩展数据,找到 n_distinct

library(tidyr)
df %>% 
    separate_rows(gradelist) %>%
    distinct %>% 
    group_by(id) %>% 
    summarise(same = c("no", "yes")[1 + (n_distinct(gradelist) == 1)]) %>% 
    left_join(df)

尝试:

transform(df, same = c('No', 'Yes')[grepl("^(.)\1*$", gsub(';', '', gradelist)) + 1])

输出:

  id       gradelist same
1  1           a;b;b   No
2  2             c;c  Yes
3  3         d;d;d;f   No
4  4     f;f;f;f;f;f  Yes
5  5         a;a;a;a  Yes
6  6 f;b;b;b;b;b;b;b   No
7  7     c;c;d;d;a;a   No

您也可以选择 strsplit 方式,如下所示:

transform(df, same = c('No', 'Yes')[sapply(strsplit(gradelist, split = ';'), function(x) length(unique(unlist(x))) == 1) + 1])

基准

我们重复字符串几次。我们还重复 df 的行,以便我们最终得到略多于 100k 行,并分配 @ThomasIsCoding 使用的函数。

df$gradelist <- sapply(df$gradelist, function(x) paste(replicate(20, x), collapse = ";"))

df <- df[rep(seq_len(nrow(df)), each = 15000), ]

f <- Vectorize(function(x) ifelse(length(unique(unlist(strsplit(x,";"))))==1,"yes","no"))

我们对所有 base 函数使用 transform 来模拟 mutatetidy 解和 microbenchmark 10 次情况下的行为:

mBench <- microbenchmark::microbenchmark(

  akrun1 = { df %>% mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"),
                                          ~ c("no", "yes")[1+(n_distinct(.x)==1)])) },
  akrun2 = { df %>% mutate(same = map_chr(str_extract_all(gradelist, "[a-z]"), ~
                                            case_when(n_distinct(.x) == 1 ~ "yes", TRUE ~ "no"))) },
  akrun3 = { df %>%
    separate_rows(gradelist) %>%
    distinct %>% 
    group_by(id) %>% 
    summarise(same = c("no", "yes")[1 + (n_distinct(gradelist) == 1)]) %>% 
    left_join(df) },
  db = { transform(df, same = sapply(gradelist, function(x) { 
    nchar(gsub(paste0(substring(x, 1, 1), "|;"), "", x)) == 0}, USE.NAMES = FALSE)) },
  `M--` = { transform(df, same = factor(unlist(lapply(strsplit(gradelist, ";"), function(x) length(unique(x))))==1, labels=c("No", "Yes"))) },
  ThomasIsCoding1 = { transform(df, same = f(gradelist)) },
  ThomasIsCoding2 = { transform(df, same = sapply(regmatches(df$gradelist,gregexpr("\w",df$gradelist)),function(x) ifelse(length(unique(x))==1,"yes","no"))) },
  arg0naut91_1 = { transform(df, same = c('No', 'Yes')[grepl("^(.)\1*$", gsub(';', '', df$gradelist)) + 1]) },
  arg0naut91_2 = { transform(df, same = c('No', 'Yes')[sapply(strsplit(df$gradelist, split = ';'), function(x) length(unique(unlist(x))) == 1) + 1]) },

  times = 10

)

结果:

Unit: seconds
            expr       min        lq      mean    median        uq       max neval
          akrun1 19.684781 19.912789 21.084244 20.646490 21.606763 24.008420    10
          akrun2 30.393006 31.066965 32.590679 31.824528 33.567449 37.780535    10
          akrun3  6.378463  7.190472  7.379439  7.373730  7.704365  8.321929    10
              db  3.738271  3.785858  3.935769  3.911479  3.926385  4.523876    10
             M--  3.551592  3.648720  3.723315  3.741075  3.798664  3.915588    10
 ThomasIsCoding1  4.453528  4.498858  4.702160  4.613088  4.823517  5.379984    10
 ThomasIsCoding2  3.368358  3.532593  3.752111  3.610664  3.773345  4.969414    10
    arg0naut91_1  1.638212  1.683986  1.699327  1.704614  1.716077  1.759059    10
    arg0naut91_2  3.665604  3.739662  3.774542  3.750144  3.774753  4.071887    10

剧情:

检查哪个字符排在首位,并将所有出现的该字符替换为空字符串。如果什么都没有留下,那就意味着所有的字符都是一样的。

sapply(df$gradelist, function(x) {
    nchar(gsub(paste0(substring(x, 1, 1), "|;"), "", x)) == 0
}, USE.NAMES = FALSE)
#[1] FALSE  TRUE FALSE  TRUE  TRUE FALSE FALSE
df$same <- factor(unlist(lapply(strsplit(df$g, ";"), function(x) 
                                    length(unique(x))))==1, labels=c("No", "Yes"))

df
#>   id       gradelist same
#> 1  1           a;b;b   No
#> 2  2             c;c  Yes
#> 3  3         d;d;d;f   No
#> 4  4     f;f;f;f;f;f  Yes
#> 5  5         a;a;a;a  Yes
#> 6  6 f;b;b;b;b;b;b;b   No
#> 7  7     c;c;d;d;a;a   No

这里有一些基本的 R 解决方案。

  • 定义您的自定义函数 f,即
f <- Vectorize(function(x) ifelse(length(unique(unlist(strsplit(x,";"))))==1,"yes","no"))

然后您可以通过

添加列same
df$same <- f(df$gradelist)
  • 使用 regmatches + sapply
df <- within(df,same <- sapply(regmatches(gradelist,gregexpr("\w",gradelist)),function(x) ifelse(length(unique(x))==1,"yes","no")))

这样

> df
  id       gradelist same
1  1           a;b;b   no
2  2             c;c  yes
3  3         d;d;d;f   no
4  4     f;f;f;f;f;f  yes
5  5         a;a;a;a  yes
6  6 f;b;b;b;b;b;b;b   no
7  7     c;c;d;d;a;a   no