如何计算各个列中的字符串

how to count strings across various columns

我有这样的数据

    df<- structure(list(Accession = c("Washington", "DC", "NY", "TM", 
"VA", "UTSAW", "UTDFS", "FLOR", "HYTAS", "HUTT", "ITA", "BELI"
), FirstCon = c("UNS", "UNS", "UNS", "UNS", "UNS", "", "UNS", 
"UNS", "UNS", "UNS", "UNS", "UNS"), SeconCon = c("", "", "", 
"UNS", "", "", "", "", "", "UTN", "UTN", "UTN"), Upcond = c("", 
"", "", "UNS", "", "UNS", "", "", "", "", "", "UTBTS")), class = "data.frame", row.names = c(NA, 
-12L))

看起来像下面这样

   Accession FirstCon SeconCon Upcond
1  Washington      UNS                
2          DC      UNS                
3          NY      UNS                
4          TM      UNS      UNS    UNS
5          VA      UNS                
6       UTSAW                      UNS
7       UTDFS      UNS                
8        FLOR      UNS                
9       HYTAS      UNS                
10       HUTT      UNS      UTN       
11        ITA      UNS      UTN       
12       BELI      UNS      UTN  UTBTS

我想要这样的输出

FirstCon SeconCon Upcond   FirstConSeconCon  FirstConUpcond  SeconConUpcond
 11        4        3           4                  2              2
FirstConSeconConUpcond
2

基本上它显示了每列中重复了多少个字符串以及所有字符串的组合

例如

Accession FirstCon SeconCon 
    4          TM      **UNS      UNS**    
    10       HUTT      **UNS      UTN**       
    11        ITA      **UNS      UTN**       
    12       BELI      **UNS      UTN**  

FirstConSeconCon 是 4 因为 FirstCon 有四个 UNS 并且 SeconCon 也有同一行的字符串(它们不需要相似但只要它们有相同的字符串它计算的相应行)

据我所知,这是一个解决方案:

df<- structure(list(Accession = c("Washington", "DC", "NY", "TM", "VA", "UTSAW",
                                  "UTDFS", "FLOR", "HYTAS", "HUTT", "ITA", "BELI"),
                    FirstCon = c("UNS", "UNS", "UNS", "UNS", "UNS", "", "UNS","UNS",
                                 "UNS", "UNS", "UNS", "UNS"),
                    SeconCon = c("", "", "", "UNS", "", "", "", "", "", "UTN",
                                 "UTN", "UTN"),
                    Upcond = c("","", "", "UNS", "", "UNS", "", "", "", "", "", "UTBTS")),
               class = "data.frame", row.names = c(NA,-12L))
# this function returns the existing of strings in the rows
occurence <- function(df){
  oc <- 0L
  for(i in 1:nrow(df)){
    if(all(nchar(df[i , ]) > 0)){
      oc <- oc + 1L
    }
  }
  oc
}

res <- c()
nm <- c()

for( i in 1:(length(names(df[-1])))){

  com <- combn(1:ncol(df[-1]) , i)
  
  for(c in 1:ncol(com)){
    dfsub <- df[,com[,c] + 1L , drop = F]
    coln <- names(dfsub)
    oc <- occurence(dfsub)
    nm <- append(nm , do.call(paste0 , as.list(coln)))
    res <- append(res , oc)
    names(res) <- nm
  }
}

res
#>               FirstCon               SeconCon                 Upcond 
#>                     11                      4                      3 
#>       FirstConSeconCon         FirstConUpcond         SeconConUpcond 
#>                      4                      2                      2 
#> FirstConSeconConUpcond 
#>                      2

reprex package (v2.0.1)

于 2022-06-03 创建

这是一种使用基数 R 但没有循环的方法:

df[df == ""] <- NA

cols <- names(df)[-1]

combins <- do.call("c", lapply(2:length(cols), function(x) combn(cols, x, FUN = list)))
combin_vals <- sapply(1:length(combins), function(x) sum(rowSums(is.na(df[,combins[[x]]])) == 0))
names(combin_vals) <-  sapply(combins, function(x) paste(x, collapse = ""))

c(colSums(!is.na(df[,cols])), combin_vals)
          FirstCon               SeconCon                 Upcond       FirstConSeconCon 
                11                      4                      3                      4 
    FirstConUpcond         SeconConUpcond FirstConSeconConUpcond 
                 2                      2                      2