如何计算各个列中的字符串
how to count strings across various columns
我有这样的数据
df<- structure(list(Accession = c("Washington", "DC", "NY", "TM",
"VA", "UTSAW", "UTDFS", "FLOR", "HYTAS", "HUTT", "ITA", "BELI"
), FirstCon = c("UNS", "UNS", "UNS", "UNS", "UNS", "", "UNS",
"UNS", "UNS", "UNS", "UNS", "UNS"), SeconCon = c("", "", "",
"UNS", "", "", "", "", "", "UTN", "UTN", "UTN"), Upcond = c("",
"", "", "UNS", "", "UNS", "", "", "", "", "", "UTBTS")), class = "data.frame", row.names = c(NA,
-12L))
看起来像下面这样
Accession FirstCon SeconCon Upcond
1 Washington UNS
2 DC UNS
3 NY UNS
4 TM UNS UNS UNS
5 VA UNS
6 UTSAW UNS
7 UTDFS UNS
8 FLOR UNS
9 HYTAS UNS
10 HUTT UNS UTN
11 ITA UNS UTN
12 BELI UNS UTN UTBTS
我想要这样的输出
FirstCon SeconCon Upcond FirstConSeconCon FirstConUpcond SeconConUpcond
11 4 3 4 2 2
FirstConSeconConUpcond
2
基本上它显示了每列中重复了多少个字符串以及所有字符串的组合
例如
Accession FirstCon SeconCon
4 TM **UNS UNS**
10 HUTT **UNS UTN**
11 ITA **UNS UTN**
12 BELI **UNS UTN**
FirstConSeconCon 是 4 因为 FirstCon
有四个 UNS 并且 SeconCon
也有同一行的字符串(它们不需要相似但只要它们有相同的字符串它计算的相应行)
据我所知,这是一个解决方案:
df<- structure(list(Accession = c("Washington", "DC", "NY", "TM", "VA", "UTSAW",
"UTDFS", "FLOR", "HYTAS", "HUTT", "ITA", "BELI"),
FirstCon = c("UNS", "UNS", "UNS", "UNS", "UNS", "", "UNS","UNS",
"UNS", "UNS", "UNS", "UNS"),
SeconCon = c("", "", "", "UNS", "", "", "", "", "", "UTN",
"UTN", "UTN"),
Upcond = c("","", "", "UNS", "", "UNS", "", "", "", "", "", "UTBTS")),
class = "data.frame", row.names = c(NA,-12L))
# this function returns the existing of strings in the rows
occurence <- function(df){
oc <- 0L
for(i in 1:nrow(df)){
if(all(nchar(df[i , ]) > 0)){
oc <- oc + 1L
}
}
oc
}
res <- c()
nm <- c()
for( i in 1:(length(names(df[-1])))){
com <- combn(1:ncol(df[-1]) , i)
for(c in 1:ncol(com)){
dfsub <- df[,com[,c] + 1L , drop = F]
coln <- names(dfsub)
oc <- occurence(dfsub)
nm <- append(nm , do.call(paste0 , as.list(coln)))
res <- append(res , oc)
names(res) <- nm
}
}
res
#> FirstCon SeconCon Upcond
#> 11 4 3
#> FirstConSeconCon FirstConUpcond SeconConUpcond
#> 4 2 2
#> FirstConSeconConUpcond
#> 2
由 reprex package (v2.0.1)
于 2022-06-03 创建
这是一种使用基数 R 但没有循环的方法:
df[df == ""] <- NA
cols <- names(df)[-1]
combins <- do.call("c", lapply(2:length(cols), function(x) combn(cols, x, FUN = list)))
combin_vals <- sapply(1:length(combins), function(x) sum(rowSums(is.na(df[,combins[[x]]])) == 0))
names(combin_vals) <- sapply(combins, function(x) paste(x, collapse = ""))
c(colSums(!is.na(df[,cols])), combin_vals)
FirstCon SeconCon Upcond FirstConSeconCon
11 4 3 4
FirstConUpcond SeconConUpcond FirstConSeconConUpcond
2 2 2
我有这样的数据
df<- structure(list(Accession = c("Washington", "DC", "NY", "TM",
"VA", "UTSAW", "UTDFS", "FLOR", "HYTAS", "HUTT", "ITA", "BELI"
), FirstCon = c("UNS", "UNS", "UNS", "UNS", "UNS", "", "UNS",
"UNS", "UNS", "UNS", "UNS", "UNS"), SeconCon = c("", "", "",
"UNS", "", "", "", "", "", "UTN", "UTN", "UTN"), Upcond = c("",
"", "", "UNS", "", "UNS", "", "", "", "", "", "UTBTS")), class = "data.frame", row.names = c(NA,
-12L))
看起来像下面这样
Accession FirstCon SeconCon Upcond
1 Washington UNS
2 DC UNS
3 NY UNS
4 TM UNS UNS UNS
5 VA UNS
6 UTSAW UNS
7 UTDFS UNS
8 FLOR UNS
9 HYTAS UNS
10 HUTT UNS UTN
11 ITA UNS UTN
12 BELI UNS UTN UTBTS
我想要这样的输出
FirstCon SeconCon Upcond FirstConSeconCon FirstConUpcond SeconConUpcond
11 4 3 4 2 2
FirstConSeconConUpcond
2
基本上它显示了每列中重复了多少个字符串以及所有字符串的组合
例如
Accession FirstCon SeconCon
4 TM **UNS UNS**
10 HUTT **UNS UTN**
11 ITA **UNS UTN**
12 BELI **UNS UTN**
FirstConSeconCon 是 4 因为 FirstCon
有四个 UNS 并且 SeconCon
也有同一行的字符串(它们不需要相似但只要它们有相同的字符串它计算的相应行)
据我所知,这是一个解决方案:
df<- structure(list(Accession = c("Washington", "DC", "NY", "TM", "VA", "UTSAW",
"UTDFS", "FLOR", "HYTAS", "HUTT", "ITA", "BELI"),
FirstCon = c("UNS", "UNS", "UNS", "UNS", "UNS", "", "UNS","UNS",
"UNS", "UNS", "UNS", "UNS"),
SeconCon = c("", "", "", "UNS", "", "", "", "", "", "UTN",
"UTN", "UTN"),
Upcond = c("","", "", "UNS", "", "UNS", "", "", "", "", "", "UTBTS")),
class = "data.frame", row.names = c(NA,-12L))
# this function returns the existing of strings in the rows
occurence <- function(df){
oc <- 0L
for(i in 1:nrow(df)){
if(all(nchar(df[i , ]) > 0)){
oc <- oc + 1L
}
}
oc
}
res <- c()
nm <- c()
for( i in 1:(length(names(df[-1])))){
com <- combn(1:ncol(df[-1]) , i)
for(c in 1:ncol(com)){
dfsub <- df[,com[,c] + 1L , drop = F]
coln <- names(dfsub)
oc <- occurence(dfsub)
nm <- append(nm , do.call(paste0 , as.list(coln)))
res <- append(res , oc)
names(res) <- nm
}
}
res
#> FirstCon SeconCon Upcond
#> 11 4 3
#> FirstConSeconCon FirstConUpcond SeconConUpcond
#> 4 2 2
#> FirstConSeconConUpcond
#> 2
由 reprex package (v2.0.1)
于 2022-06-03 创建这是一种使用基数 R 但没有循环的方法:
df[df == ""] <- NA
cols <- names(df)[-1]
combins <- do.call("c", lapply(2:length(cols), function(x) combn(cols, x, FUN = list)))
combin_vals <- sapply(1:length(combins), function(x) sum(rowSums(is.na(df[,combins[[x]]])) == 0))
names(combin_vals) <- sapply(combins, function(x) paste(x, collapse = ""))
c(colSums(!is.na(df[,cols])), combin_vals)
FirstCon SeconCon Upcond FirstConSeconCon
11 4 3 4
FirstConUpcond SeconConUpcond FirstConSeconConUpcond
2 2 2