按降序计算字母频率
Count letter frequencies in descending order
我有一个这样的数据
df<-structure(list(col = structure(c(9L, 2L, 13L, 11L, 5L, 7L, 10L,
6L, 8L, 3L, 12L, 4L, 1L), .Label = c("HHRGGVCTS", "MGSSN", "MVKTTYYDVG",
"RRHYNGAYDD", "RTSTN", "S", "SNCWC", "sp|P31689|DNJA1_HUMAN DnaJ homolog GN=DNAJA1 PE=1 SV=2 ",
"sp|Q9H9K5|MER34_HUMAN Endogenous PE=1 SV=1", "THYDT", "TVHAV",
"VCMCVVDDNR", "YATTA"), class = "factor")), class = "data.frame", row.names = c(NA,
-13L))
我正在尝试计算字母频率。我想在每一行中计算 20 个可能的字母。
例如,
- 第一行:行以
sp|
开头,所以不计算字符频率,结果是原始字符串
- 第二行:不以
sp|
开头,因此它将显示字符频率
MGSSN 2,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
表示有2 S
, 1, M
, 1, G
, 1, N
其他字母为空.
字符频率按降序排列。
最终输出如下所示
output<-structure(list(col = structure(c(9L, 2L, 13L, 11L, 5L, 7L, 10L,
6L, 8L, 3L, 12L, 4L, 1L), .Label = c("HHRGGVCTS", "MGSSN", "MVKTTYYDVG",
"RRHYNGAYDD", "RTSTN", "S", "SNCWC", "sp|P31689|DNJA1_HUMAN DnaJ homolog GN=DNAJA1 PE=1 SV=2 ",
"sp|Q9H9K5|MER34_HUMAN Endogenous PE=1 SV=1", "THYDT", "TVHAV",
"VCMCVVDDNR", "YATTA"), class = "factor"), Col2 = structure(c(8L,
2L, 3L, 2L, 2L, 2L, 2L, 1L, 7L, 5L, 6L, 5L, 4L), .Label = c("1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0",
"2,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0", "2,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0",
"2,2,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0", "2,2,2,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0",
"3,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0", "sp|P31689|DNJA1_HUMAN DnaJ homolog GN=DNAJA1 PE=1 SV=2 ",
"sp|Q9H9K5|MER34_HUMAN Endogenous PE=1 SV=1"), class = "factor")), class = "data.frame", row.names = c(NA,
-13L))
我们可以使用str_count
library(stringr)
i1 <- !grepl("^sp", df$col)
df$col2[i1] <- sapply(as.character(df$col[i1]), function(x)
paste(sort(str_count(x, LETTERS), decreasing = TRUE), collapse=", "))
df$col2[!i1] <- df$col[!i1]
或者不是保留为字符串,它也可以是 list
列
library(tidyverse)
df %>%
mutate(col = as.character(col),
col2 = map(col, ~ if(str_detect(.x, "^sp")) .x
else str_count(.x, LETTERS) %>%
sort(decreasing = TRUE)))
我有一个这样的数据
df<-structure(list(col = structure(c(9L, 2L, 13L, 11L, 5L, 7L, 10L,
6L, 8L, 3L, 12L, 4L, 1L), .Label = c("HHRGGVCTS", "MGSSN", "MVKTTYYDVG",
"RRHYNGAYDD", "RTSTN", "S", "SNCWC", "sp|P31689|DNJA1_HUMAN DnaJ homolog GN=DNAJA1 PE=1 SV=2 ",
"sp|Q9H9K5|MER34_HUMAN Endogenous PE=1 SV=1", "THYDT", "TVHAV",
"VCMCVVDDNR", "YATTA"), class = "factor")), class = "data.frame", row.names = c(NA,
-13L))
我正在尝试计算字母频率。我想在每一行中计算 20 个可能的字母。
例如,
- 第一行:行以
sp|
开头,所以不计算字符频率,结果是原始字符串 - 第二行:不以
sp|
开头,因此它将显示字符频率
MGSSN 2,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
表示有2 S
, 1, M
, 1, G
, 1, N
其他字母为空.
字符频率按降序排列。
最终输出如下所示
output<-structure(list(col = structure(c(9L, 2L, 13L, 11L, 5L, 7L, 10L,
6L, 8L, 3L, 12L, 4L, 1L), .Label = c("HHRGGVCTS", "MGSSN", "MVKTTYYDVG",
"RRHYNGAYDD", "RTSTN", "S", "SNCWC", "sp|P31689|DNJA1_HUMAN DnaJ homolog GN=DNAJA1 PE=1 SV=2 ",
"sp|Q9H9K5|MER34_HUMAN Endogenous PE=1 SV=1", "THYDT", "TVHAV",
"VCMCVVDDNR", "YATTA"), class = "factor"), Col2 = structure(c(8L,
2L, 3L, 2L, 2L, 2L, 2L, 1L, 7L, 5L, 6L, 5L, 4L), .Label = c("1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0",
"2,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0", "2,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0",
"2,2,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0", "2,2,2,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0",
"3,2,2,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0", "sp|P31689|DNJA1_HUMAN DnaJ homolog GN=DNAJA1 PE=1 SV=2 ",
"sp|Q9H9K5|MER34_HUMAN Endogenous PE=1 SV=1"), class = "factor")), class = "data.frame", row.names = c(NA,
-13L))
我们可以使用str_count
library(stringr)
i1 <- !grepl("^sp", df$col)
df$col2[i1] <- sapply(as.character(df$col[i1]), function(x)
paste(sort(str_count(x, LETTERS), decreasing = TRUE), collapse=", "))
df$col2[!i1] <- df$col[!i1]
或者不是保留为字符串,它也可以是 list
列
library(tidyverse)
df %>%
mutate(col = as.character(col),
col2 = map(col, ~ if(str_detect(.x, "^sp")) .x
else str_count(.x, LETTERS) %>%
sort(decreasing = TRUE)))