匹配 DataFrame 列中的字符串
Match string from DataFrame columns
我有如下测试数据框(原始文件为 60 x 350)
Speci 45 46 47 48 49 50 51
PP A T A - G G A
JJ G T A - T A G
KK A T - A G - A
LL A C A A G G A
MM G C A A G G G
NN C - G T C C T
基本上我想做的是传递一个输入字符串和列名称/编号,如下所示
input string: CAAGGA
ColF (Column from): 46
ColT ( Column To ): 51
并精确匹配每个对应的指定列(From to )并按降序输出最佳匹配如下
Species Matchs
LL 6
MM 5
PP 4
R / Python 我仅用于 ggplots 和基本文件处理的语言,不确定如何遍历数据帧和匹配。
在R中,我们可以写一个函数
test_func <- function(df, ColF, ColT, input_string) {
#Split input string at every character
str <- strsplit(input_string, "")[[1]]
#Create a sequence between column numbers that match
cols <- do.call(seq, as.list(match(c(ColF, ColT), names(df))))
#Count number of exact matches in each row
vals <- apply(df[cols], 1, function(x) sum(x == str))
#Sort them in descending order
ord_vals <- order(vals, decreasing = TRUE)
#Display it
data.frame(Species = df$Speci[ord_vals], vals[ord_vals])
}
input_string = "CAAGGA"
ColF = 46
ColT = 51
test_func(df, ColF, ColT, input_string)
# Species value
#1 LL 6
#2 MM 5
#3 PP 4
#4 KK 3
#5 JJ 1
#6 NN 0
数据
df <- structure(list(Speci = structure(c(6L, 1L, 2L, 3L, 4L, 5L), .Label = c("JJ",
"KK", "LL", "MM", "NN", "PP"), class = "factor"), `45` = structure(c(1L,
3L, 1L, 1L, 3L, 2L), .Label = c("A", "C", "G"), class = "factor"),
`46` = structure(c(3L, 3L, 3L, 2L, 2L, 1L), .Label = c("-",
"C", "T"), class = "factor"), `47` = structure(c(2L, 2L,
1L, 2L, 2L, 3L), .Label = c("-", "A", "G"), class = "factor"),
`48` = structure(c(1L, 1L, 2L, 2L, 2L, 3L), .Label = c("-",
"A", "T"), class = "factor"), `49` = structure(c(2L, 3L,
2L, 2L, 2L, 1L), .Label = c("C", "G", "T"), class = "factor"),
`50` = structure(c(4L, 2L, 1L, 4L, 4L, 3L), .Label = c("-",
"A", "C", "G"), class = "factor"), `51` = structure(c(1L,
2L, 1L, 1L, 2L, 3L), .Label = c("A", "G", "T"), class = "factor")),
class = "data.frame", row.names = c(NA, -6L))
一个pandas
解决方案:
s = (df.loc[:, "46":"51"]
.groupby(df.index)
.apply(lambda x: (x==np.array(list("CAAGGA"))).sum(axis=1))
).reset_index(drop=True)
print (df.merge(s.rename("best match"), left_index=True, right_index=True).sort_values("best match",ascending=False))
#
Speci 45 46 47 48 49 50 51 best match
3 LL A C A A G G A 6
4 MM G C A A G G G 5
0 PP A T A - G G A 4
2 KK A T - A G - A 3
1 JJ G T A - T A G 1
5 NN C - G T C C T 0
或列表理解:
df["match"] = [(row==np.array(list("CAAGGA"))).sum() for row in df.loc[:, "46":"51"].values]
print (df.sort_values("match",ascending=False))
#
Speci 45 46 47 48 49 50 51 match
3 LL A C A A G G A 6
4 MM G C A A G G G 5
0 PP A T A - G G A 4
2 KK A T - A G - A 3
1 JJ G T A - T A G 1
5 NN C - G T C C T 0
my_str <- function(df){
nm <- names(df)
# READ THE DATA IN
string <- unlist(strsplit(readline("string: "),""))
ColF <- grep(readline("ColF (Column from): "), nm)
ColT <- grep(readline("ColT ( Column To ): "), nm)
# COMPUTE THE MATCHES
A <- colSums(t(df[ColF:ColT]) == string)
B <- sort(setNames(A,df[,"Speci"]),decreasing = T)
data.frame(Speci = names(B),Matches = B,row.names = NULL)
}
现在 运行 my_str(data)
其中数据是您指定的数据帧
my_str(a)
string: CAAGGA
ColF (Column from): 46
ColT ( Column To ): 51
Speci Matches
1 LL 6
2 MM 5
3 PP 4
4 KK 3
5 JJ 1
6 NN 0
输入PYTHON:
def my_str(df):
string = pd.np.array(list(input("string: ")))
ColF = input("ColF (Column from): ")
ColT = input("ColT (Column to): ")
A = (df.loc[:,ColF:ColT]==string[None,:]).sum(1).sort_values(ascending = False)
return pd.DataFrame({'Speci': df['Speci'][A.index],'Matches' : A})
现在 运行 my_str(data)
其中数据是您指定的数据帧
my_str(df)
string: CAAGGA
ColF (Column from): 46
ColT (Column to): 51
Out[77]:
Speci Matches
3 LL 6
4 MM 5
0 PP 4
2 KK 3
1 JJ 1
5 NN 0
您可以在Python中使用以下解决方案:
col1 = '46'
col2 = '51'
inp = 'CAAGGA'
result = (df.loc[:, col1:col2] == np.array(list(inp))).sum(axis=1)
result.index = df['Speci']
print(result.sort_values(ascending=False)[:3])
输出:
Speci
LL 6
MM 5
PP 4
dtype: int64
我有如下测试数据框(原始文件为 60 x 350)
Speci 45 46 47 48 49 50 51
PP A T A - G G A
JJ G T A - T A G
KK A T - A G - A
LL A C A A G G A
MM G C A A G G G
NN C - G T C C T
基本上我想做的是传递一个输入字符串和列名称/编号,如下所示
input string: CAAGGA
ColF (Column from): 46
ColT ( Column To ): 51
并精确匹配每个对应的指定列(From to )并按降序输出最佳匹配如下
Species Matchs
LL 6
MM 5
PP 4
R / Python 我仅用于 ggplots 和基本文件处理的语言,不确定如何遍历数据帧和匹配。
在R中,我们可以写一个函数
test_func <- function(df, ColF, ColT, input_string) {
#Split input string at every character
str <- strsplit(input_string, "")[[1]]
#Create a sequence between column numbers that match
cols <- do.call(seq, as.list(match(c(ColF, ColT), names(df))))
#Count number of exact matches in each row
vals <- apply(df[cols], 1, function(x) sum(x == str))
#Sort them in descending order
ord_vals <- order(vals, decreasing = TRUE)
#Display it
data.frame(Species = df$Speci[ord_vals], vals[ord_vals])
}
input_string = "CAAGGA"
ColF = 46
ColT = 51
test_func(df, ColF, ColT, input_string)
# Species value
#1 LL 6
#2 MM 5
#3 PP 4
#4 KK 3
#5 JJ 1
#6 NN 0
数据
df <- structure(list(Speci = structure(c(6L, 1L, 2L, 3L, 4L, 5L), .Label = c("JJ",
"KK", "LL", "MM", "NN", "PP"), class = "factor"), `45` = structure(c(1L,
3L, 1L, 1L, 3L, 2L), .Label = c("A", "C", "G"), class = "factor"),
`46` = structure(c(3L, 3L, 3L, 2L, 2L, 1L), .Label = c("-",
"C", "T"), class = "factor"), `47` = structure(c(2L, 2L,
1L, 2L, 2L, 3L), .Label = c("-", "A", "G"), class = "factor"),
`48` = structure(c(1L, 1L, 2L, 2L, 2L, 3L), .Label = c("-",
"A", "T"), class = "factor"), `49` = structure(c(2L, 3L,
2L, 2L, 2L, 1L), .Label = c("C", "G", "T"), class = "factor"),
`50` = structure(c(4L, 2L, 1L, 4L, 4L, 3L), .Label = c("-",
"A", "C", "G"), class = "factor"), `51` = structure(c(1L,
2L, 1L, 1L, 2L, 3L), .Label = c("A", "G", "T"), class = "factor")),
class = "data.frame", row.names = c(NA, -6L))
一个pandas
解决方案:
s = (df.loc[:, "46":"51"]
.groupby(df.index)
.apply(lambda x: (x==np.array(list("CAAGGA"))).sum(axis=1))
).reset_index(drop=True)
print (df.merge(s.rename("best match"), left_index=True, right_index=True).sort_values("best match",ascending=False))
#
Speci 45 46 47 48 49 50 51 best match
3 LL A C A A G G A 6
4 MM G C A A G G G 5
0 PP A T A - G G A 4
2 KK A T - A G - A 3
1 JJ G T A - T A G 1
5 NN C - G T C C T 0
或列表理解:
df["match"] = [(row==np.array(list("CAAGGA"))).sum() for row in df.loc[:, "46":"51"].values]
print (df.sort_values("match",ascending=False))
#
Speci 45 46 47 48 49 50 51 match
3 LL A C A A G G A 6
4 MM G C A A G G G 5
0 PP A T A - G G A 4
2 KK A T - A G - A 3
1 JJ G T A - T A G 1
5 NN C - G T C C T 0
my_str <- function(df){
nm <- names(df)
# READ THE DATA IN
string <- unlist(strsplit(readline("string: "),""))
ColF <- grep(readline("ColF (Column from): "), nm)
ColT <- grep(readline("ColT ( Column To ): "), nm)
# COMPUTE THE MATCHES
A <- colSums(t(df[ColF:ColT]) == string)
B <- sort(setNames(A,df[,"Speci"]),decreasing = T)
data.frame(Speci = names(B),Matches = B,row.names = NULL)
}
现在 运行 my_str(data)
其中数据是您指定的数据帧
my_str(a)
string: CAAGGA
ColF (Column from): 46
ColT ( Column To ): 51
Speci Matches
1 LL 6
2 MM 5
3 PP 4
4 KK 3
5 JJ 1
6 NN 0
输入PYTHON:
def my_str(df):
string = pd.np.array(list(input("string: ")))
ColF = input("ColF (Column from): ")
ColT = input("ColT (Column to): ")
A = (df.loc[:,ColF:ColT]==string[None,:]).sum(1).sort_values(ascending = False)
return pd.DataFrame({'Speci': df['Speci'][A.index],'Matches' : A})
现在 运行 my_str(data)
其中数据是您指定的数据帧
my_str(df)
string: CAAGGA
ColF (Column from): 46
ColT (Column to): 51
Out[77]:
Speci Matches
3 LL 6
4 MM 5
0 PP 4
2 KK 3
1 JJ 1
5 NN 0
您可以在Python中使用以下解决方案:
col1 = '46'
col2 = '51'
inp = 'CAAGGA'
result = (df.loc[:, col1:col2] == np.array(list(inp))).sum(axis=1)
result.index = df['Speci']
print(result.sort_values(ascending=False)[:3])
输出:
Speci
LL 6
MM 5
PP 4
dtype: int64