Python 使用集群数据框命名谱系
Python lineage naming with clustered dataframe
我有一个数据框
sample1 0 0 0 0 0 1 1 1 1 1 1 1 1 L1
sample2 0 0 0 0 0 1 1 1 1 1 0 0 0 L1-1
sample3 0 0 0 0 0 1 1 0 0 0 0 0 0 L1-1-1
sample4 0 0 0 0 0 1 0 0 0 0 0 0 0 L1-1-1-1
sample5 0 0 0 0 0 0 0 1 1 0 0 0 0 L1-1-2
sample6 0 0 0 0 0 0 0 1 0 0 0 0 0 L1-1-2-1
sample7 0 0 0 0 0 0 0 0 0 1 0 0 0 L1-1-3
sample8 0 0 0 0 0 0 0 0 0 0 1 1 1 L1-2
sample9 0 0 0 0 0 0 0 0 0 0 1 1 0 L1-2-1
sample10 0 0 0 0 0 0 0 0 0 0 0 0 1 L1-2-2
sample11 1 1 1 1 1 0 0 0 0 0 0 0 0 L2
sample12 1 1 1 0 0 0 0 0 0 0 0 0 0 L2-1
sample13 1 1 0 0 0 0 0 0 0 0 0 0 0 L2-1-1
sample14 1 0 0 0 0 0 0 0 0 0 0 0 0 L2-1-1-1
sample15 0 0 0 1 0 0 0 0 0 0 0 0 0 L2-2
sample16 0 0 0 0 1 0 0 0 0 0 0 0 0 L2-3
如您所见,每一行都是聚类的。
我想为每个样本命名“基于谱系”的标签。
例如,sample1 是 lin1,因为它最先出现,sample2 将是 lin1-1。
示例 3 将是 lin1-1-1,示例 4 将是 lin1-1-1-1。
接下来,sample5 将是 lin1-2,sample6 将是 lin1-2-1...
样本 11 将是谱系 lin2 的新起点。
我最初的命名想法是。
"sample1为lin1,如果下一个sample包含在上一个sample中,lin1 + "-1"
如果不是,lin(1+1)"
样本 1 -> 林 1
sample2 -> lin1-1(sample2包含在sample1中)
sample3 -> lin1-1-1(sample3包含在sample2中)
sample4 -> lin1-1-1-1(sample4包含在sample3中)
sample5 -> lin1-1-2(sample4不包含sample5)
....这样的逻辑。
我无法将此逻辑变成 python 脚本。
这可以分几步完成。
第一步.数据预处理
数据降序排列,去除重复项,否则可能无法正常工作。假设完成。
import numpy as np
data = '''sample1 0 0 0 0 0 1 1 1 1 1 1 1 1
sample2 0 0 0 0 0 1 1 1 1 1 0 0 0
sample3 0 0 0 0 0 1 1 0 0 0 0 0 0
sample4 0 0 0 0 0 1 0 0 0 0 0 0 0
sample5 0 0 0 0 0 0 0 1 1 0 0 0 0
sample6 0 0 0 0 0 0 0 1 0 0 0 0 0
sample7 0 0 0 0 0 0 0 0 0 1 0 0 0
sample8 0 0 0 0 0 0 0 0 0 0 1 1 1
sample9 0 0 0 0 0 0 0 0 0 0 1 1 0
sample10 0 0 0 0 0 0 0 0 0 0 0 0 1
sample11 1 1 1 1 1 0 0 0 0 0 0 0 0
sample12 1 1 1 0 0 0 0 0 0 0 0 0 0
sample13 1 1 0 0 0 0 0 0 0 0 0 0 0
sample14 1 0 0 0 0 0 0 0 0 0 0 0 0
sample15 0 0 0 1 0 0 0 0 0 0 0 0 0
sample16 0 0 0 0 1 0 0 0 0 0 0 0 0'''
data = [x.split() for x in data.split('\n')]
data = [x[1:] for x in data]
data = np.array(data, dtype=int)
data
array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0],
[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]])
第2步。将样本编码到位置。每个元素都是一个frozenset。
nrow, ncol = data.shape
def to_position(sample):
ncol = len(sample)
return frozenset(i for i in range(ncol) if sample[i] == 1)
position = [to_position(data[i]) for i in range(nrow)]
# print(position)
第 3 步。将每个样本位置分配给一个集群,其中集群暂时表示为一个元组。
def assign_cluster(sample, clusters, parent):
if parent not in clusters:
clusters[parent] = sample
elif sample < clusters[parent]:
# Find child
parent = parent + (0,)
assign_cluster(sample, clusters, parent)
else:
# Find siblings
parent = parent[:-1] + (parent[-1] + 1, )
assign_cluster(sample, clusters, parent)
clusters = {}
root = (0,)
clusters[root] = position[0]
for i in range(1, nrow):
sample = position[i]
assign_cluster(sample, clusters, parent=root)
# print(clusters)
第 4 步。将簇转换为字符串并显示结果。
def cluster_to_string(c):
c = [str(_ + 1) for _ in c]
return 'L' + '-'.join(c)
position_dict = {v: k for k, v in clusters.items()}
for sample in data:
sample = to_position(sample)
c = position_dict[sample]
print(cluster_to_string(c))
L1
L1-1
L1-1-1
L1-1-1-1
L1-1-2
L1-1-2-1
L1-1-3
L1-2
L1-2-1
L1-2-2
L2
L2-1
L2-1-1
L2-1-1-1
L2-2
L2-3
我有一个数据框
sample1 0 0 0 0 0 1 1 1 1 1 1 1 1 L1
sample2 0 0 0 0 0 1 1 1 1 1 0 0 0 L1-1
sample3 0 0 0 0 0 1 1 0 0 0 0 0 0 L1-1-1
sample4 0 0 0 0 0 1 0 0 0 0 0 0 0 L1-1-1-1
sample5 0 0 0 0 0 0 0 1 1 0 0 0 0 L1-1-2
sample6 0 0 0 0 0 0 0 1 0 0 0 0 0 L1-1-2-1
sample7 0 0 0 0 0 0 0 0 0 1 0 0 0 L1-1-3
sample8 0 0 0 0 0 0 0 0 0 0 1 1 1 L1-2
sample9 0 0 0 0 0 0 0 0 0 0 1 1 0 L1-2-1
sample10 0 0 0 0 0 0 0 0 0 0 0 0 1 L1-2-2
sample11 1 1 1 1 1 0 0 0 0 0 0 0 0 L2
sample12 1 1 1 0 0 0 0 0 0 0 0 0 0 L2-1
sample13 1 1 0 0 0 0 0 0 0 0 0 0 0 L2-1-1
sample14 1 0 0 0 0 0 0 0 0 0 0 0 0 L2-1-1-1
sample15 0 0 0 1 0 0 0 0 0 0 0 0 0 L2-2
sample16 0 0 0 0 1 0 0 0 0 0 0 0 0 L2-3
如您所见,每一行都是聚类的。
我想为每个样本命名“基于谱系”的标签。
例如,sample1 是 lin1,因为它最先出现,sample2 将是 lin1-1。
示例 3 将是 lin1-1-1,示例 4 将是 lin1-1-1-1。
接下来,sample5 将是 lin1-2,sample6 将是 lin1-2-1...
样本 11 将是谱系 lin2 的新起点。
我最初的命名想法是。
"sample1为lin1,如果下一个sample包含在上一个sample中,lin1 + "-1" 如果不是,lin(1+1)"
样本 1 -> 林 1
sample2 -> lin1-1(sample2包含在sample1中)
sample3 -> lin1-1-1(sample3包含在sample2中)
sample4 -> lin1-1-1-1(sample4包含在sample3中)
sample5 -> lin1-1-2(sample4不包含sample5) ....这样的逻辑。
我无法将此逻辑变成 python 脚本。
这可以分几步完成。
第一步.数据预处理
数据降序排列,去除重复项,否则可能无法正常工作。假设完成。
import numpy as np
data = '''sample1 0 0 0 0 0 1 1 1 1 1 1 1 1
sample2 0 0 0 0 0 1 1 1 1 1 0 0 0
sample3 0 0 0 0 0 1 1 0 0 0 0 0 0
sample4 0 0 0 0 0 1 0 0 0 0 0 0 0
sample5 0 0 0 0 0 0 0 1 1 0 0 0 0
sample6 0 0 0 0 0 0 0 1 0 0 0 0 0
sample7 0 0 0 0 0 0 0 0 0 1 0 0 0
sample8 0 0 0 0 0 0 0 0 0 0 1 1 1
sample9 0 0 0 0 0 0 0 0 0 0 1 1 0
sample10 0 0 0 0 0 0 0 0 0 0 0 0 1
sample11 1 1 1 1 1 0 0 0 0 0 0 0 0
sample12 1 1 1 0 0 0 0 0 0 0 0 0 0
sample13 1 1 0 0 0 0 0 0 0 0 0 0 0
sample14 1 0 0 0 0 0 0 0 0 0 0 0 0
sample15 0 0 0 1 0 0 0 0 0 0 0 0 0
sample16 0 0 0 0 1 0 0 0 0 0 0 0 0'''
data = [x.split() for x in data.split('\n')]
data = [x[1:] for x in data]
data = np.array(data, dtype=int)
data
array([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1],
[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0],
[0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]])
第2步。将样本编码到位置。每个元素都是一个frozenset。
nrow, ncol = data.shape
def to_position(sample):
ncol = len(sample)
return frozenset(i for i in range(ncol) if sample[i] == 1)
position = [to_position(data[i]) for i in range(nrow)]
# print(position)
第 3 步。将每个样本位置分配给一个集群,其中集群暂时表示为一个元组。
def assign_cluster(sample, clusters, parent):
if parent not in clusters:
clusters[parent] = sample
elif sample < clusters[parent]:
# Find child
parent = parent + (0,)
assign_cluster(sample, clusters, parent)
else:
# Find siblings
parent = parent[:-1] + (parent[-1] + 1, )
assign_cluster(sample, clusters, parent)
clusters = {}
root = (0,)
clusters[root] = position[0]
for i in range(1, nrow):
sample = position[i]
assign_cluster(sample, clusters, parent=root)
# print(clusters)
第 4 步。将簇转换为字符串并显示结果。
def cluster_to_string(c):
c = [str(_ + 1) for _ in c]
return 'L' + '-'.join(c)
position_dict = {v: k for k, v in clusters.items()}
for sample in data:
sample = to_position(sample)
c = position_dict[sample]
print(cluster_to_string(c))
L1
L1-1
L1-1-1
L1-1-1-1
L1-1-2
L1-1-2-1
L1-1-3
L1-2
L1-2-1
L1-2-2
L2
L2-1
L2-1-1
L2-1-1-1
L2-2
L2-3