如何根据列值创建和弦图矩阵:
How to create the matrix for chord diagram based on coloumn value:
假设我有一个数据框,其中包含以下格式的数据。
UID | Name | ID
----------------
1 | ABC | IM-1
2 | XYZ | IM-2
3 | XYZ | IM-2
4 | PQR | IM-3
5 | PQR | IM-4
6 | PQR | IM-5
7 | XYZ | IM-5
8 | ABC | IM-5
我需要创建一个输入和弦图代码的矩阵。需要以下格式的输出:
(array([[0,1,1,1],
[1,1,1,0],
[1,1,0,2]]),['ABC','XYZ','PQR'])
注意:在这个例子中,
- "Name" 在列表中是有限的(即 ABC、XYZ 或 PQR)
- "ID" 在记录之间共享
- 第四列是独立的记录数(例如 ABC 是单个记录的一部分 IM-1 而 PQR 在 IM-4[= 中出现两次30=] 和 IM-5
- 矩阵的其他成员是基于ID的名称之间的联系(例如IM-5,增加PQR-XYZ的值, XYZ-PQR, PQR-ABC,ABC-PQR,XYZ-ABC & ABC-XYZ)
- 目标是为 "Name" 字段
之间的连接创建一个和弦图
我知道这是一本值得一读的书。预先感谢您的帮助。
更新了我的答案,但方法基本相同。将数据解析为数据框,在 ID
上执行 inner join 以获取通过共享公共 ID
链接的名称对。然后将这个边列表转换成邻接矩阵。最后,为了获得 "dangling" 边缘,即只出现一次的 ID
(添加到更新的答案中),并将它们的计数按相应的 Name
分组。
#!/usr/bin/env python
"""
Create adjacency matrix from a dataframe, where edges are implicitly defined by shared attributes.
Answer to:
"""
import numpy as np
import pandas as pd
from collections import Counter
def parse_data_format(file_path):
# read data skipping second line
df = pd.read_csv(file_path, sep='|', skiprows=[1])
# strip whitespace from column names
df = df.rename(columns=lambda x: x.strip())
# strip whitespace from values
df_obj = df.select_dtypes(['object'])
df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())
return df
def get_edges(df):
"""Get all combinations of 'Name' that share a 'ID' value (using an inner join)."""
inner_self_join = df.merge(df, how='inner', on='ID')
excluding_self_pairs = inner_self_join[inner_self_join['UID_x']!=inner_self_join['UID_y']]
edges = excluding_self_pairs[['Name_x', 'Name_y']].values
return edges
def get_adjacency(edges):
"Convert a list of 2-tuples specifying source and target of a connection into an adjacency matrix."
order = np.unique(edges)
total_names = len(order)
name_to_idx = dict(list(zip(order, range(total_names))))
adjacency = np.zeros((total_names, total_names))
for (source, target) in edges:
adjacency[name_to_idx[source], name_to_idx[target]] += 1
return adjacency, order
def get_dangling_edge_counts(df):
# get IDs with count 1
counts = Counter(df['ID'].values)
singles = [ID for (ID, count) in counts.items() if count == 1]
# get corresponding names
names = [df[df['ID']==ID]['Name'].values[0] for ID in singles]
# convert into counts
return Counter(names)
if __name__ == '__main__':
# here we read in the data as a file buffer;
# however, normally we would hand a file path to parse_data_format instead
import sys
if sys.version_info[0] < 3:
from StringIO import StringIO
else:
from io import StringIO
data = StringIO(
"""UID | Name | ID
----------------
1 | ABC | IM-1
2 | XYZ | IM-2
3 | XYZ | IM-2
4 | PQR | IM-3
5 | PQR | IM-4
6 | PQR | IM-5
7 | XYZ | IM-5
8 | ABC | IM-5
"""
)
df = parse_data_format(data)
edges = get_edges(df)
adjacency, order = get_adjacency(edges)
print(adjacency)
# [[0. 1. 1.]
# [1. 0. 1.]
# [1. 1. 0.]]
print(order)
# ['ABC' 'PQR' 'XYZ']
dangling_edge_counts = get_dangling_edge_counts(df)
print(dangling_edge_counts)
# Counter({'PQR': 2, 'ABC': 1})
last_column = np.zeros_like(order, dtype=np.int)
for ii, name in enumerate(order):
if name in dangling_edge_counts:
last_column[ii] = dangling_edge_counts[name]
combined = np.concatenate([adjacency, last_column[:, np.newaxis]], axis=-1)
print(combined)
#[[0. 1. 1. 1.]
# [1. 0. 1. 2.]
# [1. 1. 2. 0.]]
假设我有一个数据框,其中包含以下格式的数据。
UID | Name | ID
----------------
1 | ABC | IM-1
2 | XYZ | IM-2
3 | XYZ | IM-2
4 | PQR | IM-3
5 | PQR | IM-4
6 | PQR | IM-5
7 | XYZ | IM-5
8 | ABC | IM-5
我需要创建一个输入和弦图代码的矩阵。需要以下格式的输出:
(array([[0,1,1,1],
[1,1,1,0],
[1,1,0,2]]),['ABC','XYZ','PQR'])
注意:在这个例子中, - "Name" 在列表中是有限的(即 ABC、XYZ 或 PQR) - "ID" 在记录之间共享 - 第四列是独立的记录数(例如 ABC 是单个记录的一部分 IM-1 而 PQR 在 IM-4[= 中出现两次30=] 和 IM-5 - 矩阵的其他成员是基于ID的名称之间的联系(例如IM-5,增加PQR-XYZ的值, XYZ-PQR, PQR-ABC,ABC-PQR,XYZ-ABC & ABC-XYZ) - 目标是为 "Name" 字段
之间的连接创建一个和弦图我知道这是一本值得一读的书。预先感谢您的帮助。
更新了我的答案,但方法基本相同。将数据解析为数据框,在 ID
上执行 inner join 以获取通过共享公共 ID
链接的名称对。然后将这个边列表转换成邻接矩阵。最后,为了获得 "dangling" 边缘,即只出现一次的 ID
(添加到更新的答案中),并将它们的计数按相应的 Name
分组。
#!/usr/bin/env python
"""
Create adjacency matrix from a dataframe, where edges are implicitly defined by shared attributes.
Answer to:
"""
import numpy as np
import pandas as pd
from collections import Counter
def parse_data_format(file_path):
# read data skipping second line
df = pd.read_csv(file_path, sep='|', skiprows=[1])
# strip whitespace from column names
df = df.rename(columns=lambda x: x.strip())
# strip whitespace from values
df_obj = df.select_dtypes(['object'])
df[df_obj.columns] = df_obj.apply(lambda x: x.str.strip())
return df
def get_edges(df):
"""Get all combinations of 'Name' that share a 'ID' value (using an inner join)."""
inner_self_join = df.merge(df, how='inner', on='ID')
excluding_self_pairs = inner_self_join[inner_self_join['UID_x']!=inner_self_join['UID_y']]
edges = excluding_self_pairs[['Name_x', 'Name_y']].values
return edges
def get_adjacency(edges):
"Convert a list of 2-tuples specifying source and target of a connection into an adjacency matrix."
order = np.unique(edges)
total_names = len(order)
name_to_idx = dict(list(zip(order, range(total_names))))
adjacency = np.zeros((total_names, total_names))
for (source, target) in edges:
adjacency[name_to_idx[source], name_to_idx[target]] += 1
return adjacency, order
def get_dangling_edge_counts(df):
# get IDs with count 1
counts = Counter(df['ID'].values)
singles = [ID for (ID, count) in counts.items() if count == 1]
# get corresponding names
names = [df[df['ID']==ID]['Name'].values[0] for ID in singles]
# convert into counts
return Counter(names)
if __name__ == '__main__':
# here we read in the data as a file buffer;
# however, normally we would hand a file path to parse_data_format instead
import sys
if sys.version_info[0] < 3:
from StringIO import StringIO
else:
from io import StringIO
data = StringIO(
"""UID | Name | ID
----------------
1 | ABC | IM-1
2 | XYZ | IM-2
3 | XYZ | IM-2
4 | PQR | IM-3
5 | PQR | IM-4
6 | PQR | IM-5
7 | XYZ | IM-5
8 | ABC | IM-5
"""
)
df = parse_data_format(data)
edges = get_edges(df)
adjacency, order = get_adjacency(edges)
print(adjacency)
# [[0. 1. 1.]
# [1. 0. 1.]
# [1. 1. 0.]]
print(order)
# ['ABC' 'PQR' 'XYZ']
dangling_edge_counts = get_dangling_edge_counts(df)
print(dangling_edge_counts)
# Counter({'PQR': 2, 'ABC': 1})
last_column = np.zeros_like(order, dtype=np.int)
for ii, name in enumerate(order):
if name in dangling_edge_counts:
last_column[ii] = dangling_edge_counts[name]
combined = np.concatenate([adjacency, last_column[:, np.newaxis]], axis=-1)
print(combined)
#[[0. 1. 1. 1.]
# [1. 0. 1. 2.]
# [1. 1. 2. 0.]]