Pandas 数据框如何制作散点图以将值列表聚类到一组组中
Pandas data frame how to make a scatter plot for clustering a list of values into a set of groups
我有这个 pandas df 有 2 列
target drugs
0 ACE2 gene [angiotensin II,rosiglitazone,irbesartan,valsar]
1 Elastases [heparin,prednisolone,montelukast,formoterol]
2 MAPK14 protein [oxaprozin,nilotinib,imatinib,tocilizumab]
3 TMPRSS2 gene [enzalutamide,camostat]
4 Toll-like receptors [rituximab,atorvastatin,artesunate,tazarotene]
5 Ubiquitin [sunitinib,lapatinib,atorvastatin,edaravone]
6 ezrin [erlotinib,crizotinib,sorafenib,everolimus]
我想创建一个将药物聚类成目标的图,所以会有 7 个聚类(7 个目标),我不知道该怎么做..
这是 df:
import pandas as pd
data = {'target': ['ACE2 gene', 'Elastases', 'MAPK14 protein, human', 'TMPRSS2 gene', 'Toll-like receptors', 'Ubiquitin' , 'ezrin'],'drugs': [['angiotensin II','rosiglitazone','irbesartan'], ['heparin','prednisolone','montelukast','formoterol'] , ['oxaprozin','nilotinib','imatinib','tocilizumab'] , ['enzalutamide','camostat'] , ['rituximab','atorvastatin','artesunate','tazarotene'] , ['sunitinib','lapatinib','atorvastatin','edaravone'], ['erlotinib','crizotinib','sorafenib','everolimus']]
}
df = pd.DataFrame(data)
IIUC,您可以像下面这样使用 networkX
:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from itertools import chain
from networkx.drawing.nx_pydot import graphviz_layout
df = pd.DataFrame(data = {'target': ['ACE2 gene', 'Elastases', 'MAPK14 protein, human', 'TMPRSS2 gene', 'Toll-like receptors', 'Ubiquitin' , 'ezrin'],
'drugs': [['angiotensin II','rosiglitazone','irbesartan'], ['heparin','prednisolone','montelukast','formoterol'] ,
['oxaprozin','nilotinib','imatinib','tocilizumab'] , ['enzalutamide','camostat'] ,
['rituximab','atorvastatin','artesunate','tazarotene'] , ['sunitinib','lapatinib','atorvastatin','edaravone'],
['erlotinib','crizotinib','sorafenib','everolimus']]})
df['times'] = df['drugs'].apply(lambda x : len(x))
df = df.loc[df.index.repeat(df['times'])].reset_index(drop=True)
df['drug'] = df.groupby('target')['drugs'].transform(lambda x: list(y[idx] for idx, y in enumerate(x)))
df= df.drop(['drugs','times'], axis=1)
g = nx.DiGraph()
g.add_nodes_from(set(chain.from_iterable(df.values)))
for edg in df.values:
g.add_edge(*edg)
pos = graphviz_layout(g, prog="dot")
fig, axe = plt.subplots(figsize=(40,10))
nx.draw(g, pos,with_labels=True, node_shape='s', ax = axe)
plt.draw()
plt.show()
输出:
你可以像下面这样用seaborn
绘制scatterplot
:(因为你在其他答案的评论中说,你的答案有问题,我发送另一种方法的另一个答案。)
import matplotlib.pyplot as plt
import pandas as pd
from itertools import chain
import seaborn as sns
df = pd.DataFrame(data = {'target': ['ACE2 gene', 'Elastases', 'MAPK14 protein, human', 'TMPRSS2 gene', 'Toll-like receptors', 'Ubiquitin' , 'ezrin'],
'drugs': [['angiotensin II','rosiglitazone','irbesartan'], ['heparin','prednisolone','montelukast','formoterol'] ,
['oxaprozin','nilotinib','imatinib','tocilizumab'] , ['enzalutamide','camostat'] ,
['rituximab','atorvastatin','artesunate','tazarotene'] , ['sunitinib','lapatinib','atorvastatin','edaravone'],
['erlotinib','crizotinib','sorafenib','everolimus']]})
df['times'] = df['drugs'].apply(lambda x : len(x))
df = df.loc[df.index.repeat(df['times'])].reset_index(drop=True)
df['drug'] = df.groupby('target')['drugs'].transform(lambda x: list(y[idx] for idx, y in enumerate(x)))
df = df.drop(['drugs','times'], axis=1)
df['unq_id'] = df.index+1
fig, axe = plt.subplots(figsize=(20,10))
axe.axis('off')
sns.scatterplot(data=df, x="unq_id", y="target", hue="target", ax= axe, s=1000)
for _, point in df.iterrows():
axe.text(point['unq_id']-0.2, point['target'], point['drug'], rotation=45, size=18)
plt.setp(axe.get_legend().get_texts(), fontsize='22') # for legend text
plt.setp(axe.get_legend().get_title(), fontsize='32') # for legend title
plt.show()
输出:
我有这个 pandas df 有 2 列
target drugs
0 ACE2 gene [angiotensin II,rosiglitazone,irbesartan,valsar]
1 Elastases [heparin,prednisolone,montelukast,formoterol]
2 MAPK14 protein [oxaprozin,nilotinib,imatinib,tocilizumab]
3 TMPRSS2 gene [enzalutamide,camostat]
4 Toll-like receptors [rituximab,atorvastatin,artesunate,tazarotene]
5 Ubiquitin [sunitinib,lapatinib,atorvastatin,edaravone]
6 ezrin [erlotinib,crizotinib,sorafenib,everolimus]
我想创建一个将药物聚类成目标的图,所以会有 7 个聚类(7 个目标),我不知道该怎么做..
这是 df:
import pandas as pd
data = {'target': ['ACE2 gene', 'Elastases', 'MAPK14 protein, human', 'TMPRSS2 gene', 'Toll-like receptors', 'Ubiquitin' , 'ezrin'],'drugs': [['angiotensin II','rosiglitazone','irbesartan'], ['heparin','prednisolone','montelukast','formoterol'] , ['oxaprozin','nilotinib','imatinib','tocilizumab'] , ['enzalutamide','camostat'] , ['rituximab','atorvastatin','artesunate','tazarotene'] , ['sunitinib','lapatinib','atorvastatin','edaravone'], ['erlotinib','crizotinib','sorafenib','everolimus']]
}
df = pd.DataFrame(data)
IIUC,您可以像下面这样使用 networkX
:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from itertools import chain
from networkx.drawing.nx_pydot import graphviz_layout
df = pd.DataFrame(data = {'target': ['ACE2 gene', 'Elastases', 'MAPK14 protein, human', 'TMPRSS2 gene', 'Toll-like receptors', 'Ubiquitin' , 'ezrin'],
'drugs': [['angiotensin II','rosiglitazone','irbesartan'], ['heparin','prednisolone','montelukast','formoterol'] ,
['oxaprozin','nilotinib','imatinib','tocilizumab'] , ['enzalutamide','camostat'] ,
['rituximab','atorvastatin','artesunate','tazarotene'] , ['sunitinib','lapatinib','atorvastatin','edaravone'],
['erlotinib','crizotinib','sorafenib','everolimus']]})
df['times'] = df['drugs'].apply(lambda x : len(x))
df = df.loc[df.index.repeat(df['times'])].reset_index(drop=True)
df['drug'] = df.groupby('target')['drugs'].transform(lambda x: list(y[idx] for idx, y in enumerate(x)))
df= df.drop(['drugs','times'], axis=1)
g = nx.DiGraph()
g.add_nodes_from(set(chain.from_iterable(df.values)))
for edg in df.values:
g.add_edge(*edg)
pos = graphviz_layout(g, prog="dot")
fig, axe = plt.subplots(figsize=(40,10))
nx.draw(g, pos,with_labels=True, node_shape='s', ax = axe)
plt.draw()
plt.show()
输出:
你可以像下面这样用seaborn
绘制scatterplot
:(因为你在其他答案的评论中说,你的答案有问题,我发送另一种方法的另一个答案。)
import matplotlib.pyplot as plt
import pandas as pd
from itertools import chain
import seaborn as sns
df = pd.DataFrame(data = {'target': ['ACE2 gene', 'Elastases', 'MAPK14 protein, human', 'TMPRSS2 gene', 'Toll-like receptors', 'Ubiquitin' , 'ezrin'],
'drugs': [['angiotensin II','rosiglitazone','irbesartan'], ['heparin','prednisolone','montelukast','formoterol'] ,
['oxaprozin','nilotinib','imatinib','tocilizumab'] , ['enzalutamide','camostat'] ,
['rituximab','atorvastatin','artesunate','tazarotene'] , ['sunitinib','lapatinib','atorvastatin','edaravone'],
['erlotinib','crizotinib','sorafenib','everolimus']]})
df['times'] = df['drugs'].apply(lambda x : len(x))
df = df.loc[df.index.repeat(df['times'])].reset_index(drop=True)
df['drug'] = df.groupby('target')['drugs'].transform(lambda x: list(y[idx] for idx, y in enumerate(x)))
df = df.drop(['drugs','times'], axis=1)
df['unq_id'] = df.index+1
fig, axe = plt.subplots(figsize=(20,10))
axe.axis('off')
sns.scatterplot(data=df, x="unq_id", y="target", hue="target", ax= axe, s=1000)
for _, point in df.iterrows():
axe.text(point['unq_id']-0.2, point['target'], point['drug'], rotation=45, size=18)
plt.setp(axe.get_legend().get_texts(), fontsize='22') # for legend text
plt.setp(axe.get_legend().get_title(), fontsize='32') # for legend title
plt.show()
输出: