如何根据列 dtype 创建不同的图组
How to create groups of different plots based on column dtype
我有一个数据框
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
df= {
'Gen':['M','M','M','M','F','F','F','F','M','M','M','M','F','F','F','F'],
'Site':['FRX','FX','FRX','FRX','FRX','FX','FRX','FX','FX','FX','FX','FRX','FRX','FRX','FRX','FRX'],
'Type':['L','L','L','L','L','L','L','L','R','R','R','R','R','R','R','R'],
'UID':[1001,1002,1003,1004,1001,1002,1003,1004,1001,1002,1003,1004,1001,1002,1003,1004],
'color':['R','R','G','G','B','G','B','B','R','G','R','G','B','B','R','G'],
'Time2':[150.78,162.34,188.53,197.69,208.07,217.76,229.48,139.51,146.87,182.54,189.57,199.97,229.28,244.73,269.91,249.19],
'Time3':[250.78,262.34,288.53,297.69,308.07,317.7,329.81,339.15,346.87,382.54,369.59,399.97,329.28,347.73,369.91,349.12],
'Time4':[240.18,232.14,258.53,276.69,338.07,307.74,359.16,339.25,365.87,392.48,399.97,410.75,429.08,448.39,465.15,469.33],
'Time5':[270.84,282.14,298.53,306.69,318.73,327.47,369.63,389.59,398.75,432.18,449.78,473.55,494.85,509.39,515.52,539.23]
}
df = pd.DataFrame(df,columns = ['Gen','Site','Type','UID','color','Time2','Time3','Time4','Time5'])
df.info()
我想编写一个接受 dataframe
并执行以下操作的函数:
countplots
对于具有 object
dtype 的列(GEN
、Site
、Type
和 [=18 的 4 个计数图=]列)
boxplot
用于具有 float
dtype 的列(4 个箱线图用于 Time2
,....,Time5
列)
将图表导出为 pdf 文件 - 每页两张图表
我的尝试:
# I am open to other approaches
def data_explorer(data):
for col in data.columns:
# 1. countplots for columns with the object dtype
if data[col].dtype == 'object':
sns.countplot(x = col, data = data)
# 2. boxplots for columns with the float dtype
elif data[col].dtype == 'float':
sns.boxplot(data[col])
else:
print("skip integer dtype")
# 3. save the graphs as pdf- 4 graphs per page
plt.savefig('data_exploration.pdf')
请注意:最终输出一共需要8张图
- 主要问题是图应该作为一组保存在图中,而不是每列单独保存。
- 根据需要调整
figsize=(15, 30)
。
选项 1:4 个图,每页 2 个绘图
- Select 数据帧的所有列按 dtype 与
.select_dtypes
- 使用列表理解根据每页的绘图数量将列分成块。根据需要调整块大小
n
。
- 遍历每组列
- 创建一个行数等于每页图数的图
- 将绘图添加到图中并保存图
def data_explorer(df):
# get object and float data
dobj = df.select_dtypes(include=['object'])
dflo = df.select_dtypes(include=['float'])
# split columns into groups of two; two being the plots per page
n = 2
cols_obj = [dobj.columns[i:i+n] for i in range(0, len(dobj.columns), n)]
cols_flo = [dflo.columns[i:i+n] for i in range(0, len(dflo.columns), n)]
# create a figure with two plots for each pair in dobj
for cols in cols_obj: # iterate through each group
fig, axes = plt.subplots(n, 1, figsize=(15, 30))
for col, ax in zip(cols, axes):
sns.countplot(data=dobj[[col]], x=col, ax=ax)
fig.savefig(f'data_exploration_{"_".join(cols)}.pdf')
# create a figure with two plots for each pair in dflo
for cols in cols_flo: # iterate through each group
fig, axes = plt.subplots(n, 1, figsize=(15, 30))
for col, ax in zip(cols, axes):
sns.boxplot(data=dflo[[col]], x=col, ax=ax)
fig.savefig(f'data_exploration_{"_".join(cols)}.pdf')
data_explorer(df)
选项 2:2 个图,每页 4 个图
- Select 数据帧的所有列按 dtype 与
.select_dtypes
- 创建一个图表以匹配每页的绘图数,等于每组的总列数。
- 将每组列添加到图表中,并保存图表。
def data_explorer(df):
# get object and float data
dobj = df.select_dtypes(include=['object'])
dflo = df.select_dtypes(include=['float'])
# create a figure with two plots for each pair in dobj
fig, axes = plt.subplots(2, 2, figsize=(20, 30))
for col, ax in zip(dobj.columns, axes.flat):
sns.countplot(data=dobj[[col]], x=col, ax=ax)
fig.savefig(f'data_exploration_{"_".join(dobj.columns)}.pdf')
# create a figure with two plots for each pair in dflo
fig, axes = plt.subplots(2, 2, figsize=(20, 30))
for col, ax in zip(dflo.columns, axes.flat):
sns.boxplot(data=dflo[[col]], x=col, ax=ax)
fig.savefig(f'data_exploration_{"_".join(dflo.columns)}.pdf')
data_explorer(df)
我有一个数据框
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
df= {
'Gen':['M','M','M','M','F','F','F','F','M','M','M','M','F','F','F','F'],
'Site':['FRX','FX','FRX','FRX','FRX','FX','FRX','FX','FX','FX','FX','FRX','FRX','FRX','FRX','FRX'],
'Type':['L','L','L','L','L','L','L','L','R','R','R','R','R','R','R','R'],
'UID':[1001,1002,1003,1004,1001,1002,1003,1004,1001,1002,1003,1004,1001,1002,1003,1004],
'color':['R','R','G','G','B','G','B','B','R','G','R','G','B','B','R','G'],
'Time2':[150.78,162.34,188.53,197.69,208.07,217.76,229.48,139.51,146.87,182.54,189.57,199.97,229.28,244.73,269.91,249.19],
'Time3':[250.78,262.34,288.53,297.69,308.07,317.7,329.81,339.15,346.87,382.54,369.59,399.97,329.28,347.73,369.91,349.12],
'Time4':[240.18,232.14,258.53,276.69,338.07,307.74,359.16,339.25,365.87,392.48,399.97,410.75,429.08,448.39,465.15,469.33],
'Time5':[270.84,282.14,298.53,306.69,318.73,327.47,369.63,389.59,398.75,432.18,449.78,473.55,494.85,509.39,515.52,539.23]
}
df = pd.DataFrame(df,columns = ['Gen','Site','Type','UID','color','Time2','Time3','Time4','Time5'])
df.info()
我想编写一个接受 dataframe
并执行以下操作的函数:
countplots
对于具有object
dtype 的列(GEN
、Site
、Type
和 [=18 的 4 个计数图=]列)boxplot
用于具有float
dtype 的列(4 个箱线图用于Time2
,....,Time5
列)将图表导出为 pdf 文件 - 每页两张图表
我的尝试:
# I am open to other approaches
def data_explorer(data):
for col in data.columns:
# 1. countplots for columns with the object dtype
if data[col].dtype == 'object':
sns.countplot(x = col, data = data)
# 2. boxplots for columns with the float dtype
elif data[col].dtype == 'float':
sns.boxplot(data[col])
else:
print("skip integer dtype")
# 3. save the graphs as pdf- 4 graphs per page
plt.savefig('data_exploration.pdf')
请注意:最终输出一共需要8张图
- 主要问题是图应该作为一组保存在图中,而不是每列单独保存。
- 根据需要调整
figsize=(15, 30)
。
选项 1:4 个图,每页 2 个绘图
- Select 数据帧的所有列按 dtype 与
.select_dtypes
- 使用列表理解根据每页的绘图数量将列分成块。根据需要调整块大小
n
。 - 遍历每组列
- 创建一个行数等于每页图数的图
- 将绘图添加到图中并保存图
def data_explorer(df):
# get object and float data
dobj = df.select_dtypes(include=['object'])
dflo = df.select_dtypes(include=['float'])
# split columns into groups of two; two being the plots per page
n = 2
cols_obj = [dobj.columns[i:i+n] for i in range(0, len(dobj.columns), n)]
cols_flo = [dflo.columns[i:i+n] for i in range(0, len(dflo.columns), n)]
# create a figure with two plots for each pair in dobj
for cols in cols_obj: # iterate through each group
fig, axes = plt.subplots(n, 1, figsize=(15, 30))
for col, ax in zip(cols, axes):
sns.countplot(data=dobj[[col]], x=col, ax=ax)
fig.savefig(f'data_exploration_{"_".join(cols)}.pdf')
# create a figure with two plots for each pair in dflo
for cols in cols_flo: # iterate through each group
fig, axes = plt.subplots(n, 1, figsize=(15, 30))
for col, ax in zip(cols, axes):
sns.boxplot(data=dflo[[col]], x=col, ax=ax)
fig.savefig(f'data_exploration_{"_".join(cols)}.pdf')
data_explorer(df)
选项 2:2 个图,每页 4 个图
- Select 数据帧的所有列按 dtype 与
.select_dtypes
- 创建一个图表以匹配每页的绘图数,等于每组的总列数。
- 将每组列添加到图表中,并保存图表。
def data_explorer(df):
# get object and float data
dobj = df.select_dtypes(include=['object'])
dflo = df.select_dtypes(include=['float'])
# create a figure with two plots for each pair in dobj
fig, axes = plt.subplots(2, 2, figsize=(20, 30))
for col, ax in zip(dobj.columns, axes.flat):
sns.countplot(data=dobj[[col]], x=col, ax=ax)
fig.savefig(f'data_exploration_{"_".join(dobj.columns)}.pdf')
# create a figure with two plots for each pair in dflo
fig, axes = plt.subplots(2, 2, figsize=(20, 30))
for col, ax in zip(dflo.columns, axes.flat):
sns.boxplot(data=dflo[[col]], x=col, ax=ax)
fig.savefig(f'data_exploration_{"_".join(dflo.columns)}.pdf')
data_explorer(df)