如何根据列 dtype 创建不同的图组

How to create groups of different plots based on column dtype

我有一个数据框

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np
df= {
    'Gen':['M','M','M','M','F','F','F','F','M','M','M','M','F','F','F','F'],
    'Site':['FRX','FX','FRX','FRX','FRX','FX','FRX','FX','FX','FX','FX','FRX','FRX','FRX','FRX','FRX'],
    'Type':['L','L','L','L','L','L','L','L','R','R','R','R','R','R','R','R'],
     'UID':[1001,1002,1003,1004,1001,1002,1003,1004,1001,1002,1003,1004,1001,1002,1003,1004],
    'color':['R','R','G','G','B','G','B','B','R','G','R','G','B','B','R','G'],
    'Time2':[150.78,162.34,188.53,197.69,208.07,217.76,229.48,139.51,146.87,182.54,189.57,199.97,229.28,244.73,269.91,249.19],
     'Time3':[250.78,262.34,288.53,297.69,308.07,317.7,329.81,339.15,346.87,382.54,369.59,399.97,329.28,347.73,369.91,349.12],
     'Time4':[240.18,232.14,258.53,276.69,338.07,307.74,359.16,339.25,365.87,392.48,399.97,410.75,429.08,448.39,465.15,469.33],
     'Time5':[270.84,282.14,298.53,306.69,318.73,327.47,369.63,389.59,398.75,432.18,449.78,473.55,494.85,509.39,515.52,539.23]
}
df = pd.DataFrame(df,columns = ['Gen','Site','Type','UID','color','Time2','Time3','Time4','Time5'])
df.info()

我想编写一个接受 dataframe 并执行以下操作的函数:

  1. countplots 对于具有 object dtype 的列(GENSiteType 和 [=18 的 4 个计数图=]列)

  2. boxplot 用于具有 float dtype 的列(4 个箱线图用于 Time2,....,Time5 列)

  3. 将图表导出为 pdf 文件 - 每页两张图表

我的尝试:

# I am open to other approaches
def data_explorer(data):
    for col in data.columns:
        # 1. countplots for columns with the object dtype
        if data[col].dtype == 'object':
            sns.countplot(x = col, data = data)
         # 2. boxplots for columns with the float dtype   
        elif data[col].dtype == 'float':
            sns.boxplot(data[col])
            
        else:
            print("skip integer dtype")
         # 3. save the graphs as pdf- 4 graphs per page
       
        plt.savefig('data_exploration.pdf')


请注意:最终输出一共需要8张图

  • 主要问题是图应该作为一组保存在图中,而不是每列单独保存。
  • 根据需要调整figsize=(15, 30)

选项 1:4 个图,每页 2 个绘图

  1. Select 数据帧的所有列按 dtype 与 .select_dtypes
  2. 使用列表理解根据每页的绘图数量将列分成块。根据需要调整块大小n
  3. 遍历每组列
  4. 创建一个行数等于每页图数的图
  5. 将绘图添加到图中并保存图
def data_explorer(df):
    # get object and float data
    dobj = df.select_dtypes(include=['object'])
    dflo = df.select_dtypes(include=['float'])
    
    # split columns into groups of two; two being the plots per page
    n = 2
    cols_obj = [dobj.columns[i:i+n] for i in range(0, len(dobj.columns), n)]
    cols_flo = [dflo.columns[i:i+n] for i in range(0, len(dflo.columns), n)]
    
    # create a figure with two plots for each pair in dobj
    for cols in cols_obj:  # iterate through each group
        fig, axes = plt.subplots(n, 1, figsize=(15, 30))
        for col, ax in zip(cols, axes):
            sns.countplot(data=dobj[[col]], x=col, ax=ax)
        fig.savefig(f'data_exploration_{"_".join(cols)}.pdf')
        
    # create a figure with two plots for each pair in dflo
    for cols in cols_flo:  # iterate through each group
        fig, axes = plt.subplots(n, 1, figsize=(15, 30))
        for col, ax in zip(cols, axes):
            sns.boxplot(data=dflo[[col]], x=col, ax=ax)
        fig.savefig(f'data_exploration_{"_".join(cols)}.pdf')


data_explorer(df)

选项 2:2 个图,每页 4 个图

  1. Select 数据帧的所有列按 dtype 与 .select_dtypes
  2. 创建一个图表以匹配每页的绘图数,等于每组的总列数。
  3. 将每组列添加到图表中,并保存图表。
def data_explorer(df):
    # get object and float data
    dobj = df.select_dtypes(include=['object'])
    dflo = df.select_dtypes(include=['float'])
    
    # create a figure with two plots for each pair in dobj
    fig, axes = plt.subplots(2, 2, figsize=(20, 30))
    for col, ax in zip(dobj.columns, axes.flat):
        sns.countplot(data=dobj[[col]], x=col, ax=ax)
    fig.savefig(f'data_exploration_{"_".join(dobj.columns)}.pdf')
        
    # create a figure with two plots for each pair in dflo
    fig, axes = plt.subplots(2, 2, figsize=(20, 30))
    for col, ax in zip(dflo.columns, axes.flat):
        sns.boxplot(data=dflo[[col]], x=col, ax=ax)
    fig.savefig(f'data_exploration_{"_".join(dflo.columns)}.pdf')


data_explorer(df)