如何在 python pandas 代码中对散点图矩阵的间隔使用颜色编码?

How to use color coding for intervals for scatter plot matrix in a python pandas code?

import numpy as np
import matplotlib.pyplot as plt
import pandas
df = pandas.DataFrame(np.random.randn(1000,4 ), columns=['a', 'b', 'c', 'd'])
pandas.tools.plotting.scatter_matrix(df, alpha=0.2)
plt.show()

是否可以查看上述结果的颜色编码形式,以便进一步分析,例如对于a列,0-50之间的值可以编码为红色,50-100绿色等等?

乍看之下,我认为这并不容易做到。

scatter_matrix方法是一种方便的方法。如果深入研究它,您会发现它允许传递一些可以轻松更改某些颜色的参数。例如,试试这个:

pandas.tools.plotting.scatter_matrix(df, alpha=0.2, 
c='red', hist_kwds={'color':['burlywood']})

看pandas.tools.plotting中的scatter_matrix定义(和代码),散点图传递的是普通关键字,hist_kwds参数用于封装传递的参数直方图。

但是,我看不到仅使用传递给 hist 的参数来实现您想要的效果的方法。

this link 我找到了一个例子,它可以做你想要的事情,可以很容易地修改成这样:

N, bins, patches = ax.hist(values, **hist_kwds)
for bin_size, bin, patch in zip(N, bins, patches):
    if bin_size > 200:
        patch.set_facecolor("green")
        patch.set_label("max")
    elif bin_size < 50:
        patch.set_facecolor("red")
        patch.set_label("min")

然而,关键是你需要在用 ax.hist 绘制补丁后抓取补丁(在上面代码的第一行)。在 pandas.tools.plotting.scatter_matrix 方法中,这些不会返回给您。

根据您希望的解决方案 formal/reusable,有一种方法可以获得您想要的:定义您自己的自定义_scatter_matrix 方法。

您可以 re-use 大部分现有代码,然后在中间显示

if diagonal == 'hist':

您将单个 ax.hist() 调用替换为我上面显示的 8 行左右(并根据需要进一步自定义 logic/colors)。现在这是您自己的方法,所以如果您希望范围或颜色是动态的而不是静态的,您可以为此添加自己的参数。您必须添加几个导入,并在几个变量上明确命名空间,但这样您将拥有更多控制权。

这是我 5 分钟的努力,以证明它有效,首先是结果,然后是代码:

import numpy as np
import matplotlib.pyplot as plt
import pandas
import pandas.tools.plotting
from pandas.compat import range, lrange, lmap, map, zip, string_types


def main():

    df = pandas.DataFrame(np.random.randn(1000,4 ), columns=['a', 'b', 'c', 'd'])
#     pandas.tools.plotting.scatter_matrix(df, alpha=0.2, 
#         c='red', hist_kwds={'color':['burlywood']})
    custom_scatter_matrix(df, alpha=0.2, c='red')
    plt.show()


def custom_scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False,
                   diagonal='hist', marker='.', density_kwds=None,
                   hist_kwds=None, range_padding=0.05, **kwds):
    """
    Draw a matrix of scatter plots.

    Parameters
    ----------
    frame : DataFrame
    alpha : float, optional
        amount of transparency applied
    figsize : (float,float), optional
        a tuple (width, height) in inches
    ax : Matplotlib axis object, optional
    grid : bool, optional
        setting this to True will show the grid
    diagonal : {'hist', 'kde'}
        pick between 'kde' and 'hist' for
        either Kernel Density Estimation or Histogram
        plot in the diagonal
    marker : str, optional
        Matplotlib marker type, default '.'
    hist_kwds : other plotting keyword arguments
        To be passed to hist function
    density_kwds : other plotting keyword arguments
        To be passed to kernel density estimate plot
    range_padding : float, optional
        relative extension of axis range in x and y
        with respect to (x_max - x_min) or (y_max - y_min),
        default 0.05
    kwds : other plotting keyword arguments
        To be passed to scatter function

    Examples
    --------
    >>> df = DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D'])
    >>> scatter_matrix(df, alpha=0.2)
    """
    import matplotlib.pyplot as plt
    from matplotlib.artist import setp

    df = frame._get_numeric_data()
    n = df.columns.size
    naxes = n * n
    fig, axes = pandas.tools.plotting._subplots(naxes=naxes, figsize=figsize, ax=ax,
                          squeeze=False)

    # no gaps between subplots
    fig.subplots_adjust(wspace=0, hspace=0)

    mask = pandas.tools.plotting.com.notnull(df)

    marker = pandas.tools.plotting._get_marker_compat(marker)

    hist_kwds = hist_kwds or {}
    density_kwds = density_kwds or {}

    # workaround because `c='b'` is hardcoded in matplotlibs scatter method
    kwds.setdefault('c', plt.rcParams['patch.facecolor'])

    boundaries_list = []
    for a in df.columns:
        values = df[a].values[mask[a].values]
        rmin_, rmax_ = np.min(values), np.max(values)
        rdelta_ext = (rmax_ - rmin_) * range_padding / 2.
        boundaries_list.append((rmin_ - rdelta_ext, rmax_+ rdelta_ext))

    for i, a in zip(lrange(n), df.columns):
        for j, b in zip(lrange(n), df.columns):
            ax = axes[i, j]

            if i == j:
                values = df[a].values[mask[a].values]

                # Deal with the diagonal by drawing a histogram there.
                if diagonal == 'hist':
                    N, bins, patches = ax.hist(values, **hist_kwds)
                    for bin_size, bin, patch in zip(N, bins, patches):
                        if bin_size > 200:
                            patch.set_facecolor("green")
                            patch.set_label("max")
                        elif bin_size < 50:
                            patch.set_facecolor("red")
                            patch.set_label("min")

                elif diagonal in ('kde', 'density'):
                    from scipy.stats import gaussian_kde
                    y = values
                    gkde = gaussian_kde(y)
                    ind = np.linspace(y.min(), y.max(), 1000)
                    ax.plot(ind, gkde.evaluate(ind), **density_kwds)

                ax.set_xlim(boundaries_list[i])

            else:
                common = (mask[a] & mask[b]).values

                ax.scatter(df[b][common], df[a][common],
                           marker=marker, alpha=alpha, **kwds)

                ax.set_xlim(boundaries_list[j])
                ax.set_ylim(boundaries_list[i])

            ax.set_xlabel('')
            ax.set_ylabel('')

            pandas.tools.plotting._label_axis(ax, kind='x', label=b, position='bottom', rotate=True)

            pandas.tools.plotting._label_axis(ax, kind='y', label=a, position='left')

            if j!= 0:
                ax.yaxis.set_visible(False)
            if i != n-1:
                ax.xaxis.set_visible(False)

    for ax in axes.flat:
        setp(ax.get_xticklabels(), fontsize=8)
        setp(ax.get_yticklabels(), fontsize=8)

    return axes    

if __name__ == '__main__':
    main()

不完全清楚您希望得到什么,但我将您的问题解释为 "I want to color code the scatter points by the value in a and then see how those colors are distributed in the other variables"。如果这是你想要的,用 seaborn 很容易做到:

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
df = pd.DataFrame(np.random.randn(1000, 4 ), columns=['a', 'b', 'c', 'd'])
df["a_cat"] = pd.cut(df.a, bins=np.linspace(-3.5, 3.5, 8))
g = sns.pairplot(df, hue="a_cat",
                 hue_order=df.a_cat.cat.categories,
                 palette="YlGnBu")
g.savefig("pairplot.png")