如何在 python pandas 代码中对散点图矩阵的间隔使用颜色编码？

Question

import numpy as np
import matplotlib.pyplot as plt
import pandas
df = pandas.DataFrame(np.random.randn(1000,4 ), columns=['a', 'b', 'c', 'd'])
pandas.tools.plotting.scatter_matrix(df, alpha=0.2)
plt.show()

是否可以查看上述结果的颜色编码形式，以便进一步分析，例如对于a列，0-50之间的值可以编码为红色，50-100绿色等等？

Answer 1

乍看之下，我认为这并不容易做到。

scatter_matrix方法是一种方便的方法。如果深入研究它，您会发现它允许传递一些可以轻松更改某些颜色的参数。例如，试试这个：

pandas.tools.plotting.scatter_matrix(df, alpha=0.2, 
c='red', hist_kwds={'color':['burlywood']})

看pandas.tools.plotting中的scatter_matrix定义（和代码），散点图传递的是普通关键字，hist_kwds参数用于封装传递的参数直方图。

但是，我看不到仅使用传递给 hist 的参数来实现您想要的效果的方法。

在 this link 我找到了一个例子，它可以做你想要的事情，可以很容易地修改成这样：

N, bins, patches = ax.hist(values, **hist_kwds)
for bin_size, bin, patch in zip(N, bins, patches):
    if bin_size > 200:
        patch.set_facecolor("green")
        patch.set_label("max")
    elif bin_size < 50:
        patch.set_facecolor("red")
        patch.set_label("min")

然而，关键是你需要在用 ax.hist 绘制补丁后抓取补丁（在上面代码的第一行）。在 pandas.tools.plotting.scatter_matrix 方法中，这些不会返回给您。

根据您希望的解决方案 formal/reusable，有一种方法可以获得您想要的：定义您自己的自定义_scatter_matrix 方法。

您可以 re-use 大部分现有代码，然后在中间显示

if diagonal == 'hist':

您将单个 ax.hist() 调用替换为我上面显示的 8 行左右（并根据需要进一步自定义 logic/colors）。现在这是您自己的方法，所以如果您希望范围或颜色是动态的而不是静态的，您可以为此添加自己的参数。您必须添加几个导入，并在几个变量上明确命名空间，但这样您将拥有更多控制权。

这是我 5 分钟的努力，以证明它有效，首先是结果，然后是代码：

import numpy as np
import matplotlib.pyplot as plt
import pandas
import pandas.tools.plotting
from pandas.compat import range, lrange, lmap, map, zip, string_types


def main():

    df = pandas.DataFrame(np.random.randn(1000,4 ), columns=['a', 'b', 'c', 'd'])
#     pandas.tools.plotting.scatter_matrix(df, alpha=0.2, 
#         c='red', hist_kwds={'color':['burlywood']})
    custom_scatter_matrix(df, alpha=0.2, c='red')
    plt.show()


def custom_scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False,
                   diagonal='hist', marker='.', density_kwds=None,
                   hist_kwds=None, range_padding=0.05, **kwds):
    """
    Draw a matrix of scatter plots.

    Parameters
    ----------
    frame : DataFrame
    alpha : float, optional
        amount of transparency applied
    figsize : (float,float), optional
        a tuple (width, height) in inches
    ax : Matplotlib axis object, optional
    grid : bool, optional
        setting this to True will show the grid
    diagonal : {'hist', 'kde'}
        pick between 'kde' and 'hist' for
        either Kernel Density Estimation or Histogram
        plot in the diagonal
    marker : str, optional
        Matplotlib marker type, default '.'
    hist_kwds : other plotting keyword arguments
        To be passed to hist function
    density_kwds : other plotting keyword arguments
        To be passed to kernel density estimate plot
    range_padding : float, optional
        relative extension of axis range in x and y
        with respect to (x_max - x_min) or (y_max - y_min),
        default 0.05
    kwds : other plotting keyword arguments
        To be passed to scatter function

    Examples
    --------
    >>> df = DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D'])
    >>> scatter_matrix(df, alpha=0.2)
    """
    import matplotlib.pyplot as plt
    from matplotlib.artist import setp

    df = frame._get_numeric_data()
    n = df.columns.size
    naxes = n * n
    fig, axes = pandas.tools.plotting._subplots(naxes=naxes, figsize=figsize, ax=ax,
                          squeeze=False)

    # no gaps between subplots
    fig.subplots_adjust(wspace=0, hspace=0)

    mask = pandas.tools.plotting.com.notnull(df)

    marker = pandas.tools.plotting._get_marker_compat(marker)

    hist_kwds = hist_kwds or {}
    density_kwds = density_kwds or {}

    # workaround because `c='b'` is hardcoded in matplotlibs scatter method
    kwds.setdefault('c', plt.rcParams['patch.facecolor'])

    boundaries_list = []
    for a in df.columns:
        values = df[a].values[mask[a].values]
        rmin_, rmax_ = np.min(values), np.max(values)
        rdelta_ext = (rmax_ - rmin_) * range_padding / 2.
        boundaries_list.append((rmin_ - rdelta_ext, rmax_+ rdelta_ext))

    for i, a in zip(lrange(n), df.columns):
        for j, b in zip(lrange(n), df.columns):
            ax = axes[i, j]

            if i == j:
                values = df[a].values[mask[a].values]

                # Deal with the diagonal by drawing a histogram there.
                if diagonal == 'hist':
                    N, bins, patches = ax.hist(values, **hist_kwds)
                    for bin_size, bin, patch in zip(N, bins, patches):
                        if bin_size > 200:
                            patch.set_facecolor("green")
                            patch.set_label("max")
                        elif bin_size < 50:
                            patch.set_facecolor("red")
                            patch.set_label("min")

                elif diagonal in ('kde', 'density'):
                    from scipy.stats import gaussian_kde
                    y = values
                    gkde = gaussian_kde(y)
                    ind = np.linspace(y.min(), y.max(), 1000)
                    ax.plot(ind, gkde.evaluate(ind), **density_kwds)

                ax.set_xlim(boundaries_list[i])

            else:
                common = (mask[a] & mask[b]).values

                ax.scatter(df[b][common], df[a][common],
                           marker=marker, alpha=alpha, **kwds)

                ax.set_xlim(boundaries_list[j])
                ax.set_ylim(boundaries_list[i])

            ax.set_xlabel('')
            ax.set_ylabel('')

            pandas.tools.plotting._label_axis(ax, kind='x', label=b, position='bottom', rotate=True)

            pandas.tools.plotting._label_axis(ax, kind='y', label=a, position='left')

            if j!= 0:
                ax.yaxis.set_visible(False)
            if i != n-1:
                ax.xaxis.set_visible(False)

    for ax in axes.flat:
        setp(ax.get_xticklabels(), fontsize=8)
        setp(ax.get_yticklabels(), fontsize=8)

    return axes    

if __name__ == '__main__':
    main()

Answer 2

不完全清楚您希望得到什么，但我将您的问题解释为 "I want to color code the scatter points by the value in a and then see how those colors are distributed in the other variables"。如果这是你想要的，用 seaborn 很容易做到：

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
df = pd.DataFrame(np.random.randn(1000, 4 ), columns=['a', 'b', 'c', 'd'])
df["a_cat"] = pd.cut(df.a, bins=np.linspace(-3.5, 3.5, 8))
g = sns.pairplot(df, hue="a_cat",
                 hue_order=df.a_cat.cat.categories,
                 palette="YlGnBu")
g.savefig("pairplot.png")

如何在 python pandas 代码中对散点图矩阵的间隔使用颜色编码？

How to use color coding for intervals for scatter plot matrix in a python pandas code?

python

matplotlib

pandas