如何在 python pandas 代码中对散点图矩阵的间隔使用颜色编码?
How to use color coding for intervals for scatter plot matrix in a python pandas code?
import numpy as np
import matplotlib.pyplot as plt
import pandas
df = pandas.DataFrame(np.random.randn(1000,4 ), columns=['a', 'b', 'c', 'd'])
pandas.tools.plotting.scatter_matrix(df, alpha=0.2)
plt.show()
是否可以查看上述结果的颜色编码形式,以便进一步分析,例如对于a列,0-50之间的值可以编码为红色,50-100绿色等等?
乍看之下,我认为这并不容易做到。
scatter_matrix方法是一种方便的方法。如果深入研究它,您会发现它允许传递一些可以轻松更改某些颜色的参数。例如,试试这个:
pandas.tools.plotting.scatter_matrix(df, alpha=0.2,
c='red', hist_kwds={'color':['burlywood']})
看pandas.tools.plotting中的scatter_matrix定义(和代码),散点图传递的是普通关键字,hist_kwds参数用于封装传递的参数直方图。
但是,我看不到仅使用传递给 hist 的参数来实现您想要的效果的方法。
在 this link 我找到了一个例子,它可以做你想要的事情,可以很容易地修改成这样:
N, bins, patches = ax.hist(values, **hist_kwds)
for bin_size, bin, patch in zip(N, bins, patches):
if bin_size > 200:
patch.set_facecolor("green")
patch.set_label("max")
elif bin_size < 50:
patch.set_facecolor("red")
patch.set_label("min")
然而,关键是你需要在用 ax.hist 绘制补丁后抓取补丁(在上面代码的第一行)。在 pandas.tools.plotting.scatter_matrix 方法中,这些不会返回给您。
根据您希望的解决方案 formal/reusable,有一种方法可以获得您想要的:定义您自己的自定义_scatter_matrix 方法。
您可以 re-use 大部分现有代码,然后在中间显示
if diagonal == 'hist':
您将单个 ax.hist() 调用替换为我上面显示的 8 行左右(并根据需要进一步自定义 logic/colors)。现在这是您自己的方法,所以如果您希望范围或颜色是动态的而不是静态的,您可以为此添加自己的参数。您必须添加几个导入,并在几个变量上明确命名空间,但这样您将拥有更多控制权。
这是我 5 分钟的努力,以证明它有效,首先是结果,然后是代码:
import numpy as np
import matplotlib.pyplot as plt
import pandas
import pandas.tools.plotting
from pandas.compat import range, lrange, lmap, map, zip, string_types
def main():
df = pandas.DataFrame(np.random.randn(1000,4 ), columns=['a', 'b', 'c', 'd'])
# pandas.tools.plotting.scatter_matrix(df, alpha=0.2,
# c='red', hist_kwds={'color':['burlywood']})
custom_scatter_matrix(df, alpha=0.2, c='red')
plt.show()
def custom_scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False,
diagonal='hist', marker='.', density_kwds=None,
hist_kwds=None, range_padding=0.05, **kwds):
"""
Draw a matrix of scatter plots.
Parameters
----------
frame : DataFrame
alpha : float, optional
amount of transparency applied
figsize : (float,float), optional
a tuple (width, height) in inches
ax : Matplotlib axis object, optional
grid : bool, optional
setting this to True will show the grid
diagonal : {'hist', 'kde'}
pick between 'kde' and 'hist' for
either Kernel Density Estimation or Histogram
plot in the diagonal
marker : str, optional
Matplotlib marker type, default '.'
hist_kwds : other plotting keyword arguments
To be passed to hist function
density_kwds : other plotting keyword arguments
To be passed to kernel density estimate plot
range_padding : float, optional
relative extension of axis range in x and y
with respect to (x_max - x_min) or (y_max - y_min),
default 0.05
kwds : other plotting keyword arguments
To be passed to scatter function
Examples
--------
>>> df = DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D'])
>>> scatter_matrix(df, alpha=0.2)
"""
import matplotlib.pyplot as plt
from matplotlib.artist import setp
df = frame._get_numeric_data()
n = df.columns.size
naxes = n * n
fig, axes = pandas.tools.plotting._subplots(naxes=naxes, figsize=figsize, ax=ax,
squeeze=False)
# no gaps between subplots
fig.subplots_adjust(wspace=0, hspace=0)
mask = pandas.tools.plotting.com.notnull(df)
marker = pandas.tools.plotting._get_marker_compat(marker)
hist_kwds = hist_kwds or {}
density_kwds = density_kwds or {}
# workaround because `c='b'` is hardcoded in matplotlibs scatter method
kwds.setdefault('c', plt.rcParams['patch.facecolor'])
boundaries_list = []
for a in df.columns:
values = df[a].values[mask[a].values]
rmin_, rmax_ = np.min(values), np.max(values)
rdelta_ext = (rmax_ - rmin_) * range_padding / 2.
boundaries_list.append((rmin_ - rdelta_ext, rmax_+ rdelta_ext))
for i, a in zip(lrange(n), df.columns):
for j, b in zip(lrange(n), df.columns):
ax = axes[i, j]
if i == j:
values = df[a].values[mask[a].values]
# Deal with the diagonal by drawing a histogram there.
if diagonal == 'hist':
N, bins, patches = ax.hist(values, **hist_kwds)
for bin_size, bin, patch in zip(N, bins, patches):
if bin_size > 200:
patch.set_facecolor("green")
patch.set_label("max")
elif bin_size < 50:
patch.set_facecolor("red")
patch.set_label("min")
elif diagonal in ('kde', 'density'):
from scipy.stats import gaussian_kde
y = values
gkde = gaussian_kde(y)
ind = np.linspace(y.min(), y.max(), 1000)
ax.plot(ind, gkde.evaluate(ind), **density_kwds)
ax.set_xlim(boundaries_list[i])
else:
common = (mask[a] & mask[b]).values
ax.scatter(df[b][common], df[a][common],
marker=marker, alpha=alpha, **kwds)
ax.set_xlim(boundaries_list[j])
ax.set_ylim(boundaries_list[i])
ax.set_xlabel('')
ax.set_ylabel('')
pandas.tools.plotting._label_axis(ax, kind='x', label=b, position='bottom', rotate=True)
pandas.tools.plotting._label_axis(ax, kind='y', label=a, position='left')
if j!= 0:
ax.yaxis.set_visible(False)
if i != n-1:
ax.xaxis.set_visible(False)
for ax in axes.flat:
setp(ax.get_xticklabels(), fontsize=8)
setp(ax.get_yticklabels(), fontsize=8)
return axes
if __name__ == '__main__':
main()
不完全清楚您希望得到什么,但我将您的问题解释为 "I want to color code the scatter points by the value in a
and then see how those colors are distributed in the other variables"。如果这是你想要的,用 seaborn 很容易做到:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
df = pd.DataFrame(np.random.randn(1000, 4 ), columns=['a', 'b', 'c', 'd'])
df["a_cat"] = pd.cut(df.a, bins=np.linspace(-3.5, 3.5, 8))
g = sns.pairplot(df, hue="a_cat",
hue_order=df.a_cat.cat.categories,
palette="YlGnBu")
g.savefig("pairplot.png")
import numpy as np
import matplotlib.pyplot as plt
import pandas
df = pandas.DataFrame(np.random.randn(1000,4 ), columns=['a', 'b', 'c', 'd'])
pandas.tools.plotting.scatter_matrix(df, alpha=0.2)
plt.show()
是否可以查看上述结果的颜色编码形式,以便进一步分析,例如对于a列,0-50之间的值可以编码为红色,50-100绿色等等?
乍看之下,我认为这并不容易做到。
scatter_matrix方法是一种方便的方法。如果深入研究它,您会发现它允许传递一些可以轻松更改某些颜色的参数。例如,试试这个:
pandas.tools.plotting.scatter_matrix(df, alpha=0.2,
c='red', hist_kwds={'color':['burlywood']})
看pandas.tools.plotting中的scatter_matrix定义(和代码),散点图传递的是普通关键字,hist_kwds参数用于封装传递的参数直方图。
但是,我看不到仅使用传递给 hist 的参数来实现您想要的效果的方法。
在 this link 我找到了一个例子,它可以做你想要的事情,可以很容易地修改成这样:
N, bins, patches = ax.hist(values, **hist_kwds)
for bin_size, bin, patch in zip(N, bins, patches):
if bin_size > 200:
patch.set_facecolor("green")
patch.set_label("max")
elif bin_size < 50:
patch.set_facecolor("red")
patch.set_label("min")
然而,关键是你需要在用 ax.hist 绘制补丁后抓取补丁(在上面代码的第一行)。在 pandas.tools.plotting.scatter_matrix 方法中,这些不会返回给您。
根据您希望的解决方案 formal/reusable,有一种方法可以获得您想要的:定义您自己的自定义_scatter_matrix 方法。
您可以 re-use 大部分现有代码,然后在中间显示
if diagonal == 'hist':
您将单个 ax.hist() 调用替换为我上面显示的 8 行左右(并根据需要进一步自定义 logic/colors)。现在这是您自己的方法,所以如果您希望范围或颜色是动态的而不是静态的,您可以为此添加自己的参数。您必须添加几个导入,并在几个变量上明确命名空间,但这样您将拥有更多控制权。
这是我 5 分钟的努力,以证明它有效,首先是结果,然后是代码:
import numpy as np
import matplotlib.pyplot as plt
import pandas
import pandas.tools.plotting
from pandas.compat import range, lrange, lmap, map, zip, string_types
def main():
df = pandas.DataFrame(np.random.randn(1000,4 ), columns=['a', 'b', 'c', 'd'])
# pandas.tools.plotting.scatter_matrix(df, alpha=0.2,
# c='red', hist_kwds={'color':['burlywood']})
custom_scatter_matrix(df, alpha=0.2, c='red')
plt.show()
def custom_scatter_matrix(frame, alpha=0.5, figsize=None, ax=None, grid=False,
diagonal='hist', marker='.', density_kwds=None,
hist_kwds=None, range_padding=0.05, **kwds):
"""
Draw a matrix of scatter plots.
Parameters
----------
frame : DataFrame
alpha : float, optional
amount of transparency applied
figsize : (float,float), optional
a tuple (width, height) in inches
ax : Matplotlib axis object, optional
grid : bool, optional
setting this to True will show the grid
diagonal : {'hist', 'kde'}
pick between 'kde' and 'hist' for
either Kernel Density Estimation or Histogram
plot in the diagonal
marker : str, optional
Matplotlib marker type, default '.'
hist_kwds : other plotting keyword arguments
To be passed to hist function
density_kwds : other plotting keyword arguments
To be passed to kernel density estimate plot
range_padding : float, optional
relative extension of axis range in x and y
with respect to (x_max - x_min) or (y_max - y_min),
default 0.05
kwds : other plotting keyword arguments
To be passed to scatter function
Examples
--------
>>> df = DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D'])
>>> scatter_matrix(df, alpha=0.2)
"""
import matplotlib.pyplot as plt
from matplotlib.artist import setp
df = frame._get_numeric_data()
n = df.columns.size
naxes = n * n
fig, axes = pandas.tools.plotting._subplots(naxes=naxes, figsize=figsize, ax=ax,
squeeze=False)
# no gaps between subplots
fig.subplots_adjust(wspace=0, hspace=0)
mask = pandas.tools.plotting.com.notnull(df)
marker = pandas.tools.plotting._get_marker_compat(marker)
hist_kwds = hist_kwds or {}
density_kwds = density_kwds or {}
# workaround because `c='b'` is hardcoded in matplotlibs scatter method
kwds.setdefault('c', plt.rcParams['patch.facecolor'])
boundaries_list = []
for a in df.columns:
values = df[a].values[mask[a].values]
rmin_, rmax_ = np.min(values), np.max(values)
rdelta_ext = (rmax_ - rmin_) * range_padding / 2.
boundaries_list.append((rmin_ - rdelta_ext, rmax_+ rdelta_ext))
for i, a in zip(lrange(n), df.columns):
for j, b in zip(lrange(n), df.columns):
ax = axes[i, j]
if i == j:
values = df[a].values[mask[a].values]
# Deal with the diagonal by drawing a histogram there.
if diagonal == 'hist':
N, bins, patches = ax.hist(values, **hist_kwds)
for bin_size, bin, patch in zip(N, bins, patches):
if bin_size > 200:
patch.set_facecolor("green")
patch.set_label("max")
elif bin_size < 50:
patch.set_facecolor("red")
patch.set_label("min")
elif diagonal in ('kde', 'density'):
from scipy.stats import gaussian_kde
y = values
gkde = gaussian_kde(y)
ind = np.linspace(y.min(), y.max(), 1000)
ax.plot(ind, gkde.evaluate(ind), **density_kwds)
ax.set_xlim(boundaries_list[i])
else:
common = (mask[a] & mask[b]).values
ax.scatter(df[b][common], df[a][common],
marker=marker, alpha=alpha, **kwds)
ax.set_xlim(boundaries_list[j])
ax.set_ylim(boundaries_list[i])
ax.set_xlabel('')
ax.set_ylabel('')
pandas.tools.plotting._label_axis(ax, kind='x', label=b, position='bottom', rotate=True)
pandas.tools.plotting._label_axis(ax, kind='y', label=a, position='left')
if j!= 0:
ax.yaxis.set_visible(False)
if i != n-1:
ax.xaxis.set_visible(False)
for ax in axes.flat:
setp(ax.get_xticklabels(), fontsize=8)
setp(ax.get_yticklabels(), fontsize=8)
return axes
if __name__ == '__main__':
main()
不完全清楚您希望得到什么,但我将您的问题解释为 "I want to color code the scatter points by the value in a
and then see how those colors are distributed in the other variables"。如果这是你想要的,用 seaborn 很容易做到:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
df = pd.DataFrame(np.random.randn(1000, 4 ), columns=['a', 'b', 'c', 'd'])
df["a_cat"] = pd.cut(df.a, bins=np.linspace(-3.5, 3.5, 8))
g = sns.pairplot(df, hue="a_cat",
hue_order=df.a_cat.cat.categories,
palette="YlGnBu")
g.savefig("pairplot.png")