如何在 python 中制作帕累托图?
How to make Pareto Chart in python?
Pareto 是 Excel 和 Tableu 中非常流行的图表。在 excel 中我们可以很容易地画出帕累托图,但我发现在 Python 中没有简单的方法来画图。
我有一个这样的 pandas 数据框:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.DataFrame({'country': [177.0, 7.0, 4.0, 2.0, 2.0, 1.0, 1.0, 1.0]})
df.index = ['USA', 'Canada', 'Russia', 'UK', 'Belgium', 'Mexico', 'Germany', 'Denmark']
print(df)
country
USA 177.0
Canada 7.0
Russia 4.0
UK 2.0
Belgium 2.0
Mexico 1.0
Germany 1.0
Denmark 1.0
如何绘制帕累托图?
可能使用 pandas、seaborn、matplotlib 等?
到目前为止,我已经能够制作降序条形图。
但它仍然需要将累积总和线图放在它们之上。
我的尝试:
df.sort_values(by='country',ascending=False).plot.bar()
所需情节:
您可能想要创建一个包含百分比的新列,并将一列绘制为条形图,另一列绘制为双轴中的折线图。
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
df = pd.DataFrame({'country': [177.0, 7.0, 4.0, 2.0, 2.0, 1.0, 1.0, 1.0]})
df.index = ['USA', 'Canada', 'Russia', 'UK', 'Belgium', 'Mexico', 'Germany', 'Denmark']
df = df.sort_values(by='country',ascending=False)
df["cumpercentage"] = df["country"].cumsum()/df["country"].sum()*100
fig, ax = plt.subplots()
ax.bar(df.index, df["country"], color="C0")
ax2 = ax.twinx()
ax2.plot(df.index, df["cumpercentage"], color="C1", marker="D", ms=7)
ax2.yaxis.set_major_formatter(PercentFormatter())
ax.tick_params(axis="y", colors="C0")
ax2.tick_params(axis="y", colors="C1")
plt.show()
ImportanceOfBeingErnest 代码的更通用版本:
def create_pareto_chart(df, by_variable, quant_variable):
df.index = by_variable
df["cumpercentage"] = quant_variable.cumsum()/quant_variable.sum()*100
fig, ax = plt.subplots()
ax.bar(df.index, quant_variable, color="C0")
ax2 = ax.twinx()
ax2.plot(df.index, df["cumpercentage"], color="C1", marker="D", ms=7)
ax2.yaxis.set_major_formatter(PercentFormatter())
ax.tick_params(axis="y", colors="C0")
ax2.tick_params(axis="y", colors="C1")
plt.show()
这个也包括根据阈值分组的帕累托。
例如:如果将其设置为 70,它将把超过 70 的少数群体归为一组,称为 "Other"。
def create_pareto_chart(by_variable, quant_variable, threshold):
total=quant_variable.sum()
df = pd.DataFrame({'by_var':by_variable, 'quant_var':quant_variable})
df["cumpercentage"] = quant_variable.cumsum()/quant_variable.sum()*100
df = df.sort_values(by='quant_var',ascending=False)
df_above_threshold = df[df['cumpercentage'] < threshold]
df=df_above_threshold
df_below_threshold = df[df['cumpercentage'] >= threshold]
sum = total - df['quant_var'].sum()
restbarcumsum = 100 - df_above_threshold['cumpercentage'].max()
rest = pd.Series(['OTHERS', sum, restbarcumsum],index=['by_var','quant_var', 'cumpercentage'])
df = df.append(rest,ignore_index=True)
df.index = df['by_var']
df = df.sort_values(by='cumpercentage',ascending=True)
fig, ax = plt.subplots()
ax.bar(df.index, df["quant_var"], color="C0")
ax2 = ax.twinx()
ax2.plot(df.index, df["cumpercentage"], color="C1", marker="D", ms=7)
ax2.yaxis.set_major_formatter(PercentFormatter())
ax.tick_params(axis="x", colors="C0", labelrotation=70)
ax.tick_params(axis="y", colors="C0")
ax2.tick_params(axis="y", colors="C1")
plt.show()
另一种方法是使用 secondary_y
参数而不使用 twinx()
:
df['pareto'] = 100 *df.country.cumsum() / df.country.sum()
fig, axes = plt.subplots()
ax1 = df.plot(use_index=True, y='country', kind='bar', ax=axes)
ax2 = df.plot(use_index=True, y='pareto', marker='D', color="C1", kind='line', ax=axes, secondary_y=True)
ax2.set_ylim([0,110])
参数 use_index=True
是必需的,因为在这种情况下,您的 index
是您的 x
轴。否则你可以使用 x='x_Variable'
.
pandas.dataframe
的帕累托图
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
def _plot_pareto_by(df_, group_by, column):
df = df_.groupby(group_by)[column].sum().reset_index()
df = df.sort_values(by=column,ascending=False)
df["cumpercentage"] = df[column].cumsum()/df[column].sum()*100
fig, ax = plt.subplots(figsize=(20,5))
ax.bar(df[group_by], df[column], color="C0")
ax2 = ax.twinx()
ax2.plot(df[group_by], df["cumpercentage"], color="C1", marker="D", ms=7)
ax2.yaxis.set_major_formatter(PercentFormatter())
ax.tick_params(axis="y", colors="C0")
ax2.tick_params(axis="y", colors="C1")
for tick in ax.get_xticklabels():
tick.set_rotation(45)
plt.show()
这是我使用 pandas 和 plotly 绘制的帕累托图版本。您可以使用任何包含未分组数据的集合。
让我们从这个例子的数据开始:
import numpy as np
data = np.random.choice(['USA', 'Canada', 'Russia', 'UK', 'Belgium',
'Mexico', 'Germany', 'Denmark'], size=500,
p=[0.43, 0.14, 0.23, 0.07, 0.04, 0.01, 0.03, 0.05])
图表创建:
import pandas as pd
import plotly.graph_objects as go
def pareto_chart(collection):
collection = pd.Series(collection)
counts = (collection.value_counts().to_frame('counts')
.join(collection.value_counts(normalize=True).cumsum().to_frame('ratio')))
fig = go.Figure([go.Bar(x=counts.index, y=counts['counts'], yaxis='y1', name='count'),
go.Scatter(x=counts.index, y=counts['ratio'], yaxis='y2', name='cumulative ratio',
hovertemplate='%{y:.1%}', marker={'color': '#000000'})])
fig.update_layout(template='plotly_white', showlegend=False, hovermode='x', bargap=.3,
title={'text': 'Pareto Chart', 'x': .5},
yaxis={'title': 'count'},
yaxis2={'rangemode': "tozero", 'overlaying': 'y',
'position': 1, 'side': 'right',
'title': 'ratio',
'tickvals': np.arange(0, 1.1, .2),
'tickmode': 'array',
'ticktext': [str(i) + '%' for i in range(0, 101, 20)]})
fig.show()
结果:
Pareto 是 Excel 和 Tableu 中非常流行的图表。在 excel 中我们可以很容易地画出帕累托图,但我发现在 Python 中没有简单的方法来画图。
我有一个这样的 pandas 数据框:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
df = pd.DataFrame({'country': [177.0, 7.0, 4.0, 2.0, 2.0, 1.0, 1.0, 1.0]})
df.index = ['USA', 'Canada', 'Russia', 'UK', 'Belgium', 'Mexico', 'Germany', 'Denmark']
print(df)
country
USA 177.0
Canada 7.0
Russia 4.0
UK 2.0
Belgium 2.0
Mexico 1.0
Germany 1.0
Denmark 1.0
如何绘制帕累托图? 可能使用 pandas、seaborn、matplotlib 等?
到目前为止,我已经能够制作降序条形图。 但它仍然需要将累积总和线图放在它们之上。
我的尝试:
df.sort_values(by='country',ascending=False).plot.bar()
所需情节:
您可能想要创建一个包含百分比的新列,并将一列绘制为条形图,另一列绘制为双轴中的折线图。
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
df = pd.DataFrame({'country': [177.0, 7.0, 4.0, 2.0, 2.0, 1.0, 1.0, 1.0]})
df.index = ['USA', 'Canada', 'Russia', 'UK', 'Belgium', 'Mexico', 'Germany', 'Denmark']
df = df.sort_values(by='country',ascending=False)
df["cumpercentage"] = df["country"].cumsum()/df["country"].sum()*100
fig, ax = plt.subplots()
ax.bar(df.index, df["country"], color="C0")
ax2 = ax.twinx()
ax2.plot(df.index, df["cumpercentage"], color="C1", marker="D", ms=7)
ax2.yaxis.set_major_formatter(PercentFormatter())
ax.tick_params(axis="y", colors="C0")
ax2.tick_params(axis="y", colors="C1")
plt.show()
ImportanceOfBeingErnest 代码的更通用版本:
def create_pareto_chart(df, by_variable, quant_variable):
df.index = by_variable
df["cumpercentage"] = quant_variable.cumsum()/quant_variable.sum()*100
fig, ax = plt.subplots()
ax.bar(df.index, quant_variable, color="C0")
ax2 = ax.twinx()
ax2.plot(df.index, df["cumpercentage"], color="C1", marker="D", ms=7)
ax2.yaxis.set_major_formatter(PercentFormatter())
ax.tick_params(axis="y", colors="C0")
ax2.tick_params(axis="y", colors="C1")
plt.show()
这个也包括根据阈值分组的帕累托。 例如:如果将其设置为 70,它将把超过 70 的少数群体归为一组,称为 "Other"。
def create_pareto_chart(by_variable, quant_variable, threshold):
total=quant_variable.sum()
df = pd.DataFrame({'by_var':by_variable, 'quant_var':quant_variable})
df["cumpercentage"] = quant_variable.cumsum()/quant_variable.sum()*100
df = df.sort_values(by='quant_var',ascending=False)
df_above_threshold = df[df['cumpercentage'] < threshold]
df=df_above_threshold
df_below_threshold = df[df['cumpercentage'] >= threshold]
sum = total - df['quant_var'].sum()
restbarcumsum = 100 - df_above_threshold['cumpercentage'].max()
rest = pd.Series(['OTHERS', sum, restbarcumsum],index=['by_var','quant_var', 'cumpercentage'])
df = df.append(rest,ignore_index=True)
df.index = df['by_var']
df = df.sort_values(by='cumpercentage',ascending=True)
fig, ax = plt.subplots()
ax.bar(df.index, df["quant_var"], color="C0")
ax2 = ax.twinx()
ax2.plot(df.index, df["cumpercentage"], color="C1", marker="D", ms=7)
ax2.yaxis.set_major_formatter(PercentFormatter())
ax.tick_params(axis="x", colors="C0", labelrotation=70)
ax.tick_params(axis="y", colors="C0")
ax2.tick_params(axis="y", colors="C1")
plt.show()
另一种方法是使用 secondary_y
参数而不使用 twinx()
:
df['pareto'] = 100 *df.country.cumsum() / df.country.sum()
fig, axes = plt.subplots()
ax1 = df.plot(use_index=True, y='country', kind='bar', ax=axes)
ax2 = df.plot(use_index=True, y='pareto', marker='D', color="C1", kind='line', ax=axes, secondary_y=True)
ax2.set_ylim([0,110])
参数 use_index=True
是必需的,因为在这种情况下,您的 index
是您的 x
轴。否则你可以使用 x='x_Variable'
.
pandas.dataframe
的帕累托图import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
def _plot_pareto_by(df_, group_by, column):
df = df_.groupby(group_by)[column].sum().reset_index()
df = df.sort_values(by=column,ascending=False)
df["cumpercentage"] = df[column].cumsum()/df[column].sum()*100
fig, ax = plt.subplots(figsize=(20,5))
ax.bar(df[group_by], df[column], color="C0")
ax2 = ax.twinx()
ax2.plot(df[group_by], df["cumpercentage"], color="C1", marker="D", ms=7)
ax2.yaxis.set_major_formatter(PercentFormatter())
ax.tick_params(axis="y", colors="C0")
ax2.tick_params(axis="y", colors="C1")
for tick in ax.get_xticklabels():
tick.set_rotation(45)
plt.show()
这是我使用 pandas 和 plotly 绘制的帕累托图版本。您可以使用任何包含未分组数据的集合。 让我们从这个例子的数据开始:
import numpy as np
data = np.random.choice(['USA', 'Canada', 'Russia', 'UK', 'Belgium',
'Mexico', 'Germany', 'Denmark'], size=500,
p=[0.43, 0.14, 0.23, 0.07, 0.04, 0.01, 0.03, 0.05])
图表创建:
import pandas as pd
import plotly.graph_objects as go
def pareto_chart(collection):
collection = pd.Series(collection)
counts = (collection.value_counts().to_frame('counts')
.join(collection.value_counts(normalize=True).cumsum().to_frame('ratio')))
fig = go.Figure([go.Bar(x=counts.index, y=counts['counts'], yaxis='y1', name='count'),
go.Scatter(x=counts.index, y=counts['ratio'], yaxis='y2', name='cumulative ratio',
hovertemplate='%{y:.1%}', marker={'color': '#000000'})])
fig.update_layout(template='plotly_white', showlegend=False, hovermode='x', bargap=.3,
title={'text': 'Pareto Chart', 'x': .5},
yaxis={'title': 'count'},
yaxis2={'rangemode': "tozero", 'overlaying': 'y',
'position': 1, 'side': 'right',
'title': 'ratio',
'tickvals': np.arange(0, 1.1, .2),
'tickmode': 'array',
'ticktext': [str(i) + '%' for i in range(0, 101, 20)]})
fig.show()
结果: