如何计算和绘制两列之间的精度

How to calculate and plot accuracy between two columns

我想使用 matplotlib 创建条形图中每个字母的准确率。

示例数据集

data = {'Actual Letter': ['U', 'A', 'X', 'P', 'C', 'R', 'C', 'U', 'J', 'D'], 'Predicted Letter': ['U', 'A', 'X', 'P', 'C', 'R', 'C', 'U', 'J', 'D']}

df = pd.DataFrame(data, index=[10113, 19164, 12798, 12034, 17719, 17886, 4624, 6047, 15608, 11815])

      Actual Letter Predicted Letter
10113             U                U
19164             A                A
12798             X                X
12034             P                P
17719             C                C
17886             R                R
4624              C                C
6047              U                U
15608             J                J
11815             D                D

df.plot(kind='bar')
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-14-a5f21be4f14b> in <module>
      3 df = pd.DataFrame(data, index=[10113, 19164, 12798, 12034, 17719, 17886, 4624, 6047, 15608, 11815])
      4 
----> 5 df.plot(kind='bar')

e:\Anaconda3\lib\site-packages\pandas\plotting\_core.py in __call__(self, *args, **kwargs)
    970                     data.columns = label_name
    971 
--> 972         return plot_backend.plot(data, kind=kind, **kwargs)
    973 
    974     __call__.__doc__ = __doc__

e:\Anaconda3\lib\site-packages\pandas\plotting\_matplotlib\__init__.py in plot(data, kind, **kwargs)
     69             kwargs["ax"] = getattr(ax, "left_ax", ax)
     70     plot_obj = PLOT_CLASSES[kind](data, **kwargs)
---> 71     plot_obj.generate()
     72     plot_obj.draw()
     73     return plot_obj.result

e:\Anaconda3\lib\site-packages\pandas\plotting\_matplotlib\core.py in generate(self)
    284     def generate(self):
    285         self._args_adjust()
--> 286         self._compute_plot_data()
    287         self._setup_subplots()
    288         self._make_plot()

e:\Anaconda3\lib\site-packages\pandas\plotting\_matplotlib\core.py in _compute_plot_data(self)
    451         # no non-numeric frames or series allowed
    452         if is_empty:
--> 453             raise TypeError("no numeric data to plot")
    454 
    455         self.data = numeric_data.apply(self._convert_to_ndarray)

TypeError: no numeric data to plot

我想要一个像这样的条形图。但是我不知道该怎么做。

导入和示例 DataFrame

import pandas as pd
import numpy as np  # for sample data only
import string  # for sample data only

# create sample dataframe for testing
np.random.seed(365)
rows = 1100
data = {'Actual': np.random.choice(list(string.ascii_uppercase), size=rows),
        'Predicted': np.random.choice(list(string.ascii_uppercase), size=rows)}
df = pd.DataFrame(data)

计算和绘图

已更新

  • 下面的实现比较简洁;删除了不必要的步骤。
  1. 根据 'Predicted''Actual'
  2. 之间是否匹配,创建一个布尔值 'Match'
  3. .groupby on 'Actual', aggregate .mean(),乘以 100,四舍五入,得到百分比。
    • 每个字母的组将对布尔值求和并除以计数。对于'A',总和为1,因为有1个True,除以该组的总数,33。因此,1/33 = 0.030303030303030304
  4. 使用 pandas.DataFrame.plot
  5. 绘制所选数据的条形图
  • 请注意,步骤 (1) 和 (2) 可以简化并合并为以下内容:
    • dfa = df.Predicted.eq(df.Actual).groupby(df.Actual).mean().mul(100).round(2)
# determine where Predicted equals Actual
df['Match'] = df.Predicted.eq(df.Actual)

# display(df.head())
  Actual Predicted  Match
0      S         Z  False
1      U         J  False
2      B         L  False
3      M         V  False
4      F         C  False

# groupby and get percent
dfa = df.groupby('Actual').Match.mean().mul(100).round(2)

# display(dfa.head())
Actual
A    3.03
B    2.63
C    4.44
D    6.82
E    5.77
Name: Match, dtype: float64

# plot
ax = dfa.plot(kind='bar', x='Actual', y='%', rot=0, legend=False, grid=True, figsize=(8, 5),
              ylabel='Percent %', xlabel='Letter', title='Accuracy Rate % per letter')

原代码

  • 这也有效
# determine where Predicted equals Actual and convert to an int; True = 1 and False = 0
df['Match'] = df.Predicted.eq(df.Actual).astype(int)

# get the normalized value counts
dfg = df.groupby('Actual').Match.value_counts(normalize=True).mul(100).round(2).reset_index(name='%')

# get the accuracy scores where there is a Match
df_accuracy = dfg[dfg.Match.eq(1)]

# display(df_accuracy.head())
  Actual  Match     %
1      A      1  3.03
3      B      1  2.63
5      C      1  4.44
7      D      1  6.82
9      E      1  5.77

# plot
ax = df_accuracy.plot(kind='bar', x='Actual', y='%', rot=0, legend=False, grid=True, figsize=(8, 5),
                      ylabel='Percent %', xlabel='Letter', title='Accuracy Rate % per letter')

  • 有您记下的模拟数据
  • 如果先计算百分比,图表会非常简单
import numpy as np
import pandas as pd

# simulate some data...
df = pd.DataFrame(
    {"Actual Letter": np.random.choice(list("ABCDEFGHIJKLMNOPQRSTUVWXYZ"), 200)}
).assign(
    **{
        "Predicted Letter": lambda d: d["Actual Letter"].apply(
            lambda l: np.random.choice(
                [l] + list("ABCDEFGHIJKLMNOPQRSTUVWXYZ"), 1, p=tuple([0.95]+ [0.05/26]*26)
            )[0]
        )
    }
)

# now just calc percentage of where actual and predicted are the same
# graph it...
df.groupby("Actual Letter").apply(lambda d: (d["Actual Letter"]==d["Predicted Letter"]).sum()/len(d)).plot(kind="bar")