如何缩放数据以使图表下方的面积等于 1

How to scale data to make area under the graph equal to 1

我制作了一个函数,可以在不到 2 秒的时间内绘制大型数组的统计信息 (10**8)。如何缩放 Y-axis 以使图表下方的面积等于 1?

def dis(inp):
  import numpy as np
  import vaex
  import matplotlib.pyplot as plt

  if getattr(inp, "numpy", None) is not None:
    inp1d = np.reshape(inp.numpy(), [-1])
  else:
    inp1d = np.reshape(inp, [-1])

  bin_count = 64
  df = vaex.from_arrays(x=inp1d)
  x_min, x_max = df.minmax(df.x)
  bins = df.count(binby=df.x, shape=bin_count, limits='minmax', edges=True)
  bins[-2] += bins[-1]
  bins[-1] = bins[-2]
  bins = bins[2:]
  hist_height = np.max(bins)
  edges = np.linspace(x_min, x_max, bin_count+1)
  mean = df.mean(df.x)
  std = df.std(df.x)

  for i, v in enumerate([x * std + mean for x in range(-3, 4)]):
    if i == 3:
      plt.plot([v, v], [0, hist_height * 1.02], color='#34A853', linewidth=1)
    else:
      plt.plot([v, v], [0, hist_height * 0.97], color='#34A853', linewidth=0.5)

  plt.step(edges, bins, where='post', color='#4285F4', linewidth=1)
  plt.show()
  print(f'{str(inp.shape) + " " if getattr(inp, "shape", None) is not None and inp.ndim > 1 else ""}{len(inp1d):,}\nmean: {mean}\nstd: {std}\nmin: {x_min}\nmax: {x_max}')

x = np.random.normal(0, 1, (10**8, ))

完整的答案,如果有人现在想知道如何绘制大数据统计:

def dis(inp):
  import numpy as np
  import vaex
  import matplotlib.pyplot as plt

  if getattr(inp, "numpy", None) is not None:
    inp1d = np.reshape(inp.numpy(), [-1])
  else:
    inp1d = np.reshape(inp, [-1])

  bin_count = 64
  df = vaex.from_arrays(x=inp1d)
  x_min, x_max = df.minmax(df.x)
  bins = df.count(binby=df.x, shape=bin_count, limits='minmax', edges=True)
  bins[-2] += bins[-1]
  bins = bins[2:-1]
  edges = np.linspace(x_min, x_max, bin_count+1)
  left, right = edges[:-1], edges[1:]
  edges = np.reshape(np.array([left,right]).T, [-1])
  bins = np.reshape(np.array([bins,bins]).T, [-1])
  mean = df.mean(df.x)
  std = df.std(df.x)

  # Scale AUC to 1
  step = (x_max-x_min)/bin_count
  population = np.sum(bins)
  surface = population*step
  bins = bins/surface
  hist_height = np.max(bins)

  for i, v in enumerate([x * std + mean for x in range(-3, 4)]):
    if i == 3:
      plt.plot([v, v], [0, hist_height * 1.02], color='#34A853', linewidth=1)
    else:
      plt.plot([v, v], [0, hist_height * 0.97], color='#34A853', linewidth=0.5)

  plt.fill_between(edges, bins, step="pre", alpha=0.3)
  plt.plot(edges, bins, color='#4285F4', linewidth=1)
  plt.show()
  print(f'{str(inp.shape) + " " if getattr(inp, "shape", None) is not None and inp.ndim > 1 else ""}{len(inp1d):,}\nmean: {mean}\nstd: {std}\nmin: {x_min}\nmax: {x_max}')

致版主:这个网站不允许我 post 代码,即使它是答案:看起来你的 post 主要是代码;请添加更多详细信息。

我们的想法是规范化您的数据集,即将每列的高度除以直方图的 AUC(曲线下面积)。

在"plt.step(...)"之前写:

step = (x_max-x_min)/bin_count
population = np.sum(bins)
surface = population*step
bins = bins/surface

希望能有所帮助