如何缩放数据以使图表下方的面积等于 1
How to scale data to make area under the graph equal to 1
我制作了一个函数,可以在不到 2 秒的时间内绘制大型数组的统计信息 (10**8)
。如何缩放 Y-axis
以使图表下方的面积等于 1?
def dis(inp):
import numpy as np
import vaex
import matplotlib.pyplot as plt
if getattr(inp, "numpy", None) is not None:
inp1d = np.reshape(inp.numpy(), [-1])
else:
inp1d = np.reshape(inp, [-1])
bin_count = 64
df = vaex.from_arrays(x=inp1d)
x_min, x_max = df.minmax(df.x)
bins = df.count(binby=df.x, shape=bin_count, limits='minmax', edges=True)
bins[-2] += bins[-1]
bins[-1] = bins[-2]
bins = bins[2:]
hist_height = np.max(bins)
edges = np.linspace(x_min, x_max, bin_count+1)
mean = df.mean(df.x)
std = df.std(df.x)
for i, v in enumerate([x * std + mean for x in range(-3, 4)]):
if i == 3:
plt.plot([v, v], [0, hist_height * 1.02], color='#34A853', linewidth=1)
else:
plt.plot([v, v], [0, hist_height * 0.97], color='#34A853', linewidth=0.5)
plt.step(edges, bins, where='post', color='#4285F4', linewidth=1)
plt.show()
print(f'{str(inp.shape) + " " if getattr(inp, "shape", None) is not None and inp.ndim > 1 else ""}{len(inp1d):,}\nmean: {mean}\nstd: {std}\nmin: {x_min}\nmax: {x_max}')
x = np.random.normal(0, 1, (10**8, ))
完整的答案,如果有人现在想知道如何绘制大数据统计:
def dis(inp):
import numpy as np
import vaex
import matplotlib.pyplot as plt
if getattr(inp, "numpy", None) is not None:
inp1d = np.reshape(inp.numpy(), [-1])
else:
inp1d = np.reshape(inp, [-1])
bin_count = 64
df = vaex.from_arrays(x=inp1d)
x_min, x_max = df.minmax(df.x)
bins = df.count(binby=df.x, shape=bin_count, limits='minmax', edges=True)
bins[-2] += bins[-1]
bins = bins[2:-1]
edges = np.linspace(x_min, x_max, bin_count+1)
left, right = edges[:-1], edges[1:]
edges = np.reshape(np.array([left,right]).T, [-1])
bins = np.reshape(np.array([bins,bins]).T, [-1])
mean = df.mean(df.x)
std = df.std(df.x)
# Scale AUC to 1
step = (x_max-x_min)/bin_count
population = np.sum(bins)
surface = population*step
bins = bins/surface
hist_height = np.max(bins)
for i, v in enumerate([x * std + mean for x in range(-3, 4)]):
if i == 3:
plt.plot([v, v], [0, hist_height * 1.02], color='#34A853', linewidth=1)
else:
plt.plot([v, v], [0, hist_height * 0.97], color='#34A853', linewidth=0.5)
plt.fill_between(edges, bins, step="pre", alpha=0.3)
plt.plot(edges, bins, color='#4285F4', linewidth=1)
plt.show()
print(f'{str(inp.shape) + " " if getattr(inp, "shape", None) is not None and inp.ndim > 1 else ""}{len(inp1d):,}\nmean: {mean}\nstd: {std}\nmin: {x_min}\nmax: {x_max}')
致版主:这个网站不允许我 post 代码,即使它是答案:看起来你的 post 主要是代码;请添加更多详细信息。
我们的想法是规范化您的数据集,即将每列的高度除以直方图的 AUC(曲线下面积)。
在"plt.step(...)"之前写:
step = (x_max-x_min)/bin_count
population = np.sum(bins)
surface = population*step
bins = bins/surface
希望能有所帮助
我制作了一个函数,可以在不到 2 秒的时间内绘制大型数组的统计信息 (10**8)
。如何缩放 Y-axis
以使图表下方的面积等于 1?
def dis(inp):
import numpy as np
import vaex
import matplotlib.pyplot as plt
if getattr(inp, "numpy", None) is not None:
inp1d = np.reshape(inp.numpy(), [-1])
else:
inp1d = np.reshape(inp, [-1])
bin_count = 64
df = vaex.from_arrays(x=inp1d)
x_min, x_max = df.minmax(df.x)
bins = df.count(binby=df.x, shape=bin_count, limits='minmax', edges=True)
bins[-2] += bins[-1]
bins[-1] = bins[-2]
bins = bins[2:]
hist_height = np.max(bins)
edges = np.linspace(x_min, x_max, bin_count+1)
mean = df.mean(df.x)
std = df.std(df.x)
for i, v in enumerate([x * std + mean for x in range(-3, 4)]):
if i == 3:
plt.plot([v, v], [0, hist_height * 1.02], color='#34A853', linewidth=1)
else:
plt.plot([v, v], [0, hist_height * 0.97], color='#34A853', linewidth=0.5)
plt.step(edges, bins, where='post', color='#4285F4', linewidth=1)
plt.show()
print(f'{str(inp.shape) + " " if getattr(inp, "shape", None) is not None and inp.ndim > 1 else ""}{len(inp1d):,}\nmean: {mean}\nstd: {std}\nmin: {x_min}\nmax: {x_max}')
x = np.random.normal(0, 1, (10**8, ))
完整的答案,如果有人现在想知道如何绘制大数据统计:
def dis(inp):
import numpy as np
import vaex
import matplotlib.pyplot as plt
if getattr(inp, "numpy", None) is not None:
inp1d = np.reshape(inp.numpy(), [-1])
else:
inp1d = np.reshape(inp, [-1])
bin_count = 64
df = vaex.from_arrays(x=inp1d)
x_min, x_max = df.minmax(df.x)
bins = df.count(binby=df.x, shape=bin_count, limits='minmax', edges=True)
bins[-2] += bins[-1]
bins = bins[2:-1]
edges = np.linspace(x_min, x_max, bin_count+1)
left, right = edges[:-1], edges[1:]
edges = np.reshape(np.array([left,right]).T, [-1])
bins = np.reshape(np.array([bins,bins]).T, [-1])
mean = df.mean(df.x)
std = df.std(df.x)
# Scale AUC to 1
step = (x_max-x_min)/bin_count
population = np.sum(bins)
surface = population*step
bins = bins/surface
hist_height = np.max(bins)
for i, v in enumerate([x * std + mean for x in range(-3, 4)]):
if i == 3:
plt.plot([v, v], [0, hist_height * 1.02], color='#34A853', linewidth=1)
else:
plt.plot([v, v], [0, hist_height * 0.97], color='#34A853', linewidth=0.5)
plt.fill_between(edges, bins, step="pre", alpha=0.3)
plt.plot(edges, bins, color='#4285F4', linewidth=1)
plt.show()
print(f'{str(inp.shape) + " " if getattr(inp, "shape", None) is not None and inp.ndim > 1 else ""}{len(inp1d):,}\nmean: {mean}\nstd: {std}\nmin: {x_min}\nmax: {x_max}')
致版主:这个网站不允许我 post 代码,即使它是答案:看起来你的 post 主要是代码;请添加更多详细信息。
我们的想法是规范化您的数据集,即将每列的高度除以直方图的 AUC(曲线下面积)。
在"plt.step(...)"之前写:
step = (x_max-x_min)/bin_count
population = np.sum(bins)
surface = population*step
bins = bins/surface
希望能有所帮助