我怎样才能 bootstrap 一个numpy数组的最里面的数组?
How can I bootstrap the innermost array of a numpy array?
我有一个这些维度的 numpy 数组
data.shape (categories, models, types, events): (10, 11, 50, 100)
现在我只想在最里面的数组 (100) 中执行 sample with replacement
。对于像这样的单个数组:
data[0][0][0]
array([ 40.448624 , 39.459843 , 33.76762 , 38.944622 , 21.407362 ,
35.55499 , 68.5111 , 16.512974 , 21.118315 , 18.447166 ,
16.026619 , 21.596252 , 41.798622 , 63.01645 , 46.886642 ,
68.874756 , 17.472408 , 53.015724 , 85.41213 , 59.388977 ,
17.352108 , 61.161705 , 23.430847 , 20.203123 , 22.73194 ,
77.40547 , 43.02974 , 29.745787 , 21.50163 , 13.820962 ,
46.91466 , 41.43656 , 18.008326 , 13.122162 , 59.79936 ,
94.555305 , 24.798452 , 30.362497 , 13.629236 , 10.792178 ,
35.298515 , 20.904285 , 15.409604 , 20.567234 , 46.376335 ,
13.82727 , 17.970661 , 18.408686 , 21.987917 , 21.30094 ,
24.26776 , 27.399046 , 49.16879 , 21.831453 , 66.577 ,
15.524615 , 18.091696 , 24.346598 , 24.709772 , 19.068447 ,
24.221592 , 25.244864 , 52.865868 , 22.860783 , 23.586731 ,
18.928782 , 21.960285 , 74.77856 , 15.176119 , 20.795431 ,
14.3638935, 35.937237 , 29.993324 , 30.848495 , 48.145336 ,
38.02541 , 101.15249 , 49.801117 , 38.123184 , 12.041505 ,
18.788296 , 20.53382 , 31.20367 , 19.76104 , 92.56279 ,
41.62944 , 23.53344 , 18.967432 , 14.781404 , 20.02018 ,
27.736559 , 16.108913 , 44.935062 , 12.629299 , 34.65672 ,
20.60169 , 21.779675 , 31.585844 , 23.768578 , 92.463196 ],
dtype=float32)
我可以 sample with replacement
使用这个:np.random.choice(data[0][0][0], 100)
,我会做数千次。
array([ 13.629236, 92.56279 , 21.960285, 20.567234, 21.50163 ,
16.026619, 20.203123, 23.430847, 16.512974, 15.524615,
18.967432, 22.860783, 85.41213 , 21.779675, 23.586731,
24.26776 , 66.577 , 20.904285, 19.068447, 21.960285,
68.874756, 31.585844, 23.586731, 61.161705, 101.15249 ,
59.79936 , 16.512974, 43.02974 , 16.108913, 24.26776 ,
23.430847, 14.781404, 40.448624, 13.629236, 24.26776 ,
19.068447, 16.026619, 16.512974, 16.108913, 77.40547 ,
12.629299, 31.585844, 24.798452, 18.967432, 14.781404,
23.430847, 49.16879 , 18.408686, 22.73194 , 10.792178,
16.108913, 18.967432, 12.041505, 85.41213 , 41.62944 ,
31.20367 , 17.970661, 29.745787, 39.459843, 10.792178,
43.02974 , 21.831453, 21.50163 , 24.798452, 30.362497,
21.50163 , 18.788296, 20.904285, 17.352108, 41.798622,
18.447166, 16.108913, 19.068447, 61.161705, 52.865868,
20.795431, 85.41213 , 49.801117, 13.82727 , 18.928782,
41.43656 , 46.886642, 92.56279 , 41.62944 , 18.091696,
20.60169 , 48.145336, 20.53382 , 40.448624, 20.60169 ,
23.586731, 22.73194 , 92.56279 , 94.555305, 22.73194 ,
17.352108, 46.886642, 27.399046, 18.008326, 15.176119],
dtype=float32)
但是由于 np.random.choice 中没有 axis
,我该如何对所有数组(即(类别、模型、类型))执行此操作?还是遍历它是唯一的选择?
databoot = []
for i in range(5):
idx = np.random.choice(100, 100)
databoot.append(data[:,:,:,idx])
databoot
的形状 -> (5, 10, 11, 50, 100)
data
的形状 -> (10, 11, 50, 100)
fastest/simplest 答案原来是基于对数组的扁平化版本进行索引:
def resampFlat(arr, reps):
n = arr.shape[-1]
# create an array to shift random indexes as needed
shift = np.repeat(np.arange(0, arr.size, n), n).reshape(arr.shape)
# get a flat view of the array
arrflat = arr.ravel()
# sample the array by generating random ints and shifting them appropriately
return np.array([arrflat[np.random.randint(0, n, arr.shape) + shift]
for i in range(reps)])
计时确认这是最快的答案。
时间
我测试了上面的 resampFlat
函数以及一个更简单的 for
基于循环的解决方案:
def resampFor(arr, reps):
# store the shape for the return value
shape = arr.shape
# flatten all dimensions of arr except the last
arr = arr.reshape(-1, arr.shape[-1])
# preallocate the return value
ret = np.empty((reps, *arr.shape), dtype=arr.dtype)
# generate the indices of the resampled values
idxs = np.random.randint(0, arr.shape[-1], (reps, *arr.shape))
for rep,idx in zip(ret, idxs):
# iterate over the resampled replicates
for row,rowrep,i in zip(arr, rep, idx):
# iterate over the event arrays within a replicate
rowrep[...] = row[i]
# give the return value the appropriate shape
return ret.reshape((reps, *shape))
以及基于 Paul Panzer 奇特的索引方法的解决方案:
def resampFancyIdx(arr, reps):
idx = np.random.randint(0, arr.shape[-1], (reps, *data.shape))
_, I, J, K, _ = np.ogrid[tuple(map(slice, (0, *arr.shape[:-1], 0)))]
return arr[I, J, K, idx]
我测试了以下数据:
shape = ((10, 11, 50, 100))
data = np.arange(np.prod(shape)).reshape(shape)
这是数组展平方法的结果:
%%timeit
resampFlat(data, 100)
1.25 s ± 9.02 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
for
循环方法的结果:
%%timeit
resampFor(data, 100)
1.66 s ± 16.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
来自 Paul 的精美索引:
%%timeit
resampFancyIdx(data, 100)
1.42 s ± 16.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
出乎我的意料,resampFancyIdx
击败了 resampFor
,实际上我不得不相当努力地工作才能想出更好的东西。在这一点上,我真的很想更好地解释花式索引在 C 级是如何工作的,以及它为什么如此高效。
您可以绘制样本的索引,然后应用花哨的索引:
>>> import numpy as np
>>>
>>> (categories, models, types, events) = (10, 11, 50, 100)
>>> data = np.random.random((categories, models, types, events))
>>> N_samples = 1000
>>>
>>> idx = np.random.randint(0, events, (categories, models, types, N_samples))
>>> I, J, K, _ = np.ogrid[:categories, :models, :types, :0]
>>>
>>> resampled = data[I, J, K, idx]
一个具体的小例子。这些字段标有 "category"(A 或 B)、"model"(a 或 b)和 "type"(1 或 2),以便于验证抽样确实保留了这些。
>>> I, J, K, L = np.ix_(*(np.array(list(x), 'O') for x in ('AB', 'ab', '12', 'xyzw')))
>>> data = I+J+K+L
>>> data
array([[[['Aa1x', 'Aa1y', 'Aa1z', 'Aa1w'],
['Aa2x', 'Aa2y', 'Aa2z', 'Aa2w']],
[['Ab1x', 'Ab1y', 'Ab1z', 'Ab1w'],
['Ab2x', 'Ab2y', 'Ab2z', 'Ab2w']]],
[[['Ba1x', 'Ba1y', 'Ba1z', 'Ba1w'],
['Ba2x', 'Ba2y', 'Ba2z', 'Ba2w']],
[['Bb1x', 'Bb1y', 'Bb1z', 'Bb1w'],
['Bb2x', 'Bb2y', 'Bb2z', 'Bb2w']]]], dtype=object)
>>> N_samples = 3
>>>
>>> idx = np.random.randint(0, data.shape[-1], (N_samples, *data.shape))
>>> _, I, J, K, _ = np.ogrid[tuple(map(slice, (0, *data.shape[:-1], 0)))]
>>>
>>> resampled = data[I, J, K, idx]
>>> res
ResourceWarning resampled
>>> resampled
array([[[[['Aa1z', 'Aa1y', 'Aa1y', 'Aa1x'],
['Aa2y', 'Aa2z', 'Aa2z', 'Aa2z']],
[['Ab1w', 'Ab1z', 'Ab1y', 'Ab1x'],
['Ab2y', 'Ab2w', 'Ab2y', 'Ab2w']]],
[[['Ba1z', 'Ba1y', 'Ba1y', 'Ba1x'],
['Ba2x', 'Ba2x', 'Ba2z', 'Ba2x']],
[['Bb1x', 'Bb1x', 'Bb1y', 'Bb1z'],
['Bb2y', 'Bb2w', 'Bb2y', 'Bb2z']]]],
[[[['Aa1x', 'Aa1w', 'Aa1x', 'Aa1z'],
['Aa2y', 'Aa2y', 'Aa2x', 'Aa2z']],
[['Ab1y', 'Ab1x', 'Ab1w', 'Ab1z'],
['Ab2w', 'Ab2x', 'Ab2w', 'Ab2w']]],
[[['Ba1x', 'Ba1z', 'Ba1x', 'Ba1z'],
['Ba2x', 'Ba2y', 'Ba2y', 'Ba2w']],
[['Bb1z', 'Bb1w', 'Bb1y', 'Bb1w'],
['Bb2w', 'Bb2x', 'Bb2w', 'Bb2z']]]],
[[[['Aa1w', 'Aa1w', 'Aa1w', 'Aa1y'],
['Aa2z', 'Aa2x', 'Aa2y', 'Aa2x']],
[['Ab1z', 'Ab1z', 'Ab1x', 'Ab1y'],
['Ab2w', 'Ab2x', 'Ab2x', 'Ab2y']]],
[[['Ba1w', 'Ba1x', 'Ba1y', 'Ba1y'],
['Ba2z', 'Ba2x', 'Ba2x', 'Ba2x']],
[['Bb1z', 'Bb1w', 'Bb1x', 'Bb1x'],
['Bb2z', 'Bb2x', 'Bb2w', 'Bb2z']]]]], dtype=object)
我有一个这些维度的 numpy 数组
data.shape (categories, models, types, events): (10, 11, 50, 100)
现在我只想在最里面的数组 (100) 中执行 sample with replacement
。对于像这样的单个数组:
data[0][0][0]
array([ 40.448624 , 39.459843 , 33.76762 , 38.944622 , 21.407362 ,
35.55499 , 68.5111 , 16.512974 , 21.118315 , 18.447166 ,
16.026619 , 21.596252 , 41.798622 , 63.01645 , 46.886642 ,
68.874756 , 17.472408 , 53.015724 , 85.41213 , 59.388977 ,
17.352108 , 61.161705 , 23.430847 , 20.203123 , 22.73194 ,
77.40547 , 43.02974 , 29.745787 , 21.50163 , 13.820962 ,
46.91466 , 41.43656 , 18.008326 , 13.122162 , 59.79936 ,
94.555305 , 24.798452 , 30.362497 , 13.629236 , 10.792178 ,
35.298515 , 20.904285 , 15.409604 , 20.567234 , 46.376335 ,
13.82727 , 17.970661 , 18.408686 , 21.987917 , 21.30094 ,
24.26776 , 27.399046 , 49.16879 , 21.831453 , 66.577 ,
15.524615 , 18.091696 , 24.346598 , 24.709772 , 19.068447 ,
24.221592 , 25.244864 , 52.865868 , 22.860783 , 23.586731 ,
18.928782 , 21.960285 , 74.77856 , 15.176119 , 20.795431 ,
14.3638935, 35.937237 , 29.993324 , 30.848495 , 48.145336 ,
38.02541 , 101.15249 , 49.801117 , 38.123184 , 12.041505 ,
18.788296 , 20.53382 , 31.20367 , 19.76104 , 92.56279 ,
41.62944 , 23.53344 , 18.967432 , 14.781404 , 20.02018 ,
27.736559 , 16.108913 , 44.935062 , 12.629299 , 34.65672 ,
20.60169 , 21.779675 , 31.585844 , 23.768578 , 92.463196 ],
dtype=float32)
我可以 sample with replacement
使用这个:np.random.choice(data[0][0][0], 100)
,我会做数千次。
array([ 13.629236, 92.56279 , 21.960285, 20.567234, 21.50163 ,
16.026619, 20.203123, 23.430847, 16.512974, 15.524615,
18.967432, 22.860783, 85.41213 , 21.779675, 23.586731,
24.26776 , 66.577 , 20.904285, 19.068447, 21.960285,
68.874756, 31.585844, 23.586731, 61.161705, 101.15249 ,
59.79936 , 16.512974, 43.02974 , 16.108913, 24.26776 ,
23.430847, 14.781404, 40.448624, 13.629236, 24.26776 ,
19.068447, 16.026619, 16.512974, 16.108913, 77.40547 ,
12.629299, 31.585844, 24.798452, 18.967432, 14.781404,
23.430847, 49.16879 , 18.408686, 22.73194 , 10.792178,
16.108913, 18.967432, 12.041505, 85.41213 , 41.62944 ,
31.20367 , 17.970661, 29.745787, 39.459843, 10.792178,
43.02974 , 21.831453, 21.50163 , 24.798452, 30.362497,
21.50163 , 18.788296, 20.904285, 17.352108, 41.798622,
18.447166, 16.108913, 19.068447, 61.161705, 52.865868,
20.795431, 85.41213 , 49.801117, 13.82727 , 18.928782,
41.43656 , 46.886642, 92.56279 , 41.62944 , 18.091696,
20.60169 , 48.145336, 20.53382 , 40.448624, 20.60169 ,
23.586731, 22.73194 , 92.56279 , 94.555305, 22.73194 ,
17.352108, 46.886642, 27.399046, 18.008326, 15.176119],
dtype=float32)
但是由于 np.random.choice 中没有 axis
,我该如何对所有数组(即(类别、模型、类型))执行此操作?还是遍历它是唯一的选择?
databoot = []
for i in range(5):
idx = np.random.choice(100, 100)
databoot.append(data[:,:,:,idx])
databoot
的形状 -> (5, 10, 11, 50, 100)data
的形状 -> (10, 11, 50, 100)
fastest/simplest 答案原来是基于对数组的扁平化版本进行索引:
def resampFlat(arr, reps):
n = arr.shape[-1]
# create an array to shift random indexes as needed
shift = np.repeat(np.arange(0, arr.size, n), n).reshape(arr.shape)
# get a flat view of the array
arrflat = arr.ravel()
# sample the array by generating random ints and shifting them appropriately
return np.array([arrflat[np.random.randint(0, n, arr.shape) + shift]
for i in range(reps)])
计时确认这是最快的答案。
时间
我测试了上面的 resampFlat
函数以及一个更简单的 for
基于循环的解决方案:
def resampFor(arr, reps):
# store the shape for the return value
shape = arr.shape
# flatten all dimensions of arr except the last
arr = arr.reshape(-1, arr.shape[-1])
# preallocate the return value
ret = np.empty((reps, *arr.shape), dtype=arr.dtype)
# generate the indices of the resampled values
idxs = np.random.randint(0, arr.shape[-1], (reps, *arr.shape))
for rep,idx in zip(ret, idxs):
# iterate over the resampled replicates
for row,rowrep,i in zip(arr, rep, idx):
# iterate over the event arrays within a replicate
rowrep[...] = row[i]
# give the return value the appropriate shape
return ret.reshape((reps, *shape))
以及基于 Paul Panzer 奇特的索引方法的解决方案:
def resampFancyIdx(arr, reps):
idx = np.random.randint(0, arr.shape[-1], (reps, *data.shape))
_, I, J, K, _ = np.ogrid[tuple(map(slice, (0, *arr.shape[:-1], 0)))]
return arr[I, J, K, idx]
我测试了以下数据:
shape = ((10, 11, 50, 100))
data = np.arange(np.prod(shape)).reshape(shape)
这是数组展平方法的结果:
%%timeit
resampFlat(data, 100)
1.25 s ± 9.02 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
for
循环方法的结果:
%%timeit
resampFor(data, 100)
1.66 s ± 16.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
来自 Paul 的精美索引:
%%timeit
resampFancyIdx(data, 100)
1.42 s ± 16.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
出乎我的意料,resampFancyIdx
击败了 resampFor
,实际上我不得不相当努力地工作才能想出更好的东西。在这一点上,我真的很想更好地解释花式索引在 C 级是如何工作的,以及它为什么如此高效。
您可以绘制样本的索引,然后应用花哨的索引:
>>> import numpy as np
>>>
>>> (categories, models, types, events) = (10, 11, 50, 100)
>>> data = np.random.random((categories, models, types, events))
>>> N_samples = 1000
>>>
>>> idx = np.random.randint(0, events, (categories, models, types, N_samples))
>>> I, J, K, _ = np.ogrid[:categories, :models, :types, :0]
>>>
>>> resampled = data[I, J, K, idx]
一个具体的小例子。这些字段标有 "category"(A 或 B)、"model"(a 或 b)和 "type"(1 或 2),以便于验证抽样确实保留了这些。
>>> I, J, K, L = np.ix_(*(np.array(list(x), 'O') for x in ('AB', 'ab', '12', 'xyzw')))
>>> data = I+J+K+L
>>> data
array([[[['Aa1x', 'Aa1y', 'Aa1z', 'Aa1w'],
['Aa2x', 'Aa2y', 'Aa2z', 'Aa2w']],
[['Ab1x', 'Ab1y', 'Ab1z', 'Ab1w'],
['Ab2x', 'Ab2y', 'Ab2z', 'Ab2w']]],
[[['Ba1x', 'Ba1y', 'Ba1z', 'Ba1w'],
['Ba2x', 'Ba2y', 'Ba2z', 'Ba2w']],
[['Bb1x', 'Bb1y', 'Bb1z', 'Bb1w'],
['Bb2x', 'Bb2y', 'Bb2z', 'Bb2w']]]], dtype=object)
>>> N_samples = 3
>>>
>>> idx = np.random.randint(0, data.shape[-1], (N_samples, *data.shape))
>>> _, I, J, K, _ = np.ogrid[tuple(map(slice, (0, *data.shape[:-1], 0)))]
>>>
>>> resampled = data[I, J, K, idx]
>>> res
ResourceWarning resampled
>>> resampled
array([[[[['Aa1z', 'Aa1y', 'Aa1y', 'Aa1x'],
['Aa2y', 'Aa2z', 'Aa2z', 'Aa2z']],
[['Ab1w', 'Ab1z', 'Ab1y', 'Ab1x'],
['Ab2y', 'Ab2w', 'Ab2y', 'Ab2w']]],
[[['Ba1z', 'Ba1y', 'Ba1y', 'Ba1x'],
['Ba2x', 'Ba2x', 'Ba2z', 'Ba2x']],
[['Bb1x', 'Bb1x', 'Bb1y', 'Bb1z'],
['Bb2y', 'Bb2w', 'Bb2y', 'Bb2z']]]],
[[[['Aa1x', 'Aa1w', 'Aa1x', 'Aa1z'],
['Aa2y', 'Aa2y', 'Aa2x', 'Aa2z']],
[['Ab1y', 'Ab1x', 'Ab1w', 'Ab1z'],
['Ab2w', 'Ab2x', 'Ab2w', 'Ab2w']]],
[[['Ba1x', 'Ba1z', 'Ba1x', 'Ba1z'],
['Ba2x', 'Ba2y', 'Ba2y', 'Ba2w']],
[['Bb1z', 'Bb1w', 'Bb1y', 'Bb1w'],
['Bb2w', 'Bb2x', 'Bb2w', 'Bb2z']]]],
[[[['Aa1w', 'Aa1w', 'Aa1w', 'Aa1y'],
['Aa2z', 'Aa2x', 'Aa2y', 'Aa2x']],
[['Ab1z', 'Ab1z', 'Ab1x', 'Ab1y'],
['Ab2w', 'Ab2x', 'Ab2x', 'Ab2y']]],
[[['Ba1w', 'Ba1x', 'Ba1y', 'Ba1y'],
['Ba2z', 'Ba2x', 'Ba2x', 'Ba2x']],
[['Bb1z', 'Bb1w', 'Bb1x', 'Bb1x'],
['Bb2z', 'Bb2x', 'Bb2w', 'Bb2z']]]]], dtype=object)