不同大小的分层抽样
Stratified Sampling with different sizes
我正在尝试创建一个用于分层抽样的函数,该函数接收使用 faker 模块创建的数据框以及层、样本大小和随机种子。对于样本大小,我希望每个层中的样本数量根据用户输入而变化。这是我创建数据的代码:
import pandas as pd
import numpy as np
import random as rn#generating random numbers
from faker import Faker
fake = Faker()
frame_fake = pd.DataFrame( [{"region":
fake.random_number(1,fix_len=True),
"district": fake.random_number(2,fix_len=True),
"enum_area": fake.random_number(5,fix_len=True),
"hhs": fake.random_number(3),
"pop": fake.random_number(4),
"area": fake.random_number(1)} for x in range(100)])
# check for and remove duplicates from enum area (should be unique)
# before any further analysis
mask= frame_fake.duplicated('enum_area', keep='last')
duplicates = frame_fake[mask]
# print(duplicates)
# drop all except last
frame_fake = frame_fake.drop_duplicates('enum_area',
keep='last').sort_values(by='enum_area',ascending=True)
# reset index to have them sequentially after sorting by enum_area and
# drop the old index column
frame_fake = frame_fake.reset_index().drop('index',axis=1)
frame_fake
这是采样代码:
def stratified_custom(data,strata,sample_size, seed=None):
# for this part, we sample 5 enum areas in each strata/region
# we groupby strata and use the transform method with 'count' parameter
# to get strata sizes
data['strat_size'] = data.groupby(strata)[strata].transform('count')
# map input sample size to each strata
data['strat_sample_size'] = data[strata].map(sample_size)
# grouby strata, get sample size per stratum, cast to int and reset
# index.
smp_size = data.groupby(strata)
['strat_sample_size'].unique().astype(int).reset_index()
# groupby strata and select sample per stratum based on the sample size
# for that strata
sample = (data.groupby(strata, group_keys=False)
.apply(lambda x: x.sample(smp_size,random_state=seed)))
# probability of inclusion
sample['inclusion_prob'] =
sample['strat_sample_size']/sample['strat_size']
return sample
s_size={1:7,2:5,3:5,4:5,5:5,6:5,7:5,8:5,9:8} #pass in strata and sample
# size as dict. (key, values)
(stratified_custom(data=frame_fake,strata='region',sample_size=s_size,
seed=99).sort_values(by=['region','enum_area'],ascending=True))
但是我收到这个错误:
ValueError: The truth value of a DataFrame is ambiguous. Use a.empty,
a.bool(), a.item(), a.any() or a.all().
我不明白这个错误在说什么。感谢任何帮助。
经过大量研究,我偶然发现了这个 post 并在我的代码中实现了它,不仅可以根据不同的样本大小进行采样,还可以使用相同的函数对固定样本进行采样。这是我的数据代码:
import pandas as pd
import numpy as np
import random as rn
from faker import Faker
Faker.seed(99)
fake = Faker()
frame_fake = pd.DataFrame( [{"region":
fake.random_number(1,fix_len=True),"district":
fake.random_number(2,fix_len=True),"enum_area":
fake.random_number(5,fix_len=True), "hhs":
fake.random_number(3),"pop":
fake.random_number(4),"area":
rn.randint(1,2)} for x in range(100)])
frame_fake = frame_fake.drop_duplicates('enum_area',keep='last').sort_values(by='enum_area',ascending=True)
frame_fake = frame_fake.reset_index().drop('index',axis=1)
这是分层抽样的更新代码,现在可以使用了。
def stratified_custom(data,strata,sample_size, seed=None):
data = data.copy()
data['strat_size'] = data.groupby(strata)[strata].transform('count')
try:
data['strat_sample_size'] = data[strata].map(sample_size)
smp_size = data.set_index(strata)['strat_sample_size'].to_dict()
strat2_sample = (data.groupby(strata, group_keys=False).apply(lambda x: x.sample(smp_size[x.name],random_state=seed)))
strat2_sample['inclusion_prob'] = strat2_sample['strat_sample_size']/strat2_sample['strat_size']
return strat2_sample
except:
data['strat_sample_size'] = sample_size
strat2_sample = (data.groupby(strata, group_keys=False).apply(lambda x: x.sample(sample_size,random_state=seed)))
strat2_sample['inclusion_prob'] = strat2_sample['strat_sample_size']/strat2_sample['strat_size']
return strat2_sample
s_size={1:3,2:9,3:5,4:5,5:5,6:5,7:5,8:5,9:8}
variablesize = (stratified_custom(data=frame_fake,strata='region',sample_size=s_size, seed=99).sort_values(by=['region','enum_area'],ascending=True)).head()
variablesize
fixedsize = (stratified_custom(data=frame_fake,strata='region',sample_size=3, seed=99).sort_values(by=['region','enum_area'],ascending=True)).head()
fixedsize
可变样本大小的输出:
region district enum_area ... strat_size strat_sample_size inclusion_prob
5 1 60 14737 ... 5 3 0.6
26 1 42 34017 ... 5 3 0.6
68 1 31 72092 ... 5 3 0.6
0 2 65 10566 ... 10 9 0.9
15 2 22 25560 ... 10 9 0.9
固定样本量的输出:
region district enum_area ... strat_size strat_sample_size inclusion_prob
5 1 60 14737 ... 5 3 0.6
26 1 42 34017 ... 5 3 0.6
68 1 31 72092 ... 5 3 0.6
38 2 74 48408 ... 10 3 0.3
43 2 15 56365 ... 10 3 0.3
但是我想知道是否有更好的方法来实现这个目标?
我正在尝试创建一个用于分层抽样的函数,该函数接收使用 faker 模块创建的数据框以及层、样本大小和随机种子。对于样本大小,我希望每个层中的样本数量根据用户输入而变化。这是我创建数据的代码:
import pandas as pd
import numpy as np
import random as rn#generating random numbers
from faker import Faker
fake = Faker()
frame_fake = pd.DataFrame( [{"region":
fake.random_number(1,fix_len=True),
"district": fake.random_number(2,fix_len=True),
"enum_area": fake.random_number(5,fix_len=True),
"hhs": fake.random_number(3),
"pop": fake.random_number(4),
"area": fake.random_number(1)} for x in range(100)])
# check for and remove duplicates from enum area (should be unique)
# before any further analysis
mask= frame_fake.duplicated('enum_area', keep='last')
duplicates = frame_fake[mask]
# print(duplicates)
# drop all except last
frame_fake = frame_fake.drop_duplicates('enum_area',
keep='last').sort_values(by='enum_area',ascending=True)
# reset index to have them sequentially after sorting by enum_area and
# drop the old index column
frame_fake = frame_fake.reset_index().drop('index',axis=1)
frame_fake
这是采样代码:
def stratified_custom(data,strata,sample_size, seed=None):
# for this part, we sample 5 enum areas in each strata/region
# we groupby strata and use the transform method with 'count' parameter
# to get strata sizes
data['strat_size'] = data.groupby(strata)[strata].transform('count')
# map input sample size to each strata
data['strat_sample_size'] = data[strata].map(sample_size)
# grouby strata, get sample size per stratum, cast to int and reset
# index.
smp_size = data.groupby(strata)
['strat_sample_size'].unique().astype(int).reset_index()
# groupby strata and select sample per stratum based on the sample size
# for that strata
sample = (data.groupby(strata, group_keys=False)
.apply(lambda x: x.sample(smp_size,random_state=seed)))
# probability of inclusion
sample['inclusion_prob'] =
sample['strat_sample_size']/sample['strat_size']
return sample
s_size={1:7,2:5,3:5,4:5,5:5,6:5,7:5,8:5,9:8} #pass in strata and sample
# size as dict. (key, values)
(stratified_custom(data=frame_fake,strata='region',sample_size=s_size,
seed=99).sort_values(by=['region','enum_area'],ascending=True))
但是我收到这个错误:
ValueError: The truth value of a DataFrame is ambiguous. Use a.empty,
a.bool(), a.item(), a.any() or a.all().
我不明白这个错误在说什么。感谢任何帮助。
经过大量研究,我偶然发现了这个 post
import pandas as pd
import numpy as np
import random as rn
from faker import Faker
Faker.seed(99)
fake = Faker()
frame_fake = pd.DataFrame( [{"region":
fake.random_number(1,fix_len=True),"district":
fake.random_number(2,fix_len=True),"enum_area":
fake.random_number(5,fix_len=True), "hhs":
fake.random_number(3),"pop":
fake.random_number(4),"area":
rn.randint(1,2)} for x in range(100)])
frame_fake = frame_fake.drop_duplicates('enum_area',keep='last').sort_values(by='enum_area',ascending=True)
frame_fake = frame_fake.reset_index().drop('index',axis=1)
这是分层抽样的更新代码,现在可以使用了。
def stratified_custom(data,strata,sample_size, seed=None):
data = data.copy()
data['strat_size'] = data.groupby(strata)[strata].transform('count')
try:
data['strat_sample_size'] = data[strata].map(sample_size)
smp_size = data.set_index(strata)['strat_sample_size'].to_dict()
strat2_sample = (data.groupby(strata, group_keys=False).apply(lambda x: x.sample(smp_size[x.name],random_state=seed)))
strat2_sample['inclusion_prob'] = strat2_sample['strat_sample_size']/strat2_sample['strat_size']
return strat2_sample
except:
data['strat_sample_size'] = sample_size
strat2_sample = (data.groupby(strata, group_keys=False).apply(lambda x: x.sample(sample_size,random_state=seed)))
strat2_sample['inclusion_prob'] = strat2_sample['strat_sample_size']/strat2_sample['strat_size']
return strat2_sample
s_size={1:3,2:9,3:5,4:5,5:5,6:5,7:5,8:5,9:8}
variablesize = (stratified_custom(data=frame_fake,strata='region',sample_size=s_size, seed=99).sort_values(by=['region','enum_area'],ascending=True)).head()
variablesize
fixedsize = (stratified_custom(data=frame_fake,strata='region',sample_size=3, seed=99).sort_values(by=['region','enum_area'],ascending=True)).head()
fixedsize
可变样本大小的输出:
region district enum_area ... strat_size strat_sample_size inclusion_prob
5 1 60 14737 ... 5 3 0.6
26 1 42 34017 ... 5 3 0.6
68 1 31 72092 ... 5 3 0.6
0 2 65 10566 ... 10 9 0.9
15 2 22 25560 ... 10 9 0.9
固定样本量的输出:
region district enum_area ... strat_size strat_sample_size inclusion_prob
5 1 60 14737 ... 5 3 0.6
26 1 42 34017 ... 5 3 0.6
68 1 31 72092 ... 5 3 0.6
38 2 74 48408 ... 10 3 0.3
43 2 15 56365 ... 10 3 0.3
但是我想知道是否有更好的方法来实现这个目标?