通过循环遍历 python 中的列来汇总几列

Summarize several columns with looping through columns in python

我有一个非常奇怪的调查数据结构,如下例所示。在调查期间,收集了每个家庭的智能手机数量,然后收集有关有多少人将每个设备用于特定 activity。

的信息

示例:F3_{智能手机号码}_{HH_member_id} 所以 F3_1_4 将是 F3 & {第一部家用智能手机}=1 & {Household_member_using/sharing 此设备的数量 = 4 }

或者如果家庭中有 3 名成员剪切一个设备,F3_1_1、F3_1_2、F3_1_3 将是 1.

我正在尝试取出单个设备并计算用于该设备的手机数量 activity 以及数量。这是我的尝试

df_ph = pd.DataFrame()

   
for h in range(1,5):

  df_shared_ph = pd.DataFrame(None)

  for i in range(1,15):
    
    df_temp_ph = df[["respid", "f3_" + str(h) + "_" + str(i)]].copy()
    df_temp_ph.rename(columns = {"f3_" + str(h) + "_" + str(i): "Smartph"}, inplace = True)
    df_shared_ph = pd.concat([df_shared_ph, df_temp_ph], axis=0).dropna(subset=["Smartph"])

  df_shared_ph = df_shared_ph.groupby(['respid']).agg({'Smartph': 'sum'}).reset_index()
  df_ph = pd.concat([df_ph, df_shared_ph], axis=0)

  print("used for X and by how many:\n" + str(df_ph['Smartph'].value_counts()))

我的代码片段运行正常,但由于某种原因它会在我的原始数据中复制许多 rows/id,我无法弄清楚原因。我在这里错过了什么吗?有其他方法可以做到这一点吗?

df_ph.duplicated(['respid']).sum() == 0
False

示例数据:

# output to a dict
# the dict can be converted to a dataframe with 
# df = pd.DataFrame.from_dict(d, orient='index')  # d is the name of the dict


 {0: {'f3_1_1': 1.0, 'f3_1_10': nan,  'f3_1_11': nan,  'f3_1_12': nan,  'f3_1_13': nan,'f3_1_14': nan,   'f3_1_15': nan,  'f3_1_2': 0.0,
    'f3_1_3': 0.0,'f3_1_4': 0.0,'f3_1_5': nan,'f3_1_6': nan,  'f3_1_7': nan,  'f3_1_8': nan,  'f3_1_9': nan,  'f3_2_1': 0.0,  'f3_2_10': nan,
    'f3_2_11': nan,  'f3_2_12': nan,  'f3_2_13': nan,   'f3_2_14': nan,   'f3_2_15': nan,   'f3_2_2': 1.0,  'f3_2_3': 0.0,  'f3_2_4': 0.0,
    'f3_2_5': nan,  'f3_2_6': nan,  'f3_2_7': nan,  'f3_2_8': nan,  'f3_2_9': nan,  'f3_3_1': 0.0,  'f3_3_10': nan,   'f3_3_11': nan,
    'f3_3_12': nan,    'f3_3_13': nan,    'f3_3_14': nan,    'f3_3_15': nan,    'f3_3_2': 0.0,    'f3_3_3': 1.0,    'f3_3_4': 0.0,
    'f3_3_5': nan,    'f3_3_6': nan,    'f3_3_7': nan,    'f3_3_8': nan,    'f3_3_9': nan,    'f3_4_1': 0.0,    'f3_4_10': nan,
    'f3_4_11': nan,    'f3_4_12': nan,    'f3_4_13': nan,    'f3_4_14': nan,    'f3_4_15': nan,    'f3_4_2': 0.0,    'f3_4_3': 0.0,
    'f3_4_4': 1.0,    'f3_4_5': nan,    'f3_4_6': nan,    'f3_4_7': nan,    'f3_4_8': nan,    'f3_4_9': nan,    'f3_5_1': nan,
    'f3_5_10': nan,    'f3_5_11': nan,    'f3_5_12': nan,    'f3_5_13': nan,    'f3_5_14': nan,    'f3_5_15': nan,    'f3_5_2': nan,
    'f3_5_3': nan,    'f3_5_4': nan,    'f3_5_5': nan,    'f3_5_6': nan,    'f3_5_7': nan,    'f3_5_8': nan,    'f3_5_9': nan,
    'respid': 13766.0},
    1: {'f3_1_1': nan,   'f3_1_10': nan,  'f3_1_11': nan, 'f3_1_12': nan, 'f3_1_13': nan, 'f3_1_14': nan,  'f3_1_15': nan,  'f3_1_2': nan,
      'f3_1_3': nan,    'f3_1_4': nan,    'f3_1_5': nan,  'f3_1_6': nan,  'f3_1_7': nan,  'f3_1_8': nan,  'f3_1_9': nan,    'f3_2_1': nan,
      'f3_2_10': nan,      'f3_2_11': nan,      'f3_2_12': nan,      'f3_2_13': nan,      'f3_2_14': nan,      'f3_2_15': nan,      'f3_2_2': nan,
      'f3_2_3': nan,      'f3_2_4': nan,      'f3_2_5': nan,      'f3_2_6': nan,      'f3_2_7': nan,      'f3_2_8': nan,
      'f3_2_9': nan,      'f3_3_1': nan,      'f3_3_10': nan,      'f3_3_11': nan,      'f3_3_12': nan,      'f3_3_13': nan,
      'f3_3_14': nan,      'f3_3_15': nan,      'f3_3_2': nan,      'f3_3_3': nan,      'f3_3_4': nan,      'f3_3_5': nan,
      'f3_3_6': nan,      'f3_3_7': nan,      'f3_3_8': nan,      'f3_3_9': nan,      'f3_4_1': nan,      'f3_4_10': nan,      'f3_4_11': nan,
      'f3_4_12': nan,      'f3_4_13': nan,      'f3_4_14': nan,      'f3_4_15': nan,      'f3_4_2': nan,      'f3_4_3': nan,
      'f3_4_4': nan,      'f3_4_5': nan,      'f3_4_6': nan,      'f3_4_7': nan,      'f3_4_8': nan,      'f3_4_9': nan,      'f3_5_1': nan,
      'f3_5_10': nan,      'f3_5_11': nan,      'f3_5_12': nan,      'f3_5_13': nan,      'f3_5_14': nan,      'f3_5_15': nan,      'f3_5_2': nan,
      'f3_5_3': nan,      'f3_5_4': nan,      'f3_5_5': nan,      'f3_5_6': nan,      'f3_5_7': nan,      'f3_5_8': nan,      'f3_5_9': nan,
      'respid': 16346.0},
    2: {'f3_1_1': 1.0,      'f3_1_10': nan,      'f3_1_11': nan,      'f3_1_12': nan,      'f3_1_13': nan,      'f3_1_14': nan,      'f3_1_15': nan,
      'f3_1_2': 0.0,      'f3_1_3': nan,      'f3_1_4': nan,      'f3_1_5': nan,      'f3_1_6': nan,      'f3_1_7': nan,      'f3_1_8': nan,
      'f3_1_9': nan,      'f3_2_1': 0.0,      'f3_2_10': nan,      'f3_2_11': nan,      'f3_2_12': nan,      'f3_2_13': nan,
      'f3_2_14': nan,      'f3_2_15': nan,      'f3_2_2': 1.0,      'f3_2_3': nan,      'f3_2_4': nan,      'f3_2_5': nan,      'f3_2_6': nan,
      'f3_2_7': nan,      'f3_2_8': nan,      'f3_2_9': nan,      'f3_3_1': nan,      'f3_3_10': nan,      'f3_3_11': nan,      'f3_3_12': nan,
      'f3_3_13': nan,      'f3_3_14': nan,      'f3_3_15': nan,      'f3_3_2': nan,      'f3_3_3': nan,      'f3_3_4': nan,      'f3_3_5': nan,
      'f3_3_6': nan,      'f3_3_7': nan,      'f3_3_8': nan,      'f3_3_9': nan,      'f3_4_1': nan,      'f3_4_10': nan,      'f3_4_11': nan,
      'f3_4_12': nan,      'f3_4_13': nan,      'f3_4_14': nan,      'f3_4_15': nan,      'f3_4_2': nan,      'f3_4_3': nan,      'f3_4_4': nan,
      'f3_4_5': nan,      'f3_4_6': nan,      'f3_4_7': nan,      'f3_4_8': nan,      'f3_4_9': nan,      'f3_5_1': nan,      'f3_5_10': nan,
      'f3_5_11': nan,      'f3_5_12': nan,      'f3_5_13': nan,      'f3_5_14': nan,      'f3_5_15': nan,      'f3_5_2': nan,      'f3_5_3': nan,
      'f3_5_4': nan,      'f3_5_5': nan,      'f3_5_6': nan,      'f3_5_7': nan,      'f3_5_8': nan,      'f3_5_9': nan,      'respid': 11293.0},
    3: {'f3_1_1': nan,
      'f3_1_10': nan,      'f3_1_11': nan,      'f3_1_12': nan,      'f3_1_13': nan,      'f3_1_14': nan,      'f3_1_15': nan,      'f3_1_2': nan,
      'f3_1_3': nan,      'f3_1_4': nan,      'f3_1_5': nan,      'f3_1_6': nan,      'f3_1_7': nan,      'f3_1_8': nan,      'f3_1_9': nan,      'f3_2_1': nan,
      'f3_2_10': nan,      'f3_2_11': nan,      'f3_2_12': nan,      'f3_2_13': nan,      'f3_2_14': nan,      'f3_2_15': nan,      'f3_2_2': nan,
      'f3_2_3': nan,      'f3_2_4': nan,      'f3_2_5': nan,      'f3_2_6': nan,      'f3_2_7': nan,      'f3_2_8': nan,  'f3_2_9': nan,  'f3_3_1': nan,
      'f3_3_10': nan,      'f3_3_11': nan,      'f3_3_12': nan,      'f3_3_13': nan,      'f3_3_14': nan,      'f3_3_15': nan,      'f3_3_2': nan,
      'f3_3_3': nan,      'f3_3_4': nan,      'f3_3_5': nan,      'f3_3_6': nan,      'f3_3_7': nan,      'f3_3_8': nan,      'f3_3_9': nan,
      'f3_4_1': nan,      'f3_4_10': nan,      'f3_4_11': nan,      'f3_4_12': nan,      'f3_4_13': nan,      'f3_4_14': nan,      'f3_4_15': nan,
      'f3_4_2': nan,      'f3_4_3': nan,      'f3_4_4': nan,      'f3_4_5': nan,      'f3_4_6': nan,      'f3_4_7': nan,      'f3_4_8': nan,
      'f3_4_9': nan,      'f3_5_1': nan,      'f3_5_10': nan,      'f3_5_11': nan,      'f3_5_12': nan,      'f3_5_13': nan,      'f3_5_14': nan,
      'f3_5_15': nan,      'f3_5_2': nan,      'f3_5_3': nan,      'f3_5_4': nan,      'f3_5_5': nan,      'f3_5_6': nan,      'f3_5_7': nan,
      'f3_5_8': nan,      'f3_5_9': nan,      'respid': 15965.0},
    4: {'f3_1_1': 1.0,      'f3_1_10': nan,      'f3_1_11': nan,      'f3_1_12': nan,      'f3_1_13': nan,      'f3_1_14': nan,
      'f3_1_15': nan,      'f3_1_2': 0.0,      'f3_1_3': 0.0,      'f3_1_4': nan,      'f3_1_5': nan,      'f3_1_6': nan,      'f3_1_7': nan,
      'f3_1_8': nan,      'f3_1_9': nan,      'f3_2_1': 0.0,      'f3_2_10': nan,      'f3_2_11': nan,      'f3_2_12': nan,      'f3_2_13': nan,
      'f3_2_14': nan,      'f3_2_15': nan,      'f3_2_2': 1.0,      'f3_2_3': 0.0,      'f3_2_4': nan,      'f3_2_5': nan,      'f3_2_6': nan,
      'f3_2_7': nan,      'f3_2_8': nan,      'f3_2_9': nan,      'f3_3_1': 0.0,      'f3_3_10': nan,      'f3_3_11': nan,      'f3_3_12': nan,
      'f3_3_13': nan,      'f3_3_14': nan,      'f3_3_15': nan,      'f3_3_2': 0.0,      'f3_3_3': 1.0,      'f3_3_4': nan,      'f3_3_5': nan,
      'f3_3_6': nan,      'f3_3_7': nan,      'f3_3_8': nan,      'f3_3_9': nan,      'f3_4_1': nan,      'f3_4_10': nan,      'f3_4_11': nan,
      'f3_4_12': nan,      'f3_4_13': nan,      'f3_4_14': nan,      'f3_4_15': nan,      'f3_4_2': nan,      'f3_4_3': nan,      'f3_4_4': nan,
      'f3_4_5': nan,      'f3_4_6': nan,      'f3_4_7': nan,      'f3_4_8': nan,      'f3_4_9': nan,      'f3_5_1': nan,      'f3_5_10': nan,
      'f3_5_11': nan,      'f3_5_12': nan,      'f3_5_13': nan,      'f3_5_14': nan,      'f3_5_15': nan,      'f3_5_2': nan,      'f3_5_3': nan,
      'f3_5_4': nan,      'f3_5_5': nan,      'f3_5_6': nan,      'f3_5_7': nan,      'f3_5_8': nan,      'f3_5_9': nan,      'respid': 7110.0}}

很明显,您对多索引列进行了编码。您可以解码如下。

df = pd.DataFrame.from_dict(d, orient='index').set_index("respid")  # d is the name of the dict
# remove redundant "f3_" from column name
df = df.rename(columns={c:c[3:] for c in df.columns if c.startswith("f3_")})

# F3_{smartphone number}_{HH_member_id}
# make columns a multiindex
df.columns = pd.MultiIndex.from_tuples([tuple(c.split("_")) for c in df.columns], names=["smartphone_no","household_id"])
# now its simple to work with DF
df.stack()

输出

smartphone_no           1    2    3    4   5
respid  household_id                        
13766.0 1             1.0  0.0  0.0  0.0 NaN
        2             0.0  1.0  0.0  0.0 NaN
        3             0.0  0.0  1.0  0.0 NaN
        4             0.0  0.0  0.0  1.0 NaN
11293.0 1             1.0  0.0  NaN  NaN NaN
        2             0.0  1.0  NaN  NaN NaN
7110.0  1             1.0  0.0  0.0  NaN NaN
        2             0.0  1.0  0.0  NaN NaN
        3             0.0  0.0  1.0  NaN NaN