pandas 中拆分应用组合期间省略的列
A column that's omitted during split-apply-combie in pandas
我正在执行拆分应用组合来查找每个成员的总数量。我需要的数据框应该有 14 列:MemberID, DSFS_0_1, DSFS_1_2, DSFS_2_3, DSFS_3_4, DSFS_4_5, DSFS_5_6, DSFS_6_7, DSFS_7_8, DSFS_8_9, DSFS_9_10, DSFS_10_11, DSFS_11_12, DrugCount
。但是,我没有得到第 14 个 (DrugCount
),知道为什么吗?变量joined
输出全部14,但是joined_grouped_add
,我做聚合的函数,只有returns13。
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
# this function takes the drugcount dataframe as input and output a tuple of 3 data frames: DrugCount_Y1,DrugCount_Y2,DrugCount_Y3
def process_DrugCount(drugcount):
dc = pd.read_csv("DrugCount.csv")
sub_map = {'1' : 1, '2':2, '3':3, '4':4, '5':5, '6':6, '7+' : 7}
dc['DrugCount'] = dc.DrugCount.map(sub_map)
dc['DrugCount'] = dc.DrugCount.astype(int)
dc_grouped = dc.groupby(dc.Year, as_index=False)
DrugCount_Y1 = dc_grouped.get_group('Y1')
DrugCount_Y2 = dc_grouped.get_group('Y2')
DrugCount_Y3 = dc_grouped.get_group('Y3')
DrugCount_Y1.drop('Year', axis=1, inplace=True)
DrugCount_Y2.drop('Year', axis=1, inplace=True)
DrugCount_Y3.drop('Year', axis=1, inplace=True)
return (DrugCount_Y1,DrugCount_Y2,DrugCount_Y3)
# this function converts strings such as "1- 2 month" to "1_2"
def replaceMonth(string):
replace_map = {'0- 1 month' : "0_1", "1- 2 months": "1_2", "2- 3 months": "2_3", "3- 4 months": '3_4', "4- 5 months": "4_5", "5- 6 months": "5_6", "6- 7 months": "6_7", \
"7- 8 months" : "7_8", "8- 9 months": "8_9", "9-10 months": "9_10", "10-11 months": "10_11", "11-12 months": "11_12"}
a_new_string = string.map(replace_map)
return a_new_string
# this function processes a yearly drug count data
def process_yearly_DrugCount(aframe):
processed_frame = None
aframe.drop("Year", axis = 1, inplace = True)
reformed = aframe[['DSFS']].apply(replaceMonth)
gd = pd.get_dummies(reformed)
joined = pd.concat([aframe, gd], axis = 1)
joined.drop("DSFS", axis = 1, inplace = True)
joined_grouped = joined.groupby("MemberID", as_index = False)
joined_grouped_agg = joined_grouped.agg(np.sum)
print joined_grouped_agg
return processed_frame
def main():
pd.options.mode.chained_assignment = None
daysinhospital = pd.read_csv('DaysInHospital_Y2.csv')
drugcount = pd.read_csv('DrugCount.csv')
process_DrugCount(drugcount)
process_yearly_DrugCount(drugcount)
replaceMonth(drugcount['DSFS'])
if __name__ == '__main__':
main()
简单地说,直接从 csv 中提取的 DrugCount
不是作为数字字段读入的 (int/float)。否则它将保留在 .agg(np.sum)
处理中。在聚合之前检查 dtype 并查看它是否是 object
类型(即字符串列):
print joined['DrugCount'].dtype
事实上,在您的 process_DrugCount()
函数中,您使用 astype 显式地将 DrugCount 列转换为整数,但在 process_yearly_DrugCount()
函数中没有这样做。 运行后一个函数中的同一行和DrugCount在聚合处理中应该保留:
aframe['DrugCount'] = aframe['DrugCount'].astype(int)
或者更好的是,在 main()
中避免在后面的函数中进行两次转换:
drugcount['DrugCount'] = drugcount['DrugCount'].astype(int)
此外,请注意,read_csv() 允许使用其 dtype 参数显式指定列类型:
drugcount = pd.read_csv('DrugCount.csv', dtype={'DrugCount': np.int64})
我正在执行拆分应用组合来查找每个成员的总数量。我需要的数据框应该有 14 列:MemberID, DSFS_0_1, DSFS_1_2, DSFS_2_3, DSFS_3_4, DSFS_4_5, DSFS_5_6, DSFS_6_7, DSFS_7_8, DSFS_8_9, DSFS_9_10, DSFS_10_11, DSFS_11_12, DrugCount
。但是,我没有得到第 14 个 (DrugCount
),知道为什么吗?变量joined
输出全部14,但是joined_grouped_add
,我做聚合的函数,只有returns13。
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
# this function takes the drugcount dataframe as input and output a tuple of 3 data frames: DrugCount_Y1,DrugCount_Y2,DrugCount_Y3
def process_DrugCount(drugcount):
dc = pd.read_csv("DrugCount.csv")
sub_map = {'1' : 1, '2':2, '3':3, '4':4, '5':5, '6':6, '7+' : 7}
dc['DrugCount'] = dc.DrugCount.map(sub_map)
dc['DrugCount'] = dc.DrugCount.astype(int)
dc_grouped = dc.groupby(dc.Year, as_index=False)
DrugCount_Y1 = dc_grouped.get_group('Y1')
DrugCount_Y2 = dc_grouped.get_group('Y2')
DrugCount_Y3 = dc_grouped.get_group('Y3')
DrugCount_Y1.drop('Year', axis=1, inplace=True)
DrugCount_Y2.drop('Year', axis=1, inplace=True)
DrugCount_Y3.drop('Year', axis=1, inplace=True)
return (DrugCount_Y1,DrugCount_Y2,DrugCount_Y3)
# this function converts strings such as "1- 2 month" to "1_2"
def replaceMonth(string):
replace_map = {'0- 1 month' : "0_1", "1- 2 months": "1_2", "2- 3 months": "2_3", "3- 4 months": '3_4', "4- 5 months": "4_5", "5- 6 months": "5_6", "6- 7 months": "6_7", \
"7- 8 months" : "7_8", "8- 9 months": "8_9", "9-10 months": "9_10", "10-11 months": "10_11", "11-12 months": "11_12"}
a_new_string = string.map(replace_map)
return a_new_string
# this function processes a yearly drug count data
def process_yearly_DrugCount(aframe):
processed_frame = None
aframe.drop("Year", axis = 1, inplace = True)
reformed = aframe[['DSFS']].apply(replaceMonth)
gd = pd.get_dummies(reformed)
joined = pd.concat([aframe, gd], axis = 1)
joined.drop("DSFS", axis = 1, inplace = True)
joined_grouped = joined.groupby("MemberID", as_index = False)
joined_grouped_agg = joined_grouped.agg(np.sum)
print joined_grouped_agg
return processed_frame
def main():
pd.options.mode.chained_assignment = None
daysinhospital = pd.read_csv('DaysInHospital_Y2.csv')
drugcount = pd.read_csv('DrugCount.csv')
process_DrugCount(drugcount)
process_yearly_DrugCount(drugcount)
replaceMonth(drugcount['DSFS'])
if __name__ == '__main__':
main()
简单地说,直接从 csv 中提取的 DrugCount
不是作为数字字段读入的 (int/float)。否则它将保留在 .agg(np.sum)
处理中。在聚合之前检查 dtype 并查看它是否是 object
类型(即字符串列):
print joined['DrugCount'].dtype
事实上,在您的 process_DrugCount()
函数中,您使用 astype 显式地将 DrugCount 列转换为整数,但在 process_yearly_DrugCount()
函数中没有这样做。 运行后一个函数中的同一行和DrugCount在聚合处理中应该保留:
aframe['DrugCount'] = aframe['DrugCount'].astype(int)
或者更好的是,在 main()
中避免在后面的函数中进行两次转换:
drugcount['DrugCount'] = drugcount['DrugCount'].astype(int)
此外,请注意,read_csv() 允许使用其 dtype 参数显式指定列类型:
drugcount = pd.read_csv('DrugCount.csv', dtype={'DrugCount': np.int64})