每小时填充 activity 时间 - Python
Filling in hourly buckets of activity time - Python
我有一个设备列表及其 activity 时间(开始时间和结束时间)。一台设备可以有一个或多个 activity 日志。我想要做的是为每个设备创建一个设备活动时间的分布。
我当前的数据框看起来像这样:
device_id start_time end_time
1 03:53 10:54
1 06:00 14:00
2 20:29 06:17
要为每个设备创建 activity 时间分布,我想我会创建每小时桶(对应于从 00 到 23 的小时数)并填充设备处于活动状态的桶。例如,对于设备 1,第一行是
[0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0]
和第二行
[0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0]
将它们加起来为设备 1 创建 activity 的分布将得到:
[0,0,0,1,1,1,2,2,2,2,2,1,1,1,1,0,0,0,0,0,0,0,0,0]
我尝试了以下创建所需列表的尝试,但是,它仅在结束时间大于开始时间(例如上面数据示例中的前两行)时有效,并且不适用于大于开始时间的情况比结束时间(例如上面数据示例中的第 3 行)。
for start, end in zip(df[df['start_time'].notnull() & df['end_time'].notnull()]['start_time'],df[df['start_time'].notnull() & df['end_time'].notnull()]['end_time']) :
start_time = pd.to_datetime(start, format ='%H:%M')
end_time = pd.to_datetime(end, format ='%H:%M')
activity = [0]*24
i = (start_time + dt.timedelta(minutes=((start_time.minute // 60 + (1 if start_time.minute>30 else 0) ) * 60) - start_time.minute)).hour
rounded_end_time = (end_time + dt.timedelta(minutes=((end_time.minute // 60 + (1 if end_time.minute>30 else 0) ) * 60) - end_time.minute)).hour
while i < rounded_end_time:
activity[i] = 1
i = i + 1
print activity
有任何修复建议吗? (或者首先以更聪明的方式完成任务?)
您可以只使用 pandas 来完成,如下所示:
x=pd.DataFrame([[1, '03:53', '10:54'],[1, '06:00', '14:00'],[2, '20:29', '06:17']])
x.columns=['device_id', 'start_time', 'end_time']
x['start_time']=pd.to_datetime(x['start_time'],format ='%H:%M')
x['end_time']=pd.to_datetime(x['end_time'],format ='%H:%M')
x['type'] = x['end_time']-x['start_time']>0
x['type'] = x['type'].apply(lambda x: 0 if x else 1)
x['min'] = x[['start_time','end_time']].min(axis=1)
x['max'] = x[['start_time','end_time']].max(axis=1)
for i in range(24):
h = '0'+str(i)
h = h[-2:]
l = x['min']<=pd.to_datetime(h + ':59',format ='%H:%M')
e = x['max']>=pd.to_datetime(h+':00',format ='%H:%M')
l=l.apply(lambda x: 1 if x else -1)
e=e.apply(lambda x: 1 if x else -1)
x[i]=l+e+x['type']
x[i]=x[i].apply(lambda x: 1 if x > 0 and x < 3 else 0)
x = x.drop(['start_time','end_time'],axis=1).groupby('device_id').agg(np.max)
x.reset_index().drop('device_id',axis=1).sum()
解决了!我发布带有评论的答案以防万一有人需要它:
# for each pair of start and end time that are not null
for start, end in zip(df[df['start_time'].notnull() & df['end_time'].notnull()]['start_time'],df[df['start_time'].notnull() & df['end_time'].notnull()]['end_time']) :
start_time = pd.to_datetime(start, format ='%H:%M')
end_time = pd.to_datetime(end, format ='%H:%M')
#create a list of 24 indexes and initialize them to 0
activity = [0]*24
#round start and end time to the nearest hour
i = (start_time + dt.timedelta(minutes=((start_time.minute // 60 + (1 if start_time.minute>30 else 0) ) * 60) - start_time.minute)).hour
rounded_end_time = (end_time + dt.timedelta(minutes=((end_time.minute // 60 + (1 if end_time.minute>30 else 0) ) * 60) - end_time.minute)).hour
#calculate the number of hours of activity (which is also the number of buckets to be filled)
duration = (pd.to_datetime(rounded_end_time , format ='%H') - pd.to_datetime(i, format ='%H')).seconds//3600
#initialize a count to count the number of buckets we fill
count = 0
while duration > count:
activity[i] = 1
count = count +1
#set the index of the bucket to be filled to the next indes, unless it goes beyond the last bucket, in which case continue from the first bucket
i = (i+1 if i+1 < 24 else 0)
print activity
获取 start/end 时间行并将它们分箱到时间桶中(本例中的总持续时间)
注意:并未涵盖所有边缘情况,但如果您觉得有用,可以扩展代码
#your imports
import numpy as np
import pandas as pd
from pandas.tseries.offsets import Hour, Minute
#optional
from IPython.core.debugger import set_trace
# construct a sample raw data dataframe
start_times = ['2000-01-01 09:00', '2000-01-01 10:00']
end_times = ['2000-01-01 17:00', '2000-01-01 18:00']
index = ['Timeframe ' + str(i) for i in range(len(start_times))]
df = pd.DataFrame({'Start Time': pd.to_datetime(start_times),
'End Time' : pd.to_datetime(end_times)}, index=index)
数据框 df 看起来类似于下面
End Time Start Time
时间段 0 2000-01-0117:00:00 2000-01-0109:00:00
时间段 1 2000-01-0118:00:00 2000-01-0110:00:00
#Construct your dataframe for time buckets
rng = pd.date_range('2000-01-01 09:00', periods=9, freq='H')
ts = pd.DataFrame(0, index=rng, columns=['minutes'], dtype='float')
数据框 ts 看起来类似于下面
minutes
2000-01-01 09:00:00 0.0
2000-01-01 10:00:00 0.0
2000-01-01 11:00:00 0.0
2000-01-01 12:00:00 0.0
2000-01-01 13:00:00 0.0
2000-01-01 14:00:00 0.0
2000-01-01 15:00:00 0.0
2000-01-01 16:00:00 0.0
2000-01-01 17:00:00 0.0
for index, row in ts.iterrows():
#set_trace()
start_boundary = index
end_boundary = index + Hour()
time_count = pd.Timedelta('0 m')
for _, raw_data in df.iterrows():
#set_trace()
start_time = raw_data['Start Time']
end_time = raw_data['End Time']
if end_time > start_boundary:
if start_time < end_boundary:
if start_time <= start_boundary:
if end_time >= end_boundary:
time_count = time_count + (end_boundary - start_boundary)
else:
time_count = time + (end_time - start_boundary)
else:
if end_time >= end_boundary:
time_count = time_count + (end_boundary - start_time)
else:
time_count = time_count + (end_time - start_time)
#set_trace()
ts.at[index, 'minutes'] = time_count.seconds / 60
运行 上面的代码和您的 ts 数据框(见下文)应该根据 中的原始数据以分钟为单位的总持续时间df 数据帧
minutes
2000-01-01 09:00:00 60.0
2000-01-01 10:00:00 120.0
2000-01-01 11:00:00 120.0
2000-01-01 12:00:00 120.0
2000-01-01 13:00:00 120.0
2000-01-01 14:00:00 120.0
2000-01-01 15:00:00 120.0
2000-01-01 16:00:00 120.0
2000-01-01 17:00:00 60.0
我有一个设备列表及其 activity 时间(开始时间和结束时间)。一台设备可以有一个或多个 activity 日志。我想要做的是为每个设备创建一个设备活动时间的分布。
我当前的数据框看起来像这样:
device_id start_time end_time
1 03:53 10:54
1 06:00 14:00
2 20:29 06:17
要为每个设备创建 activity 时间分布,我想我会创建每小时桶(对应于从 00 到 23 的小时数)并填充设备处于活动状态的桶。例如,对于设备 1,第一行是
[0,0,0,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0]
和第二行
[0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0]
将它们加起来为设备 1 创建 activity 的分布将得到:
[0,0,0,1,1,1,2,2,2,2,2,1,1,1,1,0,0,0,0,0,0,0,0,0]
我尝试了以下创建所需列表的尝试,但是,它仅在结束时间大于开始时间(例如上面数据示例中的前两行)时有效,并且不适用于大于开始时间的情况比结束时间(例如上面数据示例中的第 3 行)。
for start, end in zip(df[df['start_time'].notnull() & df['end_time'].notnull()]['start_time'],df[df['start_time'].notnull() & df['end_time'].notnull()]['end_time']) :
start_time = pd.to_datetime(start, format ='%H:%M')
end_time = pd.to_datetime(end, format ='%H:%M')
activity = [0]*24
i = (start_time + dt.timedelta(minutes=((start_time.minute // 60 + (1 if start_time.minute>30 else 0) ) * 60) - start_time.minute)).hour
rounded_end_time = (end_time + dt.timedelta(minutes=((end_time.minute // 60 + (1 if end_time.minute>30 else 0) ) * 60) - end_time.minute)).hour
while i < rounded_end_time:
activity[i] = 1
i = i + 1
print activity
有任何修复建议吗? (或者首先以更聪明的方式完成任务?)
您可以只使用 pandas 来完成,如下所示:
x=pd.DataFrame([[1, '03:53', '10:54'],[1, '06:00', '14:00'],[2, '20:29', '06:17']])
x.columns=['device_id', 'start_time', 'end_time']
x['start_time']=pd.to_datetime(x['start_time'],format ='%H:%M')
x['end_time']=pd.to_datetime(x['end_time'],format ='%H:%M')
x['type'] = x['end_time']-x['start_time']>0
x['type'] = x['type'].apply(lambda x: 0 if x else 1)
x['min'] = x[['start_time','end_time']].min(axis=1)
x['max'] = x[['start_time','end_time']].max(axis=1)
for i in range(24):
h = '0'+str(i)
h = h[-2:]
l = x['min']<=pd.to_datetime(h + ':59',format ='%H:%M')
e = x['max']>=pd.to_datetime(h+':00',format ='%H:%M')
l=l.apply(lambda x: 1 if x else -1)
e=e.apply(lambda x: 1 if x else -1)
x[i]=l+e+x['type']
x[i]=x[i].apply(lambda x: 1 if x > 0 and x < 3 else 0)
x = x.drop(['start_time','end_time'],axis=1).groupby('device_id').agg(np.max)
x.reset_index().drop('device_id',axis=1).sum()
解决了!我发布带有评论的答案以防万一有人需要它:
# for each pair of start and end time that are not null
for start, end in zip(df[df['start_time'].notnull() & df['end_time'].notnull()]['start_time'],df[df['start_time'].notnull() & df['end_time'].notnull()]['end_time']) :
start_time = pd.to_datetime(start, format ='%H:%M')
end_time = pd.to_datetime(end, format ='%H:%M')
#create a list of 24 indexes and initialize them to 0
activity = [0]*24
#round start and end time to the nearest hour
i = (start_time + dt.timedelta(minutes=((start_time.minute // 60 + (1 if start_time.minute>30 else 0) ) * 60) - start_time.minute)).hour
rounded_end_time = (end_time + dt.timedelta(minutes=((end_time.minute // 60 + (1 if end_time.minute>30 else 0) ) * 60) - end_time.minute)).hour
#calculate the number of hours of activity (which is also the number of buckets to be filled)
duration = (pd.to_datetime(rounded_end_time , format ='%H') - pd.to_datetime(i, format ='%H')).seconds//3600
#initialize a count to count the number of buckets we fill
count = 0
while duration > count:
activity[i] = 1
count = count +1
#set the index of the bucket to be filled to the next indes, unless it goes beyond the last bucket, in which case continue from the first bucket
i = (i+1 if i+1 < 24 else 0)
print activity
获取 start/end 时间行并将它们分箱到时间桶中(本例中的总持续时间)
注意:并未涵盖所有边缘情况,但如果您觉得有用,可以扩展代码
#your imports
import numpy as np
import pandas as pd
from pandas.tseries.offsets import Hour, Minute
#optional
from IPython.core.debugger import set_trace
# construct a sample raw data dataframe
start_times = ['2000-01-01 09:00', '2000-01-01 10:00']
end_times = ['2000-01-01 17:00', '2000-01-01 18:00']
index = ['Timeframe ' + str(i) for i in range(len(start_times))]
df = pd.DataFrame({'Start Time': pd.to_datetime(start_times),
'End Time' : pd.to_datetime(end_times)}, index=index)
数据框 df 看起来类似于下面
End Time Start Time
时间段 0 2000-01-0117:00:00 2000-01-0109:00:00
时间段 1 2000-01-0118:00:00 2000-01-0110:00:00
#Construct your dataframe for time buckets
rng = pd.date_range('2000-01-01 09:00', periods=9, freq='H')
ts = pd.DataFrame(0, index=rng, columns=['minutes'], dtype='float')
数据框 ts 看起来类似于下面
minutes
2000-01-01 09:00:00 0.0
2000-01-01 10:00:00 0.0
2000-01-01 11:00:00 0.0
2000-01-01 12:00:00 0.0
2000-01-01 13:00:00 0.0
2000-01-01 14:00:00 0.0
2000-01-01 15:00:00 0.0
2000-01-01 16:00:00 0.0
2000-01-01 17:00:00 0.0
for index, row in ts.iterrows():
#set_trace()
start_boundary = index
end_boundary = index + Hour()
time_count = pd.Timedelta('0 m')
for _, raw_data in df.iterrows():
#set_trace()
start_time = raw_data['Start Time']
end_time = raw_data['End Time']
if end_time > start_boundary:
if start_time < end_boundary:
if start_time <= start_boundary:
if end_time >= end_boundary:
time_count = time_count + (end_boundary - start_boundary)
else:
time_count = time + (end_time - start_boundary)
else:
if end_time >= end_boundary:
time_count = time_count + (end_boundary - start_time)
else:
time_count = time_count + (end_time - start_time)
#set_trace()
ts.at[index, 'minutes'] = time_count.seconds / 60
运行 上面的代码和您的 ts 数据框(见下文)应该根据 中的原始数据以分钟为单位的总持续时间df 数据帧
minutes
2000-01-01 09:00:00 60.0
2000-01-01 10:00:00 120.0
2000-01-01 11:00:00 120.0
2000-01-01 12:00:00 120.0
2000-01-01 13:00:00 120.0
2000-01-01 14:00:00 120.0
2000-01-01 15:00:00 120.0
2000-01-01 16:00:00 120.0
2000-01-01 17:00:00 60.0