按时间增量减少日期时间列表
reduce datetime list by timedelta
在 python 中,如何将日期时间列表减少 timedelta 邻域?
如果我有
dates = [
dt.datetime(1970, 1, 1, 0, 2),
dt.datetime(1970, 1, 1, 0, 3),
dt.datetime(1970, 1, 1, 0, 7),
dt.datetime(1970, 1, 1, 0, 8)
]
和时间增量
delta = dt.timedelta(minutes=2)
我怎样才能得到这个?
expected = [
dt.datetime(1970, 1, 1, 0, 2, 30),
dt.datetime(1970, 1, 1, 0, 7, 30)
]
编辑
有数字的例子,如果我有这个数字列表
numbers = [1,2,6,7]
delta = 1
我尝试对附近的值进行分组,并得到该组的特征值(中心值)。增量是值之间的最大距离。
对于数字,特征值为
[1.5, 6.5]
因为数值被分组在[1,2]和[6,7]中并计算平均值。
import datetime as dt
dates = [
dt.datetime(1970, 1, 1, 0, 2),
dt.datetime(1970, 1, 1, 0, 3),
dt.datetime(1970, 1, 1, 0, 12),
dt.datetime(1970, 1, 1, 0, 7),
dt.datetime(1970, 1, 1, 0, 8),
dt.datetime(1970, 1, 1, 0, 9),
dt.datetime(1970, 1, 1, 0, 13)
]
def group_dates(dates, delta):
it = iter(dates)
prev = next(it)
grouped, total = [[prev]], delta.total_seconds()
for dte in it:
if (dte - prev).total_seconds() <= total:
grouped[-1].append(dte)
else:
grouped.append([dte])
prev = dte
return grouped
def td(l):
seconds = sum((d - dt.datetime(1970, 1, 1)).total_seconds() for d in l) / len(l)
return dt.datetime.utcfromtimestamp(seconds)
from pprint import pprint as pp
pp([td(sub) for sub in group_dates(dates,dt.timedelta(minutes=2))])
为避免不必要的函数调用,请检查 len:
pp([td(sub) if len(sub) > 1 else sub[0] for sub in [datetime.datetime(1970, 1, 1, 0, 2, 30),
datetime.datetime(1970, 1, 1, 0, 12),
datetime.datetime(1970, 1, 1, 0, 8),
datetime.datetime(1970, 1, 1, 0, 13)]group_dates(dates,dt.timedelta(minutes=2))])
或者边走边产生值:
def group_dates(dates, delta):
it = iter(dates)
prev = next(it)
grouped, total = (prev,),delta.total_seconds()
for dte in it:
if (dte - prev).total_seconds() <= total:
grouped = grouped + (dte,)
else:
yield td(grouped)
grouped = (dte,)
prev = dte
yield td(grouped)
pp(list(group_dates(dates, delta=dt.timedelta(minutes=2))))
[datetime.datetime(1970, 1, 1, 0, 2, 30),
datetime.datetime(1970, 1, 1, 0, 12),
datetime.datetime(1970, 1, 1, 0, 8),
datetime.datetime(1970, 1, 1, 0, 13)]
一些时间:
In [28]: dates = [
dt.datetime(1970, 1, 1, 0, 2),
dt.datetime(1970, 1, 1, 0, 3),
dt.datetime(1970, 1, 1, 0, 4),
dt.datetime(1970, 1, 1, 0, 7),
dt.datetime(1970, 1, 1, 0, 8),
dt.datetime(1970, 1, 1, 0, 9),
dt.datetime(1970, 1, 1, 0, 15),
dt.datetime(1970, 1, 1, 0, 22),
dt.datetime(1970, 1, 1, 0, 24),
dt.datetime(1970, 1, 1, 0, 27)
]
In [41]: for i in range(10000):
dates.append(dates[-1]+dt.timedelta(minutes=choice([1,2,3,4])))
....:
In [42]: timeit [td(sub) if len(sub) > 1 else sub[0] for sub in group_dates(dates,dt.timedelta(minutes=2))]
100 loops, best of 3: 15.8 ms per loop
In [43]: timeit reduce_datetime_list_by_delta(dates, delta)
100 loops, best of 3: 16.9 ms per loop
In [44]: timeit timestamps = map(avgtm, groupby(dates, key=grouper(delta)))
10 loops, best of 3: 18.8 ms per loop
In [45]: timeit (list(group_dates_iter(dates, delta = dt.timedelta(minutes=2))))
10 loops, best of 3: 18.4 ms per loop
import datetime as dt
def datetime_to_epoch(dtime):
return (dtime - dt.datetime(1970,1,1)).total_seconds()
def datetime_sublists(datetime_list, time_delta = dt.timedelta(days=1)):
sublists = []
temp = [datetime_list[0]]
for i in range(len(datetime_list)-1):
prev_date = datetime_list[i]
current_date = datetime_list[i+1]
if current_date - prev_date <= time_delta:
temp.append(current_date)
else:
sublists.append(temp)
temp = [current_date]
sublists.append(temp)
return sublists
def reduce_datetime_list_by_delta(date_list, delta):
sublist = datetime_sublists(date_list, delta)
reduced = []
for dates in sublist:
epochs = [ datetime_to_epoch(date) for date in dates]
epoch_average = sum(epochs)/len(epochs)
reduced.append(dt.datetime.utcfromtimestamp(epoch_average))
return reduced
dates = [
dt.datetime(1970, 1, 1, 0, 2),
dt.datetime(1970, 1, 1, 0, 3),
dt.datetime(1970, 1, 1, 0, 7),
dt.datetime(1970, 1, 1, 0, 8),
dt.datetime(1970, 1, 1, 0, 12)
]
delta = dt.timedelta(minutes=2)
print reduce_datetime_list_by_delta(dates, delta)
问题描述已经泄露:您想使用 itertools
中的 groupby()
函数
所需要的只是一个稍微聪明一点的 key
函数,它会记住最后一个状态并继续给出相同的 key
值,只要连续的时间戳比 delta
.
分组后,将找到的组转换为平均时间,处理单个时间戳(包括示例)。
import datetime as dt
from itertools import groupby
dates = [
dt.datetime(1970, 1, 1, 0, 2),
dt.datetime(1970, 1, 1, 0, 3),
dt.datetime(1970, 1, 1, 0, 7),
dt.datetime(1970, 1, 1, 0, 8),
dt.datetime(1970, 1, 1, 0, 13)
]
delta = dt.timedelta(minutes=2)
class grouper:
def __init__(self, delta):
self.delta= delta
self.last = None
def __call__(self, tm):
# we keep on returning the same key as long as successive time
# stamps are within the last time stamp + delta
self.last = tm if (self.last is None) or (tm - self.last)>self.delta \
else self.last
return self.last
# transform the result of groupby into average times
def avgtm(item):
(key, tms) = item
tms = list(tms) # transform generator into list so we can index it
return tms[0] + (tms[-1]-tms[0])/2 if len(tms)>1 else tms[0]
timestamps = map(avgtm, groupby(dates, key=grouper(delta)))
print "Time stamps: ",timestamps
产量输出:
Time stamps: [datetime.datetime(1970, 1, 1, 0, 2, 30),
datetime.datetime(1970, 1, 1, 0, 7, 30),
datetime.datetime(1970, 1, 1, 0, 13)]
在 python 中,如何将日期时间列表减少 timedelta 邻域?
如果我有
dates = [
dt.datetime(1970, 1, 1, 0, 2),
dt.datetime(1970, 1, 1, 0, 3),
dt.datetime(1970, 1, 1, 0, 7),
dt.datetime(1970, 1, 1, 0, 8)
]
和时间增量
delta = dt.timedelta(minutes=2)
我怎样才能得到这个?
expected = [
dt.datetime(1970, 1, 1, 0, 2, 30),
dt.datetime(1970, 1, 1, 0, 7, 30)
]
编辑
有数字的例子,如果我有这个数字列表
numbers = [1,2,6,7]
delta = 1
我尝试对附近的值进行分组,并得到该组的特征值(中心值)。增量是值之间的最大距离。
对于数字,特征值为
[1.5, 6.5]
因为数值被分组在[1,2]和[6,7]中并计算平均值。
import datetime as dt
dates = [
dt.datetime(1970, 1, 1, 0, 2),
dt.datetime(1970, 1, 1, 0, 3),
dt.datetime(1970, 1, 1, 0, 12),
dt.datetime(1970, 1, 1, 0, 7),
dt.datetime(1970, 1, 1, 0, 8),
dt.datetime(1970, 1, 1, 0, 9),
dt.datetime(1970, 1, 1, 0, 13)
]
def group_dates(dates, delta):
it = iter(dates)
prev = next(it)
grouped, total = [[prev]], delta.total_seconds()
for dte in it:
if (dte - prev).total_seconds() <= total:
grouped[-1].append(dte)
else:
grouped.append([dte])
prev = dte
return grouped
def td(l):
seconds = sum((d - dt.datetime(1970, 1, 1)).total_seconds() for d in l) / len(l)
return dt.datetime.utcfromtimestamp(seconds)
from pprint import pprint as pp
pp([td(sub) for sub in group_dates(dates,dt.timedelta(minutes=2))])
为避免不必要的函数调用,请检查 len:
pp([td(sub) if len(sub) > 1 else sub[0] for sub in [datetime.datetime(1970, 1, 1, 0, 2, 30),
datetime.datetime(1970, 1, 1, 0, 12),
datetime.datetime(1970, 1, 1, 0, 8),
datetime.datetime(1970, 1, 1, 0, 13)]group_dates(dates,dt.timedelta(minutes=2))])
或者边走边产生值:
def group_dates(dates, delta):
it = iter(dates)
prev = next(it)
grouped, total = (prev,),delta.total_seconds()
for dte in it:
if (dte - prev).total_seconds() <= total:
grouped = grouped + (dte,)
else:
yield td(grouped)
grouped = (dte,)
prev = dte
yield td(grouped)
pp(list(group_dates(dates, delta=dt.timedelta(minutes=2))))
[datetime.datetime(1970, 1, 1, 0, 2, 30),
datetime.datetime(1970, 1, 1, 0, 12),
datetime.datetime(1970, 1, 1, 0, 8),
datetime.datetime(1970, 1, 1, 0, 13)]
一些时间:
In [28]: dates = [
dt.datetime(1970, 1, 1, 0, 2),
dt.datetime(1970, 1, 1, 0, 3),
dt.datetime(1970, 1, 1, 0, 4),
dt.datetime(1970, 1, 1, 0, 7),
dt.datetime(1970, 1, 1, 0, 8),
dt.datetime(1970, 1, 1, 0, 9),
dt.datetime(1970, 1, 1, 0, 15),
dt.datetime(1970, 1, 1, 0, 22),
dt.datetime(1970, 1, 1, 0, 24),
dt.datetime(1970, 1, 1, 0, 27)
]
In [41]: for i in range(10000):
dates.append(dates[-1]+dt.timedelta(minutes=choice([1,2,3,4])))
....:
In [42]: timeit [td(sub) if len(sub) > 1 else sub[0] for sub in group_dates(dates,dt.timedelta(minutes=2))]
100 loops, best of 3: 15.8 ms per loop
In [43]: timeit reduce_datetime_list_by_delta(dates, delta)
100 loops, best of 3: 16.9 ms per loop
In [44]: timeit timestamps = map(avgtm, groupby(dates, key=grouper(delta)))
10 loops, best of 3: 18.8 ms per loop
In [45]: timeit (list(group_dates_iter(dates, delta = dt.timedelta(minutes=2))))
10 loops, best of 3: 18.4 ms per loop
import datetime as dt
def datetime_to_epoch(dtime):
return (dtime - dt.datetime(1970,1,1)).total_seconds()
def datetime_sublists(datetime_list, time_delta = dt.timedelta(days=1)):
sublists = []
temp = [datetime_list[0]]
for i in range(len(datetime_list)-1):
prev_date = datetime_list[i]
current_date = datetime_list[i+1]
if current_date - prev_date <= time_delta:
temp.append(current_date)
else:
sublists.append(temp)
temp = [current_date]
sublists.append(temp)
return sublists
def reduce_datetime_list_by_delta(date_list, delta):
sublist = datetime_sublists(date_list, delta)
reduced = []
for dates in sublist:
epochs = [ datetime_to_epoch(date) for date in dates]
epoch_average = sum(epochs)/len(epochs)
reduced.append(dt.datetime.utcfromtimestamp(epoch_average))
return reduced
dates = [
dt.datetime(1970, 1, 1, 0, 2),
dt.datetime(1970, 1, 1, 0, 3),
dt.datetime(1970, 1, 1, 0, 7),
dt.datetime(1970, 1, 1, 0, 8),
dt.datetime(1970, 1, 1, 0, 12)
]
delta = dt.timedelta(minutes=2)
print reduce_datetime_list_by_delta(dates, delta)
问题描述已经泄露:您想使用 itertools
groupby()
函数
所需要的只是一个稍微聪明一点的 key
函数,它会记住最后一个状态并继续给出相同的 key
值,只要连续的时间戳比 delta
.
分组后,将找到的组转换为平均时间,处理单个时间戳(包括示例)。
import datetime as dt
from itertools import groupby
dates = [
dt.datetime(1970, 1, 1, 0, 2),
dt.datetime(1970, 1, 1, 0, 3),
dt.datetime(1970, 1, 1, 0, 7),
dt.datetime(1970, 1, 1, 0, 8),
dt.datetime(1970, 1, 1, 0, 13)
]
delta = dt.timedelta(minutes=2)
class grouper:
def __init__(self, delta):
self.delta= delta
self.last = None
def __call__(self, tm):
# we keep on returning the same key as long as successive time
# stamps are within the last time stamp + delta
self.last = tm if (self.last is None) or (tm - self.last)>self.delta \
else self.last
return self.last
# transform the result of groupby into average times
def avgtm(item):
(key, tms) = item
tms = list(tms) # transform generator into list so we can index it
return tms[0] + (tms[-1]-tms[0])/2 if len(tms)>1 else tms[0]
timestamps = map(avgtm, groupby(dates, key=grouper(delta)))
print "Time stamps: ",timestamps
产量输出:
Time stamps: [datetime.datetime(1970, 1, 1, 0, 2, 30),
datetime.datetime(1970, 1, 1, 0, 7, 30),
datetime.datetime(1970, 1, 1, 0, 13)]