过滤具有重叠日期的数据的 Pythonic 方法

Pythonic way to filter data with overlapping dates

我有这个数据结构,每个团队都有 start/end 日期的问题列表。

对于每个团队,我想合并具有相同关键日期和重叠日期的问题,在结果问题中,开始日期会更小,结束日期会更大。

我正在尝试用几个 for 循环来完成它,但我想知道什么是最好的 Pythonic 方法来做到这一点。

更新

I want to merge only issues with same key within same team and with overlapping dates.

Issues are not in chronological order.

输入:

{
    'Team A': [{
        'start': '11/Jul/13 1:49 PM',
        'end': '10/Oct/13 5:16 PM',
        'issue': 'KEY-12678'
    }, {
        'start': '3/Oct/13 10:40 AM',
        'end': '11/Nov/13 1:02 PM',
        'issue': 'KEY-12678'
    }],

    'Team B': [{
        'start': '5/Sep/13 3:35 PM',
        'end': '08/Nov/13 3:35 PM',
        'issue': 'KEY-12679'
    }, {
        'start': '19/Aug/13 5:05 PM',
        'end': '10/Sep/13 5:16 PM',
        'issue': 'KEY-12679'
    }, {
        'start': '09/Jul/13 9:15 AM',
        'end': '29/Jul/13 9:15 AM',
        'issue': 'KEY-12680'
    }]
}

输出:

{
    'Team A': [{
        'start': '11/Jul/13 1:49 PM',
        'end': '11/Nov/13 1:02 PM',
        'issue': 'KEY-12678'
    }],
    'Team B': [{
        'start': '19/Aug/13 5:05 PM',
        'end': '08/Nov/13 3:35 PM',
        'issue': 'KEY-12679'
    }, {
        'start': '09/Jul/13 9:15 AM',
        'end': '29/Jul/13 9:15 AM',
        'issue': 'KEY-12680'
    }]
}

要解析日期,这里是日期格式(为您节省几分钟):

date_format = "%d/%b/%y %H:%M %p"

更新,新测试数据

输入

d = {
"N/A": [
  {'start': '23/Jun/14 8:48 PM', 'end': '01/Aug/14 11:00 PM', 'issue': 'KEY-12157'}
  ,{'start': '09/Jul/13 1:57 PM',  'end': '29/Jul/13 1:57 PM', 'issue': 'KEY-12173'}
  ,{'start': '21/Aug/13 12:29 PM', 'end': '02/Dec/13 6:06 PM', 'issue': 'KEY-12173'}
  ,{'start': '17/Feb/14 3:17 PM', 'end': '18/Feb/14 5:51 PM', 'issue': 'KEY-12173'}
  ,{'start': '12/May/14 4:42 PM', 'end': '02/Jun/14 4:42 PM', 'issue': 'KEY-12173'}
  ,{'start': '24/Jun/14 11:33 AM',  'end': '01/Aug/14 11:49 AM', 'issue': 'KEY-12173'}
  ,{'start': '07/Oct/14 1:17 PM',  'end': '17/Nov/14 10:30 AM', 'issue': 'KEY-12173'}
  ,{'start': '31/Mar/15 1:58 PM', 'end': '12/May/15 4:26 PM', 'issue': 'KEY-12173'}
  ,{'start': '15/Jul/14 10:06 AM',  'end': '15/Sep/14 5:25 PM', 'issue': 'KEY-12173'}
  ,{'start': '06/Jan/15 10:46 AM',  'end': '26/Jan/15 10:46 AM', 'issue': 'KEY-20628'}
  ,{'start': '18/Nov/14 5:08 PM',  'end': '16/Feb/15 1:31 PM', 'issue': 'KEY-20628'}
  ,{'start': '02/Oct/13 12:32 PM', 'end': '21/Oct/13 5:32 PM', 'issue': 'KEY-12146'}
  ,{'start': '11/Mar/14 12:08 PM', 'end': '31/Mar/14 12:08 PM', 'issue': 'KEY-12681'}
  ]}

输出

{'start': '18/Nov/14 05:08 AM', 'issue': 'KEY-20628', 'end': '16/Feb/15 01:31 AM'}
{'start': '09/Jul/13 1:57 PM', 'issue': 'KEY-12173', 'end': '29/Jul/13 1:57 PM'}
{'start': '21/Aug/13 12:29 PM', 'issue': 'KEY-12173', 'end': '02/Dec/13 6:06 PM'}
{'start': '17/Feb/14 3:17 PM', 'issue': 'KEY-12173', 'end': '18/Feb/14 5:51 PM'}
{'start': '12/May/14 4:42 PM', 'issue': 'KEY-12173', 'end': '02/Jun/14 4:42 PM'}
{'start': '24/Jun/14 11:33 AM', 'issue': 'KEY-12173', 'end': '15/Sep/14 05:25 AM'}
{'start': '07/Oct/14 1:17 PM', 'issue': 'KEY-12173', 'end': '17/Nov/14 10:30 AM'}
{'start': '31/Mar/15 1:58 PM', 'issue': 'KEY-12173', 'end': '12/May/15 4:26 PM'}
{'start': '11/Mar/14 12:08 PM', 'issue': 'KEY-12681', 'end': '31/Mar/14 12:08 PM'}
{'start': '23/Jun/14 8:48 PM', 'issue': 'KEY-12157', 'end': '01/Aug/14 11:00 PM'}
{'start': '02/Oct/13 12:32 PM', 'issue': 'KEY-12146', 'end': '21/Oct/13 5:32 PM'}

您可以将字符串日期转换为具有 '%d/%b/%y %H:%M %p' 格式和 datetime.strftime 函数的有效 python 日期时间对象,并使用 itertools.groupby 根据 issue 键,您可以遍历压缩组并使用 maxmin 函数提取最大值和最小值,并使用适当的键函数:

from datetime import datetime
from itertools import groupby
from operator import itemgetter
new={}

for key in d:
  for dic in [ zip(*[i.items() for i in g]) for _,g in groupby(d[key],itemgetter('issue'))] :
    temp={}
    for p,t in [zip(*tup) for tup in dic]:
      val=p[0]
      if val=='start':
        temp[val]=min(t,key=lambda x:datetime.strptime(x,'%d/%b/%y %H:%M %p'))
      elif val=='end':
        temp[val]=max(t,key=lambda x:datetime.strptime(x,'%d/%b/%y %H:%M %p'))
      else:
        temp[val]=t[0]
    new.setdefault(key,[]).append(temp) 


print new

结果:

{'Team A': [{'start': '11/Jul/13 1:49 PM', 'end': '11/Nov/13 1:02 PM', 'issue': 'KEY-12678'}], 
 'Team B': [{'start': '19/Aug/13 5:05 PM', 'end': '08/Nov/13 3:35 PM', 'issue': 'KEY-12679'}, 
            {'start': '09/Jul/13 9:15 AM', 'end': '29/Jul/13 9:15 AM', 'issue': 'KEY-12680'}]}

这是我当前的代码,它似乎可以正常工作(检查起来有点棘手)。

在我的代码中,我使用的名称为 epicmr,其中示例数据中的每一行都是 epic,但问题键指的是 mr.

from datetime import datetime
date_format = "%d/%b/%y %H:%M %p"

d = {"team" : [... sample data ...]}

def get_list_of_mrs(epics):
    mrs = set()
    for epic in epics:
        mrs.add(epic['issue'])
    return mrs

def is_overlap(epic1, epic2):
    start1 = datetime.strptime(epic1['start'], date_format)
    end1 = datetime.strptime(epic1['end'], date_format)
    start2 = datetime.strptime(epic2['start'], date_format)
    end2 = datetime.strptime(epic2['end'], date_format)
    return ((start1 <= end2) and (end1 >= start2))

def get_overlapping_dates(epic1, epic2):
    start1 = datetime.strptime(epic1['start'], date_format)
    end1 = datetime.strptime(epic1['end'], date_format)
    start2 = datetime.strptime(epic2['start'], date_format)
    end2 = datetime.strptime(epic2['end'], date_format)
    return (min(start1, start2), max(end1, end2))

def remove_overlaps(epics):
    filtered_epics = []
    for epic in epics:
        for temp_epic in epics:
            if temp_epic == epic:
                continue
            if epic.has_key('overlap'):
                continue
            if is_overlap(epic, temp_epic):
                temp_epic['overlap'] = True
                new_start, new_end = get_overlapping_dates(epic, temp_epic)
                epic['start'] = new_start.strftime(date_format)
                epic['end'] = new_end.strftime(date_format)

        filtered_epics.append(epic)

    filtered_epics = filter(lambda x: not x.has_key('overlap'), filtered_epics)

    return filtered_epics

for team in d:
    epics = d[team]
    epics.sort(key=lambda x: datetime.strptime(x['start'], date_format))
    uniq_mrs_in_team = get_list_of_mrs(epics)

    filtered_mrs = []
    for mr in uniq_mrs_in_team:
        mr_epics = filter(lambda x: x['issue'] == mr, epics)
        filtered = remove_overlaps(mr_epics)
        #print team, mr, len(mr_epics), len(filtered)
        for x_mr in mr_epics:
            #print " -",x_mr
            pass
        for x_mr in filtered:
            #print " +",x_mr
            pass
        filtered_mrs.extend(filtered)
    d[team] = filtered_mrs

我提出了一个 pandas 解决方案,正如 aquavitae 在评论中所暗示的那样,其中包含以下步骤:

  • 从字典 d 中读取您提供到 DataFrame 中的数据。
  • 将开始和结束列转换为日期时间对象。
  • 按键和开始日期对数据进行排序并重置索引
  • 遍历数据框(效率低下,但直到现在我想不出更好的方法)并将当前行的结束时间与下一行的开始时间进行比较,以及如果钥匙是平等的。
  • 查询 pandas 数据框以获取重叠的线
  • 遍历待删除的行并将数据合并到相应的重叠行中。
  • 删除这些行。
  • 转换回字典格式。

这看起来像:

import pandas as pd
import numpy as np
df = pd.DataFrame(d['N/A'])
df['end'] = pd.to_datetime(df['end'])
df['start'] = pd.to_datetime(df['start'])
df.sort(['issue', 'start'], inplace=True)
df.index = range(len(df))
time_overlaps = df[:-1]['end'] > df[1:]['start']
same_issue = df[:-1]['issue'] == df[1:]['issue']
rows_to_drop = np.logical_and(time_overlaps, same_issue)
rows_to_drop_indices = [i+1 for i, j in enumerate(rows_to_drop) if j]
for i in rows_to_drop_indices:
    df.loc[i-1, 'end'] = df.loc[i, 'end']
df.drop(rows_to_drop_indices, inplace=True)

如果您不想保留 DataFrame 对象并按照您在问题中指定的格式进行进一步计算,请执行以下操作:

df.to_dict('records')

编辑:找到了一种有效的方法!