
Transform list of strings of commit-details to structured dictionary applying grouping by name and date

根据我的数据,我想以这样的形式显示,其中提交键将有一个数组 在特定日期完成的提交。这就是我期望的输出

    "Dan Ab": [
            "2014-05-2": {
                "commit_count": "1",
                "commit": [{ 'commit_hash': {'lines_added': 10, 'lines_removed': 4 }}]
            "2014-05-3": {
                "commit_count": "2",
                "commit": [
                    { 'commit_hash': {'lines_added': 10, 'lines_removed': 4 }},
                    { 'commit_hash': {'lines_added': 14, 'lines_removed': 0 }},
    "John": [
        "2020-10-14": {
            "commit_count": "1",
            "commit": [{ 'commit_hash': {'lines_added': 1740, 'lines_removed': 10 }}]



import re

merged_result = [
    "43f4cc160;Dan Ab;2021-06-17; 1 file changed, 10 insertions(+), 19 deletions(-)",
    "6cbf2a8b3;Dan Ab;2021-06-15; 1 file changed, 14303 insertions(+)",
    "c0a77029c;Dan Ab;2021-06-15; 1 file changed, 1 insertion(+), 1 deletion(-)",
    "f283d7524;Dan Ab;2021-06-15; 1 file changed, 5260 deletions(-)",
    "03c5314b4;Dan Ab;2021-06-15; 5 files changed, 5265 insertions(+), 12690 deletions(-)",
    "daf38ecdf;Dan Ab;2020-12-11; 1 file changed, 8 insertions(+)",
    "b5eabd543;Dan Ab;2020-10-14; 1 file changed, 17 insertions(+)",
    "6d50a9d09;Dan Ab;2020-10-14; 43 files changed, 15740 insertions(+), 1 deletion(-)",
    "7d59n9d09;John;2020-10-14; 4 files changed, 1740 insertions(+), 10 deletion(-)"
coding_days = {}
total_lines = 0
total_lines_added = 0
total_lines_removed = 0
total_files_changed = 0

def getstatsummarycounts(line):
    1 file changed, 5 insertions(+), 1 deletion(-) - returns ['1', '5', '1']
    numbers = re.findall("\d+", line)
    if len(numbers) == 1:
        # neither insertions nor deletions: may probably only happen
        # for "0 files changed"
    elif len(numbers) == 2 and line.find("(+)") != -1:
        numbers.append(0)  # only insertions were printed on line
    elif len(numbers) == 2 and line.find("(-)") != -1:
        numbers.insert(1, 0)  # only deletions were printed on line
    return numbers

for result in merged_result:
    [commit_hash, author, commit_date, logs] = result.split(";")
    numbers = getstatsummarycounts(logs)
    if len(numbers) == 3:
        (files_changed, inserted, deleted) = map(lambda el: int(el), numbers)
        total_lines += inserted
        total_lines -= deleted
        total_lines_added += inserted
        total_lines_removed += deleted
        total_files_changed += files_changed
        if author not in coding_days:
            coding_days[author] = []
            if commit_date not in coding_days[author]:
                coding_days[author].append({commit_date: []})
                    commit_hash: {
                        "lines_added": inserted,
                        "lines_deleted": deleted,
        (files_changed, inserted, deleted) = (0, 0, 0)




merged_result = [
    "43f4cc160;Dan Ab;2021-06-17; 1 file changed, 10 insertions(+), 19 deletions(-)",
    "6cbf2a8b3;Dan Ab;2021-06-15; 1 file changed, 14303 insertions(+)",
    "c0a77029c;Dan Ab;2021-06-15; 1 file changed, 1 insertion(+), 1 deletion(-)",
    "f283d7524;Dan Ab;2021-06-15; 1 file changed, 5260 deletions(-)",
    "03c5314b4;Dan Ab;2021-06-15; 5 files changed, 5265 insertions(+), 12690 deletions(-)",
    "daf38ecdf;Dan Ab;2020-12-11; 1 file changed, 8 insertions(+)",
    "b5eabd543;Dan Ab;2020-10-14; 1 file changed, 17 insertions(+)",
    "6d50a9d09;Dan Ab;2020-10-14; 43 files changed, 15740 insertions(+), 1 deletion(-)",
    "6d50a9d09;Dan Ab;2020-10-14; Steak and Fries, no Salad",


import re

grouped = {}
pattern = r"(\d+) file[^,]*(?:\, (\d+) ins[^,]+)?(?:\, (\d+) del.+)?$"

for line in merged_result:
    tag, name, date, changes = line.split(";", 3)
        # this will throw "NoneType" has no .groups() if not matched
        files, inserts, deletes = re.search(pattern, changes).groups()
        inserts, deletes = inserts or "0", deletes or "0"
    except AttributeError as a:
        print("Skipping: '",line, 
              "': cannot match data by regex to get changed/inserted/deleted\n", a)
    nameDict = grouped.setdefault(name, {})
    dateDict = nameDict.setdefault(date, {})
    dateDict.setdefault("commit_count", 0)
    dateDict["commit_count"] += 1
    commList = dateDict.setdefault("commit", [])
    commList.append({"commit_hash": {"tag": tag, "files": files, 
                     "lines_added": inserts, "lines_removed": deletes}})



Skipping: ' 6d50a9d09;Dan Ab;2020-10-14; Steak and Fries, no Salad ': cannot match data by regex to get changed/inserted/deleted

{'Dan Ab': {'2021-06-17': {'commit_count': 1, 'commit': [{'commit_hash': {'tag': '43f4cc160', 'files': '1', 'lines_added': '10', 'lines_removed': '19'}}]}, '2021-06-15': {'commit_count': 4, 'commit': [{'commit_hash': {'tag': '6cbf2a8b3', 'files': '1', 'lines_added': '14303', 'lines_removed': '0'}}, {'commit_hash': {'tag': 'c0a77029c', 'files': '1', 'lines_added': '1', 'lines_removed': '1'}}, {'commit_hash': {'tag': 'f283d7524', 'files': '1', 'lines_added': '0', 'lines_removed': '5260'}}, {'commit_hash': {'tag': '03c5314b4', 'files': '5', 'lines_added': '5265', 'lines_removed': '12690'}}]}, '2020-12-11': {'commit_count': 1, 'commit': [{'commit_hash': {'tag': 'daf38ecdf', 'files': '1', 'lines_added': '8', 'lines_removed': '0'}}]}, '2020-10-14': {'commit_count': 2, 'commit': [{'commit_hash': {'tag': 'b5eabd543', 'files': '1', 'lines_added': '17', 'lines_removed': '0'}}, {'commit_hash': {'tag': '6d50a9d09', 'files': '43', 'lines_added': '15740', 'lines_removed': '1'}}]}}}

结果字典 reformatted(第一次点击 google):

    "Dan Ab":{

r"(\d+) file[^,]*(?:\, (\d+) ins[^,]+)?(?:\, (\d+) del.+)?$" 的解释:

我正在使用 re.search 所以它不一定从字符串的开头开始 - 我正在寻找:

(\d+) file[^,]          a number followed by "file" consuming
                        anything up to (excluding) the next ","
                        capturing the number in a group

(?:\, (\d+) ins[^,]+)?
(?:\, (\d+) del.+)?     are similar: 0 to 1 occurence of ", "
                        followed by a captured number
                        followed by a space and some text
                        after "ins" we capture anyting up to excluding
                        the next ","
                        after "del" we simply capture anything

$                       followed by end of string

如果不存在,可选组将导致 None,因此使用 inserts, deletes = inserts or "0", deletes or "0"

将它们转换为 0

如果您需要更快的速度,您可以使用defaultdict(),但dict.setdefault 也可以。