以更优雅的方式解析数据

Parsing data in more elegant way

我有一个日志文件,其中包含用户完成的一些活动,该文件是以下格式的 csv:

DATE,SCORE,STATUS,ACTIVITY_ID

我正在解析 csv,然后在三个相似的视图(每日、每周、每年)中显示它。

我设法正确显示了它,但我的代码看起来丑陋且重复,非常不符合 Python 规范。

这是我的:

import datetime

def update_views(log_file):
    log_day = {}
    log_week = {}
    log_month = {}
    day = 0
    cur_day = None
    week = 1
    for line in log_file:
        data = line.strip().split(",")

        year, month, _day = data[0].split("-")

        if cur_day != _day:
            cur_day = _day
            day += 1
            if day % 7 == 0:
                week += 1

        month_long = datetime.date(int(year), int(month), int(_day)).strftime("%B")

        if month_long not in log_month:
            log_month[month_long] = {"Comp": {}, "Miss": {}, "Post": {}, "Add": {}, "Score": 0}
        if "Week %i" % week not in log_week:
            log_week["Week %i" % week] = {"Comp": {}, "Miss": {}, "Post": {}, "Add": {}, "Score": 0}
        if "Day %i" % day not in log_day:
            log_day["Day %i" % day] = {"Comp": {}, "Miss": {}, "Post": {}, "Add": {}, "Score": 0}

        current_score = data[1]
        status = data[2]
        item_name = data[3]

        try:
            log_day["Day %i" % day][status][item_name] += 1
        except KeyError:
            log_day["Day %i" % day][status][item_name] = 1

        try:
            log_week["Week %i" % week][status][item_name] += 1
        except KeyError:
            log_week["Week %i" % week][status][item_name] = 1

        try:
            log_month[month_long][status][item_name] += 1
        except KeyError:
            log_month[month_long][status][item_name] = 1

        log_day["Day %i" % day]["Score"] = int(current_score)
        log_week["Week %i" % week]["Score"] = int(current_score)
        log_month[month_long]["Score"] = int(current_score)

log_file =   """2015-01-1,0,Add,DW_05
                2015-01-2,-1,Post,CR_02
                2015-01-3,-1,Comp,DIY_01
                2015-01-3,-1,Post,CD_01
                2015-01-4,-1,Miss,D_03
                2015-01-4,0,Miss,D_03
                2015-01-4,-1,Miss,CD_01
                2015-01-4,0,Miss,LS_04
                2015-01-5,1,Comp,DW_05
                2015-01-6,1,Comp,ANI_06
                2015-01-6,1,Comp,LS_04
                2015-01-7,1,Comp,NMW_07
                2015-01-7,1,Post,DW_05
                2015-01-7,1,Miss,LP_08
                2015-01-8,2,Post,CR_02
                2015-01-8,2,Miss,SEV_09
                2015-01-10,3,Comp,M_10
                2015-01-10,3,Add,NID_11
                2015-01-11,2,Add,ANI_06
                2015-01-12,1,Add,VF_12
                2015-01-12,0,Miss,DIY_01
                2015-01-12,1,Add,NID_11
                2015-01-12,0,Miss,D_03
                2015-01-13,1,Miss,SEV_09
                2015-01-13,2,Add,DW_05
                2015-01-13,1,Comp,NMW_07
                2015-01-13,1,Add,CPC_12""".splitlines()

update_views(log_file)

我需要帮助将其分解为更清晰的代码,我不喜欢使用那么多变量(天、周、cur_day)和 try/except 重复。

如果您的环境中有 Pandas,那么解析 CSV 的最紧凑和面向未来的方法就是 read_csv。结果是 pandas DataFrame,可以查询、转换、旋转,最后以多种格式写入,包括 HTML.

代码可以像

一样简约
import pandas as pd
df = pd.import_csv('file.csv', sep=r"\t+")

Python 有一个 csv 模块。

import csv
with csv.reader('path/to/file',csv.excel_tab) as d:
    pass
    #d is a list with values

然后

import datetime
def parse_date(strdt):
    return datetime.datetime.strptime(strdt, '%Y-%m-%d)

最后,查看 https://docs.python.org/2/library/collections.html#collections.Counter

在 codereview 的帮助下,我做了这个 class:

class TreeData:
    """Set the data structure to be used for the QTreeViews."""

    def __init__(self, name):
        self.name = name
        self.data = {}

    def add_item(self, key, status, item_name, score):
        """
        Sets the structure
                Which consists of a dict with nested defaultdict(int)
                for completed/missed/postponed/added activities and Score
        """
        if self.name != "Month":
            key = '%s %i' % (self.name, key)

        if key not in self.data:
            self.data[key] = {"Comp": defaultdict(int),
                              "Miss": defaultdict(int),
                              "Post": defaultdict(int),
                              "Add": defaultdict(int),
                              "Score": 0}

        self.data[key][status][item_name] += 1
        self.data[key]["Score"] += int(score)

    @classmethod
    def setup(cls, main_window):
        """Main method of the class, is used to read and parse the file and set the structure for the QTrees"""
        day_n = 0
        cur_day = None
        week_n = 1

        cls.day = TreeData("Day")
        cls.week = TreeData("Week")
        cls.month = TreeData("Month")

        try:
            with open("log_file.txt") as log_file:
                for line in log_file:
                    # Splits the data into a meaningful way
                    date, score_change, status, item_name = line.strip().split("\t")
                    year, month, day = map(int, date.split("-"))
                    month_name = datetime.date(year, month, day).strftime("%B")

                    # sets the day/week numbers
                    if cur_day != day:
                        cur_day = day
                        day_n += 1
                        if day_n % 7 == 0:
                            week_n += 1

                    # structure the QTrees
                    cls.day.add_item(day_n, status, item_name, score_change)
                    cls.week.add_item(week_n, status, item_name, score_change)
                    cls.month.add_item(month_name, status, item_name, score_change)