以更优雅的方式解析数据
Parsing data in more elegant way
我有一个日志文件,其中包含用户完成的一些活动,该文件是以下格式的 csv:
DATE,SCORE,STATUS,ACTIVITY_ID
我正在解析 csv,然后在三个相似的视图(每日、每周、每年)中显示它。
我设法正确显示了它,但我的代码看起来丑陋且重复,非常不符合 Python 规范。
这是我的:
import datetime
def update_views(log_file):
log_day = {}
log_week = {}
log_month = {}
day = 0
cur_day = None
week = 1
for line in log_file:
data = line.strip().split(",")
year, month, _day = data[0].split("-")
if cur_day != _day:
cur_day = _day
day += 1
if day % 7 == 0:
week += 1
month_long = datetime.date(int(year), int(month), int(_day)).strftime("%B")
if month_long not in log_month:
log_month[month_long] = {"Comp": {}, "Miss": {}, "Post": {}, "Add": {}, "Score": 0}
if "Week %i" % week not in log_week:
log_week["Week %i" % week] = {"Comp": {}, "Miss": {}, "Post": {}, "Add": {}, "Score": 0}
if "Day %i" % day not in log_day:
log_day["Day %i" % day] = {"Comp": {}, "Miss": {}, "Post": {}, "Add": {}, "Score": 0}
current_score = data[1]
status = data[2]
item_name = data[3]
try:
log_day["Day %i" % day][status][item_name] += 1
except KeyError:
log_day["Day %i" % day][status][item_name] = 1
try:
log_week["Week %i" % week][status][item_name] += 1
except KeyError:
log_week["Week %i" % week][status][item_name] = 1
try:
log_month[month_long][status][item_name] += 1
except KeyError:
log_month[month_long][status][item_name] = 1
log_day["Day %i" % day]["Score"] = int(current_score)
log_week["Week %i" % week]["Score"] = int(current_score)
log_month[month_long]["Score"] = int(current_score)
log_file = """2015-01-1,0,Add,DW_05
2015-01-2,-1,Post,CR_02
2015-01-3,-1,Comp,DIY_01
2015-01-3,-1,Post,CD_01
2015-01-4,-1,Miss,D_03
2015-01-4,0,Miss,D_03
2015-01-4,-1,Miss,CD_01
2015-01-4,0,Miss,LS_04
2015-01-5,1,Comp,DW_05
2015-01-6,1,Comp,ANI_06
2015-01-6,1,Comp,LS_04
2015-01-7,1,Comp,NMW_07
2015-01-7,1,Post,DW_05
2015-01-7,1,Miss,LP_08
2015-01-8,2,Post,CR_02
2015-01-8,2,Miss,SEV_09
2015-01-10,3,Comp,M_10
2015-01-10,3,Add,NID_11
2015-01-11,2,Add,ANI_06
2015-01-12,1,Add,VF_12
2015-01-12,0,Miss,DIY_01
2015-01-12,1,Add,NID_11
2015-01-12,0,Miss,D_03
2015-01-13,1,Miss,SEV_09
2015-01-13,2,Add,DW_05
2015-01-13,1,Comp,NMW_07
2015-01-13,1,Add,CPC_12""".splitlines()
update_views(log_file)
我需要帮助将其分解为更清晰的代码,我不喜欢使用那么多变量(天、周、cur_day)和 try/except 重复。
如果您的环境中有 Pandas,那么解析 CSV 的最紧凑和面向未来的方法就是 read_csv。结果是 pandas DataFrame,可以查询、转换、旋转,最后以多种格式写入,包括 HTML.
代码可以像
一样简约
import pandas as pd
df = pd.import_csv('file.csv', sep=r"\t+")
Python 有一个 csv 模块。
import csv
with csv.reader('path/to/file',csv.excel_tab) as d:
pass
#d is a list with values
然后
import datetime
def parse_date(strdt):
return datetime.datetime.strptime(strdt, '%Y-%m-%d)
最后,查看 https://docs.python.org/2/library/collections.html#collections.Counter
在 codereview 的帮助下,我做了这个 class:
class TreeData:
"""Set the data structure to be used for the QTreeViews."""
def __init__(self, name):
self.name = name
self.data = {}
def add_item(self, key, status, item_name, score):
"""
Sets the structure
Which consists of a dict with nested defaultdict(int)
for completed/missed/postponed/added activities and Score
"""
if self.name != "Month":
key = '%s %i' % (self.name, key)
if key not in self.data:
self.data[key] = {"Comp": defaultdict(int),
"Miss": defaultdict(int),
"Post": defaultdict(int),
"Add": defaultdict(int),
"Score": 0}
self.data[key][status][item_name] += 1
self.data[key]["Score"] += int(score)
@classmethod
def setup(cls, main_window):
"""Main method of the class, is used to read and parse the file and set the structure for the QTrees"""
day_n = 0
cur_day = None
week_n = 1
cls.day = TreeData("Day")
cls.week = TreeData("Week")
cls.month = TreeData("Month")
try:
with open("log_file.txt") as log_file:
for line in log_file:
# Splits the data into a meaningful way
date, score_change, status, item_name = line.strip().split("\t")
year, month, day = map(int, date.split("-"))
month_name = datetime.date(year, month, day).strftime("%B")
# sets the day/week numbers
if cur_day != day:
cur_day = day
day_n += 1
if day_n % 7 == 0:
week_n += 1
# structure the QTrees
cls.day.add_item(day_n, status, item_name, score_change)
cls.week.add_item(week_n, status, item_name, score_change)
cls.month.add_item(month_name, status, item_name, score_change)
我有一个日志文件,其中包含用户完成的一些活动,该文件是以下格式的 csv:
DATE,SCORE,STATUS,ACTIVITY_ID
我正在解析 csv,然后在三个相似的视图(每日、每周、每年)中显示它。
我设法正确显示了它,但我的代码看起来丑陋且重复,非常不符合 Python 规范。
这是我的:
import datetime
def update_views(log_file):
log_day = {}
log_week = {}
log_month = {}
day = 0
cur_day = None
week = 1
for line in log_file:
data = line.strip().split(",")
year, month, _day = data[0].split("-")
if cur_day != _day:
cur_day = _day
day += 1
if day % 7 == 0:
week += 1
month_long = datetime.date(int(year), int(month), int(_day)).strftime("%B")
if month_long not in log_month:
log_month[month_long] = {"Comp": {}, "Miss": {}, "Post": {}, "Add": {}, "Score": 0}
if "Week %i" % week not in log_week:
log_week["Week %i" % week] = {"Comp": {}, "Miss": {}, "Post": {}, "Add": {}, "Score": 0}
if "Day %i" % day not in log_day:
log_day["Day %i" % day] = {"Comp": {}, "Miss": {}, "Post": {}, "Add": {}, "Score": 0}
current_score = data[1]
status = data[2]
item_name = data[3]
try:
log_day["Day %i" % day][status][item_name] += 1
except KeyError:
log_day["Day %i" % day][status][item_name] = 1
try:
log_week["Week %i" % week][status][item_name] += 1
except KeyError:
log_week["Week %i" % week][status][item_name] = 1
try:
log_month[month_long][status][item_name] += 1
except KeyError:
log_month[month_long][status][item_name] = 1
log_day["Day %i" % day]["Score"] = int(current_score)
log_week["Week %i" % week]["Score"] = int(current_score)
log_month[month_long]["Score"] = int(current_score)
log_file = """2015-01-1,0,Add,DW_05
2015-01-2,-1,Post,CR_02
2015-01-3,-1,Comp,DIY_01
2015-01-3,-1,Post,CD_01
2015-01-4,-1,Miss,D_03
2015-01-4,0,Miss,D_03
2015-01-4,-1,Miss,CD_01
2015-01-4,0,Miss,LS_04
2015-01-5,1,Comp,DW_05
2015-01-6,1,Comp,ANI_06
2015-01-6,1,Comp,LS_04
2015-01-7,1,Comp,NMW_07
2015-01-7,1,Post,DW_05
2015-01-7,1,Miss,LP_08
2015-01-8,2,Post,CR_02
2015-01-8,2,Miss,SEV_09
2015-01-10,3,Comp,M_10
2015-01-10,3,Add,NID_11
2015-01-11,2,Add,ANI_06
2015-01-12,1,Add,VF_12
2015-01-12,0,Miss,DIY_01
2015-01-12,1,Add,NID_11
2015-01-12,0,Miss,D_03
2015-01-13,1,Miss,SEV_09
2015-01-13,2,Add,DW_05
2015-01-13,1,Comp,NMW_07
2015-01-13,1,Add,CPC_12""".splitlines()
update_views(log_file)
我需要帮助将其分解为更清晰的代码,我不喜欢使用那么多变量(天、周、cur_day)和 try/except 重复。
如果您的环境中有 Pandas,那么解析 CSV 的最紧凑和面向未来的方法就是 read_csv。结果是 pandas DataFrame,可以查询、转换、旋转,最后以多种格式写入,包括 HTML.
代码可以像
一样简约import pandas as pd
df = pd.import_csv('file.csv', sep=r"\t+")
Python 有一个 csv 模块。
import csv
with csv.reader('path/to/file',csv.excel_tab) as d:
pass
#d is a list with values
然后
import datetime
def parse_date(strdt):
return datetime.datetime.strptime(strdt, '%Y-%m-%d)
最后,查看 https://docs.python.org/2/library/collections.html#collections.Counter
在 codereview 的帮助下,我做了这个 class:
class TreeData:
"""Set the data structure to be used for the QTreeViews."""
def __init__(self, name):
self.name = name
self.data = {}
def add_item(self, key, status, item_name, score):
"""
Sets the structure
Which consists of a dict with nested defaultdict(int)
for completed/missed/postponed/added activities and Score
"""
if self.name != "Month":
key = '%s %i' % (self.name, key)
if key not in self.data:
self.data[key] = {"Comp": defaultdict(int),
"Miss": defaultdict(int),
"Post": defaultdict(int),
"Add": defaultdict(int),
"Score": 0}
self.data[key][status][item_name] += 1
self.data[key]["Score"] += int(score)
@classmethod
def setup(cls, main_window):
"""Main method of the class, is used to read and parse the file and set the structure for the QTrees"""
day_n = 0
cur_day = None
week_n = 1
cls.day = TreeData("Day")
cls.week = TreeData("Week")
cls.month = TreeData("Month")
try:
with open("log_file.txt") as log_file:
for line in log_file:
# Splits the data into a meaningful way
date, score_change, status, item_name = line.strip().split("\t")
year, month, day = map(int, date.split("-"))
month_name = datetime.date(year, month, day).strftime("%B")
# sets the day/week numbers
if cur_day != day:
cur_day = day
day_n += 1
if day_n % 7 == 0:
week_n += 1
# structure the QTrees
cls.day.add_item(day_n, status, item_name, score_change)
cls.week.add_item(week_n, status, item_name, score_change)
cls.month.add_item(month_name, status, item_name, score_change)