使用 Python 解析日志文件
Parsing log file using Python
我有以下日志文件,我想使用 Python 3.4
将其拆分并放入有序数据结构(类似于列表列表)中
文件遵循以下结构:
Month #1
1465465464555
345646546454
442343423433
724342342655
34324233454
24543534533
***Day # 1
5465465465465455
644654654654454
4435423534833
***Day #2
24876867655
74654654454
643876867433
***Day #3
445543534655
344876867854
64365465433
Month #2
7454353455
84756756454
64563453433
***Day # 1
44756756655
34453453454
243867867433
***Day #2
64465465455
74454353454
34878733
***Day #3
1449898955
643434354
843090909888433
目标是能够按月数循环,并且能够分别在每一天工作。
我应该可以做这样的事情:
for month in months:
for day in days:
for number in day:
print(number)
我采用的从文件中提取月份的解决方案如下,但这不是一个明智的解决方案。我需要更好的东西
lista = []
in_file = open("log.txt","r")
righe= in_file.readlines()
in_file.close()
for i in range(0,len(righe)):
if "Month" in righe[i]:
lista.append(i)
lista.append((len(righe)-1))
counter = 1
for i in range(0,len(lista)-1):
out_file = open(str(counter)+".txt","w")
for j in range(lista[i], lista[i+1]):
out_file.write(righe[j])
out_file.close()
counter=counter+1
for i in range(1,counter):
print("Month: ", i)
mano = open(str(i)+".txt","r")
righe= mano.readlines()
print(righe)
mano.close()
标准库中的 itertools.groupby
是此类工作的强大功能。下面的代码按月查找行组,然后在月内按天查找,构建嵌套数据结构。完成后,您可以按月迭代该结构,并在每个月内按天迭代。
data = """\
Month #1
1465465464555
345646546454
442343423433
724342342655
34324233454
24543534533
***Day # 1
5465465465465455
644654654654454
4435423534833
***Day #2
24876867655
74654654454
643876867433
***Day #3
445543534655
344876867854
64365465433
Month #2
7454353455
84756756454
64563453433
***Day # 1
44756756655
34453453454
243867867433
***Day #2
64465465455
74454353454
34878733
***Day #3
1449898955
643434354
843090909888433""".splitlines()
# or data = open(data_file).read().splitlines()
from itertools import groupby
# some simple boolean functions to detect Month and Day marker lines
is_month_line = lambda s: s.startswith("Month")
is_day_line = lambda s: s.startswith("***Day")
grouped_data = []
for is_month, month_lines in groupby(data, key=is_month_line):
if is_month:
# detected a 'Month' marker - save it and create placeholder in grouping structure
current_month = list(month_lines)[0]
current_month_data = []
grouped_data.append([current_month, current_month_data])
# set up blank day for month-level data lines
current_day = ''
current_day_data = []
current_month_data.append([current_day, current_day_data])
else:
# found group of non-'Month' lines, group by days
for is_day, day_lines in groupby(month_lines, key=is_day_line):
if is_day:
# day marker detected, save it for storing day values
current_day = list(day_lines)[0][3:]
current_day_data = []
current_month_data.append([current_day, current_day_data])
else:
# all non-day lines, add to current day's data
current_day_data.extend(day_lines)
使用pprint
转出嵌套列表:
from pprint import pprint
pprint(grouped_data, width=120)
给出:
[['Month #1',
[['', ['1465465464555', '345646546454', '442343423433', '724342342655', '34324233454', '24543534533']],
['Day # 1', ['5465465465465455', '644654654654454', '4435423534833']],
['Day #2', ['24876867655', '74654654454', '643876867433']],
['Day #3', ['445543534655', '344876867854', '64365465433']]]],
['Month #2',
[['', ['7454353455', '84756756454', '64563453433']],
['Day # 1', ['44756756655', '34453453454', '243867867433']],
['Day #2', ['64465465455', '74454353454', '34878733']],
['Day #3', ['1449898955', '643434354', '843090909888433']]]]]
如果你想沿着嵌套的字典路线走:
month, day = 0, 0
log = {}
with open("log.txt") as f:
for line in f:
if 'Month' in line:
month += 1
day = 0
log[month] = {0:[]}
elif 'Day' in line:
day += 1
log[month][day] = []
else:
log[month][day].append(line.strip())
请注意,我假设紧跟在月份行之后的条目是第 0 天。结构现在如下所示:
>>> from pprint import pprint
>>> pprint(log)
{1: {0: ['1465465464555',
'345646546454',
'442343423433',
'724342342655',
'34324233454',
'24543534533'],
1: ['5465465465465455', '644654654654454', '4435423534833'],
2: ['24876867655', '74654654454', '643876867433'],
3: ['445543534655', '344876867854', '64365465433']},
2: {0: ['7454353455', '84756756454', '64563453433'],
1: ['44756756655', '34453453454', '243867867433'],
2: ['64465465455', '74454353454', '34878733'],
3: ['1449898955', '643434354', '843090909888433']}}
您可以通过以下方式对其进行迭代:
for month_index in sorted(log):
month = log[month_index]
for day_index in sorted(month):
day = month[day_index]
for number in day:
print(number)
嗯,对于这个问题我们几乎没有答案。
这是我的贡献,我使用一些递归解决方案解决了这个问题。所以,换个思路:
def loop(stopParam, startArr, resultArr=[]):
if startArr == []:
return (resultArr, startArr)
elif stopParam in startArr[0]:
return (resultArr, startArr)
else:
return loop(stopParam, startArr[1:], resultArr + [startArr[0]])
def buildList(arr, testVal={}):
if 'Month' in (arr[0] if arr != [] else ''):
res = loop('Month', arr[1:])
testVal[arr[0]] = res[0]
return buildList(res[1], testVal)
else:
return testVal
in_file = open("test.txt","r")
righe= in_file.readlines()
in_file.close()
print buildList(righe)
这是一个解决方案。
我有以下日志文件,我想使用 Python 3.4
将其拆分并放入有序数据结构(类似于列表列表)中文件遵循以下结构:
Month #1
1465465464555
345646546454
442343423433
724342342655
34324233454
24543534533
***Day # 1
5465465465465455
644654654654454
4435423534833
***Day #2
24876867655
74654654454
643876867433
***Day #3
445543534655
344876867854
64365465433
Month #2
7454353455
84756756454
64563453433
***Day # 1
44756756655
34453453454
243867867433
***Day #2
64465465455
74454353454
34878733
***Day #3
1449898955
643434354
843090909888433
目标是能够按月数循环,并且能够分别在每一天工作。 我应该可以做这样的事情:
for month in months:
for day in days:
for number in day:
print(number)
我采用的从文件中提取月份的解决方案如下,但这不是一个明智的解决方案。我需要更好的东西
lista = []
in_file = open("log.txt","r")
righe= in_file.readlines()
in_file.close()
for i in range(0,len(righe)):
if "Month" in righe[i]:
lista.append(i)
lista.append((len(righe)-1))
counter = 1
for i in range(0,len(lista)-1):
out_file = open(str(counter)+".txt","w")
for j in range(lista[i], lista[i+1]):
out_file.write(righe[j])
out_file.close()
counter=counter+1
for i in range(1,counter):
print("Month: ", i)
mano = open(str(i)+".txt","r")
righe= mano.readlines()
print(righe)
mano.close()
itertools.groupby
是此类工作的强大功能。下面的代码按月查找行组,然后在月内按天查找,构建嵌套数据结构。完成后,您可以按月迭代该结构,并在每个月内按天迭代。
data = """\
Month #1
1465465464555
345646546454
442343423433
724342342655
34324233454
24543534533
***Day # 1
5465465465465455
644654654654454
4435423534833
***Day #2
24876867655
74654654454
643876867433
***Day #3
445543534655
344876867854
64365465433
Month #2
7454353455
84756756454
64563453433
***Day # 1
44756756655
34453453454
243867867433
***Day #2
64465465455
74454353454
34878733
***Day #3
1449898955
643434354
843090909888433""".splitlines()
# or data = open(data_file).read().splitlines()
from itertools import groupby
# some simple boolean functions to detect Month and Day marker lines
is_month_line = lambda s: s.startswith("Month")
is_day_line = lambda s: s.startswith("***Day")
grouped_data = []
for is_month, month_lines in groupby(data, key=is_month_line):
if is_month:
# detected a 'Month' marker - save it and create placeholder in grouping structure
current_month = list(month_lines)[0]
current_month_data = []
grouped_data.append([current_month, current_month_data])
# set up blank day for month-level data lines
current_day = ''
current_day_data = []
current_month_data.append([current_day, current_day_data])
else:
# found group of non-'Month' lines, group by days
for is_day, day_lines in groupby(month_lines, key=is_day_line):
if is_day:
# day marker detected, save it for storing day values
current_day = list(day_lines)[0][3:]
current_day_data = []
current_month_data.append([current_day, current_day_data])
else:
# all non-day lines, add to current day's data
current_day_data.extend(day_lines)
使用pprint
转出嵌套列表:
from pprint import pprint
pprint(grouped_data, width=120)
给出:
[['Month #1',
[['', ['1465465464555', '345646546454', '442343423433', '724342342655', '34324233454', '24543534533']],
['Day # 1', ['5465465465465455', '644654654654454', '4435423534833']],
['Day #2', ['24876867655', '74654654454', '643876867433']],
['Day #3', ['445543534655', '344876867854', '64365465433']]]],
['Month #2',
[['', ['7454353455', '84756756454', '64563453433']],
['Day # 1', ['44756756655', '34453453454', '243867867433']],
['Day #2', ['64465465455', '74454353454', '34878733']],
['Day #3', ['1449898955', '643434354', '843090909888433']]]]]
如果你想沿着嵌套的字典路线走:
month, day = 0, 0
log = {}
with open("log.txt") as f:
for line in f:
if 'Month' in line:
month += 1
day = 0
log[month] = {0:[]}
elif 'Day' in line:
day += 1
log[month][day] = []
else:
log[month][day].append(line.strip())
请注意,我假设紧跟在月份行之后的条目是第 0 天。结构现在如下所示:
>>> from pprint import pprint
>>> pprint(log)
{1: {0: ['1465465464555',
'345646546454',
'442343423433',
'724342342655',
'34324233454',
'24543534533'],
1: ['5465465465465455', '644654654654454', '4435423534833'],
2: ['24876867655', '74654654454', '643876867433'],
3: ['445543534655', '344876867854', '64365465433']},
2: {0: ['7454353455', '84756756454', '64563453433'],
1: ['44756756655', '34453453454', '243867867433'],
2: ['64465465455', '74454353454', '34878733'],
3: ['1449898955', '643434354', '843090909888433']}}
您可以通过以下方式对其进行迭代:
for month_index in sorted(log):
month = log[month_index]
for day_index in sorted(month):
day = month[day_index]
for number in day:
print(number)
嗯,对于这个问题我们几乎没有答案。
这是我的贡献,我使用一些递归解决方案解决了这个问题。所以,换个思路:
def loop(stopParam, startArr, resultArr=[]):
if startArr == []:
return (resultArr, startArr)
elif stopParam in startArr[0]:
return (resultArr, startArr)
else:
return loop(stopParam, startArr[1:], resultArr + [startArr[0]])
def buildList(arr, testVal={}):
if 'Month' in (arr[0] if arr != [] else ''):
res = loop('Month', arr[1:])
testVal[arr[0]] = res[0]
return buildList(res[1], testVal)
else:
return testVal
in_file = open("test.txt","r")
righe= in_file.readlines()
in_file.close()
print buildList(righe)
这是一个解决方案。