Python: 如何在没有时间戳的情况下按日期计数
Python: How to count by date without timestamp
这是我的数据格式:
[Mon May 02 15:38:50 2016] [error] [client XX.XX.XX.XX] File does not exist: /home/XXX/XXXX/XXX/XXX/XXX.shtml
这是我的代码,我试图按日期显示行数:
# datecount.py
import sys, collections
# sys.argv is the list of command-line arguments
# sys.arg[0] is the name of the program itself
# sys.arg[1] is optional and will be the file name
# set input based on number of arguments
if len(sys.argv) == 1:
f = sys.stdin
elif len(sys.argv) == 2:
try:
f = open(sys.argv[1])
except IOError:
print "Cannot open", sys.argv[1]
sys.exit()
else:
print "USAGE: python datecount [FILE]"
sys.exit()
dateCounts = collections.Counter()
# for every line passed into the script
for line in f:
# find indices of date section
start = line.find("[")
if start >= 0 :
end = line.find("]", start)
# graph just the date
date = line[start+21: end] #by YEAR
dateCounts[date]=dateCounts[date]+1
#print top dates
for date in dateCounts.most_common():
sys.stdout.write(str(date) + "\n")`
现在输出是:
('2017', 738057)
('2016', 446204)
('2015', 9995)
('2014', 706)
但我只想按日期计算,例如:
('May 02 2016', 128)
('May 03 2016', 105)
('May 04 2016', 99)
正在考虑实施正则表达式,但不知道如何实施。
如何去除日期中间的时间戳?
我们可以使用下面的代码得到预期的结果。希望对您有所帮助。
# datecount.py
import sys, collections
# sys.argv is the list of command-line arguments
# sys.arg[0] is the name of the program itself
# sys.arg[1] is optional and will be the file name
# set input based on number of arguments
if len(sys.argv) == 1:
f = sys.stdin
elif len(sys.argv) == 2:
try:
f = open(sys.argv[1])
except IOError:
print "Cannot open", sys.argv[1]
sys.exit()
else:
print "USAGE: python datecount [FILE]"
sys.exit()
dateCounts = collections.Counter()
# for every line passed into the script
for line in f:
# find indices of date section
start = line.find("[")
if start >= 0 :
end = line.find("]", start)
# graph just the date
date = line[start+5:11] +' '+ line[start+21:end] #by Date and YEAR
dateCounts[date]=dateCounts[date]+1
#print top dates
for date in dateCounts.most_common():
sys.stdout.write(str(date) + "\n")`
使用正则表达式实现:
import sys
import collections
import re
dateCounts = collections.Counter()
input_str = """
[Mon May 02 15:38:50 2016] [error] [client XX.XX.XX.XX] File does not exist: /home/XXX/XXXX/XXX/XXX/XXX.shtml
[Mon May 03 15:38:50 2017] [error] [client XX.XX.XX.XX] File does not exist: /home/XXX/XXXX/XXX/XXX/XXX.shtml
[Mon May 02 15:38:50 2016] [error] [client XX.XX.XX.XX] File does not exist: /home/XXX/XXXX/XXX/XXX/XXX.shtml
"""
found = re.findall("\[(.*)\].*\[.*\].*\[.*\].*", input_str, re.MULTILINE)
for date in found:
dateCounts[date] = dateCounts[date] + 1
for date in dateCounts.most_common():
sys.stdout.write(str(date) + "\n")
输出:
('Mon May 02 15:38:50 2016', 2)
('Mon May 03 15:38:50 2017', 1)
这是我的数据格式:
[Mon May 02 15:38:50 2016] [error] [client XX.XX.XX.XX] File does not exist: /home/XXX/XXXX/XXX/XXX/XXX.shtml
这是我的代码,我试图按日期显示行数:
# datecount.py
import sys, collections
# sys.argv is the list of command-line arguments
# sys.arg[0] is the name of the program itself
# sys.arg[1] is optional and will be the file name
# set input based on number of arguments
if len(sys.argv) == 1:
f = sys.stdin
elif len(sys.argv) == 2:
try:
f = open(sys.argv[1])
except IOError:
print "Cannot open", sys.argv[1]
sys.exit()
else:
print "USAGE: python datecount [FILE]"
sys.exit()
dateCounts = collections.Counter()
# for every line passed into the script
for line in f:
# find indices of date section
start = line.find("[")
if start >= 0 :
end = line.find("]", start)
# graph just the date
date = line[start+21: end] #by YEAR
dateCounts[date]=dateCounts[date]+1
#print top dates
for date in dateCounts.most_common():
sys.stdout.write(str(date) + "\n")`
现在输出是:
('2017', 738057)
('2016', 446204)
('2015', 9995)
('2014', 706)
但我只想按日期计算,例如:
('May 02 2016', 128)
('May 03 2016', 105)
('May 04 2016', 99)
正在考虑实施正则表达式,但不知道如何实施。
如何去除日期中间的时间戳?
我们可以使用下面的代码得到预期的结果。希望对您有所帮助。
# datecount.py
import sys, collections
# sys.argv is the list of command-line arguments
# sys.arg[0] is the name of the program itself
# sys.arg[1] is optional and will be the file name
# set input based on number of arguments
if len(sys.argv) == 1:
f = sys.stdin
elif len(sys.argv) == 2:
try:
f = open(sys.argv[1])
except IOError:
print "Cannot open", sys.argv[1]
sys.exit()
else:
print "USAGE: python datecount [FILE]"
sys.exit()
dateCounts = collections.Counter()
# for every line passed into the script
for line in f:
# find indices of date section
start = line.find("[")
if start >= 0 :
end = line.find("]", start)
# graph just the date
date = line[start+5:11] +' '+ line[start+21:end] #by Date and YEAR
dateCounts[date]=dateCounts[date]+1
#print top dates
for date in dateCounts.most_common():
sys.stdout.write(str(date) + "\n")`
使用正则表达式实现:
import sys
import collections
import re
dateCounts = collections.Counter()
input_str = """
[Mon May 02 15:38:50 2016] [error] [client XX.XX.XX.XX] File does not exist: /home/XXX/XXXX/XXX/XXX/XXX.shtml
[Mon May 03 15:38:50 2017] [error] [client XX.XX.XX.XX] File does not exist: /home/XXX/XXXX/XXX/XXX/XXX.shtml
[Mon May 02 15:38:50 2016] [error] [client XX.XX.XX.XX] File does not exist: /home/XXX/XXXX/XXX/XXX/XXX.shtml
"""
found = re.findall("\[(.*)\].*\[.*\].*\[.*\].*", input_str, re.MULTILINE)
for date in found:
dateCounts[date] = dateCounts[date] + 1
for date in dateCounts.most_common():
sys.stdout.write(str(date) + "\n")
输出:
('Mon May 02 15:38:50 2016', 2)
('Mon May 03 15:38:50 2017', 1)