python,减少内存消耗,让这段代码更高效?
python, reducing memory consumption and making this code more efficient?
我写了这段代码(它有效 - 我在小批量的 MBOX 文件上试过)。但是,当我在一个大小为 2.9 GB 的大约 50,000 封邮件的 MBOX 文件上尝试时,内存消耗猛增,导致计算机无法使用。这段代码在内存消耗方面有什么问题,有没有办法解决它,比如让代码按增量而不是整体处理?
此脚本的目标 是生成一个 CSV 文件,其中 x 为日期,Y 为该日期收到的消息计数,以便绘制它们并生成统计表示电子邮件。
未来:我计划扩展它,阅读电子邮件并按时间顺序在 pdf 上生成输出,因此需要对其进行排序(内存消耗猛增)
import mailbox
from email.utils import parsedate
from dateutil.parser import parse
import itertools
import plotly.plotly as py
from plotly.graph_objs import *
import plotly.tools as tls
import csv
from itertools import izip
path = 'mail.mbox'
mbox = mailbox.mbox(path)
def extract_date(email):
date = email.get('Date')
return parsedate(date)
#sort the email by a given date
sorted_mails = sorted(mbox, key=extract_date)
mbox.update(enumerate(sorted_mails))
mbox.flush()
#it finds all the dates within the MBOX and split
all_dates = []
mbox = mailbox.mbox(path)
for message in mbox:
all_dates.append( str( parse( message['date'] ) ).split(' ')[0] )
#counts the number of emails per given date
email_count = [(g[0], len(list(g[1]))) for g in itertools.groupby(all_dates)]
email_count[0]
#makes a list of (x,y)
x = []
y = []
for date, count in email_count:
x.append(date)
y.append(count)
#produce a CSV file of X and Y, for plotting
with open('data.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerows(izip(x, y))
"""
data = Data([x, y])
plot_url = py.iplot(Data, filename='line-scatter' )
"""
py.iplot( Data([ Scatter( x=x, y=y ) ]) )
我对这些库不是很熟悉,但我认为主要问题是您使用这一行将所有消息读入内存:
sorted_mails = sorted(mbox, key=extract_date)
这个脚本的目标是什么?你真的需要对任何东西进行排序吗?如果您只需要生成包含每个日期计数的 CSV,请尝试以下操作:
import mailbox
from email.utils import parsedate
from dateutil.parser import parse
import itertools
import plotly.plotly as py
from plotly.graph_objs import *
import plotly.tools as tls
import csv
from itertools import izip
path = 'mail.mbox'
mbox = mailbox.mbox(path)
# map date to number of emails seen on that date
date_counts = {}
for message in mbox:
date = str( parse( message['date'] ) ).split(' ')[0]
try:
date_counts[date] += 1
except KeyError:
date_counts[date] = 1
with open('data.csv', 'wb') as f:
writer = csv.writer(f)
for date in date_counts:
writer.writerow([date, date_counts[date]])
我写了这段代码(它有效 - 我在小批量的 MBOX 文件上试过)。但是,当我在一个大小为 2.9 GB 的大约 50,000 封邮件的 MBOX 文件上尝试时,内存消耗猛增,导致计算机无法使用。这段代码在内存消耗方面有什么问题,有没有办法解决它,比如让代码按增量而不是整体处理? 此脚本的目标 是生成一个 CSV 文件,其中 x 为日期,Y 为该日期收到的消息计数,以便绘制它们并生成统计表示电子邮件。 未来:我计划扩展它,阅读电子邮件并按时间顺序在 pdf 上生成输出,因此需要对其进行排序(内存消耗猛增)
import mailbox
from email.utils import parsedate
from dateutil.parser import parse
import itertools
import plotly.plotly as py
from plotly.graph_objs import *
import plotly.tools as tls
import csv
from itertools import izip
path = 'mail.mbox'
mbox = mailbox.mbox(path)
def extract_date(email):
date = email.get('Date')
return parsedate(date)
#sort the email by a given date
sorted_mails = sorted(mbox, key=extract_date)
mbox.update(enumerate(sorted_mails))
mbox.flush()
#it finds all the dates within the MBOX and split
all_dates = []
mbox = mailbox.mbox(path)
for message in mbox:
all_dates.append( str( parse( message['date'] ) ).split(' ')[0] )
#counts the number of emails per given date
email_count = [(g[0], len(list(g[1]))) for g in itertools.groupby(all_dates)]
email_count[0]
#makes a list of (x,y)
x = []
y = []
for date, count in email_count:
x.append(date)
y.append(count)
#produce a CSV file of X and Y, for plotting
with open('data.csv', 'wb') as f:
writer = csv.writer(f)
writer.writerows(izip(x, y))
"""
data = Data([x, y])
plot_url = py.iplot(Data, filename='line-scatter' )
"""
py.iplot( Data([ Scatter( x=x, y=y ) ]) )
我对这些库不是很熟悉,但我认为主要问题是您使用这一行将所有消息读入内存:
sorted_mails = sorted(mbox, key=extract_date)
这个脚本的目标是什么?你真的需要对任何东西进行排序吗?如果您只需要生成包含每个日期计数的 CSV,请尝试以下操作:
import mailbox
from email.utils import parsedate
from dateutil.parser import parse
import itertools
import plotly.plotly as py
from plotly.graph_objs import *
import plotly.tools as tls
import csv
from itertools import izip
path = 'mail.mbox'
mbox = mailbox.mbox(path)
# map date to number of emails seen on that date
date_counts = {}
for message in mbox:
date = str( parse( message['date'] ) ).split(' ')[0]
try:
date_counts[date] += 1
except KeyError:
date_counts[date] = 1
with open('data.csv', 'wb') as f:
writer = csv.writer(f)
for date in date_counts:
writer.writerow([date, date_counts[date]])