使用 Python 读取大型 .mbox 文件
Read a big .mbox file with Python
我想阅读来自 Gmail 备份的 3GB 大 .mbox 文件。这有效:
import mailbox
mbox = mailbox.mbox(r"D:\All mail Including Spam and Trash.mbox")
for i, message in enumerate(mbox):
print("from :",message['from'])
print("subject:",message['subject'])
if message.is_multipart():
content = ''.join(part.get_payload(decode=True) for part in message.get_payload())
else:
content = message.get_payload(decode=True)
print("content:",content)
print("**************************************")
if i == 10:
break
除了前 10 条消息需要超过 40 秒。
有没有更快的方法来访问 Python 的大型 .mbox 文件?
这是一个快速而肮脏的尝试,用于实现生成器以逐条消息读入 mbox
文件。我选择简单地放弃 From
分隔符中的信息;我猜也许真正的 mailbox
库可能会提供更多信息,当然,这只支持读取,不支持搜索或写回输入文件。
#!/usr/bin/env python3
import email
from email.policy import default
class MboxReader:
def __init__(self, filename):
self.handle = open(filename, 'rb')
assert self.handle.readline().startswith(b'From ')
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, exc_traceback):
self.handle.close()
def __iter__(self):
return iter(self.__next__())
def __next__(self):
lines = []
while True:
line = self.handle.readline()
if line == b'' or line.startswith(b'From '):
yield email.message_from_bytes(b''.join(lines), policy=default)
if line == b'':
break
lines = []
continue
lines.append(line)
用法:
with MboxReader(mboxfilename) as mbox:
for message in mbox:
print(message.as_string())
使用上方或 处的 MboxReader Class,您可以使用任何键从 mbox 对象获取特定信息。然后可以创建文本文件以进一步分析您的邮箱。
path = "your_gmail.mbox"
mbox = MboxReader(path)
from tqdm import tqdm
with open('Output.txt','w',encoding="utf-8") as file:
for idx,message in tqdm(enumerate(mbox)):
# print(message.keys())
mail_from = f"{str(message['From'])}\n".replace('"','')
file.write(mail_from)
print(idx,message['From'])
允许使用以下键值,放在这里供参考
['X-GM-THRID', 'X-Gmail-Labels', 'Delivered-To', 'Received', 'X-Received',
'ARC-Seal', 'ARC-Message-Signature', 'ARC-Authentication-Results',
'Return-Path', 'Received', 'Received-SPF', 'Authentication-Results',
'DKIM-Signature', 'X-Google-DKIM-Signature', 'X-Gm-Message-State',
'X-Google-Smtp-Source', 'MIME-Version', 'X-Received', 'Date', 'Reply-To',
'X-Google-Id', 'Precedence', 'List-Unsubscribe', 'Feedback-ID', 'List-Id',
'X-Notifications', 'X-Notifications-Bounce-Info', 'Message-ID', 'Subject',
'From', 'To', 'Content-Type']
希望有用:)
我想阅读来自 Gmail 备份的 3GB 大 .mbox 文件。这有效:
import mailbox
mbox = mailbox.mbox(r"D:\All mail Including Spam and Trash.mbox")
for i, message in enumerate(mbox):
print("from :",message['from'])
print("subject:",message['subject'])
if message.is_multipart():
content = ''.join(part.get_payload(decode=True) for part in message.get_payload())
else:
content = message.get_payload(decode=True)
print("content:",content)
print("**************************************")
if i == 10:
break
除了前 10 条消息需要超过 40 秒。
有没有更快的方法来访问 Python 的大型 .mbox 文件?
这是一个快速而肮脏的尝试,用于实现生成器以逐条消息读入 mbox
文件。我选择简单地放弃 From
分隔符中的信息;我猜也许真正的 mailbox
库可能会提供更多信息,当然,这只支持读取,不支持搜索或写回输入文件。
#!/usr/bin/env python3
import email
from email.policy import default
class MboxReader:
def __init__(self, filename):
self.handle = open(filename, 'rb')
assert self.handle.readline().startswith(b'From ')
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, exc_traceback):
self.handle.close()
def __iter__(self):
return iter(self.__next__())
def __next__(self):
lines = []
while True:
line = self.handle.readline()
if line == b'' or line.startswith(b'From '):
yield email.message_from_bytes(b''.join(lines), policy=default)
if line == b'':
break
lines = []
continue
lines.append(line)
用法:
with MboxReader(mboxfilename) as mbox:
for message in mbox:
print(message.as_string())
使用上方或
path = "your_gmail.mbox"
mbox = MboxReader(path)
from tqdm import tqdm
with open('Output.txt','w',encoding="utf-8") as file:
for idx,message in tqdm(enumerate(mbox)):
# print(message.keys())
mail_from = f"{str(message['From'])}\n".replace('"','')
file.write(mail_from)
print(idx,message['From'])
允许使用以下键值,放在这里供参考
['X-GM-THRID', 'X-Gmail-Labels', 'Delivered-To', 'Received', 'X-Received',
'ARC-Seal', 'ARC-Message-Signature', 'ARC-Authentication-Results',
'Return-Path', 'Received', 'Received-SPF', 'Authentication-Results',
'DKIM-Signature', 'X-Google-DKIM-Signature', 'X-Gm-Message-State',
'X-Google-Smtp-Source', 'MIME-Version', 'X-Received', 'Date', 'Reply-To',
'X-Google-Id', 'Precedence', 'List-Unsubscribe', 'Feedback-ID', 'List-Id',
'X-Notifications', 'X-Notifications-Bounce-Info', 'Message-ID', 'Subject',
'From', 'To', 'Content-Type']
希望有用:)