如何解析 eml 文件并提取元数据信息

How to parse eml file and extract meta-data informations

我有一个带有一些附件的 eml 文件。我想读取 eml 文件中的文本内容,我想提取元数据信息,如(发件人、发件人、抄送、密件抄送、主题)。我也想下载附件。在以下代码的帮助下,我只能提取电子邮件正文中的信息/文本内容。

import email
from email import policy
from email.parser import BytesParser
import glob
file_list = glob.glob('*.eml') # returns list of files
with open(file_list[2], 'rb') as fp:  # select a specific email file from the list
    msg = BytesParser(policy=policy.default).parse(fp)
text = msg.get_body(preferencelist=('plain')).get_content()
print(text)

有可用于 Python 的模块名称 emaildata 2 完成了工作。

提取元数据信息

import email
from emaildata.metadata import MetaData

message = email.message_from_file(open('message.eml'))
extractor = MetaData(message)
data = extractor.to_dict()
print data.keys()

提取附件信息

import email
from emaildata.attachment import Attachment

message = email.message_from_file(open('message.eml'))
for content, filename, mimetype, message in Attachment.extract(message):
    print filename
    with open(filename, 'w') as stream:
        stream.write(content)
    # If message is not None then it is an instance of email.message.Message
    if message:
        print "The file {0} is a message with attachments.".format(filename)

但是这个库现在已经被弃用并且可以使用了。是否有任何其他库可以提取元数据和附件相关信息?

Meta-data 可以使用 Python 3.x

中的以下代码访问信息
from email import policy
from email.parser import BytesParser
with open(eml_file, 'rb') as fp:
    msg = BytesParser(policy=policy.default).parse(fp)

print('To:', msg['to'])
print('From:', msg['from'])
print('Subject:', msg['subject'])

剩余的 header 信息可以使用 msg.keys()

访问

要从 eml 文件下载附件,您可以使用以下代码:

import sys
import os
import os.path
from collections import defaultdict
from email.parser import Parser

eml_mail = 'your eml file'
output_dir = 'mention the directory where you want the files to be download'

def parse_message(filename):
    with open(filename) as f:
        return Parser().parse(f)

def find_attachments(message):
    """
    Return a tuple of parsed content-disposition dict, message object
    for each attachment found.
    """
    found = []
    for part in message.walk():
        if 'content-disposition' not in part:
            continue
        cdisp = part['content-disposition'].split(';')
        cdisp = [x.strip() for x in cdisp]
        if cdisp[0].lower() != 'attachment':
            continue
        parsed = {}
        for kv in cdisp[1:]:
            key, val = kv.split('=')
            if val.startswith('"'):
                val = val.strip('"')
            elif val.startswith("'"):
                val = val.strip("'")
            parsed[key] = val
        found.append((parsed, part))
    return found

def run(eml_filename, output_dir):
    msg = parse_message(eml_filename)
    attachments = find_attachments(msg)
    print ("Found {0} attachments...".format(len(attachments)))
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)
    for cdisp, part in attachments:
        cdisp_filename = os.path.normpath(cdisp['filename'])
        # prevent malicious crap
        if os.path.isabs(cdisp_filename):
            cdisp_filename = os.path.basename(cdisp_filename)
        towrite = os.path.join(output_dir, cdisp_filename)
        print( "Writing " + towrite)
        with open(towrite, 'wb') as fp:
            data = part.get_payload(decode=True)
            fp.write(data)


run(eml_mail, output_dir)

查看:ParsEML it bulk extracts attachments from all eml files in a directory (originally from Stephan Hügel). And i used a modified version of MeIOC 以 json 格式轻松提取所有元数据;如果你想要,我可以分享给。