从 python 中的 EMAIL 中提取本身属于 MSG 类型的附件
Extract Attachment which itself is of type MSG from an EMAIL in python
我需要从电子邮件中提取 msg 类型的附件并将 MSG 附件保存到 python 中的某个位置。
我编写的脚本适用于除 outlook 项目之外的几乎所有类型的文件
def parse_attachment(message_part):
content_disposition = message_part.get("Content-Disposition", None)
if content_disposition:
dispositions = content_disposition.strip().split(";")
if bool(content_disposition and (dispositions[0].lower() == "attachment" or dispositions[0].lower() == "inline")):
file_data = message_part.get_payload(decode=True)
debug(message_part)
attachment = {}
attachment['data'] = file_data
attachment['content_type'] = message_part.get_content_type()
attachment['size'] = len(file_data)
for param in dispositions[1:]:
name,value = param.split("=")
name = name.lower().strip()
value = value.strip().strip("\"")
if name == "filename":
attachment['name'] = value
elif name == "creation-date":
attachment['creation-date'] = value
elif name == "modification-date":
attachment['modification-date'] = value
elif name == "size":
attachment['size'] = value
return attachment
return None
我们必须单独处理电子邮件附件。但是,如果我们使用 walk()
,它是一个通用生成器,可用于迭代消息对象树的所有 部分和子部分 ,在 深度优先遍历顺序,我们最终也解析了附件邮件。
因此,我们将不得不使用 get_payload()
来获取电子邮件的每个单独部分。以下是我们如何解析电子邮件附件 -
def get_subject(msgobj) :
subject = None
if msgobj['Subject'] is not None:
decodefrag = decode_header(msgobj['Subject'])
subj_fragments = []
for s , enc in decodefrag:
if enc:
s = unicode(s , enc).encode('utf8','replace')
subj_fragments.append(s)
subject = ''.join(subj_fragments)
subject = re.sub('\n', '', subject)
return subject
def get_msg_file_as_attachment(message_part):
attachment = {}
attachment['data'] = message_part.get_payload()[0].as_string(unixfrom=True)
attachment['content_type'] = message_part.get_content_type()
attachment['name'] = get_subject(message_part.get_payload()[0])
attachment['name'] += '.eml'
attachment['size'] = len(attachment['data'])
return attachment
def parse_attachment(message_part):
content_disposition = message_part.get("Content-Disposition", None)
content_type = message_part.get_content_type()
if content_disposition:
dispositions = content_disposition.strip().split(";")
if bool(content_disposition and (dispositions[0].lower() == "attachment" or dispositions[0].lower() == "inline")):
if (content_type.lower().strip() == 'message/rfc822'):
return get_msg_file_as_attachment(message_part)
else:
file_data = message_part.get_payload(decode=True)
attachment = {}
attachment['data'] = file_data
attachment['content_type'] = content_type
attachment['size'] = len(file_data)
attachment['name'] = message_part.get_filename()
return attachment
return None
我需要从电子邮件中提取 msg 类型的附件并将 MSG 附件保存到 python 中的某个位置。
我编写的脚本适用于除 outlook 项目之外的几乎所有类型的文件
def parse_attachment(message_part):
content_disposition = message_part.get("Content-Disposition", None)
if content_disposition:
dispositions = content_disposition.strip().split(";")
if bool(content_disposition and (dispositions[0].lower() == "attachment" or dispositions[0].lower() == "inline")):
file_data = message_part.get_payload(decode=True)
debug(message_part)
attachment = {}
attachment['data'] = file_data
attachment['content_type'] = message_part.get_content_type()
attachment['size'] = len(file_data)
for param in dispositions[1:]:
name,value = param.split("=")
name = name.lower().strip()
value = value.strip().strip("\"")
if name == "filename":
attachment['name'] = value
elif name == "creation-date":
attachment['creation-date'] = value
elif name == "modification-date":
attachment['modification-date'] = value
elif name == "size":
attachment['size'] = value
return attachment
return None
我们必须单独处理电子邮件附件。但是,如果我们使用 walk()
,它是一个通用生成器,可用于迭代消息对象树的所有 部分和子部分 ,在 深度优先遍历顺序,我们最终也解析了附件邮件。
因此,我们将不得不使用 get_payload()
来获取电子邮件的每个单独部分。以下是我们如何解析电子邮件附件 -
def get_subject(msgobj) :
subject = None
if msgobj['Subject'] is not None:
decodefrag = decode_header(msgobj['Subject'])
subj_fragments = []
for s , enc in decodefrag:
if enc:
s = unicode(s , enc).encode('utf8','replace')
subj_fragments.append(s)
subject = ''.join(subj_fragments)
subject = re.sub('\n', '', subject)
return subject
def get_msg_file_as_attachment(message_part):
attachment = {}
attachment['data'] = message_part.get_payload()[0].as_string(unixfrom=True)
attachment['content_type'] = message_part.get_content_type()
attachment['name'] = get_subject(message_part.get_payload()[0])
attachment['name'] += '.eml'
attachment['size'] = len(attachment['data'])
return attachment
def parse_attachment(message_part):
content_disposition = message_part.get("Content-Disposition", None)
content_type = message_part.get_content_type()
if content_disposition:
dispositions = content_disposition.strip().split(";")
if bool(content_disposition and (dispositions[0].lower() == "attachment" or dispositions[0].lower() == "inline")):
if (content_type.lower().strip() == 'message/rfc822'):
return get_msg_file_as_attachment(message_part)
else:
file_data = message_part.get_payload(decode=True)
attachment = {}
attachment['data'] = file_data
attachment['content_type'] = content_type
attachment['size'] = len(file_data)
attachment['name'] = message_part.get_filename()
return attachment
return None