使用 Mailparser 在邮件正文中查找 Email-adresses
Find Email-adresses in the mailbody with Mailparser
我对这个话题还很陌生,而且我的邮件解析器仍然存在一些问题。虽然在电子邮件 header (mail.from) 中搜索和查找电子邮件确实有效,但在电子邮件 body 中不起作用。是否有人body对此有一些经验并愿意提供帮助?您可以在 "// Check for other addresses in Mail-Body (Doesn't work yet)"-评论下找到我正在谈论的功能。我认为,我的正则表达式是正确的。此外,如果 matchAll-Function 返回一个数组并且它不能保存在订阅者中。email-object,它至少应该被记录到控制台。我还手动检查了收件箱中是否有邮件中包含电子邮件地址的邮件 body。至少有两个,应该能找到..
App.js 中进行邮件解析的部分:
const simpleParser = require('mailparser').simpleParser;
//const htmlparser = require("htmlparser2");
var fs = require('fs');
var config = require('./config');
var Imap = require('imap');
var imap = new Imap(config.imap);
var blacklistString = '';
String.prototype.matchAll = function(regexp) {
var matches = [];
this.replace(regexp, function() {
var arr = ([]).slice.call(arguments, 0);
var extras = arr.splice(-2);
arr.index = extras[0];
arr.input = extras[1];
matches.push(arr);
});
return matches.length ? matches : null;
};
function openInbox(subbox,cb) {
imap.openBox('INBOX.'+subbox, true, cb);
}
function getBoxes(cb) {
imap.getBoxes(cb);
}
function showBoxes(boxes) {
imap.end();
}
function logArrayElements(element) {
if(element[1].indexOf('placeholder.de')==-1){
addToBlacklistString(element[1]);
}
}
function addToBlacklistString(str) {
blacklistString += str+"\n";
}
function writeBlacklistFile() {
fs.appendFile('data/data.csv', blacklistString, function (err) {
if (err) throw err;
console.log('Saved!');
});
}
function search(searchArray, regex){
imap.search(searchArray, function(err, results) {
if (err) throw err;
var temp = 0;
var mailtemp = [];
var f = imap.fetch(results, { bodies: '' });
f.on('message', function(msg, seqno) {
console.log('Message #%d', seqno);
var prefix = '(#' + seqno + ') ';
msg.on('body', function(stream, info) {
simpleParser(stream, (err, mail)=>{
//console.log(temp);
//console.log(mail.subject);
/*fs.writeFile('data/'+seqno+'.txt',mail.text, function(err){
console.log(err);
});*/
//var text = mail.text;
// New Subscriber Object
var subscr = new Subscriber({nr: '', mailIdent: '', from: '', emails: '', text:'', uLink: '', anwalt: false });
subscr.nr = seqno;
//Check for From-Address
if(!!mail.from) {
//console.log(mail.from.value);
for(var i = 0; i < mail.from.value.length; i++) {
mailtemp = mail.from.value[i].address.matchAll(regex);
mailtemp.forEach(function(element){
/*fs.appendFile('data/data.csv', element[0] + "\n", function(error){
console.log(error);
});*/
subscr.from = element[0];
});
if(!!mailtemp) {
mailtemp.forEach(logArrayElements);
}
}
}else{
//console.log(mail.text);
}
// Message-ID
if(!!mail.messageId) {
subscr.mailIdent = mail.messageId;
}
console.log(mail.messageId);
// Check for other addresses in Mail-Body (Doesn't work yet)
var regexEmails = new RegExp('/([\w\.\-\_\#\+]+@[\w\.\-\_äüö]+\.[a-zA-Z]+)/g');
if(!!mail.text){
if(mail.text.matchAll(regexEmails)!=null) {
subscr.emails = mail.text.matchAll(regexEmails);
console.log(subscr.emails);
}
}
/* Split mail.text at substrings in substr-array. Extend if necessary..
*
* Also check for 'Anwalt'-Expression in splitted Substring
*
* If mail.text doesn't exist -> Check for html body and convert it to text-format
*/
//var regexLink = new RegExp('\.de\/(unsubscribe|austragen)\/([^\"]+)');
var regexAnwalt = new RegExp('nwalt|echtsanwalt|rechtlicher');
if(!!mail.text) {
var substr = ["schrieb pplaceholder.de", "Von: \"placeholder.de", "Von: pplaceholder.de", "From: placeholder.de", "Ursprüngliche Nachricht"];
for (var i = 0; i<substr.length; i++) {
if(mail.text.indexOf(substr[i]) > -1) {
var textTemp = mail.text;
var arr = textTemp.split(substr[i]);
if(arr[0].matchAll(regexAnwalt)!=null) {
subscr.anwalt = true;
};
subscr.text = arr[0];
break;
} else {
subscr.text = mail.text;
}
}
//console.log(arr);
}
else
{
var html = mail.html;
var text = htmlToText.fromString(html, {
noLinkBrackets: true,
ignoreImage: true,
uppercaseHeadings: false,
preserveNewlines: false,
wordwrap:130,
format: {
heading: function (node, fn, options) {
var h = fn(node.children, options);
return '\n==== ' + h + ' ====\n\n';
}
}
});
subscr.text = text;
}
mail.headers.forEach(function(value, key) {
//console.log(value);
});
subscr.save();
//console.log(subscr);
temp++;
});
});
msg.once('end', function() {
console.log(prefix + 'Finished');
});
});
f.once('error', function(err) {
console.log('Fetch error: ' + err);
});
f.once('end', function() {
console.log('Done fetching all messages!');
//writeBlacklistFile();
imap.end();
});
});
}
imap.once('ready', function() {
openInbox('Test',function(err, box) {
var searchArray = [['FROM', '@']];
search(searchArray,/([\w\.\-\_\#\+]+@[\w\.\-\_äüö]+\.[a-zA-Z]+)/g);
});
});
imap.once('error', function(err) {
console.log(err);
});
imap.once('end', function() {
console.log('Connection ended');
});
imap.connect();
app.listen(2700, function(){
console.log("Listening on Port 2700")
});
module.exports = app;
subscriber.js
const mongoose = require('mongoose');
var subscriberSchema = mongoose.Schema({
nr: Number,
mailIdent: String,
from: String,
emails: String,
text: String,
uLink: String,
anwalt: Boolean
});
var Subscriber = module.exports = mongoose.model('Subscriber', subscriberSchema);
//get Subscriber
module.exports.getSubscribers = function(callback, limit){
Subscriber.find(callback).limit(limit);
};
module.exports.getSubscriberByID = function(_id, callback){
Subscriber.findById(_id, callback);
};
电子邮件的正则表达式有点错误。
我也没有注意到 matchAll-Fct。正在返回一个二维数组。这是代码的更改部分:
var regexEmails = new RegExp("([\w\.\-\_\#\+]+@[\w\.\-\_äüö]+\.[a-zA-Z]+)");
var temp1 = mail.text.matchAll(regexEmails);
if(!!temp1){
//console.log(temp1);
for(var i =0; i<temp1.length; i++) {
if(temp1[0][i]!=='info@service.placeholder.de' && temp1[0][i] !== "info@placeholder.de"){
subscr.emails += temp1[0][i];
}
}
}
我对这个话题还很陌生,而且我的邮件解析器仍然存在一些问题。虽然在电子邮件 header (mail.from) 中搜索和查找电子邮件确实有效,但在电子邮件 body 中不起作用。是否有人body对此有一些经验并愿意提供帮助?您可以在 "// Check for other addresses in Mail-Body (Doesn't work yet)"-评论下找到我正在谈论的功能。我认为,我的正则表达式是正确的。此外,如果 matchAll-Function 返回一个数组并且它不能保存在订阅者中。email-object,它至少应该被记录到控制台。我还手动检查了收件箱中是否有邮件中包含电子邮件地址的邮件 body。至少有两个,应该能找到..
App.js 中进行邮件解析的部分:
const simpleParser = require('mailparser').simpleParser;
//const htmlparser = require("htmlparser2");
var fs = require('fs');
var config = require('./config');
var Imap = require('imap');
var imap = new Imap(config.imap);
var blacklistString = '';
String.prototype.matchAll = function(regexp) {
var matches = [];
this.replace(regexp, function() {
var arr = ([]).slice.call(arguments, 0);
var extras = arr.splice(-2);
arr.index = extras[0];
arr.input = extras[1];
matches.push(arr);
});
return matches.length ? matches : null;
};
function openInbox(subbox,cb) {
imap.openBox('INBOX.'+subbox, true, cb);
}
function getBoxes(cb) {
imap.getBoxes(cb);
}
function showBoxes(boxes) {
imap.end();
}
function logArrayElements(element) {
if(element[1].indexOf('placeholder.de')==-1){
addToBlacklistString(element[1]);
}
}
function addToBlacklistString(str) {
blacklistString += str+"\n";
}
function writeBlacklistFile() {
fs.appendFile('data/data.csv', blacklistString, function (err) {
if (err) throw err;
console.log('Saved!');
});
}
function search(searchArray, regex){
imap.search(searchArray, function(err, results) {
if (err) throw err;
var temp = 0;
var mailtemp = [];
var f = imap.fetch(results, { bodies: '' });
f.on('message', function(msg, seqno) {
console.log('Message #%d', seqno);
var prefix = '(#' + seqno + ') ';
msg.on('body', function(stream, info) {
simpleParser(stream, (err, mail)=>{
//console.log(temp);
//console.log(mail.subject);
/*fs.writeFile('data/'+seqno+'.txt',mail.text, function(err){
console.log(err);
});*/
//var text = mail.text;
// New Subscriber Object
var subscr = new Subscriber({nr: '', mailIdent: '', from: '', emails: '', text:'', uLink: '', anwalt: false });
subscr.nr = seqno;
//Check for From-Address
if(!!mail.from) {
//console.log(mail.from.value);
for(var i = 0; i < mail.from.value.length; i++) {
mailtemp = mail.from.value[i].address.matchAll(regex);
mailtemp.forEach(function(element){
/*fs.appendFile('data/data.csv', element[0] + "\n", function(error){
console.log(error);
});*/
subscr.from = element[0];
});
if(!!mailtemp) {
mailtemp.forEach(logArrayElements);
}
}
}else{
//console.log(mail.text);
}
// Message-ID
if(!!mail.messageId) {
subscr.mailIdent = mail.messageId;
}
console.log(mail.messageId);
// Check for other addresses in Mail-Body (Doesn't work yet)
var regexEmails = new RegExp('/([\w\.\-\_\#\+]+@[\w\.\-\_äüö]+\.[a-zA-Z]+)/g');
if(!!mail.text){
if(mail.text.matchAll(regexEmails)!=null) {
subscr.emails = mail.text.matchAll(regexEmails);
console.log(subscr.emails);
}
}
/* Split mail.text at substrings in substr-array. Extend if necessary..
*
* Also check for 'Anwalt'-Expression in splitted Substring
*
* If mail.text doesn't exist -> Check for html body and convert it to text-format
*/
//var regexLink = new RegExp('\.de\/(unsubscribe|austragen)\/([^\"]+)');
var regexAnwalt = new RegExp('nwalt|echtsanwalt|rechtlicher');
if(!!mail.text) {
var substr = ["schrieb pplaceholder.de", "Von: \"placeholder.de", "Von: pplaceholder.de", "From: placeholder.de", "Ursprüngliche Nachricht"];
for (var i = 0; i<substr.length; i++) {
if(mail.text.indexOf(substr[i]) > -1) {
var textTemp = mail.text;
var arr = textTemp.split(substr[i]);
if(arr[0].matchAll(regexAnwalt)!=null) {
subscr.anwalt = true;
};
subscr.text = arr[0];
break;
} else {
subscr.text = mail.text;
}
}
//console.log(arr);
}
else
{
var html = mail.html;
var text = htmlToText.fromString(html, {
noLinkBrackets: true,
ignoreImage: true,
uppercaseHeadings: false,
preserveNewlines: false,
wordwrap:130,
format: {
heading: function (node, fn, options) {
var h = fn(node.children, options);
return '\n==== ' + h + ' ====\n\n';
}
}
});
subscr.text = text;
}
mail.headers.forEach(function(value, key) {
//console.log(value);
});
subscr.save();
//console.log(subscr);
temp++;
});
});
msg.once('end', function() {
console.log(prefix + 'Finished');
});
});
f.once('error', function(err) {
console.log('Fetch error: ' + err);
});
f.once('end', function() {
console.log('Done fetching all messages!');
//writeBlacklistFile();
imap.end();
});
});
}
imap.once('ready', function() {
openInbox('Test',function(err, box) {
var searchArray = [['FROM', '@']];
search(searchArray,/([\w\.\-\_\#\+]+@[\w\.\-\_äüö]+\.[a-zA-Z]+)/g);
});
});
imap.once('error', function(err) {
console.log(err);
});
imap.once('end', function() {
console.log('Connection ended');
});
imap.connect();
app.listen(2700, function(){
console.log("Listening on Port 2700")
});
module.exports = app;
subscriber.js
const mongoose = require('mongoose');
var subscriberSchema = mongoose.Schema({
nr: Number,
mailIdent: String,
from: String,
emails: String,
text: String,
uLink: String,
anwalt: Boolean
});
var Subscriber = module.exports = mongoose.model('Subscriber', subscriberSchema);
//get Subscriber
module.exports.getSubscribers = function(callback, limit){
Subscriber.find(callback).limit(limit);
};
module.exports.getSubscriberByID = function(_id, callback){
Subscriber.findById(_id, callback);
};
电子邮件的正则表达式有点错误。 我也没有注意到 matchAll-Fct。正在返回一个二维数组。这是代码的更改部分:
var regexEmails = new RegExp("([\w\.\-\_\#\+]+@[\w\.\-\_äüö]+\.[a-zA-Z]+)");
var temp1 = mail.text.matchAll(regexEmails);
if(!!temp1){
//console.log(temp1);
for(var i =0; i<temp1.length; i++) {
if(temp1[0][i]!=='info@service.placeholder.de' && temp1[0][i] !== "info@placeholder.de"){
subscr.emails += temp1[0][i];
}
}
}