在 scrapy 中无法收到来自 html 的电子邮件
Can't get email from html in scrapy
我可以在 chrome 检查中看到一封电子邮件。
<a class="obfuscatedEmail" href="mailto:info@endoskopie-stuttgart.de" el="nofollow" style="direction: ltr; unicode-bidi: normal; white-space: nowrap;">info@endoskopie-stuttgart.de</a>
但是在页面源代码中,它看起来像
<a class="obfuscatedEmail" href="YP.KLSIKKHKA-YGDFTAFP3Y@FO3G:FKRGS4@" rel="nofollow">YP.KLSIKKHKA-YGDFTAFP3Y@FO3G@</a>
我需要用 python scrapy 抓取电子邮件。
我怎样才能收到电子邮件?
页面源代码中提供了用于对电子邮件进行去混淆处理的函数:
//This is a first layer of deobfuscation.
//Basically a reversed ROT13 algorithm.
function changeLetters(string) {
//Helper variables.
var currentLetter,
currentPos,
currentString = "",
//Behold! The one and only counter.
i = 0,
//We"re going to loop through the obfuscated strings characters, so this will come in handy.
stringLength = string.length - 1,
//Characters that will be used when deobfuscating email address.
//Same as string in PHP obfuscate function (obfuscateEmail).
characters = "123456789qwertzuiopasdfghjklyxcvbnmMNBVCXYLKJHGFDSAPOIUZTREWQ",
charactersLength = characters.length;
//Counter variable has been declared before.
for( ; i<stringLength; i++ ) {
//This letter will be deobfuscated.
currentLetter = string.charAt(i);
//Position of the letter in our characters string.
currentPos = characters.indexOf(currentLetter);
//If character is present in our string, replace it with a character
//30 places before (opposite from obfuscating).
//If not, leave it as it is (because character wasn"t obfuscated).
if( currentPos > -1 ) {
currentPos -= (charactersLength-1) / 2;
currentPos = currentPos < 0 ? charactersLength + currentPos : currentPos;
} else {
currentString += currentLetter;
}
//Finally, append a character to our temp string that will be returned.
currentString += characters.charAt(currentPos);
}
return currentString;
}
//Function that will handle email deobfuscation.
//@param element is a reference to html element that will be deobfuscated.
//Deobfuscation is done on text and on href attribute of the element.
//Nevertheless, function will work well with any element you pass in,
//even if href attribute won"t be present.
function deObfuscateEmail( element ) {
//Get the text of the element.
var text = element.innerHTML,
//Get href attribute. If there is no href attribute, set href value to be an empty string.
//Regular expression is an IE Fix.
//Namely, IE appends obfuscated email to the url (www.domain.com/com.liameym@em).
//Therefore, the first part of the link needs to be removed (we grab just everything after the last forward slash "/").
href = element.getAttribute("href").replace(/http:\/\/(.+)\//gi, "") || "",
//Control variable. if the two @ symbols are present, we will perform deobfuscation,
//if not, the string is not obfuscated and doesn"t have to be deobfuscated.
textReplace = text.search(/@.+@/),
hrefReplace = href.search(/@.+@/),
//This function handles the second layer of deobfuscation.
//It is called later in the code.
//Letters of the email are reversed (again) and css direction returned back to ltr.
//This is called on mouseover event.
reverseEmails = function(){
//Only if htef is obfuscated.
if( hrefReplace > -1 ) {
//That"s the reversing part right here.
element.setAttribute("href", href.split("").reverse().join("") );
}
//Only if text is obfuscated.
if( textReplace > -1 ) {
//Reverse the text of the element and
//return the direction to normal (left to right).
element.innerHTML = text.split("").reverse().join("");
element.style.direction = "ltr";
element.style.unicodeBidi = "normal";
}
//Letters are replaced and the event isn"t needed anymore.
if( element.removeEventListener ) {
element.removeEventListener("mouseover", reverseEmails, false);
} else {
// IE8-
element.detachEvent("onmouseover", reverseEmails);
}
};
//End variables and functions definitions.
//href has to be processed first, because of the strange
//IE bug that will mix the href and innerHTML values.
if( hrefReplace > -1 ) {
href = changeLetters(href);
element.setAttribute("href", href);
}
//Change the direction of the text to show real address
//to users, instead of a reversed one.
if( textReplace > -1 ) {
text = changeLetters( text );
element.innerHTML = text;
element.style.direction = "rtl";
element.style.unicodeBidi = "bidi-override";
element.style.whiteSpace = "nowrap";
}
//Since we have a rtl text, user can"t copy or click on a link.
//Therefore we"ll replace the value as soon as user hovers over the link.
if( element.addEventListener ) {
element.addEventListener("mouseover", reverseEmails, false);
} else {
element.attachEvent("onmouseover", reverseEmails);
}
}
代码看起来很多,其实很简单,可以分解成几个步骤:
- 声明一个用作键的字符串,
characters
- 用密钥中 30 个位置的前一个字符替换混淆电子邮件的每个字符(如果需要,环绕),或者如果它不是密钥的一部分,则保留它
- 反转结果并删除多余的
@
在 python 中实现它应该只需要几行代码,一旦你这样做了,你就可以自己对电子邮件进行去混淆处理。
下面是python.
中反混淆功能的一些实现
def deobfuscate(s):
s = s[::-1]
final_s = ''
characters = "123456789qwertzuiopasdfghjklyxcvbnmMNBVCXYLKJHGFDSAPOIUZTREWQ"
char_len = len(characters)
for i, char in enumerate(s):
if char in characters:
cur_pos = characters.index(char)
cpos = cur_pos - (char_len-1) // 2
cpos = char_len + cpos if cpos < 0 else cpos
final_s += characters[cpos]
else:
final_s += char
return ''.join(final_s).strip('@')
s = 'YP.KLSIKKHKA-YGDFTAFP3Y@FO3G:FKRGS4@'
print (deobfuscate(s))
# mailto:info@endoskopie-stuttgart.de
我可以在 chrome 检查中看到一封电子邮件。
<a class="obfuscatedEmail" href="mailto:info@endoskopie-stuttgart.de" el="nofollow" style="direction: ltr; unicode-bidi: normal; white-space: nowrap;">info@endoskopie-stuttgart.de</a>
但是在页面源代码中,它看起来像
<a class="obfuscatedEmail" href="YP.KLSIKKHKA-YGDFTAFP3Y@FO3G:FKRGS4@" rel="nofollow">YP.KLSIKKHKA-YGDFTAFP3Y@FO3G@</a>
我需要用 python scrapy 抓取电子邮件。 我怎样才能收到电子邮件?
页面源代码中提供了用于对电子邮件进行去混淆处理的函数:
//This is a first layer of deobfuscation.
//Basically a reversed ROT13 algorithm.
function changeLetters(string) {
//Helper variables.
var currentLetter,
currentPos,
currentString = "",
//Behold! The one and only counter.
i = 0,
//We"re going to loop through the obfuscated strings characters, so this will come in handy.
stringLength = string.length - 1,
//Characters that will be used when deobfuscating email address.
//Same as string in PHP obfuscate function (obfuscateEmail).
characters = "123456789qwertzuiopasdfghjklyxcvbnmMNBVCXYLKJHGFDSAPOIUZTREWQ",
charactersLength = characters.length;
//Counter variable has been declared before.
for( ; i<stringLength; i++ ) {
//This letter will be deobfuscated.
currentLetter = string.charAt(i);
//Position of the letter in our characters string.
currentPos = characters.indexOf(currentLetter);
//If character is present in our string, replace it with a character
//30 places before (opposite from obfuscating).
//If not, leave it as it is (because character wasn"t obfuscated).
if( currentPos > -1 ) {
currentPos -= (charactersLength-1) / 2;
currentPos = currentPos < 0 ? charactersLength + currentPos : currentPos;
} else {
currentString += currentLetter;
}
//Finally, append a character to our temp string that will be returned.
currentString += characters.charAt(currentPos);
}
return currentString;
}
//Function that will handle email deobfuscation.
//@param element is a reference to html element that will be deobfuscated.
//Deobfuscation is done on text and on href attribute of the element.
//Nevertheless, function will work well with any element you pass in,
//even if href attribute won"t be present.
function deObfuscateEmail( element ) {
//Get the text of the element.
var text = element.innerHTML,
//Get href attribute. If there is no href attribute, set href value to be an empty string.
//Regular expression is an IE Fix.
//Namely, IE appends obfuscated email to the url (www.domain.com/com.liameym@em).
//Therefore, the first part of the link needs to be removed (we grab just everything after the last forward slash "/").
href = element.getAttribute("href").replace(/http:\/\/(.+)\//gi, "") || "",
//Control variable. if the two @ symbols are present, we will perform deobfuscation,
//if not, the string is not obfuscated and doesn"t have to be deobfuscated.
textReplace = text.search(/@.+@/),
hrefReplace = href.search(/@.+@/),
//This function handles the second layer of deobfuscation.
//It is called later in the code.
//Letters of the email are reversed (again) and css direction returned back to ltr.
//This is called on mouseover event.
reverseEmails = function(){
//Only if htef is obfuscated.
if( hrefReplace > -1 ) {
//That"s the reversing part right here.
element.setAttribute("href", href.split("").reverse().join("") );
}
//Only if text is obfuscated.
if( textReplace > -1 ) {
//Reverse the text of the element and
//return the direction to normal (left to right).
element.innerHTML = text.split("").reverse().join("");
element.style.direction = "ltr";
element.style.unicodeBidi = "normal";
}
//Letters are replaced and the event isn"t needed anymore.
if( element.removeEventListener ) {
element.removeEventListener("mouseover", reverseEmails, false);
} else {
// IE8-
element.detachEvent("onmouseover", reverseEmails);
}
};
//End variables and functions definitions.
//href has to be processed first, because of the strange
//IE bug that will mix the href and innerHTML values.
if( hrefReplace > -1 ) {
href = changeLetters(href);
element.setAttribute("href", href);
}
//Change the direction of the text to show real address
//to users, instead of a reversed one.
if( textReplace > -1 ) {
text = changeLetters( text );
element.innerHTML = text;
element.style.direction = "rtl";
element.style.unicodeBidi = "bidi-override";
element.style.whiteSpace = "nowrap";
}
//Since we have a rtl text, user can"t copy or click on a link.
//Therefore we"ll replace the value as soon as user hovers over the link.
if( element.addEventListener ) {
element.addEventListener("mouseover", reverseEmails, false);
} else {
element.attachEvent("onmouseover", reverseEmails);
}
}
代码看起来很多,其实很简单,可以分解成几个步骤:
- 声明一个用作键的字符串,
characters
- 用密钥中 30 个位置的前一个字符替换混淆电子邮件的每个字符(如果需要,环绕),或者如果它不是密钥的一部分,则保留它
- 反转结果并删除多余的
@
在 python 中实现它应该只需要几行代码,一旦你这样做了,你就可以自己对电子邮件进行去混淆处理。
下面是python.
中反混淆功能的一些实现def deobfuscate(s):
s = s[::-1]
final_s = ''
characters = "123456789qwertzuiopasdfghjklyxcvbnmMNBVCXYLKJHGFDSAPOIUZTREWQ"
char_len = len(characters)
for i, char in enumerate(s):
if char in characters:
cur_pos = characters.index(char)
cpos = cur_pos - (char_len-1) // 2
cpos = char_len + cpos if cpos < 0 else cpos
final_s += characters[cpos]
else:
final_s += char
return ''.join(final_s).strip('@')
s = 'YP.KLSIKKHKA-YGDFTAFP3Y@FO3G:FKRGS4@'
print (deobfuscate(s))
# mailto:info@endoskopie-stuttgart.de