在 scrapy 中无法收到来自 html 的电子邮件

Can't get email from html in scrapy

我可以在 chrome 检查中看到一封电子邮件。

<a class="obfuscatedEmail" href="mailto:info@endoskopie-stuttgart.de" el="nofollow" style="direction: ltr; unicode-bidi: normal; white-space: nowrap;">info@endoskopie-stuttgart.de</a>

但是在页面源代码中,它看起来像

<a class="obfuscatedEmail" href="YP.KLSIKKHKA-YGDFTAFP3Y@FO3G:FKRGS4@" rel="nofollow">YP.KLSIKKHKA-YGDFTAFP3Y@FO3G@</a>

我需要用 python scrapy 抓取电子邮件。 我怎样才能收到电子邮件?

页面源代码中提供了用于对电子邮件进行去混淆处理的函数:

    //This is a first layer of deobfuscation.
    //Basically a reversed ROT13 algorithm.
    function changeLetters(string) {

        //Helper variables.
        var currentLetter, 
            currentPos,
            currentString = "",

            //Behold! The one and only counter.
            i = 0,

            //We"re going to loop through the obfuscated strings characters, so this will come in handy.
            stringLength = string.length - 1,

            //Characters that will be used when deobfuscating email address.
            //Same as string in PHP obfuscate function (obfuscateEmail).
            characters = "123456789qwertzuiopasdfghjklyxcvbnmMNBVCXYLKJHGFDSAPOIUZTREWQ",
            charactersLength = characters.length;


        //Counter variable has been declared before.
        for( ; i<stringLength; i++ ) {

            //This letter will be deobfuscated.
            currentLetter = string.charAt(i);

            //Position of the letter in our characters string.
            currentPos = characters.indexOf(currentLetter);

            //If character is present in our string, replace it with a character
            //30 places before (opposite from obfuscating).
            //If not, leave it as it is (because character wasn"t obfuscated).
            if( currentPos > -1 ) {

                currentPos -= (charactersLength-1) / 2;
                currentPos = currentPos < 0 ? charactersLength + currentPos : currentPos;

            } else {

                currentString += currentLetter;

            }

            //Finally, append a character to our temp string that will be returned.
            currentString += characters.charAt(currentPos);

        }

        return currentString;

    }

    //Function that will handle email deobfuscation.
    //@param element is a reference to html element that will be deobfuscated.
    //Deobfuscation is done on text and on href attribute of the element.
    //Nevertheless, function will work well with any element you pass in, 
    //even if href attribute won"t be present.
    function deObfuscateEmail( element ) {

        //Get the text of the element.
        var text = element.innerHTML,

            //Get href attribute. If there is no href attribute, set href value to be an empty string.
            //Regular expression is an IE Fix.
            //Namely, IE appends obfuscated email to the url (www.domain.com/com.liameym@em).
            //Therefore, the first part of the link needs to be removed (we grab just everything after the last forward slash "/").
            href = element.getAttribute("href").replace(/http:\/\/(.+)\//gi, "") || "",

            //Control variable. if the two @ symbols are present, we will perform deobfuscation,
            //if not, the string is not obfuscated and doesn"t have to be deobfuscated.
            textReplace = text.search(/@.+@/),
            hrefReplace = href.search(/@.+@/),

            //This function handles the second layer of deobfuscation.
            //It is called later in the code.
            //Letters of the email are reversed (again) and css direction returned back to ltr.
            //This is called on mouseover event.
            reverseEmails = function(){

                //Only if htef is obfuscated.
                if( hrefReplace > -1 ) {

                    //That"s the reversing part right here.
                    element.setAttribute("href", href.split("").reverse().join("") );

                }

                //Only if text is obfuscated.
                if( textReplace > -1 ) {

                    //Reverse the text of the element and
                    //return the direction to normal (left to right).
                    element.innerHTML = text.split("").reverse().join("");
                    element.style.direction = "ltr";
                    element.style.unicodeBidi = "normal";

                }


                //Letters are replaced and the event isn"t needed anymore.
                if( element.removeEventListener ) {

                    element.removeEventListener("mouseover", reverseEmails, false);

                } else {

                    // IE8-
                    element.detachEvent("onmouseover", reverseEmails);

                }


            };
            //End variables and functions definitions.


        //href has to be processed first, because of the strange 
        //IE bug that will mix the href and innerHTML values.
        if( hrefReplace > -1 ) {

            href = changeLetters(href);
            element.setAttribute("href", href);

        }

        //Change the direction of the text to show real address
        //to users, instead of a reversed one.
        if( textReplace > -1 ) {

            text = changeLetters( text );
            element.innerHTML = text;
            element.style.direction = "rtl";
            element.style.unicodeBidi = "bidi-override";
            element.style.whiteSpace = "nowrap";
        }


        //Since we have a rtl text, user can"t copy or click on a link.
        //Therefore we"ll replace the value as soon as user hovers over the link.
        if( element.addEventListener ) {

            element.addEventListener("mouseover", reverseEmails, false);

        } else {

            element.attachEvent("onmouseover", reverseEmails);

        }

    }

代码看起来很多,其实很简单,可以分解成几个步骤:

  1. 声明一个用作键的字符串,characters
  2. 用密钥中 30 个位置的前一个字符替换混淆电子邮件的每个字符(如果需要,环绕),或者如果它不是密钥的一部分,则保留它
  3. 反转结果并删除多余的@

在 python 中实现它应该只需要几行代码,一旦你这样做了,你就可以自己对电子邮件进行去混淆处理。

下面是python.

中反混淆功能的一些实现
def deobfuscate(s):
    s = s[::-1]
    final_s = ''
    characters = "123456789qwertzuiopasdfghjklyxcvbnmMNBVCXYLKJHGFDSAPOIUZTREWQ"
    char_len = len(characters)
    for i, char in enumerate(s):
        if char in characters:
            cur_pos = characters.index(char)
            cpos = cur_pos - (char_len-1) // 2
            cpos = char_len + cpos if cpos < 0 else cpos
            final_s += characters[cpos]
        else:
            final_s += char
    return ''.join(final_s).strip('@')

s = 'YP.KLSIKKHKA-YGDFTAFP3Y@FO3G:FKRGS4@'
print (deobfuscate(s))
# mailto:info@endoskopie-stuttgart.de