Detect/Verify 遵循 w3c 推荐的 XML 文件的编码

Detect/Verify the encoding of an XML file following the w3c recommandation

嗯, 我在 W3C recommendations 之后寻找 c# XML 编码检测器,但没有找到。

网络上有多种解决方案,但 none 严格遵循上述 W3C 建议。

很快,W3C 说:"Each XML entity not accompanied by external encoding information and not in UTF-8 or UTF-16 encoding must begin with an XML encoding declaration, in which the first characters must be "< ?xml”。[...] 处理器可以在输入两到四个八位字节后检测到编码。”

有人有这种算法的 c# 实现吗?

从 A. Hristov here, I wrote a C# implementation integrated with the suggestions taken from the TextFileEncodingDetector project 提出的 java 实现开始。我希望它对社区有用。

注:代码看似较长,但只有一个public方法。

using System;
using System.IO;
using System.Xml;
using System.Text;
using System.Text.RegularExpressions;

public class XmlEncodingDetector
{

    /// Detect the XML encoding by reading both the file stream as text-based and the encoding pseudoattribute of the XML header (if present)
    /// The encoding is detected using the guidelines specified in the http://www.w3.org/TR/xml/#sec-guessing' (XML W3C Specification).
    ///
    /// Returns the detected encoding or null if not detected</returns>
    public static Encoding DetectXmlFileEncoding(string xmlFileName)
    {
        using (FileStream xmlFileStream = File.OpenRead(xmlFileName))
        {
            return DetectXmlFileEncoding(xmlFileStream);
        }
    }

    /// Detect the XML encoding by reading both the file stream as text-based and the encoding pseudoattribute of the XML header (if present)
    /// The encoding is detected using the guidelines specified in the http://www.w3.org/TR/xml/#sec-guessing' (XML W3C Specification).
    ///
    /// Returns the detected encoding or null if not detected</returns>
    public static Encoding DetectXmlFileEncoding(FileStream xmlFileStream)
    {
        long originalPos = -1;
        Encoding encodingFound1 = null;
        Encoding encodingFound2 = null;
        try
        {
            originalPos = xmlFileStream.Position;

            // Reading a binary sample of the file in order to parse it
            byte[] sample = new byte[xmlFileStream.Length > 0x100 ? 0x100 : xmlFileStream.Length];
            xmlFileStream.Read(sample, 0, sample.Length);

            // look for the BOM of the file in the read sample
            encodingFound1 = DetectBOMBytes(sample);

            // if the encoding was not detected due to a missing or unrecognizable BOM, try to detect from the binary representation of the string "<?xml"
            Boolean checkPseudoAttribute = false;
            if (encodingFound1 == null)
            {
                if (sample[0] == (byte)0x00 && sample[1] == (byte)0x3C && sample[2] == (byte)0x00 && sample[3] == (byte)0x3F)
                {
                    // UTF-16BE or big-endian ISO-10646-UCS-2 or other encoding with a 16-bit code unit in big-endian order and ASCII characters encoded as ASCII values
                    // (the encoding declaration must be read to determine which)
                    encodingFound1 = Encoding.BigEndianUnicode;
                    checkPseudoAttribute = true;
                }
                else if (sample[0] == (byte)0x00 && sample[1] == (byte)0x00 && sample[2] == (byte)0x00 && sample[3] == (byte)0x3C)
                {
                    // most probably utf-32BE (Encoding.GetEncoding(12001))
                    // (the encoding declaration must be read to determine which)
                    encodingFound1 = Encoding.GetEncoding(12001);
                    checkPseudoAttribute = true;
                }
                else if (sample[0] == (byte)0xFF && sample[1] == (byte)0xFE)
                {
                    encodingFound1 = Encoding.Unicode;
                }
                else if (sample[0] == (byte)0xFE && sample[1] == (byte)0xFF)
                {
                    encodingFound1 = Encoding.BigEndianUnicode;
                }
                else if (sample[0] == (byte)0x3C && sample[1] == (byte)0x00 && sample[2] == (byte)0x00 && sample[3] == (byte)0x00)
                {
                    // (the encoding declaration must be read to determine which)
                    encodingFound1 = Encoding.UTF32;
                    checkPseudoAttribute = true;
                }
                else if (sample[0] == (byte)0x3C && sample[1] == (byte)0x00 && sample[2] == (byte)0x3F && sample[3] == (byte)0x00)
                {
                    // UTF-16LE or little-endian ISO-10646-UCS-2 or other encoding with a 16-bit code unit in little-endian order and ASCII characters encoded as ASCII values
                    // (the encoding declaration must be read to determine which)
                    encodingFound1 = Encoding.Unicode;
                    checkPseudoAttribute = true;
                }
                else if (sample[0] == (byte)0x3C && sample[1] == (byte)0x3F && sample[2] == (byte)0x78 && sample[3] == (byte)0x6D)
                {
                    // UTF-8, ISO 646, ASCII, some part of ISO 8859 or any other 7-bit, 8-bit
                    // (the encoding declaration must be read to determine which)
                    encodingFound1 = Encoding.ASCII;
                    checkPseudoAttribute = true;
                }
                else if (sample[0] == (byte)0x4C && sample[1] == (byte)0x6F && sample[2] == (byte)0xA7 && sample[3] == (byte)0x94)
                {
                    encodingFound1 = Encoding.GetEncoding(37);    // IBM037 - IBM EBCDIC US-Canada"CP037";
                }
            }   // if (encodingFound1 == null)


            // Now read the encoding pseudoattribute in the XML header, if present
            encodingFound2 = GetXmlDeclaredEncoding(sample, encodingFound1 ?? Encoding.UTF8);   // if I have no info, try with the most common (sigh)
            // when not declare, w3c says it is utf-8
            if (encodingFound2 == null) encodingFound2 = Encoding.UTF8;

            // compare the 2 found encoding and decided which is the right one
            Encoding winner = null;
            if (encodingFound1 == encodingFound2)
            {
                winner = encodingFound2;
            }
            else if (encodingFound1 == null)
            {
                winner = encodingFound2;
            }
            else if (encodingFound2 == null)
            {
                winner = encodingFound1;
            }
            else if (checkPseudoAttribute)
            {
                // Fine-tune the winner encoding. This is the most heuristic part, as some encoding
                // can be overloaded. E.g. ASCII might be UTF-7, UTF-8, ISO-8859...

                if (encodingFound1.Equals(Encoding.ASCII) &&
                        (encodingFound2.Equals(Encoding.UTF7) || encodingFound2.Equals(Encoding.UTF8) || encodingFound2.HeaderName.Contains("iso-8859")))
                {
                    winner = encodingFound2;
                }
                else
                {
                    // I'm not sure here if throw an exception or accept encodingFound1 or encodingFound2, 
                    // as both are not null and not equals
                    throw new XmlException(string.Format("{0} ({1}, {2})",
                            "The text encoding and the encoding pseudo-attribute of the XML header mismatch", 
                            encodingFound1, encodingFound2));
                }
            }
            else
            {
                // encodingFound1 and encodingFound2 are different so none win
                throw new XmlException(string.Format("{0} ({1}, {2})",
                        "The text encoding and the encoding pseudo-attribute of the XML header mismatch",
                        encodingFound1, encodingFound2));
            }

            // return the detected encoding
            return winner;
        }
        finally
        {
            if (originalPos >= 0) xmlFileStream.Position = originalPos;
        }

    }


    #region private methods

    /// Search for the standard Begin of Message sequence to identify encoding
    /// Returns the possibily null identified encoding
    private static Encoding DetectBOMBytes(byte[] BOMBytes)
    {
        if (BOMBytes.Length < 2) return null;

        if (BOMBytes[0] == 0xFF && BOMBytes[1] == 0xFE
                && (BOMBytes.Length < 4 || BOMBytes[2] != 0x00 || BOMBytes[3] != 0x00))
            return Encoding.Unicode;            // utf-16LE - Unicode UTF-16 little endian byte order

        if (BOMBytes[0] == 0xFE && BOMBytes[1] == 0xFF)
            return Encoding.BigEndianUnicode;   // utf-16BE - Unicode UTF-16 big endian byte order

        if (BOMBytes.Length < 3) return null;

        if (BOMBytes[0] == 0xEF && BOMBytes[1] == 0xBB && BOMBytes[2] == 0xBF)
            return Encoding.UTF8;               // utf-8

        if (BOMBytes[0] == 0x2B && BOMBytes[1] == 0x2F && BOMBytes[2] == 0x76)
            return Encoding.UTF7;               // note: Character encodings such as UTF-7 that make overloaded usage of ASCII-valued bytes may fail to be reliably detected

        if (BOMBytes.Length < 4) return null;

        if (BOMBytes[0] == 0xFF && BOMBytes[1] == 0xFE && BOMBytes[2] == 0x00 && BOMBytes[3] == 0x00)
            return Encoding.UTF32;              // utf-32LE - Unicode UTF-32 little endian byte order

        if (BOMBytes[0] == 0x00 && BOMBytes[1] == 0x00 && BOMBytes[2] == 0xFE && BOMBytes[3] == 0xFF)
            return Encoding.GetEncoding(12001); // utf-32BE - Unicode UTF-32 big endian byte order

        return null;
    }


    private static Encoding GetXmlDeclaredEncoding(byte[] sample, Encoding guessedEncoding)
    {
        // capture the encoding from the xml declaraion
        string contents = contents = GetStringFromByteArray(sample, guessedEncoding);
        string pattern = "<\?xml\sversion=\"1.0\"\sencoding=\"(?<encoding>[\w|-]+)\"";
        Match m = Regex.Match(contents, pattern, RegexOptions.ExplicitCapture);

        return (m.Groups["encoding"].Success) ? Encoding.GetEncoding(m.Groups["encoding"].Value) : null;
    }


    private static string GetStringFromByteArray(byte[] message, Encoding guessedEncoding)
    {
        // try to get the encoding from the byte array
        Encoding encodingFound = DetectBOMBytes(message);

        return (encodingFound != null)
            // for some reason, the default encodings don't detect/swallow their own preambles!!
            ? encodingFound.GetString(message, encodingFound.GetPreamble().Length, message.Length - encodingFound.GetPreamble().Length)
            : (DetectUnicodeInByteSampleByHeuristics(message) ?? guessedEncoding).GetString(message);
    }


    private static Encoding DetectUnicodeInByteSampleByHeuristics(byte[] SampleBytes)
    {
        long oddBinaryNullsInSample = 0;
        long evenBinaryNullsInSample = 0;
        long suspiciousUTF8SequenceCount = 0;
        long suspiciousUTF8BytesTotal = 0;
        long likelyUSASCIIBytesInSample = 0;

        // Cycle through, keeping count of binary null positions, possible UTF-8
        // sequences from upper ranges of Windows-1252, and probable US-ASCII
        // character counts.

        long currentPos = 0;
        int skipUTF8Bytes = 0;

        while (currentPos < SampleBytes.Length)
        {
            //binary null distribution
            if (SampleBytes[currentPos] == 0)
            {
                if (currentPos % 2 == 0)
                    evenBinaryNullsInSample++;
                else
                    oddBinaryNullsInSample++;
            }

            //likely US-ASCII characters
            if (IsCommonUSASCIIByte(SampleBytes[currentPos])) likelyUSASCIIBytesInSample++;
            //suspicious sequences (look like UTF-8)
            if (skipUTF8Bytes == 0)
            {
                int lengthFound = DetectSuspiciousUTF8SequenceLength(SampleBytes, currentPos);

                if (lengthFound > 0)
                {
                    suspiciousUTF8SequenceCount++;
                    suspiciousUTF8BytesTotal += lengthFound;
                    skipUTF8Bytes = lengthFound - 1;
                }
            }
            else
            {
                skipUTF8Bytes--;
            }

            currentPos++;
        }

        //1: UTF-16 LE - in english / european environments, this is usually characterized by a
        // high proportion of odd binary nulls (starting at 0), with (as this is text) a low
        // proportion of even binary nulls.
        // The thresholds here used (less than 20% nulls where you expect non-nulls, and more than
        // 60% nulls where you do expect nulls) are completely arbitrary.

        if (((evenBinaryNullsInSample * 2.0) / SampleBytes.Length) < 0.2
        && ((oddBinaryNullsInSample * 2.0) / SampleBytes.Length) > 0.6
        )
            return Encoding.Unicode;


        //2: UTF-16 BE - in english / european environments, this is usually characterized by a
        // high proportion of even binary nulls (starting at 0), with (as this is text) a low
        // proportion of odd binary nulls.
        // The thresholds here used (less than 20% nulls where you expect non-nulls, and more than
        // 60% nulls where you do expect nulls) are completely arbitrary.

        if (((oddBinaryNullsInSample * 2.0) / SampleBytes.Length) < 0.2
        && ((evenBinaryNullsInSample * 2.0) / SampleBytes.Length) > 0.6
        )
            return Encoding.BigEndianUnicode;


        //3: UTF-8 - Martin Dürst outlines a method for detecting whether something CAN be UTF-8 content
        // using regexp, in his w3c.org unicode FAQ entry:
        // http://www.w3.org/International/questions/qa-forms-utf-8
        // adapted here for C#.
        string potentiallyMangledString = Encoding.ASCII.GetString(SampleBytes);
        Regex UTF8Validator = new Regex(@"\A("
        + @"[\x09\x0A\x0D\x20-\x7E]"
        + @"|[\xC2-\xDF][\x80-\xBF]"
        + @"|\xE0[\xA0-\xBF][\x80-\xBF]"
        + @"|[\xE1-\xEC\xEE\xEF][\x80-\xBF]{2}"
        + @"|\xED[\x80-\x9F][\x80-\xBF]"
        + @"|\xF0[\x90-\xBF][\x80-\xBF]{2}"
        + @"|[\xF1-\xF3][\x80-\xBF]{3}"
        + @"|\xF4[\x80-\x8F][\x80-\xBF]{2}"
        + @")*\z");
        if (UTF8Validator.IsMatch(potentiallyMangledString))
        {
            //Unfortunately, just the fact that it CAN be UTF-8 doesn't tell you much about probabilities.
            //If all the characters are in the 0-127 range, no harm done, most western charsets are same as UTF-8 in these ranges.
            //If some of the characters were in the upper range (western accented characters), however, they would likely be mangled to 2-byte by the UTF-8 encoding process.
            // So, we need to play stats.

            // The "Random" likelihood of any pair of randomly generated characters being one
            // of these "suspicious" character sequences is:
            // 128 / (256 * 256) = 0.2%.
            //
            // In western text data, that is SIGNIFICANTLY reduced - most text data stays in the <127
            // character range, so we assume that more than 1 in 500,000 of these character
            // sequences indicates UTF-8. The number 500,000 is completely arbitrary - so sue me.
            //
            // We can only assume these character sequences will be rare if we ALSO assume that this
            // IS in fact western text - in which case the bulk of the UTF-8 encoded data (that is
            // not already suspicious sequences) should be plain US-ASCII bytes. This, I
            // arbitrarily decided, should be 80% (a random distribution, eg binary data, would yield
            // approx 40%, so the chances of hitting this threshold by accident in random data are
            // VERY low).

            if ((suspiciousUTF8SequenceCount * 500000.0 / SampleBytes.Length >= 1) //suspicious sequences
                        && ( //all suspicious, so cannot evaluate proportion of US-Ascii
                            (SampleBytes.Length - suspiciousUTF8BytesTotal == 0)
                                || likelyUSASCIIBytesInSample * 1.0 / (SampleBytes.Length - suspiciousUTF8BytesTotal) >= 0.8
                            )
                )
                return Encoding.UTF8;
        }

        return null;
    }


    private static bool IsCommonUSASCIIByte(byte testByte)
    {
        if (testByte == 0x0A //lf
        || testByte == 0x0D //cr
        || testByte == 0x09 //tab
        || (testByte >= 0x20 && testByte <= 0x2F) //common punctuation
        || (testByte >= 0x30 && testByte <= 0x39) //digits
        || (testByte >= 0x3A && testByte <= 0x40) //common punctuation
        || (testByte >= 0x41 && testByte <= 0x5A) //capital letters
        || (testByte >= 0x5B && testByte <= 0x60) //common punctuation
        || (testByte >= 0x61 && testByte <= 0x7A) //lowercase letters
        || (testByte >= 0x7B && testByte <= 0x7E) //common punctuation
        )
            return true;
        else
            return false;
    }


    private static int DetectSuspiciousUTF8SequenceLength(byte[] SampleBytes, long currentPos)
    {
        int lengthFound = 0;

        if (SampleBytes.Length >= currentPos + 1
        && SampleBytes[currentPos] == 0xC2
        )
        {
            if (SampleBytes[currentPos + 1] == 0x81
            || SampleBytes[currentPos + 1] == 0x8D
            || SampleBytes[currentPos + 1] == 0x8F
            )
                lengthFound = 2;
            else if (SampleBytes[currentPos + 1] == 0x90
            || SampleBytes[currentPos + 1] == 0x9D
            )
                lengthFound = 2;
            else if (SampleBytes[currentPos + 1] >= 0xA0
            && SampleBytes[currentPos + 1] <= 0xBF
            )
                lengthFound = 2;
        }
        else if (SampleBytes.Length >= currentPos + 1
        && SampleBytes[currentPos] == 0xC3
        )
        {
            if (SampleBytes[currentPos + 1] >= 0x80
            && SampleBytes[currentPos + 1] <= 0xBF
            )
                lengthFound = 2;
        }
        else if (SampleBytes.Length >= currentPos + 1
        && SampleBytes[currentPos] == 0xC5
        )
        {
            if (SampleBytes[currentPos + 1] == 0x92
            || SampleBytes[currentPos + 1] == 0x93
            )
                lengthFound = 2;
            else if (SampleBytes[currentPos + 1] == 0xA0
            || SampleBytes[currentPos + 1] == 0xA1
            )
                lengthFound = 2;
            else if (SampleBytes[currentPos + 1] == 0xB8
            || SampleBytes[currentPos + 1] == 0xBD
            || SampleBytes[currentPos + 1] == 0xBE
            )
                lengthFound = 2;
        }
        else if (SampleBytes.Length >= currentPos + 1
        && SampleBytes[currentPos] == 0xC6
        )
        {
            if (SampleBytes[currentPos + 1] == 0x92)
                lengthFound = 2;
        }
        else if (SampleBytes.Length >= currentPos + 1
        && SampleBytes[currentPos] == 0xCB
        )
        {
            if (SampleBytes[currentPos + 1] == 0x86
            || SampleBytes[currentPos + 1] == 0x9C
            )
                lengthFound = 2;
        }
        else if (SampleBytes.Length >= currentPos + 2
        && SampleBytes[currentPos] == 0xE2
        )
        {
            if (SampleBytes[currentPos + 1] == 0x80)
            {
                if (SampleBytes[currentPos + 2] == 0x93
                || SampleBytes[currentPos + 2] == 0x94
                )
                    lengthFound = 3;
                if (SampleBytes[currentPos + 2] == 0x98
                || SampleBytes[currentPos + 2] == 0x99
                || SampleBytes[currentPos + 2] == 0x9A
                )
                    lengthFound = 3;
                if (SampleBytes[currentPos + 2] == 0x9C
                || SampleBytes[currentPos + 2] == 0x9D
                || SampleBytes[currentPos + 2] == 0x9E
                )
                    lengthFound = 3;
                if (SampleBytes[currentPos + 2] == 0xA0
                || SampleBytes[currentPos + 2] == 0xA1
                || SampleBytes[currentPos + 2] == 0xA2
                )
                    lengthFound = 3;
                if (SampleBytes[currentPos + 2] == 0xA6)
                    lengthFound = 3;
                if (SampleBytes[currentPos + 2] == 0xB0)
                    lengthFound = 3;
                if (SampleBytes[currentPos + 2] == 0xB9
                || SampleBytes[currentPos + 2] == 0xBA
                )
                    lengthFound = 3;
            }
            else if (SampleBytes[currentPos + 1] == 0x82
            && SampleBytes[currentPos + 2] == 0xAC
            )
                lengthFound = 3;
            else if (SampleBytes[currentPos + 1] == 0x84
            && SampleBytes[currentPos + 2] == 0xA2
            )
                lengthFound = 3;
        }

        return lengthFound;
    }

    #endregion
}