无法使用 java.nio 将 Shift_JIS 的编码更改为 UTF-8

Unable to change the encoding of a Shift_JIS to UTF-8 using java.nio

我正在尝试读取使用 Shift_JIS 编码的文件,然后将其转换为 UTF-8。当我使用 java.nio CharsetDecoder.decode 时,它会抛出以下错误。我无法查明此问题的真正原因。

java.nio.charset.UnmappableCharacterException: Input length = 2
java.nio.charset.UnmappableCharacterException: Input length = 2
    at java.nio.charset.CoderResult.throwException(CoderResult.java:278)
    at java.nio.charset.CharsetDecoder.decode(CharsetDecoder.java:798)
    at CharacterSetConversionUtility.getString(CharacterSetConversionUtility.java:23)
    at CharacterSetConversionUtility.convertBetweenEncodings(CharacterSetConversionUtility.java:39)
    at CharacterSetConversionUtility.main(CharacterSetConversionUtility.java:94

下面是代码片段

import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.charset.CharsetDecoder;

import org.mozilla.universalchardet.UniversalDetector;  
 public class CharacterSetConversionUtility
 {
    public static String getString(String charSet, byte[] bytes) throws CharacterCodingException
{
    ByteBuffer buffer = ByteBuffer.wrap(bytes);
    Charset charset = Charset.forName(charSet);
    CharsetDecoder decoder = charset.newDecoder();
    CharBuffer output = decoder.decode(buffer);
    return output.toString();
}

public static byte[] convertToEncoding(String charSet, String input) throws CharacterCodingException
{
    CharBuffer buffer = CharBuffer.wrap(input);
    Charset charset = Charset.forName(charSet);
    CharsetEncoder encoder = charset.newEncoder();
    ByteBuffer output = encoder.encode(buffer);
    return output.array();
}

public static byte[] convertBetweenEncodings(byte[] originalBytes, String sourceCharSet, String destCharSet)
        throws CharacterCodingException
{
    String unicodeString = getString(sourceCharSet, originalBytes);
    byte[] output = convertToEncoding(destCharSet, unicodeString);
    return output;
}

/** Utility Method to detect character encoding in a byte stream **/

public static String getCharacterEncoding(String fileName){
    byte[] buf = new byte[4096];
    String encoding = null;
    try {
        java.io.FileInputStream fis = new java.io.FileInputStream(fileName);

        // (1)
        UniversalDetector detector = new UniversalDetector(null);

        // (2)
        int nread;
        while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
          detector.handleData(buf, 0, nread);
        }
        // (3)
        detector.dataEnd();

        // (4)
        encoding = detector.getDetectedCharset();
        if (encoding != null) {
          System.out.println("Detected encoding = " + encoding);
        } else {
          System.out.println("No encoding detected.");
        }

        // (5)
        detector.reset();

        //

    } catch (FileNotFoundException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    return encoding;
}


public static void main(String[] args) {
    Path path = Paths.get("E:/Encoding Issue/SJISFile");
    try {
        byte[] inputdata = Files.readAllBytes(path);
        //Detect the character encoding of the input data
        String inputCharEncoding = getCharacterEncoding("E:/Encoding Issue/SJISFile");
        //Perform a character set conversion
        byte[] outputdata =convertBetweenEncodings(inputdata,inputCharEncoding,"UTF-8");
        FileOutputStream fos = new FileOutputStream("E:/Encoding Issue/convertedutf8.txt");
        fos.write(outputdata);
        fos.close();

    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}

因此,对于您遇到的失败原因,我没有明确的答案,尽管我怀疑它位于 String/char[]/[=13= 之间的转换中]. 也就是说,我想为手头的问题提供一个更简单和紧凑的工作解决方案,它使用 this method 代替,即字符串 class 本身提供的转换功能而不是使用 en/decoders。 这将适用于 Shift_JIS 字符集或任何其他字符集。此外,UniversalDetector 的使用没有任何问题,但为了简单起见,我省略了它,而是对源字符集进行了硬编码。最后这个版本兼容 JavaSE 1.6。

希望对您有所帮助:)


import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.CharBuffer;
import java.nio.charset.Charset;

public class JapaneseCharsetTest {

    public static final int CHAR_LENGTH_TO_READ = 8192;
    public static void main(String[] args) {
        JapaneseCharsetTest test = new JapaneseCharsetTest();
        test.doIt();
    }

    public void doIt() {

        InputStreamReader reader = null;
        FileOutputStream fos = null;
        try {
            FileInputStream stream = new FileInputStream(new File("C:/Path/To/My/ShiftJISFile.txt"));
            reader = new InputStreamReader(stream, Charset.forName("Shift_JIS"));
            fos = new FileOutputStream("C:/Path/To/My/UTF8TargetFile.txt");
            char[] inputdata = new char[CHAR_LENGTH_TO_READ];
            int len = 0;
            while ((len = reader.read(inputdata)) != -1) {
                convert(len, inputdata, fos);
            }
        } 
        catch (IOException e) {
            e.printStackTrace();
        }
        finally {
            try {if (reader != null) reader.close();} catch (IOException ignored) {} 
            try {if (fos != null) fos.close();} catch (IOException ignored){}
        }
    }

    private void convert(int len, char[] inputData, FileOutputStream fos) throws IOException {
        char[] charsToWrite = inputData;
        if (len < CHAR_LENGTH_TO_READ) {
            // Last chunk of data - cut it to size
            charsToWrite = new char[len];
            CharBuffer.wrap(inputData).get(charsToWrite, 0, len);
        }

        // Convert initial charset (here Shift_JIS) to target (here UTF-8)
        byte[] utf8 = new String(charsToWrite).getBytes("UTF-8");
        fos.write(utf8);
    }
}