无法使用 java.nio 将 Shift_JIS 的编码更改为 UTF-8
Unable to change the encoding of a Shift_JIS to UTF-8 using java.nio
我正在尝试读取使用 Shift_JIS 编码的文件,然后将其转换为 UTF-8。当我使用 java.nio CharsetDecoder.decode 时,它会抛出以下错误。我无法查明此问题的真正原因。
java.nio.charset.UnmappableCharacterException: Input length = 2
java.nio.charset.UnmappableCharacterException: Input length = 2
at java.nio.charset.CoderResult.throwException(CoderResult.java:278)
at java.nio.charset.CharsetDecoder.decode(CharsetDecoder.java:798)
at CharacterSetConversionUtility.getString(CharacterSetConversionUtility.java:23)
at CharacterSetConversionUtility.convertBetweenEncodings(CharacterSetConversionUtility.java:39)
at CharacterSetConversionUtility.main(CharacterSetConversionUtility.java:94
下面是代码片段
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.charset.CharsetDecoder;
import org.mozilla.universalchardet.UniversalDetector;
public class CharacterSetConversionUtility
{
public static String getString(String charSet, byte[] bytes) throws CharacterCodingException
{
ByteBuffer buffer = ByteBuffer.wrap(bytes);
Charset charset = Charset.forName(charSet);
CharsetDecoder decoder = charset.newDecoder();
CharBuffer output = decoder.decode(buffer);
return output.toString();
}
public static byte[] convertToEncoding(String charSet, String input) throws CharacterCodingException
{
CharBuffer buffer = CharBuffer.wrap(input);
Charset charset = Charset.forName(charSet);
CharsetEncoder encoder = charset.newEncoder();
ByteBuffer output = encoder.encode(buffer);
return output.array();
}
public static byte[] convertBetweenEncodings(byte[] originalBytes, String sourceCharSet, String destCharSet)
throws CharacterCodingException
{
String unicodeString = getString(sourceCharSet, originalBytes);
byte[] output = convertToEncoding(destCharSet, unicodeString);
return output;
}
/** Utility Method to detect character encoding in a byte stream **/
public static String getCharacterEncoding(String fileName){
byte[] buf = new byte[4096];
String encoding = null;
try {
java.io.FileInputStream fis = new java.io.FileInputStream(fileName);
// (1)
UniversalDetector detector = new UniversalDetector(null);
// (2)
int nread;
while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
detector.handleData(buf, 0, nread);
}
// (3)
detector.dataEnd();
// (4)
encoding = detector.getDetectedCharset();
if (encoding != null) {
System.out.println("Detected encoding = " + encoding);
} else {
System.out.println("No encoding detected.");
}
// (5)
detector.reset();
//
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return encoding;
}
public static void main(String[] args) {
Path path = Paths.get("E:/Encoding Issue/SJISFile");
try {
byte[] inputdata = Files.readAllBytes(path);
//Detect the character encoding of the input data
String inputCharEncoding = getCharacterEncoding("E:/Encoding Issue/SJISFile");
//Perform a character set conversion
byte[] outputdata =convertBetweenEncodings(inputdata,inputCharEncoding,"UTF-8");
FileOutputStream fos = new FileOutputStream("E:/Encoding Issue/convertedutf8.txt");
fos.write(outputdata);
fos.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
因此,对于您遇到的失败原因,我没有明确的答案,尽管我怀疑它位于 String
/char[]
/[=13= 之间的转换中].
也就是说,我想为手头的问题提供一个更简单和紧凑的工作解决方案,它使用 this method 代替,即字符串 class 本身提供的转换功能而不是使用 en/decoders。
这将适用于 Shift_JIS
字符集或任何其他字符集。此外,UniversalDetector
的使用没有任何问题,但为了简单起见,我省略了它,而是对源字符集进行了硬编码。最后这个版本兼容 JavaSE 1.6。
希望对您有所帮助:)
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
public class JapaneseCharsetTest {
public static final int CHAR_LENGTH_TO_READ = 8192;
public static void main(String[] args) {
JapaneseCharsetTest test = new JapaneseCharsetTest();
test.doIt();
}
public void doIt() {
InputStreamReader reader = null;
FileOutputStream fos = null;
try {
FileInputStream stream = new FileInputStream(new File("C:/Path/To/My/ShiftJISFile.txt"));
reader = new InputStreamReader(stream, Charset.forName("Shift_JIS"));
fos = new FileOutputStream("C:/Path/To/My/UTF8TargetFile.txt");
char[] inputdata = new char[CHAR_LENGTH_TO_READ];
int len = 0;
while ((len = reader.read(inputdata)) != -1) {
convert(len, inputdata, fos);
}
}
catch (IOException e) {
e.printStackTrace();
}
finally {
try {if (reader != null) reader.close();} catch (IOException ignored) {}
try {if (fos != null) fos.close();} catch (IOException ignored){}
}
}
private void convert(int len, char[] inputData, FileOutputStream fos) throws IOException {
char[] charsToWrite = inputData;
if (len < CHAR_LENGTH_TO_READ) {
// Last chunk of data - cut it to size
charsToWrite = new char[len];
CharBuffer.wrap(inputData).get(charsToWrite, 0, len);
}
// Convert initial charset (here Shift_JIS) to target (here UTF-8)
byte[] utf8 = new String(charsToWrite).getBytes("UTF-8");
fos.write(utf8);
}
}
我正在尝试读取使用 Shift_JIS 编码的文件,然后将其转换为 UTF-8。当我使用 java.nio CharsetDecoder.decode 时,它会抛出以下错误。我无法查明此问题的真正原因。
java.nio.charset.UnmappableCharacterException: Input length = 2
java.nio.charset.UnmappableCharacterException: Input length = 2
at java.nio.charset.CoderResult.throwException(CoderResult.java:278)
at java.nio.charset.CharsetDecoder.decode(CharsetDecoder.java:798)
at CharacterSetConversionUtility.getString(CharacterSetConversionUtility.java:23)
at CharacterSetConversionUtility.convertBetweenEncodings(CharacterSetConversionUtility.java:39)
at CharacterSetConversionUtility.main(CharacterSetConversionUtility.java:94
下面是代码片段
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.Charset;
import java.nio.charset.CharsetEncoder;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.charset.CharsetDecoder;
import org.mozilla.universalchardet.UniversalDetector;
public class CharacterSetConversionUtility
{
public static String getString(String charSet, byte[] bytes) throws CharacterCodingException
{
ByteBuffer buffer = ByteBuffer.wrap(bytes);
Charset charset = Charset.forName(charSet);
CharsetDecoder decoder = charset.newDecoder();
CharBuffer output = decoder.decode(buffer);
return output.toString();
}
public static byte[] convertToEncoding(String charSet, String input) throws CharacterCodingException
{
CharBuffer buffer = CharBuffer.wrap(input);
Charset charset = Charset.forName(charSet);
CharsetEncoder encoder = charset.newEncoder();
ByteBuffer output = encoder.encode(buffer);
return output.array();
}
public static byte[] convertBetweenEncodings(byte[] originalBytes, String sourceCharSet, String destCharSet)
throws CharacterCodingException
{
String unicodeString = getString(sourceCharSet, originalBytes);
byte[] output = convertToEncoding(destCharSet, unicodeString);
return output;
}
/** Utility Method to detect character encoding in a byte stream **/
public static String getCharacterEncoding(String fileName){
byte[] buf = new byte[4096];
String encoding = null;
try {
java.io.FileInputStream fis = new java.io.FileInputStream(fileName);
// (1)
UniversalDetector detector = new UniversalDetector(null);
// (2)
int nread;
while ((nread = fis.read(buf)) > 0 && !detector.isDone()) {
detector.handleData(buf, 0, nread);
}
// (3)
detector.dataEnd();
// (4)
encoding = detector.getDetectedCharset();
if (encoding != null) {
System.out.println("Detected encoding = " + encoding);
} else {
System.out.println("No encoding detected.");
}
// (5)
detector.reset();
//
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return encoding;
}
public static void main(String[] args) {
Path path = Paths.get("E:/Encoding Issue/SJISFile");
try {
byte[] inputdata = Files.readAllBytes(path);
//Detect the character encoding of the input data
String inputCharEncoding = getCharacterEncoding("E:/Encoding Issue/SJISFile");
//Perform a character set conversion
byte[] outputdata =convertBetweenEncodings(inputdata,inputCharEncoding,"UTF-8");
FileOutputStream fos = new FileOutputStream("E:/Encoding Issue/convertedutf8.txt");
fos.write(outputdata);
fos.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
因此,对于您遇到的失败原因,我没有明确的答案,尽管我怀疑它位于 String
/char[]
/[=13= 之间的转换中].
也就是说,我想为手头的问题提供一个更简单和紧凑的工作解决方案,它使用 this method 代替,即字符串 class 本身提供的转换功能而不是使用 en/decoders。
这将适用于 Shift_JIS
字符集或任何其他字符集。此外,UniversalDetector
的使用没有任何问题,但为了简单起见,我省略了它,而是对源字符集进行了硬编码。最后这个版本兼容 JavaSE 1.6。
希望对您有所帮助:)
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
public class JapaneseCharsetTest {
public static final int CHAR_LENGTH_TO_READ = 8192;
public static void main(String[] args) {
JapaneseCharsetTest test = new JapaneseCharsetTest();
test.doIt();
}
public void doIt() {
InputStreamReader reader = null;
FileOutputStream fos = null;
try {
FileInputStream stream = new FileInputStream(new File("C:/Path/To/My/ShiftJISFile.txt"));
reader = new InputStreamReader(stream, Charset.forName("Shift_JIS"));
fos = new FileOutputStream("C:/Path/To/My/UTF8TargetFile.txt");
char[] inputdata = new char[CHAR_LENGTH_TO_READ];
int len = 0;
while ((len = reader.read(inputdata)) != -1) {
convert(len, inputdata, fos);
}
}
catch (IOException e) {
e.printStackTrace();
}
finally {
try {if (reader != null) reader.close();} catch (IOException ignored) {}
try {if (fos != null) fos.close();} catch (IOException ignored){}
}
}
private void convert(int len, char[] inputData, FileOutputStream fos) throws IOException {
char[] charsToWrite = inputData;
if (len < CHAR_LENGTH_TO_READ) {
// Last chunk of data - cut it to size
charsToWrite = new char[len];
CharBuffer.wrap(inputData).get(charsToWrite, 0, len);
}
// Convert initial charset (here Shift_JIS) to target (here UTF-8)
byte[] utf8 = new String(charsToWrite).getBytes("UTF-8");
fos.write(utf8);
}
}