如何从 Java 中的文件中读取大于 0xFFFF 的 unicode 代码点
How to read unicode codepoints greater than 0xFFFF from file in Java
我正在为编译器编写词法分析器,我想知道如何读取包含大于 0xFFFF 的 unicode 代码点的 UTF-8 文件。 char
数据类型仅支持两个字节,那么如何从文件中读取 int
代码点?
我最近不得不这样做;这是我使用的代码。这是一个 Spliterator.OfInt
实现,可用于根据 Reader
的输入创建代码点的 IntStream
,或者直接使用(如果更容易的话)。或者只是从 nextCP
方法中提取逻辑。
package org.raevnos.util.iterator;
import java.util.Objects;
import java.util.Spliterator;
import java.util.function.IntConsumer;
import java.io.Reader;
import java.io.Closeable;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.charset.CharacterCodingException;
/**
* A {@code Spliterator.OfInt} used to iterate over codepoints read from a file.
*/
public class CPSpliterator
implements Spliterator.OfInt, Closeable {
private final Reader input;
/**
* Create a new spliterator.
* @param input The {@code Reader} to get codepoints from.
*/
public CPSpliterator(Reader input) {
this.input = Objects.requireNonNull(input);
}
/**
* Fetch the next codepoint from the underlying stream, accounting for
* surrogate pairs.
* @return a codepoint, or -1 on end of file.
* @throws UncheckedIOException on input errors.
*/
private int nextCP() {
try {
int first_char = input.read();
if (first_char == -1) {
return -1;
} else if (Character.isHighSurrogate((char)first_char)) {
int second_char = input.read();
if (second_char == -1
|| !Character.isLowSurrogate((char)second_char)) {
// Hopefully shouldn't happen; caught by Reader first.
throw new CharacterCodingException();
} else {
return Character.toCodePoint((char)first_char, (char)second_char);
}
} else {
return first_char;
}
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
@Override
public int characteristics() { return ORDERED | NONNULL; }
@Override
public long estimateSize() { return Long.MAX_VALUE; }
@Override
public void forEachRemaining(IntConsumer f) {
int cp;
while ((cp = nextCP()) != -1) {
f.accept(cp);
}
}
@Override
public boolean tryAdvance(IntConsumer f) {
int cp = nextCP();
if (cp != -1) {
f.accept(cp);
return true;
} else {
return false;
}
}
@Override
public Spliterator.OfInt trySplit() { return null; }
@Override
public void close() throws IOException { input.close(); }
}
用法示例:
try (CPSpliterator sp = new CPSpliterator(Files.newBufferedReader(Path.of(whereEver)))) {
IntStream codepoints = StreamSupport.intStream(sp, false);
// do something with the stream
}
或
try (CPSpliterator sp = new CPSpliterator(Files.newBufferedReader(Path.of(whereEver)))) {
sp.forEachRemaining(cp -> doSomething(cp));
}
等等
您也可以使用 Files.readString()
将整个文件读入一个字符串并在其上使用 String#codePoints
或其他代码点方法,但如果这样的话,上面的 class 内存效率更高很重要,因为它一次只读取一个字符。或者一次读取一行并将其转换为代码点。
我正在为编译器编写词法分析器,我想知道如何读取包含大于 0xFFFF 的 unicode 代码点的 UTF-8 文件。 char
数据类型仅支持两个字节,那么如何从文件中读取 int
代码点?
我最近不得不这样做;这是我使用的代码。这是一个 Spliterator.OfInt
实现,可用于根据 Reader
的输入创建代码点的 IntStream
,或者直接使用(如果更容易的话)。或者只是从 nextCP
方法中提取逻辑。
package org.raevnos.util.iterator;
import java.util.Objects;
import java.util.Spliterator;
import java.util.function.IntConsumer;
import java.io.Reader;
import java.io.Closeable;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.charset.CharacterCodingException;
/**
* A {@code Spliterator.OfInt} used to iterate over codepoints read from a file.
*/
public class CPSpliterator
implements Spliterator.OfInt, Closeable {
private final Reader input;
/**
* Create a new spliterator.
* @param input The {@code Reader} to get codepoints from.
*/
public CPSpliterator(Reader input) {
this.input = Objects.requireNonNull(input);
}
/**
* Fetch the next codepoint from the underlying stream, accounting for
* surrogate pairs.
* @return a codepoint, or -1 on end of file.
* @throws UncheckedIOException on input errors.
*/
private int nextCP() {
try {
int first_char = input.read();
if (first_char == -1) {
return -1;
} else if (Character.isHighSurrogate((char)first_char)) {
int second_char = input.read();
if (second_char == -1
|| !Character.isLowSurrogate((char)second_char)) {
// Hopefully shouldn't happen; caught by Reader first.
throw new CharacterCodingException();
} else {
return Character.toCodePoint((char)first_char, (char)second_char);
}
} else {
return first_char;
}
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
@Override
public int characteristics() { return ORDERED | NONNULL; }
@Override
public long estimateSize() { return Long.MAX_VALUE; }
@Override
public void forEachRemaining(IntConsumer f) {
int cp;
while ((cp = nextCP()) != -1) {
f.accept(cp);
}
}
@Override
public boolean tryAdvance(IntConsumer f) {
int cp = nextCP();
if (cp != -1) {
f.accept(cp);
return true;
} else {
return false;
}
}
@Override
public Spliterator.OfInt trySplit() { return null; }
@Override
public void close() throws IOException { input.close(); }
}
用法示例:
try (CPSpliterator sp = new CPSpliterator(Files.newBufferedReader(Path.of(whereEver)))) {
IntStream codepoints = StreamSupport.intStream(sp, false);
// do something with the stream
}
或
try (CPSpliterator sp = new CPSpliterator(Files.newBufferedReader(Path.of(whereEver)))) {
sp.forEachRemaining(cp -> doSomething(cp));
}
等等
您也可以使用 Files.readString()
将整个文件读入一个字符串并在其上使用 String#codePoints
或其他代码点方法,但如果这样的话,上面的 class 内存效率更高很重要,因为它一次只读取一个字符。或者一次读取一行并将其转换为代码点。