如何从 Java 中的文件中读取大于 0xFFFF 的 unicode 代码点

How to read unicode codepoints greater than 0xFFFF from file in Java

我正在为编译器编写词法分析器,我想知道如何读取包含大于 0xFFFF 的 unicode 代码点的 UTF-8 文件。 char 数据类型仅支持两个字节,那么如何从文件中读取 int 代码点?

我最近不得不这样做;这是我使用的代码。这是一个 Spliterator.OfInt 实现,可用于根据 Reader 的输入创建代码点的 IntStream,或者直接使用(如果更容易的话)。或者只是从 nextCP 方法中提取逻辑。

package org.raevnos.util.iterator;

import java.util.Objects;
import java.util.Spliterator;
import java.util.function.IntConsumer;
import java.io.Reader;
import java.io.Closeable;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.charset.CharacterCodingException;

/**
 * A {@code Spliterator.OfInt} used to iterate over codepoints read from a file.
 */
public class CPSpliterator
    implements Spliterator.OfInt, Closeable {
    private final Reader input;

    /**
     * Create a new spliterator.
     * @param input The {@code Reader} to get codepoints from.
     */
    public CPSpliterator(Reader input) {
        this.input = Objects.requireNonNull(input);
    }

    /**
     * Fetch the next codepoint from the underlying stream, accounting for
     * surrogate pairs.
     * @return a codepoint, or -1 on end of file.
     * @throws UncheckedIOException on input errors.
     */
    private int nextCP() {
        try {
            int first_char = input.read();
            if (first_char == -1) {
                return -1;
            } else if (Character.isHighSurrogate((char)first_char)) {
                int second_char = input.read();
                if (second_char == -1
                    || !Character.isLowSurrogate((char)second_char)) {
                    // Hopefully shouldn't happen; caught by Reader first.
                    throw new CharacterCodingException();
                } else {
                    return Character.toCodePoint((char)first_char, (char)second_char);
                }
            } else {
                return first_char;
            }
        } catch (IOException e) {
            throw new UncheckedIOException(e);
        }
    }

    @Override
    public int characteristics() { return ORDERED | NONNULL; }

    @Override
    public long estimateSize() { return Long.MAX_VALUE; }

    @Override
    public void forEachRemaining(IntConsumer f) {
        int cp;
        while ((cp = nextCP()) != -1) {
            f.accept(cp);
        }
    }

    @Override
    public boolean tryAdvance(IntConsumer f) {
        int cp = nextCP();
        if (cp != -1) {
            f.accept(cp);
            return true;
        } else {
            return false;
        }
    }

    @Override
    public Spliterator.OfInt trySplit() { return null; }

    @Override
    public void close() throws IOException { input.close(); }
}

用法示例:

try (CPSpliterator sp = new CPSpliterator(Files.newBufferedReader(Path.of(whereEver)))) {
    IntStream codepoints = StreamSupport.intStream(sp, false);
    // do something with the stream
}

try (CPSpliterator sp = new CPSpliterator(Files.newBufferedReader(Path.of(whereEver)))) {
    sp.forEachRemaining(cp -> doSomething(cp));
}

等等

您也可以使用 Files.readString() 将整个文件读入一个字符串并在其上使用 String#codePoints 或其他代码点方法,但如果这样的话,上面的 class 内存效率更高很重要,因为它一次只读取一个字符。或者一次读取一行并将其转换为代码点。