在 java 中获取 HTML 部分

Fetch HTML part in java

我无法理解如何只下载 html 页面的一部分。我通过 URL::openStream 方法和 BufferedReader 尝试了传统方式,但我不太确定这种方式是否促使我下载整个页面。 问题是:我有相当大的 HTML 页面,我需要从中解析 2 个数字,至少每秒更新一次。上面的方式有助于在 2-3 秒内检测一次变化,我想知道是否有办法让它更快。所以我想如果部分获取页面可以帮助我。

我认为您应该了解如何获取数据(SSE 或 WebSocket)并尝试订阅该服务。如果那是不可能的,请尝试更高效的 XML 解析器。我推荐 https://vtd-xml.sourceforge.io/ 它比 JDK 附带的 DOM 解析器快 10 倍。

还要小心 BufferedReader.readLine(),因为存在分配的隐藏成本(这是非常高级的东西,因为您必须考虑 CPU 内存带宽、L1 缓存未命中等。)对于您并不真正需要的字符串。

使用我提到的库的示例:

byte[] pageInBytes = readAllBytesFromTheURL();
VTDGen vg = new VTDGen();
vg.setDoc(pageInBytes);
vg.parse(false);
VTDNav vn = vg.getNav();

AutoPilot ap = new AutoPilot(vn);

//Jump to the section that we want to process
ap.selectXPath("/html/body/div");
String fileId = vn.toString(vu.getElementFragment());

写了帮助程序来阅读 url 内容。解析另一个 class.

中的元素
public class HTMLReaderHelper {

private final URL currentURL;

HTMLReaderHelper(URL url){
    currentURL = url;
}

public CharIterator charIterator(){
    CharIterator iterator;
    try {
        iterator = new CharIterator();
    } catch(IOException ex){
        return null;
    }
    return iterator;
}

public StringIterator stringIterator(){
    return new StringIterator();
}

class CharIterator implements java.util.Iterator<Character>{

    private InputStream urlStream;

    private boolean isValid;

    private Queue<Character> buffer;

    private CharIterator() throws IOException {
        urlStream = currentURL.openStream();
        isValid = true;
        buffer = new ArrayDeque<>();
    }

    @Override
    public boolean hasNext() {
        char c;
        try {
            c = (char)urlStream.read();
            buffer.add(c);
        } catch (IOException ex) {
            markInvalid();
            return false;
        }
        return c != (char) -1;
    }

    @Override
    public Character next() {
        if(!isValid){
            return null;
        }
        char c;
        try {
            if(buffer.size() > 0){
                return buffer.remove();
            }
            c = (char)urlStream.read();
        } catch (IOException ex) {
            markInvalid();
            return null;
        }
        return (c != (char)-1) ? c : null;
    }

    private void markInvalid(){
        isValid = false;
    }
}

class StringIterator implements java.util.Iterator<String>{

    private CharIterator charPointer;

    private Queue<String> buffer;

    private boolean isValid;

    private StringIterator(){
        charPointer = charIterator();
        isValid = true;
        buffer = new ArrayDeque<>();
    }

    @Override
    public boolean hasNext() {
        String value = next();
        try {
            buffer.add(value);
        } catch (NullPointerException ex){
            markInvalid();
            return false;
        }
        return isValid;
    }

    @Override
    public String next() {
        if(buffer.size() > 0){
            return buffer.remove();
        }
        if(!isValid){
            return null;
        }
        StringBuilder sb = new StringBuilder();
        Character currentChar = charPointer.next();
        if(currentChar == null){
            return null;
        }
        while (currentChar.equals('\n') || currentChar.equals('\r')){
            currentChar = charPointer.next();
            if(currentChar == null){
                return null;
            }
        }
        while (currentChar != Character.valueOf('\n') && currentChar != Character.valueOf('\r')){
            sb.append(currentChar);
            currentChar = charPointer.next();
        }
        return sb.toString();
    }
    private void markInvalid(){
        isValid = false;
    }
}
}