在 java 中获取 HTML 部分
Fetch HTML part in java
我无法理解如何只下载 html 页面的一部分。我通过 URL::openStream
方法和 BufferedReader
尝试了传统方式,但我不太确定这种方式是否促使我下载整个页面。
问题是:我有相当大的 HTML 页面,我需要从中解析 2 个数字,至少每秒更新一次。上面的方式有助于在 2-3 秒内检测一次变化,我想知道是否有办法让它更快。所以我想如果部分获取页面可以帮助我。
我认为您应该了解如何获取数据(SSE 或 WebSocket)并尝试订阅该服务。如果那是不可能的,请尝试更高效的 XML 解析器。我推荐 https://vtd-xml.sourceforge.io/ 它比 JDK 附带的 DOM 解析器快 10 倍。
还要小心 BufferedReader.readLine()
,因为存在分配的隐藏成本(这是非常高级的东西,因为您必须考虑 CPU 内存带宽、L1 缓存未命中等。)对于您并不真正需要的字符串。
使用我提到的库的示例:
byte[] pageInBytes = readAllBytesFromTheURL();
VTDGen vg = new VTDGen();
vg.setDoc(pageInBytes);
vg.parse(false);
VTDNav vn = vg.getNav();
AutoPilot ap = new AutoPilot(vn);
//Jump to the section that we want to process
ap.selectXPath("/html/body/div");
String fileId = vn.toString(vu.getElementFragment());
写了帮助程序来阅读 url 内容。解析另一个 class.
中的元素
public class HTMLReaderHelper {
private final URL currentURL;
HTMLReaderHelper(URL url){
currentURL = url;
}
public CharIterator charIterator(){
CharIterator iterator;
try {
iterator = new CharIterator();
} catch(IOException ex){
return null;
}
return iterator;
}
public StringIterator stringIterator(){
return new StringIterator();
}
class CharIterator implements java.util.Iterator<Character>{
private InputStream urlStream;
private boolean isValid;
private Queue<Character> buffer;
private CharIterator() throws IOException {
urlStream = currentURL.openStream();
isValid = true;
buffer = new ArrayDeque<>();
}
@Override
public boolean hasNext() {
char c;
try {
c = (char)urlStream.read();
buffer.add(c);
} catch (IOException ex) {
markInvalid();
return false;
}
return c != (char) -1;
}
@Override
public Character next() {
if(!isValid){
return null;
}
char c;
try {
if(buffer.size() > 0){
return buffer.remove();
}
c = (char)urlStream.read();
} catch (IOException ex) {
markInvalid();
return null;
}
return (c != (char)-1) ? c : null;
}
private void markInvalid(){
isValid = false;
}
}
class StringIterator implements java.util.Iterator<String>{
private CharIterator charPointer;
private Queue<String> buffer;
private boolean isValid;
private StringIterator(){
charPointer = charIterator();
isValid = true;
buffer = new ArrayDeque<>();
}
@Override
public boolean hasNext() {
String value = next();
try {
buffer.add(value);
} catch (NullPointerException ex){
markInvalid();
return false;
}
return isValid;
}
@Override
public String next() {
if(buffer.size() > 0){
return buffer.remove();
}
if(!isValid){
return null;
}
StringBuilder sb = new StringBuilder();
Character currentChar = charPointer.next();
if(currentChar == null){
return null;
}
while (currentChar.equals('\n') || currentChar.equals('\r')){
currentChar = charPointer.next();
if(currentChar == null){
return null;
}
}
while (currentChar != Character.valueOf('\n') && currentChar != Character.valueOf('\r')){
sb.append(currentChar);
currentChar = charPointer.next();
}
return sb.toString();
}
private void markInvalid(){
isValid = false;
}
}
}
我无法理解如何只下载 html 页面的一部分。我通过 URL::openStream
方法和 BufferedReader
尝试了传统方式,但我不太确定这种方式是否促使我下载整个页面。
问题是:我有相当大的 HTML 页面,我需要从中解析 2 个数字,至少每秒更新一次。上面的方式有助于在 2-3 秒内检测一次变化,我想知道是否有办法让它更快。所以我想如果部分获取页面可以帮助我。
我认为您应该了解如何获取数据(SSE 或 WebSocket)并尝试订阅该服务。如果那是不可能的,请尝试更高效的 XML 解析器。我推荐 https://vtd-xml.sourceforge.io/ 它比 JDK 附带的 DOM 解析器快 10 倍。
还要小心 BufferedReader.readLine()
,因为存在分配的隐藏成本(这是非常高级的东西,因为您必须考虑 CPU 内存带宽、L1 缓存未命中等。)对于您并不真正需要的字符串。
使用我提到的库的示例:
byte[] pageInBytes = readAllBytesFromTheURL();
VTDGen vg = new VTDGen();
vg.setDoc(pageInBytes);
vg.parse(false);
VTDNav vn = vg.getNav();
AutoPilot ap = new AutoPilot(vn);
//Jump to the section that we want to process
ap.selectXPath("/html/body/div");
String fileId = vn.toString(vu.getElementFragment());
写了帮助程序来阅读 url 内容。解析另一个 class.
中的元素public class HTMLReaderHelper {
private final URL currentURL;
HTMLReaderHelper(URL url){
currentURL = url;
}
public CharIterator charIterator(){
CharIterator iterator;
try {
iterator = new CharIterator();
} catch(IOException ex){
return null;
}
return iterator;
}
public StringIterator stringIterator(){
return new StringIterator();
}
class CharIterator implements java.util.Iterator<Character>{
private InputStream urlStream;
private boolean isValid;
private Queue<Character> buffer;
private CharIterator() throws IOException {
urlStream = currentURL.openStream();
isValid = true;
buffer = new ArrayDeque<>();
}
@Override
public boolean hasNext() {
char c;
try {
c = (char)urlStream.read();
buffer.add(c);
} catch (IOException ex) {
markInvalid();
return false;
}
return c != (char) -1;
}
@Override
public Character next() {
if(!isValid){
return null;
}
char c;
try {
if(buffer.size() > 0){
return buffer.remove();
}
c = (char)urlStream.read();
} catch (IOException ex) {
markInvalid();
return null;
}
return (c != (char)-1) ? c : null;
}
private void markInvalid(){
isValid = false;
}
}
class StringIterator implements java.util.Iterator<String>{
private CharIterator charPointer;
private Queue<String> buffer;
private boolean isValid;
private StringIterator(){
charPointer = charIterator();
isValid = true;
buffer = new ArrayDeque<>();
}
@Override
public boolean hasNext() {
String value = next();
try {
buffer.add(value);
} catch (NullPointerException ex){
markInvalid();
return false;
}
return isValid;
}
@Override
public String next() {
if(buffer.size() > 0){
return buffer.remove();
}
if(!isValid){
return null;
}
StringBuilder sb = new StringBuilder();
Character currentChar = charPointer.next();
if(currentChar == null){
return null;
}
while (currentChar.equals('\n') || currentChar.equals('\r')){
currentChar = charPointer.next();
if(currentChar == null){
return null;
}
}
while (currentChar != Character.valueOf('\n') && currentChar != Character.valueOf('\r')){
sb.append(currentChar);
currentChar = charPointer.next();
}
return sb.toString();
}
private void markInvalid(){
isValid = false;
}
}
}