HtmlUnit 大型异常日志试图获取 html 网页,该网页使用脚本加载某些字段

HtmlUnit large exception log trying to get html of a webpage that loads somefields with scripts

我正在尝试阅读网页的 html。我认为某些字段是由脚本填充的,因为我可以在浏览器中完美地看到它们,但在网上抓取它们的内容是空的。 当我尝试使用 HtmlUnit 和下一个代码执行此操作时,我在日志中遇到了一个很大的异常,我不知道如何修复它。

这是我的代码:

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.DomNode;
import com.gargoylesoftware.htmlunit.html.DomNodeList;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

public class HtmlUnitTest {

    public static void main(String[] args) {

         /* turn off annoying htmlunit warnings */
        java.util.logging.Logger.getLogger("com.gargoylesoftware").setLevel(java.util.logging.Level.OFF);

        String searchQuery = "William Hill" ;

        WebClient client = new WebClient(BrowserVersion.CHROME);  
        client.getOptions().setCssEnabled(false);  
        client.getOptions().setJavaScriptEnabled(true);  
        try {  
          String searchUrl = "http://sports.williamhill.es/bet_esp/es/betting/t/338/LaLiga.html" + URLEncoder.encode(searchQuery, "UTF-8");

          HtmlPage page = client.getPage(searchUrl);
          client.waitForBackgroundJavaScriptStartingBefore(10000);

          final DomNodeList<DomNode> divs = page.querySelectorAll("tr.rowOdd");

          for (DomNode div : divs) {
                System.out.println(div.asXml());
            }


        }catch(Exception e){
          e.printStackTrace();
        }
        }


}

这是我得到的例外情况:

======= EXCEPTION START ========
EcmaError: lineNumber=[471] column=[0] lineSource=[<no source>] name=[TypeError] sourceName=[http://trans.staticcache.org/ob/static/cust/js/minified/main_end.js?ver=f4b42247e3c3fbf04e68fa1715088db1] message=[TypeError: Cannot call method "replace" of undefined (http://trans.staticcache.org/ob/static/cust/js/minified/main_end.js?ver=f4b42247e3c3fbf04e68fa1715088db1#471)]
com.gargoylesoftware.htmlunit.ScriptException: TypeError: Cannot call method "replace" of undefined (http://trans.staticcache.org/ob/static/cust/js/minified/main_end.js?ver=f4b42247e3c3fbf04e68fa1715088db1#471)
    at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine$HtmlUnitContextAction.run(JavaScriptEngine.java:914)
    at net.sourceforge.htmlunit.corejs.javascript.Context.call(Context.java:599)
    at net.sourceforge.htmlunit.corejs.javascript.ContextFactory.call(ContextFactory.java:527)
    at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine.execute(JavaScriptEngine.java:794)
    at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine.execute(JavaScriptEngine.java:770)
    at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine.execute(JavaScriptEngine.java:761)
    at com.gargoylesoftware.htmlunit.html.HtmlPage.executeJavaScript(HtmlPage.java:919)
    at com.gargoylesoftware.htmlunit.html.HtmlScript.executeInlineScriptIfNeeded(HtmlScript.java:316)
    at com.gargoylesoftware.htmlunit.html.HtmlScript.executeScriptIfNeeded(HtmlScript.java:396)
    at com.gargoylesoftware.htmlunit.html.HtmlScript.execute(HtmlScript.java:246)
    at com.gargoylesoftware.htmlunit.html.HtmlScript.onAllChildrenAddedToPage(HtmlScript.java:267)
    at com.gargoylesoftware.htmlunit.html.HTMLParser$HtmlUnitDOMBuilder.endElement(HTMLParser.java:805)
    at org.apache.xerces.parsers.AbstractSAXParser.endElement(Unknown Source)
    at com.gargoylesoftware.htmlunit.html.HTMLParser$HtmlUnitDOMBuilder.endElement(HTMLParser.java:761)
    at net.sourceforge.htmlunit.cyberneko.HTMLTagBalancer.callEndElement(HTMLTagBalancer.java:1236)
    at net.sourceforge.htmlunit.cyberneko.HTMLTagBalancer.endElement(HTMLTagBalancer.java:1136)
    at net.sourceforge.htmlunit.cyberneko.filters.DefaultFilter.endElement(DefaultFilter.java:226)
    at net.sourceforge.htmlunit.cyberneko.filters.NamespaceBinder.endElement(NamespaceBinder.java:345)
    at net.sourceforge.htmlunit.cyberneko.HTMLScanner$ContentScanner.scanEndElement(HTMLScanner.java:3189)
    at net.sourceforge.htmlunit.cyberneko.HTMLScanner$ContentScanner.scan(HTMLScanner.java:2141)
    at net.sourceforge.htmlunit.cyberneko.HTMLScanner.scanDocument(HTMLScanner.java:945)
    at net.sourceforge.htmlunit.cyberneko.HTMLConfiguration.parse(HTMLConfiguration.java:521)
    at net.sourceforge.htmlunit.cyberneko.HTMLConfiguration.parse(HTMLConfiguration.java:472)
    at org.apache.xerces.parsers.XMLParser.parse(Unknown Source)
    at com.gargoylesoftware.htmlunit.html.HTMLParser$HtmlUnitDOMBuilder.parse(HTMLParser.java:1004)
    at com.gargoylesoftware.htmlunit.html.HTMLParser.parse(HTMLParser.java:253)
    at com.gargoylesoftware.htmlunit.html.HTMLParser.parseHtml(HTMLParser.java:195)
    at com.gargoylesoftware.htmlunit.DefaultPageCreator.createHtmlPage(DefaultPageCreator.java:267)
    at com.gargoylesoftware.htmlunit.DefaultPageCreator.createPage(DefaultPageCreator.java:158)
    at com.gargoylesoftware.htmlunit.WebClient.loadWebResponseInto(WebClient.java:529)
    at com.gargoylesoftware.htmlunit.WebClient.getPage(WebClient.java:398)
    at com.gargoylesoftware.htmlunit.WebClient.getPage(WebClient.java:315)
    at com.gargoylesoftware.htmlunit.WebClient.getPage(WebClient.java:463)
    at com.gargoylesoftware.htmlunit.WebClient.getPage(WebClient.java:448)
    at HtmlUnitTest.main(HtmlUnitTest.java:25)
Caused by: net.sourceforge.htmlunit.corejs.javascript.EcmaError: TypeError: Cannot call method "replace" of undefined (http://trans.staticcache.org/ob/static/cust/js/minified/main_end.js?ver=f4b42247e3c3fbf04e68fa1715088db1#471)
    at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.constructError(ScriptRuntime.java:4130)
    at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.constructError(ScriptRuntime.java:4108)
    at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.typeError(ScriptRuntime.java:4141)
    at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.typeError2(ScriptRuntime.java:4160)
    at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.undefCallError(ScriptRuntime.java:4179)
    at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.getPropFunctionAndThisHelper(ScriptRuntime.java:2509)
    at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.getPropFunctionAndThis(ScriptRuntime.java:2502)
    at net.sourceforge.htmlunit.corejs.javascript.Interpreter.interpretLoop(Interpreter.java:1327)
    at net.sourceforge.htmlunit.corejs.javascript.Interpreter.interpret(Interpreter.java:815)
    at net.sourceforge.htmlunit.corejs.javascript.InterpretedFunction.call(InterpretedFunction.java:111)
    at net.sourceforge.htmlunit.corejs.javascript.ContextFactory.doTopCall(ContextFactory.java:417)
    at com.gargoylesoftware.htmlunit.javascript.HtmlUnitContextFactory.doTopCall(HtmlUnitContextFactory.java:325)
    at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.doTopCall(ScriptRuntime.java:3424)
    at net.sourceforge.htmlunit.corejs.javascript.InterpretedFunction.exec(InterpretedFunction.java:122)
    at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine.doRun(JavaScriptEngine.java:785)
    at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine$HtmlUnitContextAction.run(JavaScriptEngine.java:899)
    ... 34 more
Enclosed exception: 
net.sourceforge.htmlunit.corejs.javascript.EcmaError: TypeError: Cannot call method "replace" of undefined (http://trans.staticcache.org/ob/static/cust/js/minified/main_end.js?ver=f4b42247e3c3fbf04e68fa1715088db1#471)
    at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.constructError(ScriptRuntime.java:4130)
    at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.constructError(ScriptRuntime.java:4108)
    at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.typeError(ScriptRuntime.java:4141)
    at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.typeError2(ScriptRuntime.java:4160)
    at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.undefCallError(ScriptRuntime.java:4179)
    at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.getPropFunctionAndThisHelper(ScriptRuntime.java:2509)
    at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.getPropFunctionAndThis(ScriptRuntime.java:2502)
    at net.sourceforge.htmlunit.corejs.javascript.Interpreter.interpretLoop(Interpreter.java:1327)
    at script(http://trans.staticcache.org/ob/static/cust/js/minified/main_end.js?ver=f4b42247e3c3fbf04e68fa1715088db1:471)
    at script(http://trans.staticcache.org/ob/static/cust/js/minified/main_end.js?ver=f4b42247e3c3fbf04e68fa1715088db1:377)
    at script(http://trans.staticcache.org/ob/static/cust/js/minified/main_end.js?ver=f4b42247e3c3fbf04e68fa1715088db1:379)
    at script(http://trans.staticcache.org/ob/static/cust/js/minified/main_end.js?ver=f4b42247e3c3fbf04e68fa1715088db1:397)
    at script(http://trans.staticcache.org/ob/static/cust/js/minified/main_end.js?ver=f4b42247e3c3fbf04e68fa1715088db1:415)
    at script(http://trans.staticcache.org/ob/static/cust/js/minified/main_end.js?ver=f4b42247e3c3fbf04e68fa1715088db1:418)
    at script(script in http://sports.williamhill.es/bet_esp/es/betting/t/338/LaLiga.htmlWilliam+Hill from (1174, 34) to (1192, 12):1176)
    at net.sourceforge.htmlunit.corejs.javascript.Interpreter.interpret(Interpreter.java:815)
    at net.sourceforge.htmlunit.corejs.javascript.InterpretedFunction.call(InterpretedFunction.java:111)
    at net.sourceforge.htmlunit.corejs.javascript.ContextFactory.doTopCall(ContextFactory.java:417)
    at com.gargoylesoftware.htmlunit.javascript.HtmlUnitContextFactory.doTopCall(HtmlUnitContextFactory.java:325)
    at net.sourceforge.htmlunit.corejs.javascript.ScriptRuntime.doTopCall(ScriptRuntime.java:3424)
    at net.sourceforge.htmlunit.corejs.javascript.InterpretedFunction.exec(InterpretedFunction.java:122)
    at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine.doRun(JavaScriptEngine.java:785)
    at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine$HtmlUnitContextAction.run(JavaScriptEngine.java:899)
    at net.sourceforge.htmlunit.corejs.javascript.Context.call(Context.java:599)
    at net.sourceforge.htmlunit.corejs.javascript.ContextFactory.call(ContextFactory.java:527)
    at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine.execute(JavaScriptEngine.java:794)
    at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine.execute(JavaScriptEngine.java:770)
    at com.gargoylesoftware.htmlunit.javascript.JavaScriptEngine.execute(JavaScriptEngine.java:761)
    at com.gargoylesoftware.htmlunit.html.HtmlPage.executeJavaScript(HtmlPage.java:919)
    at com.gargoylesoftware.htmlunit.html.HtmlScript.executeInlineScriptIfNeeded(HtmlScript.java:316)
    at com.gargoylesoftware.htmlunit.html.HtmlScript.executeScriptIfNeeded(HtmlScript.java:396)
    at com.gargoylesoftware.htmlunit.html.HtmlScript.execute(HtmlScript.java:246)
    at com.gargoylesoftware.htmlunit.html.HtmlScript.onAllChildrenAddedToPage(HtmlScript.java:267)
    at com.gargoylesoftware.htmlunit.html.HTMLParser$HtmlUnitDOMBuilder.endElement(HTMLParser.java:805)
    at org.apache.xerces.parsers.AbstractSAXParser.endElement(Unknown Source)
    at com.gargoylesoftware.htmlunit.html.HTMLParser$HtmlUnitDOMBuilder.endElement(HTMLParser.java:761)
    at net.sourceforge.htmlunit.cyberneko.HTMLTagBalancer.callEndElement(HTMLTagBalancer.java:1236)
    at net.sourceforge.htmlunit.cyberneko.HTMLTagBalancer.endElement(HTMLTagBalancer.java:1136)
    at net.sourceforge.htmlunit.cyberneko.filters.DefaultFilter.endElement(DefaultFilter.java:226)
    at net.sourceforge.htmlunit.cyberneko.filters.NamespaceBinder.endElement(NamespaceBinder.java:345)
    at net.sourceforge.htmlunit.cyberneko.HTMLScanner$ContentScanner.scanEndElement(HTMLScanner.java:3189)
    at net.sourceforge.htmlunit.cyberneko.HTMLScanner$ContentScanner.scan(HTMLScanner.java:2141)
    at net.sourceforge.htmlunit.cyberneko.HTMLScanner.scanDocument(HTMLScanner.java:945)
    at net.sourceforge.htmlunit.cyberneko.HTMLConfiguration.parse(HTMLConfiguration.java:521)
    at net.sourceforge.htmlunit.cyberneko.HTMLConfiguration.parse(HTMLConfiguration.java:472)
    at org.apache.xerces.parsers.XMLParser.parse(Unknown Source)
    at com.gargoylesoftware.htmlunit.html.HTMLParser$HtmlUnitDOMBuilder.parse(HTMLParser.java:1004)
    at com.gargoylesoftware.htmlunit.html.HTMLParser.parse(HTMLParser.java:253)
    at com.gargoylesoftware.htmlunit.html.HTMLParser.parseHtml(HTMLParser.java:195)
    at com.gargoylesoftware.htmlunit.DefaultPageCreator.createHtmlPage(DefaultPageCreator.java:267)
    at com.gargoylesoftware.htmlunit.DefaultPageCreator.createPage(DefaultPageCreator.java:158)
    at com.gargoylesoftware.htmlunit.WebClient.loadWebResponseInto(WebClient.java:529)
    at com.gargoylesoftware.htmlunit.WebClient.getPage(WebClient.java:398)
    at com.gargoylesoftware.htmlunit.WebClient.getPage(WebClient.java:315)
    at com.gargoylesoftware.htmlunit.WebClient.getPage(WebClient.java:463)
    at com.gargoylesoftware.htmlunit.WebClient.getPage(WebClient.java:448)
    at HtmlUnitTest.main(HtmlUnitTest.java:25)
======= EXCEPTION END ========

第一次通过启用选项告诉webclient不要为脚本错误抛出异常

client.getOptions().setThrowExceptionOnScriptError(false);