如何使用 HtmlUnit 显示所有 AJAX 请求
How to show all AJAX requests with HtmlUnit
我想获取网页所有网络调用的列表。这是页面的 url
https://www.upwork.com/o/jobs/browse/?q=Java&sort=renew_time_int%2Bdesc
如果查看 DeveloperConsole->Network,您将看到以下列表
这是我的代码:
public static void main(String[] args) throws IOException {
final WebClient webClient = configWebClient();
final List<String> list = new ArrayList<>();
new WebConnectionWrapper(webClient) {
@Override
public WebResponse getResponse(final WebRequest request) throws IOException {
final WebResponse response = super.getResponse(request);
list.add(request.getUrl().toString());
return response;
}
};
webClient.getPage("https://www.upwork.com/ab/find-work/");
list.forEach(System.out::println);
}
private static WebClient configWebClient() {
WebClient webClient = new WebClient(BrowserVersion.FIREFOX_60);
webClient.getOptions().setJavaScriptEnabled(true);
webClient.waitForBackgroundJavaScriptStartingBefore(5_000);
webClient.waitForBackgroundJavaScript(3_000);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setRedirectEnabled(true);
webClient.getOptions().setUseInsecureSSL(false);
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getCookieManager().setCookiesEnabled(true);
webClient.setAjaxController(new AjaxController());
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getCookieManager().setCookiesEnabled(true);
return webClient;
}
这是输出
https://www.upwork.com/o/jobs/browse/?q=Java&sort=renew_time_int%2Bdesc
https://www.upwork.com/o/jobs/browse/?q=Java
https://www.upwork.com:443/o/jobs/browse/js/328ecc3.js?4af40b2
https://www.googletagmanager.com/gtm.js?id=GTM-5XK7SV
https://client.perimeterx.net/PXSs13U803/main.min.js
https://assets.static-upwork.com/components/11.4.0/core.11.4.0.air2.min.js
https://assets.static-upwork.com/global-components/@latest/ugc.js
https://assets.static-upwork.com/global-components/@latest/ugc/ugc.6jcmqb32.js
https://www.upwork.com:443/static/jsui/JobSearchUI/assets/4af40b2/js/55260a3.js
如您所见,它不包含 xhr 调用。我做错了什么?
您的问题使用了两个不同的 URL;希望我用对了
- 这里多次提到; .waitForBackground... 方法不是选项,您必须在调用某些 Web 请求后调用它们
- AJAX中的A代表async; webClient.getPage() 是同步调用,意味着您必须等待所有 javascript 完成
在使用 HtmlUnit 时调用页面似乎会产生一些 js 错误。也许这会导致无法执行此页面中的所有 javascript 代码(HtmlUnit (Rhino) 仍然不支持一些 javascript 功能;欢迎任何帮助)
public static void main(String[] args) throws IOException {
final WebClient webClient = new WebClient(BrowserVersion.FIREFOX_60);
webClient.getOptions().setThrowExceptionOnScriptError(false);
final List<String> list = new ArrayList<>();
new WebConnectionWrapper(webClient) {
@Override
public WebResponse getResponse(final WebRequest request) throws IOException {
final WebResponse response = super.getResponse(request);
list.add(request.getHttpMethod() + " " + request.getUrl());
return response;
}
};
webClient.getPage("https://www.upwork.com/o/jobs/browse/?q=Java&sort=renew_time_int%2Bdesc");
webClient.waitForBackgroundJavaScript(10_000);
list.forEach(System.out::println);
}
我想获取网页所有网络调用的列表。这是页面的 url
https://www.upwork.com/o/jobs/browse/?q=Java&sort=renew_time_int%2Bdesc
如果查看 DeveloperConsole->Network,您将看到以下列表
这是我的代码:
public static void main(String[] args) throws IOException {
final WebClient webClient = configWebClient();
final List<String> list = new ArrayList<>();
new WebConnectionWrapper(webClient) {
@Override
public WebResponse getResponse(final WebRequest request) throws IOException {
final WebResponse response = super.getResponse(request);
list.add(request.getUrl().toString());
return response;
}
};
webClient.getPage("https://www.upwork.com/ab/find-work/");
list.forEach(System.out::println);
}
private static WebClient configWebClient() {
WebClient webClient = new WebClient(BrowserVersion.FIREFOX_60);
webClient.getOptions().setJavaScriptEnabled(true);
webClient.waitForBackgroundJavaScriptStartingBefore(5_000);
webClient.waitForBackgroundJavaScript(3_000);
webClient.getOptions().setCssEnabled(false);
webClient.getOptions().setRedirectEnabled(true);
webClient.getOptions().setUseInsecureSSL(false);
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getCookieManager().setCookiesEnabled(true);
webClient.setAjaxController(new AjaxController());
webClient.getOptions().setThrowExceptionOnScriptError(false);
webClient.getCookieManager().setCookiesEnabled(true);
return webClient;
}
这是输出
https://www.upwork.com/o/jobs/browse/?q=Java&sort=renew_time_int%2Bdesc
https://www.upwork.com/o/jobs/browse/?q=Java
https://www.upwork.com:443/o/jobs/browse/js/328ecc3.js?4af40b2
https://www.googletagmanager.com/gtm.js?id=GTM-5XK7SV
https://client.perimeterx.net/PXSs13U803/main.min.js
https://assets.static-upwork.com/components/11.4.0/core.11.4.0.air2.min.js
https://assets.static-upwork.com/global-components/@latest/ugc.js
https://assets.static-upwork.com/global-components/@latest/ugc/ugc.6jcmqb32.js
https://www.upwork.com:443/static/jsui/JobSearchUI/assets/4af40b2/js/55260a3.js
如您所见,它不包含 xhr 调用。我做错了什么?
您的问题使用了两个不同的 URL;希望我用对了
- 这里多次提到; .waitForBackground... 方法不是选项,您必须在调用某些 Web 请求后调用它们
- AJAX中的A代表async; webClient.getPage() 是同步调用,意味着您必须等待所有 javascript 完成
在使用 HtmlUnit 时调用页面似乎会产生一些 js 错误。也许这会导致无法执行此页面中的所有 javascript 代码(HtmlUnit (Rhino) 仍然不支持一些 javascript 功能;欢迎任何帮助)
public static void main(String[] args) throws IOException { final WebClient webClient = new WebClient(BrowserVersion.FIREFOX_60); webClient.getOptions().setThrowExceptionOnScriptError(false); final List<String> list = new ArrayList<>(); new WebConnectionWrapper(webClient) { @Override public WebResponse getResponse(final WebRequest request) throws IOException { final WebResponse response = super.getResponse(request); list.add(request.getHttpMethod() + " " + request.getUrl()); return response; } }; webClient.getPage("https://www.upwork.com/o/jobs/browse/?q=Java&sort=renew_time_int%2Bdesc"); webClient.waitForBackgroundJavaScript(10_000); list.forEach(System.out::println); }