有没有办法让 puppeteer 的 waitUntil "networkidle" 只考虑 XHR (ajax) 请求?
Is there a way to get puppeteer's waitUntil "networkidle" to only consider XHR (ajax) requests?
我正在使用 puppeteer 在我的测试应用程序中评估基于 javascript 的 HTML 网页。
这是我用来确保加载所有数据的行:
await page.setRequestInterception(true);
page.on("request", (request) => {
if (request.resourceType() === "image" || request.resourceType() === "font" || request.resourceType() === "media") {
console.log("Request intercepted! ", request.url(), request.resourceType());
request.abort();
} else {
request.continue();
}
});
try {
await page.goto(url, { waitUntil: ['networkidle0', 'load'], timeout: requestCounterMaxWaitMs });
} catch (e) {
}
这是等待 ajax 请求 完成的最佳方式吗?
感觉不错,但我不确定是否应该使用 networkidle0、networkidle1 等?
XHR 本质上可以稍后出现在应用程序中。如果应用程序在例如 1 秒后发送 XHR 而您想等待它,则任何 networkidle0
都不会帮助您。我想如果你想这样做 "properly" 你应该知道你在等待什么请求并 await
等待它们。
这是一个 XHR 稍后在应用程序中发生并等待所有 XHR 的示例:
const puppeteer = require('puppeteer');
const html = `
<html>
<body>
<script>
setTimeout(() => {
fetch('https://swapi.co/api/people/1/');
}, 1000);
setTimeout(() => {
fetch('https://www.metaweather.com/api/location/search/?query=san');
}, 2000);
setTimeout(() => {
fetch('https://api.fda.gov/drug/event.json?limit=1');
}, 3000);
</script>
</body>
</html>`;
// you can listen to part of the request
// in this example I'm waiting for all of them
const requests = [
'https://swapi.co/api/people/1/',
'https://www.metaweather.com/api/location/search/?query=san',
'https://api.fda.gov/drug/event.json?limit=1'
];
const waitForRequests = (page, names) => {
const requestsList = [...names];
return new Promise(resolve =>
page.on('request', request => {
if (request.resourceType() === "xhr") {
// check if request is in observed list
const index = requestsList.indexOf(request.url());
if (index > -1) {
requestsList.splice(index, 1);
}
// if all request are fulfilled
if (!requestsList.length) {
resolve();
}
}
request.continue();
})
);
};
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setRequestInterception(true);
// register page.on('request') observables
const observedRequests = waitForRequests(page, requests);
// await is ignored here because you want to only consider XHR (ajax)
// but it's not necessary
page.goto(`data:text/html,${html}`);
console.log('before xhr');
// await for all observed requests
await observedRequests;
console.log('after all xhr');
await browser.close();
})();
您可以使用 pending-xhr-puppeteer,一个公开承诺等待所有未决 xhr 请求得到解决的库。
像这样使用它:
const puppeteer = require('puppeteer');
const { PendingXHR } = require('pending-xhr-puppeteer');
const browser = await puppeteer.launch({
headless: true,
args,
});
const page = await browser.newPage();
const pendingXHR = new PendingXHR(page);
await page.goto(`http://page-with-xhr`);
// Here all xhr requests are not finished
await pendingXHR.waitForAllXhrFinished();
// Here all xhr requests are finished
免责声明:我是 pending-xhr-puppeteer
的维护者
我同意 中的观点,即等待 所有 网络 activity 停止(“所有数据已加载”)是一个相当模棱两可的概念,完全取决于您要抓取的网站的行为。
用于检测响应的选项包括等待固定持续时间、网络流量空闲后的固定持续时间、特定响应(或一组响应)、元素出现在页面上、谓词 return true 等等,所有这些 Puppeteer supports.
考虑到这一点,最典型的情况是您正在等待来自已知(或部分已知,使用某种模式或前缀)资源的某些特定响应或一组响应 URL(s ) 将传递您想要读取的有效载荷 and/or 触发您需要检测的 DOM 交互。 Puppeteer 提供 page.waitForResponse
来做这件事。
这是一个基于 的示例(并展示了我们如何从响应中检索数据):
const puppeteer = require("puppeteer");
const html = `
<html>
<body>
<script>
setTimeout(() => {
fetch("http://jsonplaceholder.typicode.com/users/1");
}, 1000);
setTimeout(() => {
fetch("http://jsonplaceholder.typicode.com/users/2");
}, 2000);
setTimeout(() => {
fetch("http://jsonplaceholder.typicode.com/users/3");
}, 3000);
setTimeout(() => {
// fetch something irrelevant to us
fetch("http://jsonplaceholder.typicode.com/users/4");
}, 0);
</script>
</body>
</html>`;
(async () => {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.setContent(html);
const expectedUrls = [
"http://jsonplaceholder.typicode.com/users/1",
"http://jsonplaceholder.typicode.com/users/2",
"http://jsonplaceholder.typicode.com/users/3",
];
try {
const responses = await Promise.all(expectedUrls.map(url =>
page.waitForResponse(
response => response.url() === url,
{timeout: 5000}
)
));
const data = await Promise.all(
responses.map(response => response.json())
);
console.log(data);
}
catch (err) {
console.error(err);
}
await browser.close();
})()
我正在使用 puppeteer 在我的测试应用程序中评估基于 javascript 的 HTML 网页。
这是我用来确保加载所有数据的行:
await page.setRequestInterception(true);
page.on("request", (request) => {
if (request.resourceType() === "image" || request.resourceType() === "font" || request.resourceType() === "media") {
console.log("Request intercepted! ", request.url(), request.resourceType());
request.abort();
} else {
request.continue();
}
});
try {
await page.goto(url, { waitUntil: ['networkidle0', 'load'], timeout: requestCounterMaxWaitMs });
} catch (e) {
}
这是等待 ajax 请求 完成的最佳方式吗?
感觉不错,但我不确定是否应该使用 networkidle0、networkidle1 等?
XHR 本质上可以稍后出现在应用程序中。如果应用程序在例如 1 秒后发送 XHR 而您想等待它,则任何 networkidle0
都不会帮助您。我想如果你想这样做 "properly" 你应该知道你在等待什么请求并 await
等待它们。
这是一个 XHR 稍后在应用程序中发生并等待所有 XHR 的示例:
const puppeteer = require('puppeteer');
const html = `
<html>
<body>
<script>
setTimeout(() => {
fetch('https://swapi.co/api/people/1/');
}, 1000);
setTimeout(() => {
fetch('https://www.metaweather.com/api/location/search/?query=san');
}, 2000);
setTimeout(() => {
fetch('https://api.fda.gov/drug/event.json?limit=1');
}, 3000);
</script>
</body>
</html>`;
// you can listen to part of the request
// in this example I'm waiting for all of them
const requests = [
'https://swapi.co/api/people/1/',
'https://www.metaweather.com/api/location/search/?query=san',
'https://api.fda.gov/drug/event.json?limit=1'
];
const waitForRequests = (page, names) => {
const requestsList = [...names];
return new Promise(resolve =>
page.on('request', request => {
if (request.resourceType() === "xhr") {
// check if request is in observed list
const index = requestsList.indexOf(request.url());
if (index > -1) {
requestsList.splice(index, 1);
}
// if all request are fulfilled
if (!requestsList.length) {
resolve();
}
}
request.continue();
})
);
};
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.setRequestInterception(true);
// register page.on('request') observables
const observedRequests = waitForRequests(page, requests);
// await is ignored here because you want to only consider XHR (ajax)
// but it's not necessary
page.goto(`data:text/html,${html}`);
console.log('before xhr');
// await for all observed requests
await observedRequests;
console.log('after all xhr');
await browser.close();
})();
您可以使用 pending-xhr-puppeteer,一个公开承诺等待所有未决 xhr 请求得到解决的库。
像这样使用它:
const puppeteer = require('puppeteer');
const { PendingXHR } = require('pending-xhr-puppeteer');
const browser = await puppeteer.launch({
headless: true,
args,
});
const page = await browser.newPage();
const pendingXHR = new PendingXHR(page);
await page.goto(`http://page-with-xhr`);
// Here all xhr requests are not finished
await pendingXHR.waitForAllXhrFinished();
// Here all xhr requests are finished
免责声明:我是 pending-xhr-puppeteer
的维护者我同意
用于检测响应的选项包括等待固定持续时间、网络流量空闲后的固定持续时间、特定响应(或一组响应)、元素出现在页面上、谓词 return true 等等,所有这些 Puppeteer supports.
考虑到这一点,最典型的情况是您正在等待来自已知(或部分已知,使用某种模式或前缀)资源的某些特定响应或一组响应 URL(s ) 将传递您想要读取的有效载荷 and/or 触发您需要检测的 DOM 交互。 Puppeteer 提供 page.waitForResponse
来做这件事。
这是一个基于
const puppeteer = require("puppeteer");
const html = `
<html>
<body>
<script>
setTimeout(() => {
fetch("http://jsonplaceholder.typicode.com/users/1");
}, 1000);
setTimeout(() => {
fetch("http://jsonplaceholder.typicode.com/users/2");
}, 2000);
setTimeout(() => {
fetch("http://jsonplaceholder.typicode.com/users/3");
}, 3000);
setTimeout(() => {
// fetch something irrelevant to us
fetch("http://jsonplaceholder.typicode.com/users/4");
}, 0);
</script>
</body>
</html>`;
(async () => {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.setContent(html);
const expectedUrls = [
"http://jsonplaceholder.typicode.com/users/1",
"http://jsonplaceholder.typicode.com/users/2",
"http://jsonplaceholder.typicode.com/users/3",
];
try {
const responses = await Promise.all(expectedUrls.map(url =>
page.waitForResponse(
response => response.url() === url,
{timeout: 5000}
)
));
const data = await Promise.all(
responses.map(response => response.json())
);
console.log(data);
}
catch (err) {
console.error(err);
}
await browser.close();
})()