有没有办法添加脚本以在 chrome+puppeeter 的 evaluate() 上下文中添加新函数?

Is there a way to add script to add new functions in evaluate() context of chrome+puppeeter?

基于这个 ,有没有办法(就像 casperjs/phantomjs)在 page.evaluate() 上下文中添加我们的自定义函数?

例如,包含一个带有辅助函数 x 的文件以调用 Xpath 函数:x('//a/@href')

您可以在单独的 page.evaluate() 调用中在浏览器上下文中将辅助函数注册到 运行。 page.exposeFunction() looks tempting, but it doesn't have access to browser context(您需要 document 对象)。

下面是在浏览器上下文中注册辅助函数的示例,如 $x()

const puppeteer = require('puppeteer');

const addHelperFunctions = () => {
    window.$x = xPath => document
        .evaluate(
            xPath,
            document,
            null,
            XPathResult.FIRST_ORDERED_NODE_TYPE,
            null
        )
        .singleNodeValue;
};

(async () => {
    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    await page.goto('https://en.wikipedia.org', { waitUntil: 'networkidle2' });

    await page.evaluate(addHelperFunctions);

    const text = await page.evaluate(() => {
        // $x() is now available
        const featureArticle = $x('//*[@id="mp-tfa"]');

        return featureArticle.textContent;
    });
    console.log(text);
    await browser.close();
})();

您还可以将帮助程序保存在单独的文件中,然后使用 page.addScriptTag().

将它们注入到浏览器上下文中

这是一个例子:

helperFunctions.js

window.$x = xPath => document
    .evaluate(
        xPath,
        document,
        null,
        XPathResult.FIRST_ORDERED_NODE_TYPE,
        null
    )
    .singleNodeValue;

并使用它:

const puppeteer = require('puppeteer');

(async () => {
    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    await page.goto('https://en.wikipedia.org', { waitUntil: 'networkidle2' });

    await page.addScriptTag({ path: './helperFunctions.js' });

    const text = await page.evaluate(() => {
        // $x() is now available
        const featureArticle = $x('//*[@id="mp-tfa"]');

        return featureArticle.textContent;
    });
    console.log(text);
    await browser.close();
})();

基于getElementByXPath()getElementsByXPath()的另一种解决方案。优点是我们可以针对特定节点(第二个参数)使用 xpath 表达式。

window.$x = xPath => document
    .evaluate(
        xPath,
        document,
        null,
        XPathResult.FIRST_ORDERED_NODE_TYPE,
        null
    )
    .singleNodeValue;

window.getElementByXPath = function getElementByXPath(expression, scope) {
    scope = scope || document;
    var a = document.evaluate(expression, scope, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
    if (a.snapshotLength > 0) {
        return a.snapshotItem(0);
    }
};

window.getElementsByXPath = function getElementsByXPath(expression, scope) {
    scope = scope || document;
    var nodes = [];
    var a = document.evaluate(expression, scope, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
    for (var i = 0; i < a.snapshotLength; i++) {
        nodes.push(a.snapshotItem(i));
    }
    return nodes;
};

现实生活中的代码示例:

const puppeteer = require('puppeteer');

(async () => {

    const browser = await puppeteer.launch();
    const page = await browser.newPage();

    await page.goto('https://99bitcoins.com/bitcoin-rich-list-top100/#addresses', { waitUntil: 'networkidle2' });
    await page.addScriptTag({ path: './helperFunctions.js' });

    const result = await page.evaluate(() => {
        var obj = {};
        var data = getElementsByXPath('//table[@class="t99btc-rich-list"]//tr');
        for (var i = 1; i<=100; i++) {
           obj[i] = {
               "hash": getElementByXPath('./td/a', data[i]).innerText,
               "balance": getElementByXPath('./td[3]', data[i]).innerText
           }
        }

        return obj;

    });
    console.log(JSON.stringify(result, null, 4));
    await browser.close();

})();