Puppeteer 从 HTML 数据创建 PDF 文件挂起 Windows 10 个系统

Puppeteer create PDF files from HTML data hangs Windows 10 system

我创建了一个应用程序,通过从多个 excel 工作簿中提取数据来处理学生的成绩。问题是使用 Puppeteer 生成 PDF 文件,使系统陷入循环,直到挂起系统。

实际上,我已经使用捆绑为 pdf-creator-node 的 PhantomJs 测试了下面相同的代码,并且能够在 3 分钟内轻松生成 150 个 PDF 文件。我放弃 PhantomJs 的唯一挑战是 CSS 文件中的所有样式都没有包含,即使我将它作为内联样式插入 header 中,起诉 JS 的替换功能。另一个是 PhantomJs 不再处于积极开发中。我在网上搜索了一下,发现只有 Puppeteer 是有效的解决方案,并且有积极的开发和支持。

我尝试在循环中的 pdfCreator() 末尾使用 page.close(),在 pdfGenerator() 末尾使用 browser.close()。我做错了什么?

以下是server.js和PdfGenerator.js文件中的代码,以及错误示例,以及系统爬出挂起状态后我的任务管理器的屏幕截图。对于 HTML 代,我使用了 Mustache。我排除了 server.js 中的一些代码行,因为总字符数超过 60k。

server.js


// [codes were removed here]


        if(getCode == 'compute-result') {
          // declare variable
          let setData = null;
          let setTitle = 'Results Computation...';
          let setArgs = getArgs;
          // dataFromFile = ReadFile(pathCodeTextFile);
          // setArgs = Number(dataFromFile);
          setCode = 'compute-result';
          let setView = [];
          let setNext = true;
          let countTerms = [];
          
          // if(getArg > 0) {

            // Final Result computation
            const getJson = ReadFile(pathJsonResults);
            // const getCtrl = ReadFile(pathJsonCtrl);
            const getResultObject = JSON.parse(getJson);
            getResult = getResultObject;
            const totalResults = getResult.firstTerm.length + getResult.secondTerm.length + getResult.thirdTerm.length;

            if(setView.length < 1 && getResult != null) {
              setData = 'PDFs for Students Results initiating...';
              setView.unshift('Reading saved data...');
              client.emit('query', {data: setData, title: setTitle, code: setCode, next: setNext, args: null, view: JSON.stringify(setView)});
            }

          Sleep(2000).then(() => {

            if(getResult != null) {          
              setData = 'Students Results will be ready in a moment';
              client.emit('query', {data: setData, title: setTitle, code: setCode, next: setNext, args: setArgs, view: JSON.stringify(setView)});
            }

            const wacthFiles = (file, className, termName, sessionName, completed, pdfList) => {
              try {
                if(typeof file == 'string' && !FileExists(pathJsonPdfList)) {

                  if(pdfList.length < 2){
                    setData = 'Saving PDFs to downladable files...';
                  }

                  if(className != null && termName != null && sessionName != null) {
                    setTitle = `${pdfList.length} Result PDF${pdfList.length > 1?'s':''}...`;
                    setView.unshift(file);
                    if(!countTerms.includes(termName)) {
                      countTerms.push(termName)
                    }

                    // setCode = -1000 - pdfList.length;
                    // console.log('PDF PROGRESS: ', `${pdfList.length} Result PDF${pdfList.length > 1?'s':''}... ${setCode}`);
                  
                    // when all PDFs are created
                    if(completed) {
                      setTitle = setTitle.replace('...', ' [completed]');
                      setData = 'Result Download button is Active. You may click it now.';
                      setView.unshift('=== PDF GENERATION COMPLETED ===');
                      setView.unshift(`A total of ${pdfList.length} students' Results were generated`);
                      WriteFile(pathJsonPdfList, JSON.stringify(pdfList));

                      // set donwload button active
                      setCode = Number(codeTextFilePdfCompleted);
                      setNext = false;
                      getResult = null;
                      let termString = countTerms.toString();
                      termString = ReplaceAll(termString, '-term', '');
                      termString = ReplaceAll(termString, ',', '-');
                      const addTxt = `${className} _${termString} Term${countTerms.length>1?'s':''} (${sessionName})`;
                      WriteFile(pathCodeTextFile, addTxt);
                      // console.log('======== PDF GENERATION ENDS ================');
                    } else {
                      setCode = -1 * pdfList.length;
                    }
                      client.emit('query', {data: setData, title: setTitle, code: setCode, next: setNext, args: setArgs, view: JSON.stringify(setView)});
                    }
                }
                
              } catch (error) {
                console.log('ERROR ON WATCHER: ', error);
              }
            }


            if(!FileExists(pathJsonPdfList) && getResult !== null) {
              PdfGenerator(getResult, wacthFiles);
            }

            // Watcher(pathWatchResults, setCode, wacthDir, 10000);
          });
          // }
        }


      }
    } catch (error) {
  })

  client.on('disconnect', () => {
    console.log('SERVER: Disconnected');
});



server.listen(portApi, () =>{
  console.log('Server listens on port 8881')
});

// serve static files
app.use(express.static(pathPublic));

// [codes were removed here]

PdfGenerator.js 问题在于这些功能:PdfGenerator & createPdf

'use strict';
process.setMaxListeners(Infinity) // fix for Puppeteer MaxListenerExceededWarning
const Puppeteer = require('puppeteer')
const {HtmlGenerator} = require('../components/HtmlGenerator')
const {WriteFile, FileExists, RandomNumber, RoundNumber, IsNumberFraction, ReadFile} = require('../components/Functions')


if (process.env.NODE_ENV !== 'production') {
    require('dotenv').config();
}

const pathFirstTermResults = process.env.DIR_FIRST_TERM_RESULTS;
const pathSecondTermResults = process.env.DIR_SECOND_TERM_RESULTS;
const pathThirdTermResults = process.env.DIR_THIRD_TERM_RESULTS;
const publicDir = process.env.DIR_PUBLIC;
const cssFile = process.env.PATH_CSS_FILENAME;
const pathCssRaw = __dirname + '\' + publicDir + '\' + cssFile;
const pathCss = pathCssRaw.replace(`\uploads`, '');
const tagCssReplace = process.env.TAG_CSS_REPLACE;
let jsonDir = process.env.PATH_JSON;
jsonDir = jsonDir.split('/').pop();
let htmlDir = process.env.DIR_HTML;
htmlDir = __dirname + '\' + htmlDir.split('/').pop();
const htmlType1 = htmlDir +  '\' + process.env.HTML_TYPE1;
const htmlType2 = htmlDir +  '\' + process.env.HTML_TYPE2;
const htmlType3 = htmlDir +  '\' + process.env.HTML_TYPE3;
const pathJsonPdfList = './' + jsonDir + '/' + process.env.JSON_PDF_LIST_FILENAME;
const pathJsonPdfContent = __dirname + '\' + jsonDir + '\' + process.env.JSON_PDF_CONTENT;

const firstTermDir = 'first-term';
const secondTermDir = 'second-term';
const thirdTermDir = 'third-term';

let cumulativeFirstTermTotalList = {};
let cumulativeSecondTermTotalList = {};

let firstTermOnce = true;
let secondTermOnce = true;
let thirdTermOnce = true;
let isActive = false;

const getPath = (p, f) => {
    let dir = pathFirstTermResults;
    switch (p) {
        case firstTermDir:
            dir = pathFirstTermResults;
            break;
        case secondTermDir:
            dir = pathSecondTermResults;
            break;
        case thirdTermDir:
            dir = pathThirdTermResults;
            break;
    
        default:
            break;
    }
    return dir + f
}

const resolution = {
    x: 1920,
    y: 1080
}

const args = [
    '--disable-gpu',
    `--window-size=${resolution.x},${resolution.y}`,
    '--no-sandbox',
]

const createPdf = (page, content, templateType, filename, className, term, sessionName, isProcessActive, pdfFileList, cb) => {
    
    let path, document, options;
    path = getPath(term, filename);

    if(path != null) {

        let options = {
            path: path,
            format: 'A4',
            printBackground: true,
            margin: {
                left: '0px',
                top: '0px',
                right: '0px',
                bottom: '0px'
            }
        }
        
        let templateData = '';
        switch (templateType) {
            case '1':
                templateData = ReadFile(htmlType1);
                break;
            case '2':
                templateData = ReadFile(htmlType2);
                break;
            case '3':
                templateData = ReadFile(htmlType3);
                break;
        
            default:
                templateData = ReadFile(htmlType1);
                break;
        }
        
        (async() => {
            const html = HtmlGenerator(content, templateData);

            if(html != undefined && html !== '' && html != null) {
            // create PDF file
            cb(filename, className, term, sessionName, isProcessActive, pdfFileList);

                // get style from .css & replace
                const css = ReadFile(pathCss);

                await page.setContent(html, { waitUntil: 'networkidle0'});
                await page.addStyleTag(css);
                await page.pdf(options);
                page.close();
            }
        })()
    }
}


const pdfGenerator = (json, cb) => {
    let data  = {};
    let pdfFileList = [];

    if(typeof json == 'string') {
        data = JSON.parse(json)
    } else {
        data = json;
    }

    try {        

    // declare defaults
    let filename = 'Student' + '.pdf';
    let termName = firstTermDir;
    const templateType = data.keys.templateType;
    const session = data.classInfo.Session;
    const sessionName = session.replace('/', '-');
    const students = data.students;
    const className = data.classInfo.Class_Name;
    const recordFirstTerm = data.firstTerm;
    const recordSecondTerm = data.secondTerm;
    const recordThirdTerm = data.thirdTerm;
    
    let pdfCreatedList = [];
    let isReset = false;

    let totalResultsExpected = Object.keys(recordFirstTerm).length + Object.keys(recordSecondTerm).length + Object.keys(recordThirdTerm).length;
    let totalResultsCount = 0;
    let jsonForPdf = {};
    let record = {};
    let sRecord, path, id, fName, lName;

    // get each student 
    let logEndOnce = true;
    let logBeforeOnce = true;
    logBeforeOnce && console.log('==============    ***     ================');
    logBeforeOnce && console.log('======== PDF GENERATION BEGINS ================');
    

    const computeResult = (page, setTerm, setRecord, setReset) => {
        const termName = setTerm;
        const record = setRecord;
        let isReset = setReset;

        logBeforeOnce && console.log(`====== ${termName} RESULTS BEGINS ======`);
            for(let elem of students){
                id = elem.id;
                fName = elem.firstName;
                lName = elem.lastName;
                filename = `${lName} ${fName} _${termName} ${sessionName}.pdf`;
                // sRecord = record.filter(function (entry) { return entry[id] !== undefined; });
                sRecord = record[id];
                path = getPath(termName, filename);
    
                // create pdf
                if(!FileExists(path) && !FileExists(pathJsonPdfList)){
                
                    // generate final JSON for the student
                    // isReset = (pdfCreatedList.includes(id))? false: true;
                    
                    jsonForPdf = finalJson(elem, sRecord, data, termName);
                    (pdfFileList.length < 1) && WriteFile(pathJsonPdfContent, JSON.stringify(jsonForPdf));
        
                    pdfFileList.push({
                      'term': termName,
                      'file': filename
                    });
                    totalResultsCount = pdfFileList.length;
                    const pdfDate = new Date();
                    console.log(`${filename} (${totalResultsCount}/${totalResultsExpected}) at ${pdfDate.getHours()}hr${pdfDate.getHours()>1?'s':''} - ${pdfDate.getMinutes()}min${pdfDate.getMinutes()>1?'s':''} - ${pdfDate.getSeconds()}sec${pdfDate.getSeconds()>1?'s':''}`);

                    isActive = (totalResultsExpected === totalResultsCount)? true: false;
                    logEndOnce = false;
                    // cb(filename, className, termName, sessionName, isActive, pdfFileList);
                    // WriteFile(path, null);
                    isReset = true;
                    createPdf(page, jsonForPdf, templateType, filename, className, termName, sessionName, isActive, pdfFileList, cb);
                }
            }


            logBeforeOnce && console.log(`====== ${termName} RESULTS ENDS ======`);
    }

    // get each student result for First Term
    const computeFirstTerm = (p) => {
        return new Promise((resolve) => {
            if(data.keys.firstTerm === '1') {
                termName = firstTermDir;
                record = recordFirstTerm;
                pdfCreatedList = [];
                isReset = false;

                computeResult(p, termName, record, isReset)
            }
            resolve()
        })
    }

    // get each student result for Second Term
    const computeSecondTerm = (p) => {
        return new Promise((resolve) => {
            if(data.keys.secondTerm === '1') {
                termName = secondTermDir;
                record = recordSecondTerm;
                pdfCreatedList = [];
                isReset = false;

                computeResult(p, termName, record, isReset)
            }
            resolve()
        })
    }

    // get each student result for Third Term
    const computeThirdTerm = (p) => {
        return new Promise((resolve) => {
            if(data.keys.thirdTerm === '1') {
                termName = thirdTermDir;
                record = recordThirdTerm;
                pdfCreatedList = [];
                isReset = false;

                computeResult(p, termName, record, isReset)
            }
            resolve()
        })
    }

    (async () => {
        browser = await Puppeteer.launch({
            headless: true,
            handleSIGINT: false,
            args: args,
        });

        const page = await browser.newPage();
    
        await page.setViewport({
            width: resolution.x,
            height: resolution.y,
        })

        await computeFirstTerm(page);
        await computeSecondTerm(page);
        await computeThirdTerm(page);
        browser.close()
    })()
    

    
    if(totalResultsExpected === totalResultsCount && totalResultsCount !== 0 && !logEndOnce) {
        logEndOnce = true;
        logBeforeOnce = false;
        console.log('======== PDF GENERATION ENDS ================');
    }



    } catch (error) {
        console.log('==== ERROR IN PDF GENERATION: ', error)
    }


}

module.exports = {
    PdfGenerator: pdfGenerator
}

错误

info Visit https://yarnpkg.com/en/docs/cli/run for documentation about this command.

lerna ERR! yarn run start stderr:

<--- Last few GCs --->

[9884:000002D68A73C6B0]  1665171 ms: Scavenge 44.1 (45.8) -> 43.2 (45.8) MB, 223.9 / 0.0 ms  (average mu = 0.956, current mu = 0.952) allocation failure
[9884:000002D68A73C6B0]  1684089 ms: Scavenge 44.1 (45.8) -> 43.3 (45.8) MB, 587.3 / 0.0 ms  (average mu = 0.956, current mu = 0.952) allocation failure
[9884:000002D68A73C6B0]  1749901 ms: Scavenge 44.2 (45.8) -> 43.3 (45.8) MB, 5099.0 / 0.0 ms  (average mu = 0.956, current mu = 0.952) allocation failure


<--- JS stacktrace --->

FATAL ERROR: Committing semi space failed. Allocation failed - JavaScript heap out of memory
 1: 00007FF6ED61013F
 2: 00007FF6ED59F396
 3: 00007FF6ED5A024D
 4: 00007FF6EDED19EE
 5: 00007FF6EDEBBECD
 6: 00007FF6EDD5F61C
 7: 00007FF6EDD6933F
 8: 00007FF6EDD5BF19
 9: 00007FF6EDD5A0D0
10: 00007FF6EDD7EA06
11: 00007FF6EDAB1CD5
12: 00007FF6EDF5F3E1
13: 00007FF6EDF602E9
14: 000002D68C4EF69E
error Command failed with exit code 134.

任务管理器的屏幕截图,Chromium 运行 超过 50 的多个实例。

感谢任何帮助。我希望这可以解决,让我顺利生成 PDF。 谢谢。

示例解决方案(限制并行浏览器)

我为您创建了一个 PdfPrinter class,您可以将其集成到您的设置中。它允许您限制并行 pdf 生成作业的数量,并允许设置限制并为您管理 opening/closing 浏览器。 PdfPrinter class 也是高度耦合的,需要进行一些修改才能将其用作通用队列。从逻辑上讲,这可以修改为通用队列。

您可以尝试将其集成到您的代码中。这是一个完整的工作测试示例,带有简化的 pdf(没有从 excel..)

获取实际数据的部分

据我了解您的代码,您不需要在所有函数周围传递 page。首先创建您的 html + css 然后使用 pdfPrinter 并让它处理 page 创建 + 浏览器启动..

(我喜欢编写这样的代码,所以我直接往前走..)


var puppeteer = require('puppeteer')

const defaultPrinterOptions = {
    format: 'A4',
    printBackground: true,
    margin: {
        left: '0px',
        top: '0px',
        right: '0px',
        bottom: '0px'
    }
}

class PdfPrinter {

    maxBrowsers = 2
    enqueuedPrintJobs = []
    failedJobs = []
    browserInstances = 0

    // max browser instances in parallel 
    constructor(maxBrowsers) {
        this.maxBrowsers = maxBrowsers
    }

    /**
     * 
     * @param {*} html the html content to print
     * @param {*} css to apply to the page
     * @param {*} printOptions options passed to puppeteer
     */
    // enqueues a print but the exact end moment cannot be known..
    enqueuePrint = (html, css, path, done) => {
        // merge custom options with defaultOptions..
        const printOptions = {
            ...defaultPrinterOptions,

            // add the path to the options.
            path: path
        }

        // create a function which can be stored in an array
        // it will later be grabbed by startPrinter() OR at the time any 
        // brwoser freed up.. 
        // the function needs to be passed the actual used browser instance!
        this.enqueuedPrintJobs.push(async(browser) => {

            // catch the error which may be produced when printing something..
            try {
                // print the document
                await this.print(browser, html, css, printOptions)
            } catch (err) {
                console.error('error when printing document..CLosing browser and starting a new job!!', printOptions.path)
                console.error(err)

                // store someting so you now what failed and coudl be retried or something..
                this.failedJobs.push({ html, css, path: printOptions.path })

                // puppeteer can run into erros too!! 
                // so close the browser and launch a new one!
                await this.closeBrowser(browser)
                browser = await this.launchBrowser()
            }

            // after the print, call done() so the promise is resovled in the right moment when 
            // this particular print has ended.!
            done()

            // start the next job right now  if there are any left.
            const job = this.enqueuedPrintJobs.shift()

            if (!job) {
                console.log('No print jobs available anymore. CLosing this browser instance.. Remaining browsers now:', this.maxBrowsers - this.browserInstances + 1)
                await this.closeBrowser(browser)
                return
            }

            // job is actually this function itself! It will be executed
            // and automatically grab a new job after completion :)
            // we pass the same browser instance to the next job!.
            await job(browser)
        })

        // whenever a print job added make sure to start the printer
        // this starts new browser instances if the limit is not exceeded resp. if no browser is instantiated yet,
        // and does nothing if maximum browser count is reached..
        this.tryStartPrinter()
    }

    // same as enqueuePrint except it wraps it in a promise so we can now the
    // exact end moment and await it..
    enqueuePrintPromise(html, css, path) {
        return new Promise((resolve, reject) => {
            try {
                this.enqueuePrint(html, css, path, resolve)
            } catch (err) {
                console.error('unexpected error when setting up print job..', err)
                reject(err)
            }
        })

    }

    // If browser instance limit is not reached will isntantiate a new one and run a print job with it.
    // a print job will automatically grab a next job with the created browser if there are any left.
    tryStartPrinter = async() => {

        // Max browser count in use OR no jobs left.
        if (this.browserInstances >= this.maxBrowsers || this.enqueuedPrintJobs.length === 0) {
            return
        }
        // browser instances available! 
        // create a new one 

        console.log('launching new browser. Available after launch:', this.maxBrowsers - this.browserInstances - 1)
        const browser = await this.launchBrowser()
        
        // run job
        const job = this.enqueuedPrintJobs.shift()
        await job(browser)

    }

    closeBrowser = async(browser) => {


        // decrement browsers in use!
        // important to call before closing browser!!
        this.browserInstances--
        await browser.close()

    }

    launchBrowser = async() => {
        // increment browsers in use!
        // important to increase before actualy launching (async stuff..)
        this.browserInstances++

        // this code you have to adjust according your enviromnemt..
        const browser = await puppeteer.launch({ headless: true })

        return browser
    }


    // The actual print function which creates a pdf.
    print = async(browser, html, css, printOptions) => {

        console.log('Converting page to pdf. path:', printOptions.path)
            // Run pdf creation in seperate page.
        const page = await browser.newPage()

        await page.setContent(html, { waitUntil: 'networkidle0' });
        await page.addStyleTag({ content: css });
        await page.pdf(printOptions);
        await page.close();

    }

}

// testing the PDFPrinter with some jobs.
// make sure to run the printer in an `async` function so u can 
// use await... 
const testPrinterQueue = async() => {

    // config
    const maxOpenedBrowsers = 5 // amount of browser instances which are allowed to be opened in parallel
    const testJobCount = 100 // amount of test pdf jobs to be created
    const destDir = 'C:\somepath' // the directory to store the pdfs in..


    // create sample jobs for testing...
    const jobs = []
    for (let i = 0; i < testJobCount; i++) {
        jobs.push({
            html: `<h1>job number [${i}]</h1>`,
            css: 'h1 { background-color: red; }',
            path: require('path').join(destDir, `pdf_${i}.pdf`)
        })
    }

    // track time
    const label = 'printed a total of ' + testJobCount + ' pdfs!'
    console.time(label)

    // run the actual pdf generation..
    const printer = new PdfPrinter(maxOpenedBrowsers)

    const jobProms = []
    for (let job of jobs) {

        // run jobs in parallel. Each job wil be runned async and return a Promise therefor
        jobProms.push(
            printer.enqueuePrintPromise(job.html, job.css, job.path)
        )
    }

    console.log('All jobs enqueued!! Wating for finish now.')

    // helper function which awaits all the print jobs, resp. an array of promises.
    await Promise.all(jobProms)
    console.timeEnd(label)

    // failed jobs::
    console.log('jobs failed:', printer.failedJobs)

    // as file:
    await require('fs').promises.writeFile('failed-jobs.json', JSON.stringify(printer.failedJobs))
}


testPrinterQueue().then(() => {
    console.log('done with everyting..')
}).catch(err => {
    console.error('unexpected error occured while printing all pages...', err)
})

您只需调整 testPrinterQueue() 开头的 destDir / openedBrowserstestJobCount 变量即可使其正常工作。

是什么导致了您的代码中的问题

我们来看看这篇

(async () => {
        browser = await Puppeteer.launch({
            headless: true,
            handleSIGINT: false,
            args: args,
        });

        const page = await browser.newPage();
    
        await page.setViewport({
            width: resolution.x,
            height: resolution.y,
        })

        await computeFirstTerm(page);
        await computeSecondTerm(page);
        await computeThirdTerm(page);
        browser.close()
    })()

您创建了一个立即执行的匿名函数。在函数内,使用 await 正确等待所有语句。但是,如果您 运行 在应用程序的同步部分中使用这整个部分,则整个功能将立即启动,但不会等待 运行 下一个代码。

检查这个例子:

//utility
function wait(ms) {
    return new Promise(resolve => {
        setTimeout(resolve, ms)
    })
}

const AsyncFunction = async() => {
    console.log('Async named function started')
        // simulate execution time of 2 seconds
    await wait(2000)

    console.log('Async named function ended')
};


function SyncFunction() {
    console.log('sync function started')

    // example of async function execution within a sync function..
    AsyncFunction();

    // what you have done in your code:
    (async() => {
        console.log('Async anonymus function started')
        await wait(3000)
        console.log('Async anonymus function ended')

    })()


    // what
    console.log('sync function ended.')
}

SyncFunction()
console.log('done')

注意输出:

Async named function started
Async anonymus function started
sync function ended. // => sync function already ended 
done   // sync function ended and code continues execution.
Async named function ended
Async anonymus function ended

要正确等待 async 内容,您需要将整个应用程序置于异步范围内:

//utility
function wait(ms) {
    return new Promise(resolve => {
        setTimeout(resolve, ms)
    })
}

const AsyncFunction = async() => {
    console.log('Async named function started')
        // simulate execution time of 2 seconds
    await wait(2000)

    console.log('Async named function ended')
};

// this is now async!!
async function SyncFunction() {
    console.log('sync function started')

    // example of async function execution within a sync function..
    await AsyncFunction();

    // what you have done in your code:
    await (async() => {
        console.log('Async anonymus function started')
        await wait(3000)
        console.log('Async anonymus function ended')

    })()


    // what
    console.log('sync function ended.')
}

SyncFunction().then(() => {
    console.log('done')
}).catch(err => {
    console.error('unexpected error occured..')
})

这个输出就是我们想要的

sync function started
Async named function started
Async named function ended
Async anonymus function started
Async anonymus function ended
sync function ended.
done

希望这能帮助你理解。

随时发表评论。