PDFJS Firebase Cloud Functions:证书已过期

PDFJS Firebase Cloud Functions: certificate has expired

我正在使用 PDFJS 从 pdf 文档中提取纯文本,并且使用 Firebase Cloud Functions 安排提取。一切都很好,直到有一天我从选定的 pdf 中获得 certificate has expired,主要来自两个域。

我检查过那些受影响的域仍然具有有效的 SSL,并且 运行 本地机器上的纯文本提取代码没有问题。一旦部署到 firebase 云函数,它就会抛出 certificate has expired 错误。

Error
    at BaseExceptionClosure (/srv/node_modules/pdfjs-dist/build/pdf.js:666:29)
    at Object.<anonymous> (/srv/node_modules/pdfjs-dist/build/pdf.js:669:2)
    at __w_pdfjs_require__ (/srv/node_modules/pdfjs-dist/build/pdf.js:52:30)
    at Object.defineProperty.value (/srv/node_modules/pdfjs-dist/build/pdf.js:129:23)
    at __w_pdfjs_require__ (/srv/node_modules/pdfjs-dist/build/pdf.js:52:30)
    at pdfjsVersion (/srv/node_modules/pdfjs-dist/build/pdf.js:116:18)
    at /srv/node_modules/pdfjs-dist/build/pdf.js:119:10
    at webpackUniversalModuleDefinition (/srv/node_modules/pdfjs-dist/build/pdf.js:25:20)
    at Object.<anonymous> (/srv/node_modules/pdfjs-dist/build/pdf.js:32:3)
    at Module._compile (module.js:653:30)
    at Object.Module._extensions..js (module.js:664:10)
    at Module.load (module.js:566:32)
    at tryModuleLoad (module.js:506:12)
    at Function.Module._load (module.js:498:3)
    at Module.require (module.js:597:17)
    at require (internal/module.js:11:18)
    at Object.<anonymous> (/srv/pdf/pdf.js:7:18)
    at Module._compile (module.js:653:30)
    at Object.Module._extensions..js (module.js:664:10)
    at Module.load (module.js:566:32)
    at tryModuleLoad (module.js:506:12)
    at Function.Module._load (module.js:498:3)
  message: 'certificate has expired',
  name: 'UnknownErrorException',
  details: 'UnknownErrorException: certificate has expired' }" 

代码:

const pdfjslib = require('pdfjs-dist');
const functions = require('firebase-functions');

module.exports = functions.https.onRequest((req, res) => {
    let url = req.query.url

    return extractPlainTextFromPdf(url)
    .then(pb => {
        return res.send(pb)
    })
    .catch(err => {
        console.log(err)
        return res.send("Err occured")
    })
});

function extractPlainTextFromPdf(pdfUrl) {
    let options = setupPdfOptions(pdfUrl)
    return getPlainBody(options)
    .then((plainBody) => plainBody)
    .catch((err) => {
        console.log("Err plainBody", err) //<== Error thrown here
    })
}

async function getPlainBody(options) {
    return getDocument(options)
    .then(doc => extractTexts(doc, doc.numPages))
}

function getDocument(options) {
    var loadingTask = pdfjslib.getDocument(options)
    return loadingTask.promise
    .then((doc) => doc)
}

function setupPdfOptions(url) {
    return {
        url: url,
        httpHeaders: {
            "User-Agent": "MY-USER-AGENT",
        },
    };
}

这是面临上述问题的两个示例 pdf。

https://www.nea.gov.sg/docs/default-source/our-services/building-planning/notification-of-new-edition-of-code-of-practice-on-environment-health-(2020-edition).pdf

https://www.nparks.gov.sg/-/media/nparks-real-content/partner-us/developers-architects-and-engineers/circular_2020_0106_nparks.pdf?la=en&hash=F25A74CC8667D5D98EDF3A9C186E235330D228A8

编辑:

//package.json
{
  "name": "functions",
  "description": "Cloud Functions for Firebase",
  "scripts": {
    "serve": "firebase serve --only functions",
    "shell": "firebase functions:shell",
    "start": "npm run shell",
    "deploy": "firebase deploy --only functions",
    "logs": "firebase functions:log",
  },
  "engines": {
    "node": "8"
  },
  "dependencies": {
    "@google-cloud/functions-framework": "^1.5.1",
    "@google-cloud/vision": "^1.11.0",
    "aws-sdk": "^2.667.0",
    "axios": "^0.19.2",
    "cheerio": "^1.0.0-rc.3",
    "diff-match-patch": "^1.0.4",
    "firebase-admin": "^8.11.0",
    "firebase-functions": "^3.6.1",
    "moment": "^2.25.0",
    "nodemailer": "^6.4.6",
    "pdfjs-dist": "^2.3.200",
    "request": "^2.88.2",
    "request-promise": "^4.2.5",
  },
  "devDependencies": {
    "firebase-functions-test": "^0.1.6"
  },
  "private": true
}

Node.JS 8 on cloud functions is deprecated,我认为像 openssl 和其他一些包在 Node 8 运行时中已经过时并导致奇怪的 SSL 问题,我在一些老式 linux 发行版中遇到过这种行为(ubuntu 10.04).

"The Node.js 8 runtime will be deprecated on 2020-06-05. To ensure that your functions are on a supported version of Node.js, migrate them to Node.js 10."