使用nodejs服务器和reactjs网页从pdf中提取文本的问题

Question

下面是我的代码 textractUtils.js -

const _ = require("lodash");
const aws = require("aws-sdk");
const config = require("./config");

aws.config.update({
  accessKeyId: config.awsAccesskeyID,
  secretAccessKey: config.awsSecretAccessKey,
  region: config.awsRegion
});

const textract = new aws.Textract();

const getText = (result, blocksMap) => {
  let text = "";

  if (_.has(result, "Relationships")) {
    result.Relationships.forEach(relationship => {
      if (relationship.Type === "CHILD") {
        relationship.Ids.forEach(childId => {
          const word = blocksMap[childId];
          if (word.BlockType === "WORD") {
            text += `${word.Text} `;
          }
          if (word.BlockType === "SELECTION_ELEMENT") {
            if (word.SelectionStatus === "SELECTED") {
              text += `X `;
            }
          }
        });
      }
    });
  }

  return text.trim();
};

const findValueBlock = (keyBlock, valueMap) => {
  let valueBlock;
  keyBlock.Relationships.forEach(relationship => {
    if (relationship.Type === "VALUE") {
      // eslint-disable-next-line array-callback-return
      relationship.Ids.every(valueId => {
        if (_.has(valueMap, valueId)) {
          valueBlock = valueMap[valueId];
          return false;
        }
      });
    }
  });

  return valueBlock;
};

const getKeyValueRelationship = (keyMap, valueMap, blockMap) => {
  const keyValues = {};

  const keyMapValues = _.values(keyMap);

  keyMapValues.forEach(keyMapValue => {
    const valueBlock = findValueBlock(keyMapValue, valueMap);
    const key = getText(keyMapValue, blockMap);
    const value = getText(valueBlock, blockMap);
    keyValues[key] = value;
  });

  return keyValues;
};

const getKeyValueMap = blocks => {
  const keyMap = {};
  const valueMap = {};
  const blockMap = {};

  let blockId;
  blocks.forEach(block => {
    blockId = block.Id;
    blockMap[blockId] = block;

    if (block.BlockType === "KEY_VALUE_SET") {
      if (_.includes(block.EntityTypes, "KEY")) {
        keyMap[blockId] = block;
      } else {
        valueMap[blockId] = block;
      }
    }
  });

  return { keyMap, valueMap, blockMap };
};

module.exports = async buffer => {
  const params = {
    Document: {
      /* required */
      Bytes: buffer
    },
    FeatureTypes: ["FORMS"]
  };

  const request = textract.analyzeDocument(params);
  const data = await request.promise();

  if (data && data.Blocks) {
    const { keyMap, valueMap, blockMap } = getKeyValueMap(data.Blocks);
    const keyValues = getKeyValueRelationship(keyMap, valueMap, blockMap);

    return keyValues;
  }

  // in case no blocks are found return undefined
  return undefined;
};

它适用于图像但不适用于 pdf（既不是单页也不是多页）。下面是我导入pdf-运行的时候出现的错误-

(node:2001) UnhandledPromiseRejectionWarning: UnsupportedDocumentException: Request has unsupported document format
    at Request.extractError (/home/<user>/textract-lab/node_modules/aws-sdk/lib/protocol/json.js:51:27)
    at Request.callListeners (/home/<user>/textract-lab/node_modules/aws-sdk/lib/sequential_executor.js:106:20)
    at Request.emit (/home/<user>/textract-lab/node_modules/aws-sdk/lib/sequential_executor.js:78:10)
    at Request.emit (/home/<user>/textract-lab/node_modules/aws-sdk/lib/request.js:683:14)
    at Request.transition (/home/<user>/textract-lab/node_modules/aws-sdk/lib/request.js:22:10)
    at AcceptorStateMachine.runTo (/home/<user>/textract-lab/node_modules/aws-sdk/lib/state_machine.js:14:12)
    at /home/<user>/textract-lab/node_modules/aws-sdk/lib/state_machine.js:26:10
    at Request.<anonymous> (/home/<user>/textract-lab/node_modules/aws-sdk/lib/request.js:38:9)
    at Request.<anonymous> (/home/<user>/textract-lab/node_modules/aws-sdk/lib/request.js:685:12)
    at Request.callListeners (/home/<user>/textract-lab/node_modules/aws-sdk/lib/sequential_executor.js:116:18)
(node:2001) UnhandledPromiseRejectionWarning: Unhandled promise rejection. This error originated either by throwing inside of an async function without a catch block, or by rejecting a promise which was not handled with .catch(). (rejection id: 1)
(node:2001) [DEP0018] DeprecationWarning: Unhandled promise rejections are deprecated. In the future, promise rejections that are not handled will terminate the Node.js process with a non-zero exit code.

我尝试过的是非文本包含图像、文本包含图像、table 包含图像、单页 pdf 和多页 pdf。我还有一个概念上的疑问，如果我已经导入了 aws-sdk，为什么我应该为 pdf 编码，因为 aws-sdk for textract 会处理 pdf、png、jpeg 和 jpg 格式的图像？我必须对 textractUtils.js 进行哪些更改才能处理 epdf 文件？

Answer 1

AnalyzeDocumentAPI操作仅支持PNG或JPEG格式的图片。来自 Textract documentation:

Amazon Textract synchronous operations (DetectDocumentText and AnalyzeDocument) support the PNG and JPEG image formats. Asynchronous operations (StartDocumentTextDetection, StartDocumentAnalysis) also support the PDF file format.

您应该使用异步操作来处理您的 PDF 文档。否则，解决方法是在您的代码中将 PDF 文档转换为图像，然后对这些图像使用同步 API 操作来处理文档。

使用nodejs服务器和reactjs网页从pdf中提取文本的问题

Problem with text extraction from pdf using nodejs server and reactjs webpage

machine-learning

amazon-web-services

node.js

reactjs

amazon-textract