使用nodejs服务器和reactjs网页从pdf中提取文本的问题
Problem with text extraction from pdf using nodejs server and reactjs webpage
下面是我的代码 textractUtils.js -
const _ = require("lodash");
const aws = require("aws-sdk");
const config = require("./config");
aws.config.update({
accessKeyId: config.awsAccesskeyID,
secretAccessKey: config.awsSecretAccessKey,
region: config.awsRegion
});
const textract = new aws.Textract();
const getText = (result, blocksMap) => {
let text = "";
if (_.has(result, "Relationships")) {
result.Relationships.forEach(relationship => {
if (relationship.Type === "CHILD") {
relationship.Ids.forEach(childId => {
const word = blocksMap[childId];
if (word.BlockType === "WORD") {
text += `${word.Text} `;
}
if (word.BlockType === "SELECTION_ELEMENT") {
if (word.SelectionStatus === "SELECTED") {
text += `X `;
}
}
});
}
});
}
return text.trim();
};
const findValueBlock = (keyBlock, valueMap) => {
let valueBlock;
keyBlock.Relationships.forEach(relationship => {
if (relationship.Type === "VALUE") {
// eslint-disable-next-line array-callback-return
relationship.Ids.every(valueId => {
if (_.has(valueMap, valueId)) {
valueBlock = valueMap[valueId];
return false;
}
});
}
});
return valueBlock;
};
const getKeyValueRelationship = (keyMap, valueMap, blockMap) => {
const keyValues = {};
const keyMapValues = _.values(keyMap);
keyMapValues.forEach(keyMapValue => {
const valueBlock = findValueBlock(keyMapValue, valueMap);
const key = getText(keyMapValue, blockMap);
const value = getText(valueBlock, blockMap);
keyValues[key] = value;
});
return keyValues;
};
const getKeyValueMap = blocks => {
const keyMap = {};
const valueMap = {};
const blockMap = {};
let blockId;
blocks.forEach(block => {
blockId = block.Id;
blockMap[blockId] = block;
if (block.BlockType === "KEY_VALUE_SET") {
if (_.includes(block.EntityTypes, "KEY")) {
keyMap[blockId] = block;
} else {
valueMap[blockId] = block;
}
}
});
return { keyMap, valueMap, blockMap };
};
module.exports = async buffer => {
const params = {
Document: {
/* required */
Bytes: buffer
},
FeatureTypes: ["FORMS"]
};
const request = textract.analyzeDocument(params);
const data = await request.promise();
if (data && data.Blocks) {
const { keyMap, valueMap, blockMap } = getKeyValueMap(data.Blocks);
const keyValues = getKeyValueRelationship(keyMap, valueMap, blockMap);
return keyValues;
}
// in case no blocks are found return undefined
return undefined;
};
它适用于图像但不适用于 pdf(既不是单页也不是多页)。下面是我导入pdf-运行的时候出现的错误-
(node:2001) UnhandledPromiseRejectionWarning: UnsupportedDocumentException: Request has unsupported document format
at Request.extractError (/home/<user>/textract-lab/node_modules/aws-sdk/lib/protocol/json.js:51:27)
at Request.callListeners (/home/<user>/textract-lab/node_modules/aws-sdk/lib/sequential_executor.js:106:20)
at Request.emit (/home/<user>/textract-lab/node_modules/aws-sdk/lib/sequential_executor.js:78:10)
at Request.emit (/home/<user>/textract-lab/node_modules/aws-sdk/lib/request.js:683:14)
at Request.transition (/home/<user>/textract-lab/node_modules/aws-sdk/lib/request.js:22:10)
at AcceptorStateMachine.runTo (/home/<user>/textract-lab/node_modules/aws-sdk/lib/state_machine.js:14:12)
at /home/<user>/textract-lab/node_modules/aws-sdk/lib/state_machine.js:26:10
at Request.<anonymous> (/home/<user>/textract-lab/node_modules/aws-sdk/lib/request.js:38:9)
at Request.<anonymous> (/home/<user>/textract-lab/node_modules/aws-sdk/lib/request.js:685:12)
at Request.callListeners (/home/<user>/textract-lab/node_modules/aws-sdk/lib/sequential_executor.js:116:18)
(node:2001) UnhandledPromiseRejectionWarning: Unhandled promise rejection. This error originated either by throwing inside of an async function without a catch block, or by rejecting a promise which was not handled with .catch(). (rejection id: 1)
(node:2001) [DEP0018] DeprecationWarning: Unhandled promise rejections are deprecated. In the future, promise rejections that are not handled will terminate the Node.js process with a non-zero exit code.
我尝试过的是非文本包含图像、文本包含图像、table 包含图像、单页 pdf 和多页 pdf。我还有一个概念上的疑问,如果我已经导入了 aws-sdk,为什么我应该为 pdf 编码,因为 aws-sdk for textract 会处理 pdf、png、jpeg 和 jpg 格式的图像?
我必须对 textractUtils.js 进行哪些更改才能处理 epdf 文件?
AnalyzeDocument
API操作仅支持PNG或JPEG格式的图片。来自 Textract documentation:
Amazon Textract synchronous operations (DetectDocumentText
and AnalyzeDocument
) support the PNG and JPEG image formats. Asynchronous operations (StartDocumentTextDetection
, StartDocumentAnalysis
) also support the PDF file format.
您应该使用异步操作来处理您的 PDF 文档。否则,解决方法是在您的代码中将 PDF 文档转换为图像,然后对这些图像使用同步 API 操作来处理文档。
下面是我的代码 textractUtils.js -
const _ = require("lodash");
const aws = require("aws-sdk");
const config = require("./config");
aws.config.update({
accessKeyId: config.awsAccesskeyID,
secretAccessKey: config.awsSecretAccessKey,
region: config.awsRegion
});
const textract = new aws.Textract();
const getText = (result, blocksMap) => {
let text = "";
if (_.has(result, "Relationships")) {
result.Relationships.forEach(relationship => {
if (relationship.Type === "CHILD") {
relationship.Ids.forEach(childId => {
const word = blocksMap[childId];
if (word.BlockType === "WORD") {
text += `${word.Text} `;
}
if (word.BlockType === "SELECTION_ELEMENT") {
if (word.SelectionStatus === "SELECTED") {
text += `X `;
}
}
});
}
});
}
return text.trim();
};
const findValueBlock = (keyBlock, valueMap) => {
let valueBlock;
keyBlock.Relationships.forEach(relationship => {
if (relationship.Type === "VALUE") {
// eslint-disable-next-line array-callback-return
relationship.Ids.every(valueId => {
if (_.has(valueMap, valueId)) {
valueBlock = valueMap[valueId];
return false;
}
});
}
});
return valueBlock;
};
const getKeyValueRelationship = (keyMap, valueMap, blockMap) => {
const keyValues = {};
const keyMapValues = _.values(keyMap);
keyMapValues.forEach(keyMapValue => {
const valueBlock = findValueBlock(keyMapValue, valueMap);
const key = getText(keyMapValue, blockMap);
const value = getText(valueBlock, blockMap);
keyValues[key] = value;
});
return keyValues;
};
const getKeyValueMap = blocks => {
const keyMap = {};
const valueMap = {};
const blockMap = {};
let blockId;
blocks.forEach(block => {
blockId = block.Id;
blockMap[blockId] = block;
if (block.BlockType === "KEY_VALUE_SET") {
if (_.includes(block.EntityTypes, "KEY")) {
keyMap[blockId] = block;
} else {
valueMap[blockId] = block;
}
}
});
return { keyMap, valueMap, blockMap };
};
module.exports = async buffer => {
const params = {
Document: {
/* required */
Bytes: buffer
},
FeatureTypes: ["FORMS"]
};
const request = textract.analyzeDocument(params);
const data = await request.promise();
if (data && data.Blocks) {
const { keyMap, valueMap, blockMap } = getKeyValueMap(data.Blocks);
const keyValues = getKeyValueRelationship(keyMap, valueMap, blockMap);
return keyValues;
}
// in case no blocks are found return undefined
return undefined;
};
它适用于图像但不适用于 pdf(既不是单页也不是多页)。下面是我导入pdf-运行的时候出现的错误-
(node:2001) UnhandledPromiseRejectionWarning: UnsupportedDocumentException: Request has unsupported document format
at Request.extractError (/home/<user>/textract-lab/node_modules/aws-sdk/lib/protocol/json.js:51:27)
at Request.callListeners (/home/<user>/textract-lab/node_modules/aws-sdk/lib/sequential_executor.js:106:20)
at Request.emit (/home/<user>/textract-lab/node_modules/aws-sdk/lib/sequential_executor.js:78:10)
at Request.emit (/home/<user>/textract-lab/node_modules/aws-sdk/lib/request.js:683:14)
at Request.transition (/home/<user>/textract-lab/node_modules/aws-sdk/lib/request.js:22:10)
at AcceptorStateMachine.runTo (/home/<user>/textract-lab/node_modules/aws-sdk/lib/state_machine.js:14:12)
at /home/<user>/textract-lab/node_modules/aws-sdk/lib/state_machine.js:26:10
at Request.<anonymous> (/home/<user>/textract-lab/node_modules/aws-sdk/lib/request.js:38:9)
at Request.<anonymous> (/home/<user>/textract-lab/node_modules/aws-sdk/lib/request.js:685:12)
at Request.callListeners (/home/<user>/textract-lab/node_modules/aws-sdk/lib/sequential_executor.js:116:18)
(node:2001) UnhandledPromiseRejectionWarning: Unhandled promise rejection. This error originated either by throwing inside of an async function without a catch block, or by rejecting a promise which was not handled with .catch(). (rejection id: 1)
(node:2001) [DEP0018] DeprecationWarning: Unhandled promise rejections are deprecated. In the future, promise rejections that are not handled will terminate the Node.js process with a non-zero exit code.
我尝试过的是非文本包含图像、文本包含图像、table 包含图像、单页 pdf 和多页 pdf。我还有一个概念上的疑问,如果我已经导入了 aws-sdk,为什么我应该为 pdf 编码,因为 aws-sdk for textract 会处理 pdf、png、jpeg 和 jpg 格式的图像? 我必须对 textractUtils.js 进行哪些更改才能处理 epdf 文件?
AnalyzeDocument
API操作仅支持PNG或JPEG格式的图片。来自 Textract documentation:
Amazon Textract synchronous operations (
DetectDocumentText
andAnalyzeDocument
) support the PNG and JPEG image formats. Asynchronous operations (StartDocumentTextDetection
,StartDocumentAnalysis
) also support the PDF file format.
您应该使用异步操作来处理您的 PDF 文档。否则,解决方法是在您的代码中将 PDF 文档转换为图像,然后对这些图像使用同步 API 操作来处理文档。