使用 textract 模块抓取 PDF
PDF scraping using textract module
我有一个 Node.js 应用程序,它必须对在线 pdf 进行一些网络抓取。
这是一段代码:
var textract = require('textract');
const util = require('util');
var methods = {};
var urls = [
{year: '2016', link: 'http://www.url2016.pdf'},
{year: '2015', link: 'http://www.url2015.pdf'}
];
var result = [];
const textractFromUrl = util.promisify(textract.fromUrl);
methods.download = function(req, res) {
return extractText();
}
async function extractText() {
try {
var config = {
preserveLineBreaks: true
};
for(let url of urls) {
let text = await textractFromUrl(url.link, config);
switch(url.year) {
case '2015':
await extractTextType1(url, text);
break;
case '2016':
await extractTextType2(url, text);
break;
default:
console.log('Error: no switch case');
}
}
}
catch(err) {
console.log('catch block');
console.log(err);
}
}
如您所见,我使用 textrack
包来抓取 pdf。
当我 运行 这个应用程序时,我得到:
catch block
{ Error: Error for type: [[ application/pdf ]], file: [[ C:\Users\myUserName\AppData\Local\Temp710848773.pdf ]], extractor for type exists, but failed to initialize. Message: INFO: 'pdftotext' does not appear to be installed, so textract will be unable to extract PDFs.
at extract (C:\Users\myUserName\projectPath\node_modules\textract\lib\extract.js:147:15)
at Timeout._onTimeout (C:\Users\myUserName\projectPath\node_modules\textract\lib\extract.js:155:7)
at ontimeout (timers.js:466:11)
at tryOnTimeout (timers.js:304:5)
at Timer.listOnTimeout (timers.js:267:5) typeNotFound: true }
在npm textract module page, is written that PDF extraction requires pdftotext be installed, link.
所以我去 http://www.foolabs.com/xpdf/download.html 下载并安装 下载 XpdfReader:Windows 64 位.
我再次尝试 运行 使用 node app.js
的应用程序(app.js
是我的应用程序的主文件)但我遇到了同样的错误所以我下载了 下载 Xpdf 工具:Windows 64 位。
这是一个 zip 文件,我解压了该文件,然后尝试安装 pdftotext.exe
,但是当我双击 pdftotext.exe
时,什么都发生了。
我也尝试使用管理员权限安装它。没有。
我正在使用 Windows 10、64 位。
我必须做什么?
编辑 1
按照建议,我将 pdftotext.exe
文件复制到 C:\Windows\System32
。然后我又 运行 我的程序,我遇到了这个问题:
catch block
TypeError: Cannot read property 'split' of undefined
at extractTextType4 (C:\myUserName\projectPath\file.js:301:28)
at extractText (C:\myUserName\projectPath\file.js:78:12)
at <anonymous>
at process._tickCallback (internal/process/next_tick.js:182:7)
我必须更改环境变量吗?
编辑 2
我的C:\myUserName\projectPath\file.js
是这样的:
var textract = require('textract');
const util = require('util');
var utilFunc = require('../helpers/utilFunc.js'); // my lib
var postgreSQLlib = require('../middlewares/postgreSQLlib.js'); // my lib
// object of methods
var methods = {};
// object of pdf links
var urls = [
{year: '2014', link: 'http://www.salute.gov.it/imgs/C_17_tavole_20_allegati_iitemAllegati_0_fileAllegati_itemFile_2_file.pdf'},
{year: '2013', link: 'http://www.salute.gov.it/imgs/C_17_tavole_20_allegati_iitemAllegati_0_fileAllegati_itemFile_1_file.pdf'},
{year: '2012', link: 'http://www.salute.gov.it/imgs/C_17_tavole_20_allegati_iitemAllegati_5_fileAllegati_itemFile_0_file.pdf'}
];
var result = [];
const textractFromUrl = util.promisify(textract.fromUrl);
/**
* Do web scraping and save info on DB.
*/
methods.download = function(req, res) {
return extractText();
}
/**
* Switch between different type of pdf file.
*/
async function extractText() {
try {
var config = {
preserveLineBreaks: true
};
for(let url of urls) {
let text = await textractFromUrl(url.link, config);
switch(url.year) {
case '2012':
await extractTextType1(url, text);
break;
case '2013':
await extractTextType2(url, text);
break;
case '2014':
await extractTextType3(url, text);
break;
default:
console.log('Error: no switch case');
}
}
}
catch(err) {
console.log('catch block');
console.log(err);
}
}
/**
* Save data on DB.
*/
function saveOnObject(vaccines, regions, map, url) {
vaccines = map.shift(); // remove and return the first element
regions = map.shift(); // remove and return the first element
let promises = [];
for(var i = 0; i <= 21; i++) {
var line = map[i];
line.forEach(async function(value, index) {
// make the values uniform
var vac = utilFunc.makeUniform(vaccines[index], 'vaccine');
var reg = utilFunc.makeUniform(regions[i], 'region');
var perc = utilFunc.makeUniform(value, 'value');
// create json object
var obj = utilFunc.createJsonObjectCov(parseFloat(url.year), 'Italy', vac, reg, perc);
covDataItAll.push(obj);
});
}
return Promise.all(promises);
}
/**
* Extract text to pdf 2000-2012.
*/
function extractTextType1(url, text) {
var matrix = [];
var map = [];
var vaccines = [];
var regionsTemp = [];
var regions = [];
var regionLength = [1, 2, 1, 2, 3, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1];
// text to matrix
var textArray = text.split('\n');
for(var i = 0; i < 23; i++) {
matrix[i] = textArray[i].split(' ');
}
// create vaccines array
matrix[0].shift();
vaccines = matrix[0];
map[0] = vaccines;
// create regionsTemp and values arrays
for(var i = 0; i < regionLength.length; i++) { // i index for regionLength
var j = i + 1; // index for matrix
var indexToRemove = 0;
var numberToRemove = regionLength[i];
var region = matrix[j].splice(indexToRemove, numberToRemove);
regionsTemp.push(region);
map[j+1] = matrix[j];
}
// create regions array (merge some elements)
for(var i = 0; i < regionsTemp.length; i++) {
var region = '';
if(regionLength[i] > 1) {
region = regionsTemp[i].join(' ');
}
else {
region = regionsTemp[i].join('');
}
regions.push(region);
}
map[1] = regions;
// remove \r char from map
for(var i = 0; i < map.length; i++) {
for(var j = 0; j < map[i].length; j++) {
map[i][j] = map[i][j].replace(/\r/g, '');
}
}
return saveOnObject(vaccines, regions, map, url);
}
/**
* Extract text to pdf 2013.
*/
function extractTextType2(url, text) {
var matrix = [];
var map = [];
var vaccines = [];
var regions = [];
// text to matrix
var textArray = text.split('\n');
for(var i = 0; i < 36; i++) {
matrix[i] = textArray[i].split(' ');
}
// create vaccines array
vaccines.push(matrix[0][1].replace(/\(a\)/g, '').replace(/\(b\)/g, '').replace(/\(c\)/g, '').replace(/\r/g, ''));
for(var i = 1; i < 10; i++) {
vaccines.push(matrix[i][0].replace(/\(a\)/g, '').replace(/\(b\)/g, '').replace(/\(c\)/g, '').replace(/\r/g, ''));
}
var meningo = ''.concat(matrix[10][0], matrix[11]).replace(/\(a\)/g, '').replace(/\(b\)/g, '').replace(/\(c\)/g, '').replace(/\r/g, '');
vaccines.push(meningo);
var pneumo = ''.concat(matrix[12][0], ' ', matrix[13]).replace(/\(a\)/g, '').replace(/\(b\)/g, '').replace(/\(c\)/g, '').replace(/\r/g, '');
vaccines.push(pneumo);
map[0] = vaccines;
// create regions array
for(var i = 14; i < matrix.length; i++) {
regions.push(matrix[i][0]);
}
map[1] = regions;
// create values array
for(var i = 14; i < matrix.length; i++) {
matrix[i].shift();
map.push(matrix[i]);
}
// remove \r char from map
for(var i = 0; i < map.length; i++) {
for(var j = 0; j < map[i].length; j++) {
map[i][j] = map[i][j].replace(/\r/g, '');
}
}
return saveOnObject(vaccines, regions, map, url);
}
/**
* Extract text to pdf 2014.
*/
function extractTextType3(url, text) {
var matrix = [];
var map = [];
var vaccines = [];
var regionsTemp = [];
var regions = [];
var regionLength = [1, 2, 1, 3, 3, 1, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2]; // array that contains the length of the regions (I need to correctly split the arrays)
// text to matrix
var textArray = text.split('\n');
for(var i = 0; i < 36; i++) {
matrix[i] = textArray[i].split(' ');
}
// create vaccines array
vaccines.push(matrix[0][2].replace(/\(a\)/g, '').replace(/\(b\)/g, '').replace(/\(c\)/g, '').replace(/\r/g, ''));
for(var i = 1; i < 10; i++) {
vaccines.push(matrix[i][0].replace(/\(a\)/g, '').replace(/\(b\)/g, '').replace(/\(c\)/g, '').replace(/\r/g, ''));
}
var meningo = ''.concat(matrix[10][0], ' ', matrix[10][1], ' ', matrix[11]).replace(/\(a\)/g, '').replace(/\(b\)/g, '').replace(/\(c\)/g, '').replace(/\r/g, '');
vaccines.push(meningo);
var pneumo = ''.concat(matrix[12][0], ' ', matrix[13]).replace(/\(a\)/g, '').replace(/\(b\)/g, '').replace(/\(c\)/g, '').replace(/\r/g, '');
vaccines.push(pneumo);
map[0] = vaccines;
// create regionsTemp and values arrays
for(var i = 0; i < regionLength.length; i++) { // i index for regionLength
var j = i + 14; // index for matrix
var indexToRemove = 0;
var numberToRemove = regionLength[i];
var region = matrix[j].splice(indexToRemove, numberToRemove);
regionsTemp.push(region);
map[i+2] = matrix[j];
}
// create regions array (merge some elements)
for(var i = 0; i < regionsTemp.length; i++) {
var region = '';
if(regionLength[i] > 1) {
region = regionsTemp[i].join(' ');
}
else {
region = regionsTemp[i].join('');
}
regions.push(region);
}
map[1] = regions;
// remove \r char from map
for(var i = 0; i < map.length; i++) {
for(var j = 0; j < map[i].length; j++) {
map[i][j] = map[i][j].replace(/\r/g, '');
}
}
return saveOnObject(vaccines, regions, map, url);
}
// exports methods
module.exports = methods;
将文件复制到 pdftotext.exe
到 c:\windows
或 c:\windows\system32
。
基本上是添加到您的 PATH 中的任何文件夹。然后它应该工作
源码里可以看到
代码假定 pdftotext
在路径
中
我有一个 Node.js 应用程序,它必须对在线 pdf 进行一些网络抓取。 这是一段代码:
var textract = require('textract');
const util = require('util');
var methods = {};
var urls = [
{year: '2016', link: 'http://www.url2016.pdf'},
{year: '2015', link: 'http://www.url2015.pdf'}
];
var result = [];
const textractFromUrl = util.promisify(textract.fromUrl);
methods.download = function(req, res) {
return extractText();
}
async function extractText() {
try {
var config = {
preserveLineBreaks: true
};
for(let url of urls) {
let text = await textractFromUrl(url.link, config);
switch(url.year) {
case '2015':
await extractTextType1(url, text);
break;
case '2016':
await extractTextType2(url, text);
break;
default:
console.log('Error: no switch case');
}
}
}
catch(err) {
console.log('catch block');
console.log(err);
}
}
如您所见,我使用 textrack
包来抓取 pdf。
当我 运行 这个应用程序时,我得到:
catch block
{ Error: Error for type: [[ application/pdf ]], file: [[ C:\Users\myUserName\AppData\Local\Temp710848773.pdf ]], extractor for type exists, but failed to initialize. Message: INFO: 'pdftotext' does not appear to be installed, so textract will be unable to extract PDFs.
at extract (C:\Users\myUserName\projectPath\node_modules\textract\lib\extract.js:147:15)
at Timeout._onTimeout (C:\Users\myUserName\projectPath\node_modules\textract\lib\extract.js:155:7)
at ontimeout (timers.js:466:11)
at tryOnTimeout (timers.js:304:5)
at Timer.listOnTimeout (timers.js:267:5) typeNotFound: true }
在npm textract module page, is written that PDF extraction requires pdftotext be installed, link.
所以我去 http://www.foolabs.com/xpdf/download.html 下载并安装 下载 XpdfReader:Windows 64 位.
我再次尝试 运行 使用 node app.js
的应用程序(app.js
是我的应用程序的主文件)但我遇到了同样的错误所以我下载了 下载 Xpdf 工具:Windows 64 位。
这是一个 zip 文件,我解压了该文件,然后尝试安装 pdftotext.exe
,但是当我双击 pdftotext.exe
时,什么都发生了。
我也尝试使用管理员权限安装它。没有。
我正在使用 Windows 10、64 位。
我必须做什么?
编辑 1
按照建议,我将 pdftotext.exe
文件复制到 C:\Windows\System32
。然后我又 运行 我的程序,我遇到了这个问题:
catch block
TypeError: Cannot read property 'split' of undefined
at extractTextType4 (C:\myUserName\projectPath\file.js:301:28)
at extractText (C:\myUserName\projectPath\file.js:78:12)
at <anonymous>
at process._tickCallback (internal/process/next_tick.js:182:7)
我必须更改环境变量吗?
编辑 2
我的C:\myUserName\projectPath\file.js
是这样的:
var textract = require('textract');
const util = require('util');
var utilFunc = require('../helpers/utilFunc.js'); // my lib
var postgreSQLlib = require('../middlewares/postgreSQLlib.js'); // my lib
// object of methods
var methods = {};
// object of pdf links
var urls = [
{year: '2014', link: 'http://www.salute.gov.it/imgs/C_17_tavole_20_allegati_iitemAllegati_0_fileAllegati_itemFile_2_file.pdf'},
{year: '2013', link: 'http://www.salute.gov.it/imgs/C_17_tavole_20_allegati_iitemAllegati_0_fileAllegati_itemFile_1_file.pdf'},
{year: '2012', link: 'http://www.salute.gov.it/imgs/C_17_tavole_20_allegati_iitemAllegati_5_fileAllegati_itemFile_0_file.pdf'}
];
var result = [];
const textractFromUrl = util.promisify(textract.fromUrl);
/**
* Do web scraping and save info on DB.
*/
methods.download = function(req, res) {
return extractText();
}
/**
* Switch between different type of pdf file.
*/
async function extractText() {
try {
var config = {
preserveLineBreaks: true
};
for(let url of urls) {
let text = await textractFromUrl(url.link, config);
switch(url.year) {
case '2012':
await extractTextType1(url, text);
break;
case '2013':
await extractTextType2(url, text);
break;
case '2014':
await extractTextType3(url, text);
break;
default:
console.log('Error: no switch case');
}
}
}
catch(err) {
console.log('catch block');
console.log(err);
}
}
/**
* Save data on DB.
*/
function saveOnObject(vaccines, regions, map, url) {
vaccines = map.shift(); // remove and return the first element
regions = map.shift(); // remove and return the first element
let promises = [];
for(var i = 0; i <= 21; i++) {
var line = map[i];
line.forEach(async function(value, index) {
// make the values uniform
var vac = utilFunc.makeUniform(vaccines[index], 'vaccine');
var reg = utilFunc.makeUniform(regions[i], 'region');
var perc = utilFunc.makeUniform(value, 'value');
// create json object
var obj = utilFunc.createJsonObjectCov(parseFloat(url.year), 'Italy', vac, reg, perc);
covDataItAll.push(obj);
});
}
return Promise.all(promises);
}
/**
* Extract text to pdf 2000-2012.
*/
function extractTextType1(url, text) {
var matrix = [];
var map = [];
var vaccines = [];
var regionsTemp = [];
var regions = [];
var regionLength = [1, 2, 1, 2, 3, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1];
// text to matrix
var textArray = text.split('\n');
for(var i = 0; i < 23; i++) {
matrix[i] = textArray[i].split(' ');
}
// create vaccines array
matrix[0].shift();
vaccines = matrix[0];
map[0] = vaccines;
// create regionsTemp and values arrays
for(var i = 0; i < regionLength.length; i++) { // i index for regionLength
var j = i + 1; // index for matrix
var indexToRemove = 0;
var numberToRemove = regionLength[i];
var region = matrix[j].splice(indexToRemove, numberToRemove);
regionsTemp.push(region);
map[j+1] = matrix[j];
}
// create regions array (merge some elements)
for(var i = 0; i < regionsTemp.length; i++) {
var region = '';
if(regionLength[i] > 1) {
region = regionsTemp[i].join(' ');
}
else {
region = regionsTemp[i].join('');
}
regions.push(region);
}
map[1] = regions;
// remove \r char from map
for(var i = 0; i < map.length; i++) {
for(var j = 0; j < map[i].length; j++) {
map[i][j] = map[i][j].replace(/\r/g, '');
}
}
return saveOnObject(vaccines, regions, map, url);
}
/**
* Extract text to pdf 2013.
*/
function extractTextType2(url, text) {
var matrix = [];
var map = [];
var vaccines = [];
var regions = [];
// text to matrix
var textArray = text.split('\n');
for(var i = 0; i < 36; i++) {
matrix[i] = textArray[i].split(' ');
}
// create vaccines array
vaccines.push(matrix[0][1].replace(/\(a\)/g, '').replace(/\(b\)/g, '').replace(/\(c\)/g, '').replace(/\r/g, ''));
for(var i = 1; i < 10; i++) {
vaccines.push(matrix[i][0].replace(/\(a\)/g, '').replace(/\(b\)/g, '').replace(/\(c\)/g, '').replace(/\r/g, ''));
}
var meningo = ''.concat(matrix[10][0], matrix[11]).replace(/\(a\)/g, '').replace(/\(b\)/g, '').replace(/\(c\)/g, '').replace(/\r/g, '');
vaccines.push(meningo);
var pneumo = ''.concat(matrix[12][0], ' ', matrix[13]).replace(/\(a\)/g, '').replace(/\(b\)/g, '').replace(/\(c\)/g, '').replace(/\r/g, '');
vaccines.push(pneumo);
map[0] = vaccines;
// create regions array
for(var i = 14; i < matrix.length; i++) {
regions.push(matrix[i][0]);
}
map[1] = regions;
// create values array
for(var i = 14; i < matrix.length; i++) {
matrix[i].shift();
map.push(matrix[i]);
}
// remove \r char from map
for(var i = 0; i < map.length; i++) {
for(var j = 0; j < map[i].length; j++) {
map[i][j] = map[i][j].replace(/\r/g, '');
}
}
return saveOnObject(vaccines, regions, map, url);
}
/**
* Extract text to pdf 2014.
*/
function extractTextType3(url, text) {
var matrix = [];
var map = [];
var vaccines = [];
var regionsTemp = [];
var regions = [];
var regionLength = [1, 2, 1, 3, 3, 1, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2]; // array that contains the length of the regions (I need to correctly split the arrays)
// text to matrix
var textArray = text.split('\n');
for(var i = 0; i < 36; i++) {
matrix[i] = textArray[i].split(' ');
}
// create vaccines array
vaccines.push(matrix[0][2].replace(/\(a\)/g, '').replace(/\(b\)/g, '').replace(/\(c\)/g, '').replace(/\r/g, ''));
for(var i = 1; i < 10; i++) {
vaccines.push(matrix[i][0].replace(/\(a\)/g, '').replace(/\(b\)/g, '').replace(/\(c\)/g, '').replace(/\r/g, ''));
}
var meningo = ''.concat(matrix[10][0], ' ', matrix[10][1], ' ', matrix[11]).replace(/\(a\)/g, '').replace(/\(b\)/g, '').replace(/\(c\)/g, '').replace(/\r/g, '');
vaccines.push(meningo);
var pneumo = ''.concat(matrix[12][0], ' ', matrix[13]).replace(/\(a\)/g, '').replace(/\(b\)/g, '').replace(/\(c\)/g, '').replace(/\r/g, '');
vaccines.push(pneumo);
map[0] = vaccines;
// create regionsTemp and values arrays
for(var i = 0; i < regionLength.length; i++) { // i index for regionLength
var j = i + 14; // index for matrix
var indexToRemove = 0;
var numberToRemove = regionLength[i];
var region = matrix[j].splice(indexToRemove, numberToRemove);
regionsTemp.push(region);
map[i+2] = matrix[j];
}
// create regions array (merge some elements)
for(var i = 0; i < regionsTemp.length; i++) {
var region = '';
if(regionLength[i] > 1) {
region = regionsTemp[i].join(' ');
}
else {
region = regionsTemp[i].join('');
}
regions.push(region);
}
map[1] = regions;
// remove \r char from map
for(var i = 0; i < map.length; i++) {
for(var j = 0; j < map[i].length; j++) {
map[i][j] = map[i][j].replace(/\r/g, '');
}
}
return saveOnObject(vaccines, regions, map, url);
}
// exports methods
module.exports = methods;
将文件复制到 pdftotext.exe
到 c:\windows
或 c:\windows\system32
。
基本上是添加到您的 PATH 中的任何文件夹。然后它应该工作
源码里可以看到
代码假定 pdftotext
在路径