从 PDF 中提取字体名称
Extract Font Name from PDF
我正在使用 pdf.js 从 pdf 中提取文本,但字体名称显示为 g_d0_f6
等。我需要字体名称使用适当的 table 来转换为 Unicode .这是从 pdf2svg.js 示例派生的代码:-
var fs = require('fs');
var util = require('util');
var path = require('path');
var stream = require('stream');
// HACK few hacks to let PDF.js be loaded not as a module in global space.
require('./domstubs.js').setStubs(global);
var pdfjsLib = require('pdfjs-dist');
var pdfPath = process.argv[2] || '../../web/compressed.tracemonkey-pldi-09.pdf';
var data = new Uint8Array(fs.readFileSync(pdfPath));
var loadingTask = pdfjsLib.getDocument({
data: data,
nativeImageDecoderSupport: pdfjsLib.NativeImageDecoding.DISPLAY,
});
loadingTask.promise.then(function(doc) {
var lastPromise = Promise.resolve(); // will be used to chain promises
var loadPage = function (pageNum) {
return doc.getPage(pageNum).then(function (page) {
return page.getTextContent().then(function (textContent) {
console.log(textContent);
});
});
};
for (var i = 1; i <= doc.numPages; i++) {
lastPromise = lastPromise.then(loadPage.bind(null, i));
}
return lastPromise;
}).then(function () {
console.log('# End of Document');
}, function (err) {
console.error('Error: ' + err);
});
示例输出:-
{ items:
[ { str: 'bl fp=k osQ ckjs esa cPpksa ls ckrphr djsa & ;g LowQy esa fdl le; dk n`\'; gS\ cPps',
dir: 'ltr',
width: 396.2250000000001,
height: 15,
transform: [Array],
fontName: 'g_d0_f1' },
{ str: 'D;k dj jgs gSa\ cPps dkSu&dkSu ls [ksy] [ksy j',
dir: 'ltr',
width: 216.1650000000001,
height: 15,
transform: [Array],
fontName: 'g_d0_f1' },
{ str: 'g',
dir: 'ltr',
width: 6.42,
height: 15,
transform: [Array],
fontName: 'g_d0_f1' },
{ str: 's gSa\ fp=k esa fdrus cPps gSa vkSj fdrus',
dir: 'ltr',
width: 173.865,
height: 15,
transform: [Array],
fontName: 'g_d0_f1' },
{ str: 'cM+s gSa\ vkil esa dkSu D;k ckr dj jgk gksxk\ cPpksa ls fp=k esa lcosQ fy, uke lkspus',
dir: 'ltr',
width: 396.54000000000013,
height: 15,
transform: [Array],
fontName: 'g_d0_f1' },
{ str: 'dks dgasaA',
dir: 'ltr',
width: 40.74,
height: 15,
transform: [Array],
fontName: 'g_d0_f1' },
{ str: 'csVh cpkvks',
dir: 'ltr',
width: 66.725,
height: 17,
transform: [Array],
fontName: 'g_d0_f2' },
{ str: 'csVh i<+kvksA',
dir: 'ltr',
width: 66.75899999999999,
height: 17,
transform: [Array],
fontName: 'g_d0_f2' },
{ str: '2018-19',
dir: 'ltr',
width: 36.690000000000005,
height: 10,
transform: [Array],
fontName: 'g_d0_f3' } ],
styles:
{ g_d0_f1:
{ fontFamily: 'sans-serif',
ascent: 0.837,
descent: -0.216,
vertical: false },
g_d0_f2:
{ fontFamily: 'sans-serif',
ascent: 0.786,
descent: -0.181,
vertical: false },
g_d0_f3:
{ fontFamily: 'sans-serif',
ascent: 0.9052734375,
descent: -0.2119140625,
vertical: false } } }
这里是使用嵌入字体的 pdf:http://ncert.nic.in/textbook/pdf/ahhn101.pdf
这是一个相关问题,但建议的 commonObjs 是空的:
我认为您是在正确的轨道上:page.commonObjs
是找到实际字体名称的地方。但是,page.commonObjs
仅在访问页面的 text/operators 时才会填充,因此如果您在此之前查看,您会发现它是空的。
注意:下面的答案与 pdf.js 没有任何关系,但它回答了问题,从 PDF 中提取字体名称。
我还没有找到解决方案,所以我继续抓取 mutool,它有以下命令来获取每页的字体信息。
mutool info -F input.pdf 0-2147483647
然后我获取了 spawn
函数,通过一些正则表达式和模式匹配将输出破解为 return 数据。
const extractFontData = async str => {
const getMatches = str => {
const regex = /Page (\d+):\nFonts \((\d+)\):/;
const match = str.match(regex);
if (match) {
return { page: match[1], fonts: match[2] };
}
return {};
};
const singleFont = fontData => {
const match = fontData.match(/\+([a-zA-Z0-9_-]+[.,]?[a-zA-Z0-9_-]+)/);
return match && match[1];
};
return str
.split("Page ")
.map(singlePageData => {
const { page, fonts } = getMatches(`Page ` + singlePageData);
if (fonts) {
const split = singlePageData.split("\n").filter(e => e.length);
const fontList = split.slice(2).map(singleFont);
return { page, fonts, fontList };
}
})
.filter(e => e);
};
// Taken and adjusted from:
function run(...cmd) {
return new Promise((resolve, reject) => {
var { spawn } = require("child_process");
var command = spawn(...cmd);
var result = "";
command.stdout.on("data", function(data) {
result += data.toString();
});
command.on("close", function(code) {
resolve(result);
});
command.on("error", function(err) {
reject(err);
});
});
}
async function wrapper(filePath) {
const data = await run("mutool", ["info", "-F", filePath, "0-2147483647"]);
return extractFontData(data);
}
示例用法:
wrapper("ahhn101.pdf").then(data => console.log(data));
结果:
我正在使用 pdf.js 从 pdf 中提取文本,但字体名称显示为 g_d0_f6
等。我需要字体名称使用适当的 table 来转换为 Unicode .这是从 pdf2svg.js 示例派生的代码:-
var fs = require('fs');
var util = require('util');
var path = require('path');
var stream = require('stream');
// HACK few hacks to let PDF.js be loaded not as a module in global space.
require('./domstubs.js').setStubs(global);
var pdfjsLib = require('pdfjs-dist');
var pdfPath = process.argv[2] || '../../web/compressed.tracemonkey-pldi-09.pdf';
var data = new Uint8Array(fs.readFileSync(pdfPath));
var loadingTask = pdfjsLib.getDocument({
data: data,
nativeImageDecoderSupport: pdfjsLib.NativeImageDecoding.DISPLAY,
});
loadingTask.promise.then(function(doc) {
var lastPromise = Promise.resolve(); // will be used to chain promises
var loadPage = function (pageNum) {
return doc.getPage(pageNum).then(function (page) {
return page.getTextContent().then(function (textContent) {
console.log(textContent);
});
});
};
for (var i = 1; i <= doc.numPages; i++) {
lastPromise = lastPromise.then(loadPage.bind(null, i));
}
return lastPromise;
}).then(function () {
console.log('# End of Document');
}, function (err) {
console.error('Error: ' + err);
});
示例输出:-
{ items:
[ { str: 'bl fp=k osQ ckjs esa cPpksa ls ckrphr djsa & ;g LowQy esa fdl le; dk n`\'; gS\ cPps',
dir: 'ltr',
width: 396.2250000000001,
height: 15,
transform: [Array],
fontName: 'g_d0_f1' },
{ str: 'D;k dj jgs gSa\ cPps dkSu&dkSu ls [ksy] [ksy j',
dir: 'ltr',
width: 216.1650000000001,
height: 15,
transform: [Array],
fontName: 'g_d0_f1' },
{ str: 'g',
dir: 'ltr',
width: 6.42,
height: 15,
transform: [Array],
fontName: 'g_d0_f1' },
{ str: 's gSa\ fp=k esa fdrus cPps gSa vkSj fdrus',
dir: 'ltr',
width: 173.865,
height: 15,
transform: [Array],
fontName: 'g_d0_f1' },
{ str: 'cM+s gSa\ vkil esa dkSu D;k ckr dj jgk gksxk\ cPpksa ls fp=k esa lcosQ fy, uke lkspus',
dir: 'ltr',
width: 396.54000000000013,
height: 15,
transform: [Array],
fontName: 'g_d0_f1' },
{ str: 'dks dgasaA',
dir: 'ltr',
width: 40.74,
height: 15,
transform: [Array],
fontName: 'g_d0_f1' },
{ str: 'csVh cpkvks',
dir: 'ltr',
width: 66.725,
height: 17,
transform: [Array],
fontName: 'g_d0_f2' },
{ str: 'csVh i<+kvksA',
dir: 'ltr',
width: 66.75899999999999,
height: 17,
transform: [Array],
fontName: 'g_d0_f2' },
{ str: '2018-19',
dir: 'ltr',
width: 36.690000000000005,
height: 10,
transform: [Array],
fontName: 'g_d0_f3' } ],
styles:
{ g_d0_f1:
{ fontFamily: 'sans-serif',
ascent: 0.837,
descent: -0.216,
vertical: false },
g_d0_f2:
{ fontFamily: 'sans-serif',
ascent: 0.786,
descent: -0.181,
vertical: false },
g_d0_f3:
{ fontFamily: 'sans-serif',
ascent: 0.9052734375,
descent: -0.2119140625,
vertical: false } } }
这里是使用嵌入字体的 pdf:http://ncert.nic.in/textbook/pdf/ahhn101.pdf
这是一个相关问题,但建议的 commonObjs 是空的:
我认为您是在正确的轨道上:page.commonObjs
是找到实际字体名称的地方。但是,page.commonObjs
仅在访问页面的 text/operators 时才会填充,因此如果您在此之前查看,您会发现它是空的。
注意:下面的答案与 pdf.js 没有任何关系,但它回答了问题,从 PDF 中提取字体名称。
我还没有找到解决方案,所以我继续抓取 mutool,它有以下命令来获取每页的字体信息。
mutool info -F input.pdf 0-2147483647
然后我获取了 spawn
函数,通过一些正则表达式和模式匹配将输出破解为 return 数据。
const extractFontData = async str => {
const getMatches = str => {
const regex = /Page (\d+):\nFonts \((\d+)\):/;
const match = str.match(regex);
if (match) {
return { page: match[1], fonts: match[2] };
}
return {};
};
const singleFont = fontData => {
const match = fontData.match(/\+([a-zA-Z0-9_-]+[.,]?[a-zA-Z0-9_-]+)/);
return match && match[1];
};
return str
.split("Page ")
.map(singlePageData => {
const { page, fonts } = getMatches(`Page ` + singlePageData);
if (fonts) {
const split = singlePageData.split("\n").filter(e => e.length);
const fontList = split.slice(2).map(singleFont);
return { page, fonts, fontList };
}
})
.filter(e => e);
};
// Taken and adjusted from:
function run(...cmd) {
return new Promise((resolve, reject) => {
var { spawn } = require("child_process");
var command = spawn(...cmd);
var result = "";
command.stdout.on("data", function(data) {
result += data.toString();
});
command.on("close", function(code) {
resolve(result);
});
command.on("error", function(err) {
reject(err);
});
});
}
async function wrapper(filePath) {
const data = await run("mutool", ["info", "-F", filePath, "0-2147483647"]);
return extractFontData(data);
}
示例用法:
wrapper("ahhn101.pdf").then(data => console.log(data));
结果: