是否有任何缩放矩形坐标的 PDF 命令?
Is there any PDF command, that scales rectangle coordinates?
我有一个应用程序,可以从 pdf 文件中提取文本和矩形以供进一步分析。我使用 ItextSharp 进行提取,一切顺利,直到我偶然发现一个文档,其中有一些奇怪的 table 单元格矩形。我检索到的绘图命令中的值似乎比后面矩形的实际尺寸大 10 倍。
举个例子:
2577 831.676 385.996 3.99609 再
同时,在查看文档时,所有矩形似乎都正确地适合文档页面的边界。我的猜测是应该有一些缩放命令,告诉这些值应该按比例缩小。如此大的矩形被呈现为位于页面边界内的假设是否正确,或者怎么可能?
pdf文档在这个link后面:https://www.dropbox.com/s/gyvon0dwk6a9cj0/prEVS_ISO_11620_KOM_et.pdf?dl=0
处理从 PRStream 中提取维度的代码如下:
private static List<PdfRect> GetRectsAndLinesFromStream(PRStream stream)
{
var streamBytes = PdfReader.GetStreamBytes(stream);
var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(streamBytes));
List<string> newBuf = new List<string>();
List<PdfRect> rects = new List<PdfRect>();
List<string> allTokens = new List<string>();
float[,] ctm = null;
List<float[,]> ctms = new List<float[,]>();
//if current ctm has not yet been added to list
bool pendingCtm = false;
//format definition for string-> float conversion
var format = new System.Globalization.NumberFormatInfo();
format.NegativeSign = "-";
while (tokenizer.NextToken())
{
//Add them to our master buffer
newBuf.Add(tokenizer.StringValue);
if (
tokenizer.TokenType == PRTokeniser.TokType.OTHER && newBuf[newBuf.Count - 1] == "re"
)
{
float startPointX = (float)double.Parse(newBuf[newBuf.Count - 5], format);
float startPointY = (float)double.Parse(newBuf[newBuf.Count - 4], format);
float width = (float)double.Parse(newBuf[newBuf.Count - 3], format);
float height = (float)double.Parse(newBuf[newBuf.Count - 2], format);
float endPointX = startPointX + width;
float endPointY = startPointY + height;
//if transformation is defined, correct coordinates
if (ctm!=null)
{
//extract parameters
float a = ctm[0, 0];
float b = ctm[0, 1];
float c = ctm[1, 0];
float d = ctm[1, 1];
float e = ctm[2, 0];
float f = ctm[2, 1];
//reverse transformation to get x and y from x' and y'
startPointX = (startPointX - startPointY * c - e) / a;
startPointY = (startPointY - startPointX * b - f) / d;
endPointX = (endPointX - endPointY * c - e) / a;
endPointY = (endPointY - endPointX * b - f) / d;
}
rects.Add(new PdfRect(startPointX, startPointY , endPointX , endPointY ));
}
//store current ctm
else if (tokenizer.TokenType == PRTokeniser.TokType.OTHER && newBuf[newBuf.Count - 1] == "q")
{
if (ctm != null)
{
ctms.Add(ctm);
pendingCtm = false;
}
}
//fetch last ctm and remove it from list
else if (tokenizer.TokenType == PRTokeniser.TokType.OTHER && newBuf[newBuf.Count - 1] == "Q")
{
if (ctms.Count > 0)
{
ctm = ctms[ctms.Count - 1];
ctms.RemoveAt(ctms.Count -1 );
}
}
else if (tokenizer.TokenType == PRTokeniser.TokType.OTHER && newBuf[newBuf.Count - 1] == "cm")
{
// x' = x*a + y*c + e ; y' = x*b + y*d + f
float a = (float)double.Parse(newBuf[newBuf.Count - 7], format);
float b = (float)double.Parse(newBuf[newBuf.Count - 6], format);
float c = (float)double.Parse(newBuf[newBuf.Count - 5], format);
float d = (float)double.Parse(newBuf[newBuf.Count - 4], format);
float e = (float)double.Parse(newBuf[newBuf.Count - 3], format);
float f = (float)double.Parse(newBuf[newBuf.Count - 2], format);
float[,] tempCtm = ctm;
ctm = new float[3, 3] {
{a,b,0},
{c,d,0},
{e,f,1}
};
//multiply matrices to form 1 transformation matrix
if (pendingCtm && tempCtm != null)
{
float[,] resultantCtm;
if (!TryMultiplyMatrix(tempCtm, ctm, out resultantCtm))
{
throw new InvalidOperationException("Invalid transform matrix");
}
ctm = resultantCtm;
}
//current CTM has not yet been saved to stack
pendingCtm = true;
}
return rects;
}
您要查找的命令是cm
。你读过The ABC of PDF with iText吗?这本书还没有完成,但您已经可以下载前五章了。
这是显示 cm
运算符的 table 的屏幕截图:
这是创建 5 个形状的示例使用完全相同的方式,使用相同的语法:
它们被添加在不同的位置,甚至是不同的大小和形状,因为图形状态的变化:坐标系改变了,形状在改变后的坐标系中呈现。
我有一个应用程序,可以从 pdf 文件中提取文本和矩形以供进一步分析。我使用 ItextSharp 进行提取,一切顺利,直到我偶然发现一个文档,其中有一些奇怪的 table 单元格矩形。我检索到的绘图命令中的值似乎比后面矩形的实际尺寸大 10 倍。
举个例子:
2577 831.676 385.996 3.99609 再
同时,在查看文档时,所有矩形似乎都正确地适合文档页面的边界。我的猜测是应该有一些缩放命令,告诉这些值应该按比例缩小。如此大的矩形被呈现为位于页面边界内的假设是否正确,或者怎么可能?
pdf文档在这个link后面:https://www.dropbox.com/s/gyvon0dwk6a9cj0/prEVS_ISO_11620_KOM_et.pdf?dl=0
处理从 PRStream 中提取维度的代码如下:
private static List<PdfRect> GetRectsAndLinesFromStream(PRStream stream)
{
var streamBytes = PdfReader.GetStreamBytes(stream);
var tokenizer = new PRTokeniser(new RandomAccessFileOrArray(streamBytes));
List<string> newBuf = new List<string>();
List<PdfRect> rects = new List<PdfRect>();
List<string> allTokens = new List<string>();
float[,] ctm = null;
List<float[,]> ctms = new List<float[,]>();
//if current ctm has not yet been added to list
bool pendingCtm = false;
//format definition for string-> float conversion
var format = new System.Globalization.NumberFormatInfo();
format.NegativeSign = "-";
while (tokenizer.NextToken())
{
//Add them to our master buffer
newBuf.Add(tokenizer.StringValue);
if (
tokenizer.TokenType == PRTokeniser.TokType.OTHER && newBuf[newBuf.Count - 1] == "re"
)
{
float startPointX = (float)double.Parse(newBuf[newBuf.Count - 5], format);
float startPointY = (float)double.Parse(newBuf[newBuf.Count - 4], format);
float width = (float)double.Parse(newBuf[newBuf.Count - 3], format);
float height = (float)double.Parse(newBuf[newBuf.Count - 2], format);
float endPointX = startPointX + width;
float endPointY = startPointY + height;
//if transformation is defined, correct coordinates
if (ctm!=null)
{
//extract parameters
float a = ctm[0, 0];
float b = ctm[0, 1];
float c = ctm[1, 0];
float d = ctm[1, 1];
float e = ctm[2, 0];
float f = ctm[2, 1];
//reverse transformation to get x and y from x' and y'
startPointX = (startPointX - startPointY * c - e) / a;
startPointY = (startPointY - startPointX * b - f) / d;
endPointX = (endPointX - endPointY * c - e) / a;
endPointY = (endPointY - endPointX * b - f) / d;
}
rects.Add(new PdfRect(startPointX, startPointY , endPointX , endPointY ));
}
//store current ctm
else if (tokenizer.TokenType == PRTokeniser.TokType.OTHER && newBuf[newBuf.Count - 1] == "q")
{
if (ctm != null)
{
ctms.Add(ctm);
pendingCtm = false;
}
}
//fetch last ctm and remove it from list
else if (tokenizer.TokenType == PRTokeniser.TokType.OTHER && newBuf[newBuf.Count - 1] == "Q")
{
if (ctms.Count > 0)
{
ctm = ctms[ctms.Count - 1];
ctms.RemoveAt(ctms.Count -1 );
}
}
else if (tokenizer.TokenType == PRTokeniser.TokType.OTHER && newBuf[newBuf.Count - 1] == "cm")
{
// x' = x*a + y*c + e ; y' = x*b + y*d + f
float a = (float)double.Parse(newBuf[newBuf.Count - 7], format);
float b = (float)double.Parse(newBuf[newBuf.Count - 6], format);
float c = (float)double.Parse(newBuf[newBuf.Count - 5], format);
float d = (float)double.Parse(newBuf[newBuf.Count - 4], format);
float e = (float)double.Parse(newBuf[newBuf.Count - 3], format);
float f = (float)double.Parse(newBuf[newBuf.Count - 2], format);
float[,] tempCtm = ctm;
ctm = new float[3, 3] {
{a,b,0},
{c,d,0},
{e,f,1}
};
//multiply matrices to form 1 transformation matrix
if (pendingCtm && tempCtm != null)
{
float[,] resultantCtm;
if (!TryMultiplyMatrix(tempCtm, ctm, out resultantCtm))
{
throw new InvalidOperationException("Invalid transform matrix");
}
ctm = resultantCtm;
}
//current CTM has not yet been saved to stack
pendingCtm = true;
}
return rects;
}
您要查找的命令是cm
。你读过The ABC of PDF with iText吗?这本书还没有完成,但您已经可以下载前五章了。
这是显示 cm
运算符的 table 的屏幕截图:
这是创建 5 个形状的示例使用完全相同的方式,使用相同的语法:
它们被添加在不同的位置,甚至是不同的大小和形状,因为图形状态的变化:坐标系改变了,形状在改变后的坐标系中呈现。