如何使用pdfbox获取页面内容高度
How to get page content height using pdfbox
是否可以使用pdfbox获取页面内容的高度?
我想我尝试了所有方法,但页面的每个 (PDRectangle) returns 全高:842。
一开始我以为这是因为页码放在了页面的底部,但是当我在 Illustrator 中打开 pdf 时,整个内容都在复合元素内,并没有扩展到整个页面高度。所以如果illustrator可以把它看作一个单独的元素并计算它的高度,我想这在pdfbox中应该也是可能的。
示例页面:
一般
PDF 规范允许 PDF 提供多个页面边界,参见 this answer。除了它们之外,内容边界可能仅来自页面内容,例如来自
表单 XObjects:
A form XObject is a PDF content stream that is a self-contained description of any sequence of graphics objects (including path objects, text objects, and sampled images). A form XObject may be painted multiple times—either on several pages or at several locations on the same page—and produces the same results each time, subject only to the graphics state at the time it is invoked.
剪切路径:
The graphics state shall contain a current clipping path that limits the regions of the page affected by painting operators. The closed subpaths of this path shall define the area that can be painted. Marks falling inside this area shall be applied to the page; those falling outside it shall not be.
...
要找到它们中的任何一个,必须解析页面内容,寻找适当的操作,并计算结果边界。
在 OP 的情况下
您的每个示例 PDF 仅明确定义一个页面边界,MediaBox。因此,所有其他 PDF 页面边界(CropBox、BleedBox、TrimBox、ArtBox) 默认为它。所以难怪在你的尝试中
each (PDRectangle) returns full height of the page: 842
它们都不包含 XObject 形式,但都使用了剪切路径。
如果测试-pdf4.pdf:
Start at: 28.31999969482422, 813.6799926757812
Line to: 565.9199829101562, 813.6799926757812
Line to: 565.9199829101562, 660.2196655273438
Line to: 28.31999969482422, 660.2196655273438
Line to: 28.31999969482422, 813.6799926757812
(这可能与您问题中的草图相符。)
如果测试-pdf5.pdf:
Start at: 23.0, 34.0
Line to: 572.0, 34.0
Line to: 572.0, -751.0
Line to: 23.0, -751.0
Line to: 23.0, 34.0
和
Start at: 23.0, 819.0
Line to: 572.0, 819.0
Line to: 572.0, 34.0
Line to: 23.0, 34.0
Line to: 23.0, 819.0
由于与草图的匹配,我假设 Illustrator 会考虑在非平凡的剪切路径生效时绘制的所有内容,复合元素 将剪切路径作为边框.
使用 PDFBox 查找剪切路径
我使用 PDFBox 找到了上面提到的剪切路径。我使用了正在开发的 2.0.0 版本的当前 SNAPSHOT,因为与当前版本 1.8.8 相比,所需的 API 有了很大改进。
我将 PDFGraphicsStreamEngine
扩展为 ClipPathFinder
class:
public class ClipPathFinder extends PDFGraphicsStreamEngine implements Iterable<Path>
{
public ClipPathFinder(PDPage page)
{
super(page);
}
//
// PDFGraphicsStreamEngine overrides
//
public void findClipPaths() throws IOException
{
processPage(getPage());
}
@Override
public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) throws IOException
{
startPathIfNecessary();
currentPath.appendRectangle(toFloat(p0), toFloat(p1), toFloat(p2), toFloat(p3));
}
@Override
public void drawImage(PDImage pdImage) throws IOException { }
@Override
public void clip(int windingRule) throws IOException
{
currentPath.complete(windingRule);
paths.add(currentPath);
currentPath = null;
}
@Override
public void moveTo(float x, float y) throws IOException
{
startPathIfNecessary();
currentPath.moveTo(x, y);
}
@Override
public void lineTo(float x, float y) throws IOException
{
currentPath.lineTo(x, y);
}
@Override
public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) throws IOException
{
currentPath.curveTo(x1, y1, x2, y2, x3, y3);
}
@Override
public Point2D.Float getCurrentPoint() throws IOException
{
return currentPath.getCurrentPoint();
}
@Override
public void closePath() throws IOException
{
currentPath.closePath();
}
@Override
public void endPath() throws IOException
{
currentPath = null;
}
@Override
public void strokePath() throws IOException
{
currentPath = null;
}
@Override
public void fillPath(int windingRule) throws IOException
{
currentPath = null;
}
@Override
public void fillAndStrokePath(int windingRule) throws IOException
{
currentPath = null;
}
@Override
public void shadingFill(COSName shadingName) throws IOException
{
currentPath = null;
}
void startPathIfNecessary()
{
if (currentPath == null)
currentPath = new Path();
}
Point2D.Float toFloat(Point2D p)
{
if (p == null || (p instanceof Point2D.Float))
{
return (Point2D.Float)p;
}
return new Point2D.Float((float)p.getX(), (float)p.getY());
}
//
// Iterable<Path> implementation
//
public Iterator<Path> iterator()
{
return paths.iterator();
}
Path currentPath = null;
final List<Path> paths = new ArrayList<Path>();
}
它使用这个助手 class 来表示路径:
public class Path implements Iterable<Path.SubPath>
{
public static class Segment
{
Segment(Point2D.Float start, Point2D.Float end)
{
this.start = start;
this.end = end;
}
public Point2D.Float getStart()
{
return start;
}
public Point2D.Float getEnd()
{
return end;
}
final Point2D.Float start, end;
}
public class SubPath implements Iterable<Segment>
{
public class Line extends Segment
{
Line(Point2D.Float start, Point2D.Float end)
{
super(start, end);
}
//
// Object override
//
@Override
public String toString()
{
StringBuilder builder = new StringBuilder();
builder.append(" Line to: ")
.append(end.getX())
.append(", ")
.append(end.getY())
.append('\n');
return builder.toString();
}
}
public class Curve extends Segment
{
Curve(Point2D.Float start, Point2D.Float control1, Point2D.Float control2, Point2D.Float end)
{
super(start, end);
this.control1 = control1;
this.control2 = control2;
}
public Point2D getControl1()
{
return control1;
}
public Point2D getControl2()
{
return control2;
}
//
// Object override
//
@Override
public String toString()
{
StringBuilder builder = new StringBuilder();
builder.append(" Curve to: ")
.append(end.getX())
.append(", ")
.append(end.getY())
.append(" with Control1: ")
.append(control1.getX())
.append(", ")
.append(control1.getY())
.append(" and Control2: ")
.append(control2.getX())
.append(", ")
.append(control2.getY())
.append('\n');
return builder.toString();
}
final Point2D control1, control2;
}
SubPath(Point2D.Float start)
{
this.start = start;
currentPoint = start;
}
public Point2D getStart()
{
return start;
}
void lineTo(float x, float y)
{
Point2D.Float end = new Point2D.Float(x, y);
segments.add(new Line(currentPoint, end));
currentPoint = end;
}
void curveTo(float x1, float y1, float x2, float y2, float x3, float y3)
{
Point2D.Float control1 = new Point2D.Float(x1, y1);
Point2D.Float control2 = new Point2D.Float(x2, y2);
Point2D.Float end = new Point2D.Float(x3, y3);
segments.add(new Curve(currentPoint, control1, control2, end));
currentPoint = end;
}
void closePath()
{
closed = true;
currentPoint = start;
}
//
// Iterable<Segment> implementation
//
public Iterator<Segment> iterator()
{
return segments.iterator();
}
//
// Object override
//
@Override
public String toString()
{
StringBuilder builder = new StringBuilder();
builder.append(" {\n Start at: ")
.append(start.getX())
.append(", ")
.append(start.getY())
.append('\n');
for (Segment segment : segments)
builder.append(segment);
if (closed)
builder.append(" Closed\n");
builder.append(" }\n");
return builder.toString();
}
boolean closed = false;
final Point2D.Float start;
final List<Segment> segments = new ArrayList<Path.Segment>();
}
public class Rectangle extends SubPath
{
Rectangle(Point2D.Float p0, Point2D.Float p1, Point2D.Float p2, Point2D.Float p3)
{
super(p0);
lineTo((float)p1.getX(), (float)p1.getY());
lineTo((float)p2.getX(), (float)p2.getY());
lineTo((float)p3.getX(), (float)p3.getY());
closePath();
}
//
// Object override
//
@Override
public String toString()
{
StringBuilder builder = new StringBuilder();
builder.append(" {\n Rectangle\n Start at: ")
.append(start.getX())
.append(", ")
.append(start.getY())
.append('\n');
for (Segment segment : segments)
builder.append(segment);
if (closed)
builder.append(" Closed\n");
builder.append(" }\n");
return builder.toString();
}
}
public int getWindingRule()
{
return windingRule;
}
void complete(int windingRule)
{
finishSubPath();
this.windingRule = windingRule;
}
void appendRectangle(Point2D.Float p0, Point2D.Float p1, Point2D.Float p2, Point2D.Float p3) throws IOException
{
finishSubPath();
currentSubPath = new Rectangle(p0, p1, p2, p3);
finishSubPath();
}
void moveTo(float x, float y) throws IOException
{
finishSubPath();
currentSubPath = new SubPath(new Point2D.Float(x, y));
}
void lineTo(float x, float y) throws IOException
{
currentSubPath.lineTo(x, y);
}
void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) throws IOException
{
currentSubPath.curveTo(x1, y1, x2, y2, x3, y3);
}
Point2D.Float getCurrentPoint() throws IOException
{
return currentPoint;
}
void closePath() throws IOException
{
currentSubPath.closePath();
finishSubPath();
}
void finishSubPath()
{
if (currentSubPath != null)
{
subPaths.add(currentSubPath);
currentSubPath = null;
}
}
//
// Iterable<Path.SubPath> implementation
//
public Iterator<SubPath> iterator()
{
return subPaths.iterator();
}
//
// Object override
//
@Override
public String toString()
{
StringBuilder builder = new StringBuilder();
builder.append("{\n Winding: ")
.append(windingRule)
.append('\n');
for (SubPath subPath : subPaths)
builder.append(subPath);
builder.append("}\n");
return builder.toString();
}
Point2D.Float currentPoint = null;
SubPath currentSubPath = null;
int windingRule = -1;
final List<SubPath> subPaths = new ArrayList<Path.SubPath>();
}
classClipPathFinder
的用法是这样的:
PDDocument document = PDDocument.load(PDFRESOURCE, null);
PDPage page = document.getPage(PAGENUMBER);
ClipPathFinder finder = new ClipPathFinder(page);
finder.findClipPaths();
for (Path path : finder)
{
System.out.println(path);
}
document.close();
是否可以使用pdfbox获取页面内容的高度? 我想我尝试了所有方法,但页面的每个 (PDRectangle) returns 全高:842。 一开始我以为这是因为页码放在了页面的底部,但是当我在 Illustrator 中打开 pdf 时,整个内容都在复合元素内,并没有扩展到整个页面高度。所以如果illustrator可以把它看作一个单独的元素并计算它的高度,我想这在pdfbox中应该也是可能的。
示例页面:
一般
PDF 规范允许 PDF 提供多个页面边界,参见 this answer。除了它们之外,内容边界可能仅来自页面内容,例如来自
表单 XObjects:
A form XObject is a PDF content stream that is a self-contained description of any sequence of graphics objects (including path objects, text objects, and sampled images). A form XObject may be painted multiple times—either on several pages or at several locations on the same page—and produces the same results each time, subject only to the graphics state at the time it is invoked.
剪切路径:
The graphics state shall contain a current clipping path that limits the regions of the page affected by painting operators. The closed subpaths of this path shall define the area that can be painted. Marks falling inside this area shall be applied to the page; those falling outside it shall not be.
...
要找到它们中的任何一个,必须解析页面内容,寻找适当的操作,并计算结果边界。
在 OP 的情况下
您的每个示例 PDF 仅明确定义一个页面边界,MediaBox。因此,所有其他 PDF 页面边界(CropBox、BleedBox、TrimBox、ArtBox) 默认为它。所以难怪在你的尝试中
each (PDRectangle) returns full height of the page: 842
它们都不包含 XObject 形式,但都使用了剪切路径。
如果测试-pdf4.pdf:
Start at: 28.31999969482422, 813.6799926757812 Line to: 565.9199829101562, 813.6799926757812 Line to: 565.9199829101562, 660.2196655273438 Line to: 28.31999969482422, 660.2196655273438 Line to: 28.31999969482422, 813.6799926757812
(这可能与您问题中的草图相符。)
如果测试-pdf5.pdf:
Start at: 23.0, 34.0 Line to: 572.0, 34.0 Line to: 572.0, -751.0 Line to: 23.0, -751.0 Line to: 23.0, 34.0
和
Start at: 23.0, 819.0 Line to: 572.0, 819.0 Line to: 572.0, 34.0 Line to: 23.0, 34.0 Line to: 23.0, 819.0
由于与草图的匹配,我假设 Illustrator 会考虑在非平凡的剪切路径生效时绘制的所有内容,复合元素 将剪切路径作为边框.
使用 PDFBox 查找剪切路径
我使用 PDFBox 找到了上面提到的剪切路径。我使用了正在开发的 2.0.0 版本的当前 SNAPSHOT,因为与当前版本 1.8.8 相比,所需的 API 有了很大改进。
我将 PDFGraphicsStreamEngine
扩展为 ClipPathFinder
class:
public class ClipPathFinder extends PDFGraphicsStreamEngine implements Iterable<Path>
{
public ClipPathFinder(PDPage page)
{
super(page);
}
//
// PDFGraphicsStreamEngine overrides
//
public void findClipPaths() throws IOException
{
processPage(getPage());
}
@Override
public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) throws IOException
{
startPathIfNecessary();
currentPath.appendRectangle(toFloat(p0), toFloat(p1), toFloat(p2), toFloat(p3));
}
@Override
public void drawImage(PDImage pdImage) throws IOException { }
@Override
public void clip(int windingRule) throws IOException
{
currentPath.complete(windingRule);
paths.add(currentPath);
currentPath = null;
}
@Override
public void moveTo(float x, float y) throws IOException
{
startPathIfNecessary();
currentPath.moveTo(x, y);
}
@Override
public void lineTo(float x, float y) throws IOException
{
currentPath.lineTo(x, y);
}
@Override
public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) throws IOException
{
currentPath.curveTo(x1, y1, x2, y2, x3, y3);
}
@Override
public Point2D.Float getCurrentPoint() throws IOException
{
return currentPath.getCurrentPoint();
}
@Override
public void closePath() throws IOException
{
currentPath.closePath();
}
@Override
public void endPath() throws IOException
{
currentPath = null;
}
@Override
public void strokePath() throws IOException
{
currentPath = null;
}
@Override
public void fillPath(int windingRule) throws IOException
{
currentPath = null;
}
@Override
public void fillAndStrokePath(int windingRule) throws IOException
{
currentPath = null;
}
@Override
public void shadingFill(COSName shadingName) throws IOException
{
currentPath = null;
}
void startPathIfNecessary()
{
if (currentPath == null)
currentPath = new Path();
}
Point2D.Float toFloat(Point2D p)
{
if (p == null || (p instanceof Point2D.Float))
{
return (Point2D.Float)p;
}
return new Point2D.Float((float)p.getX(), (float)p.getY());
}
//
// Iterable<Path> implementation
//
public Iterator<Path> iterator()
{
return paths.iterator();
}
Path currentPath = null;
final List<Path> paths = new ArrayList<Path>();
}
它使用这个助手 class 来表示路径:
public class Path implements Iterable<Path.SubPath>
{
public static class Segment
{
Segment(Point2D.Float start, Point2D.Float end)
{
this.start = start;
this.end = end;
}
public Point2D.Float getStart()
{
return start;
}
public Point2D.Float getEnd()
{
return end;
}
final Point2D.Float start, end;
}
public class SubPath implements Iterable<Segment>
{
public class Line extends Segment
{
Line(Point2D.Float start, Point2D.Float end)
{
super(start, end);
}
//
// Object override
//
@Override
public String toString()
{
StringBuilder builder = new StringBuilder();
builder.append(" Line to: ")
.append(end.getX())
.append(", ")
.append(end.getY())
.append('\n');
return builder.toString();
}
}
public class Curve extends Segment
{
Curve(Point2D.Float start, Point2D.Float control1, Point2D.Float control2, Point2D.Float end)
{
super(start, end);
this.control1 = control1;
this.control2 = control2;
}
public Point2D getControl1()
{
return control1;
}
public Point2D getControl2()
{
return control2;
}
//
// Object override
//
@Override
public String toString()
{
StringBuilder builder = new StringBuilder();
builder.append(" Curve to: ")
.append(end.getX())
.append(", ")
.append(end.getY())
.append(" with Control1: ")
.append(control1.getX())
.append(", ")
.append(control1.getY())
.append(" and Control2: ")
.append(control2.getX())
.append(", ")
.append(control2.getY())
.append('\n');
return builder.toString();
}
final Point2D control1, control2;
}
SubPath(Point2D.Float start)
{
this.start = start;
currentPoint = start;
}
public Point2D getStart()
{
return start;
}
void lineTo(float x, float y)
{
Point2D.Float end = new Point2D.Float(x, y);
segments.add(new Line(currentPoint, end));
currentPoint = end;
}
void curveTo(float x1, float y1, float x2, float y2, float x3, float y3)
{
Point2D.Float control1 = new Point2D.Float(x1, y1);
Point2D.Float control2 = new Point2D.Float(x2, y2);
Point2D.Float end = new Point2D.Float(x3, y3);
segments.add(new Curve(currentPoint, control1, control2, end));
currentPoint = end;
}
void closePath()
{
closed = true;
currentPoint = start;
}
//
// Iterable<Segment> implementation
//
public Iterator<Segment> iterator()
{
return segments.iterator();
}
//
// Object override
//
@Override
public String toString()
{
StringBuilder builder = new StringBuilder();
builder.append(" {\n Start at: ")
.append(start.getX())
.append(", ")
.append(start.getY())
.append('\n');
for (Segment segment : segments)
builder.append(segment);
if (closed)
builder.append(" Closed\n");
builder.append(" }\n");
return builder.toString();
}
boolean closed = false;
final Point2D.Float start;
final List<Segment> segments = new ArrayList<Path.Segment>();
}
public class Rectangle extends SubPath
{
Rectangle(Point2D.Float p0, Point2D.Float p1, Point2D.Float p2, Point2D.Float p3)
{
super(p0);
lineTo((float)p1.getX(), (float)p1.getY());
lineTo((float)p2.getX(), (float)p2.getY());
lineTo((float)p3.getX(), (float)p3.getY());
closePath();
}
//
// Object override
//
@Override
public String toString()
{
StringBuilder builder = new StringBuilder();
builder.append(" {\n Rectangle\n Start at: ")
.append(start.getX())
.append(", ")
.append(start.getY())
.append('\n');
for (Segment segment : segments)
builder.append(segment);
if (closed)
builder.append(" Closed\n");
builder.append(" }\n");
return builder.toString();
}
}
public int getWindingRule()
{
return windingRule;
}
void complete(int windingRule)
{
finishSubPath();
this.windingRule = windingRule;
}
void appendRectangle(Point2D.Float p0, Point2D.Float p1, Point2D.Float p2, Point2D.Float p3) throws IOException
{
finishSubPath();
currentSubPath = new Rectangle(p0, p1, p2, p3);
finishSubPath();
}
void moveTo(float x, float y) throws IOException
{
finishSubPath();
currentSubPath = new SubPath(new Point2D.Float(x, y));
}
void lineTo(float x, float y) throws IOException
{
currentSubPath.lineTo(x, y);
}
void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) throws IOException
{
currentSubPath.curveTo(x1, y1, x2, y2, x3, y3);
}
Point2D.Float getCurrentPoint() throws IOException
{
return currentPoint;
}
void closePath() throws IOException
{
currentSubPath.closePath();
finishSubPath();
}
void finishSubPath()
{
if (currentSubPath != null)
{
subPaths.add(currentSubPath);
currentSubPath = null;
}
}
//
// Iterable<Path.SubPath> implementation
//
public Iterator<SubPath> iterator()
{
return subPaths.iterator();
}
//
// Object override
//
@Override
public String toString()
{
StringBuilder builder = new StringBuilder();
builder.append("{\n Winding: ")
.append(windingRule)
.append('\n');
for (SubPath subPath : subPaths)
builder.append(subPath);
builder.append("}\n");
return builder.toString();
}
Point2D.Float currentPoint = null;
SubPath currentSubPath = null;
int windingRule = -1;
final List<SubPath> subPaths = new ArrayList<Path.SubPath>();
}
classClipPathFinder
的用法是这样的:
PDDocument document = PDDocument.load(PDFRESOURCE, null);
PDPage page = document.getPage(PAGENUMBER);
ClipPathFinder finder = new ClipPathFinder(page);
finder.findClipPaths();
for (Path path : finder)
{
System.out.println(path);
}
document.close();