pdf小丑-不突出显示特定的搜索关键字

Question

我正在使用 pdf-clown 和 pdfclown-0.2.0-HEAD.jar.I 已经编写了下面的代码来突出显示在中文 pdf 文件中搜索关键字，同样的代码在英文 pdf 文件中工作正常。

import java.awt.Color;
import java.awt.Desktop;
import java.awt.geom.Rectangle2D;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.io.BufferedInputStream;
import java.io.File;
import org.pdfclown.documents.Page;
import org.pdfclown.documents.contents.ITextString;
import org.pdfclown.documents.contents.TextChar;
import org.pdfclown.documents.contents.colorSpaces.DeviceRGBColor;
import org.pdfclown.documents.interaction.annotations.TextMarkup;
import org.pdfclown.documents.interaction.annotations.TextMarkup.MarkupTypeEnum;

import org.pdfclown.files.SerializationModeEnum;
import org.pdfclown.util.math.Interval;
import org.pdfclown.util.math.geom.Quad;
import org.pdfclown.tools.TextExtractor;

public class pdfclown2 {
    private static int count;

    public static void main(String[] args) throws IOException {

        highlight("ebook.pdf","C:\Users\Downloads\6.pdf");
        System.out.println("OK");
    }
    private static void highlight(String inputPath, String outputPath) throws IOException {

        URL url = new URL(inputPath);
        InputStream in = new BufferedInputStream(url.openStream());
        org.pdfclown.files.File file = null;

        try {
            file = new org.pdfclown.files.File("C:\Users\Desktop\pdf\test123.pdf");

        Map<String, String> m = new HashMap<String, String>();
            m.put("亿元或","hi");
            m.put("收入亿来","hi");



        System.out.println("map size"+m.size());
         long startTime = System.currentTimeMillis();




            // 2. Iterating through the document pages...
            TextExtractor textExtractor = new TextExtractor(true, true);
            for (final Page page : file.getDocument().getPages()) {
                Map<Rectangle2D, List<ITextString>> textStrings = textExtractor.extract(page);
                for (Map.Entry<String, String> entry : m.entrySet()) {

                    Pattern pattern;
                    String serachKey =  entry.getKey();
                    final String translationKeyword = entry.getValue();
                /*
                        if ((serachKey.contains(")") && serachKey.contains("("))
                                || (serachKey.contains("(") && !serachKey.contains(")"))
                                || (serachKey.contains(")") && !serachKey.contains("(")) || serachKey.contains("?")
                                || serachKey.contains("*") || serachKey.contains("+")) {s
                            pattern = Pattern.compile(Pattern.quote(serachKey), Pattern.CASE_INSENSITIVE);
                        }
                        else*/
                             pattern = Pattern.compile(serachKey, Pattern.CASE_INSENSITIVE);
                // 2.1. Extract the page text!

            //System.out.println(textStrings.toString().indexOf(entry.getKey()));

                // 2.2. Find the text pattern matches!
                final Matcher matcher = pattern.matcher(TextExtractor.toString(textStrings));
                // 2.3. Highlight the text pattern matches!
                textExtractor.filter(textStrings, new TextExtractor.IIntervalFilter() {
                    public boolean hasNext() {
                        // System.out.println(matcher.find());
                        // if(key.getMatchCriteria() == 1){
                        if (matcher.find()) {
                            return true;
                        }
                        /*
                         * } else if(key.getMatchCriteria() == 2) { if
                         * (matcher.hitEnd()) { count++; return true; } }
                         */
                        return false;

                    }

                    public Interval<Integer> next() {
                        return new Interval<Integer>(matcher.start(), matcher.end());
                    }

                    public void process(Interval<Integer> interval, ITextString match) {
                        // Defining the highlight box of the text pattern
                        // match...
                        System.out.println(match);
                    /*  List<Quad> highlightQuads = new ArrayList<Quad>();
                        {
                            Rectangle2D textBox = null;
                            for (TextChar textChar : match.getTextChars()) {
                                Rectangle2D textCharBox = textChar.getBox();
                                if (textBox == null) {
                                    textBox = (Rectangle2D) textCharBox.clone();
                                } else {
                                    if (textCharBox.getY() > textBox.getMaxY()) {
                                        highlightQuads.add(Quad.get(textBox));
                                        textBox = (Rectangle2D) textCharBox.clone();
                                    } else {
                                        textBox.add(textCharBox);
                                    }
                                }
                            }
                            textBox.setRect(textBox.getX(), textBox.getY(), textBox.getWidth(), textBox.getHeight());
                            highlightQuads.add(Quad.get(textBox));
                        }*/
                        List<Quad> highlightQuads = new ArrayList<Quad>();
                        List<TextChar> textChars = match.getTextChars();
                        Rectangle2D firstRect = textChars.get(0).getBox();
                        Rectangle2D lastRect = textChars.get(textChars.size()-1).getBox();
                        Rectangle2D rect = firstRect.createUnion(lastRect);
                        highlightQuads.add(Quad.get(rect).get(rect));
                        // subtype can be Highlight, Underline, StrikeOut, Squiggly


                        new TextMarkup(page, highlightQuads, translationKeyword, MarkupTypeEnum.Highlight);

                    }

                    public void remove() {
                        throw new UnsupportedOperationException();
                    }

                });
            }

        }

        SerializationModeEnum serializationMode = SerializationModeEnum.Standard;

            file.save(new java.io.File(outputPath), serializationMode);

            System.out.println("file created");
            long endTime = System.currentTimeMillis();

             System.out.println("seconds take for execution is:"+(endTime-startTime)/1000);

        } catch (Exception e) {
               e.printStackTrace();
        }
        finally{
            in.close();
        }


    }
}

请提供您的输入以突出显示非英语 pdf 文件的特定搜索关键字。

我正在搜索下面的中文关键字。

普双套习近平修宪普京利用双套车绕开宪法装班要走普京

enter image description here

Answer 1

您的 PDF 小丑版本

您检索到的 PDF Clown 版本 here from Tymate's maven repository on github has been pushed there April 23rd, 2015. The final (as of now) check-in to the PDF Clown subversion 另一方面，sourceforge 上的源代码存储库 TRUNK 是从 2015 年 5 月 27 日开始的。在 2015 年 4 月 23 日之后实际上有大约 30 个签入。因此，您绝对不要使用这个明显死掉的 PDF 库项目的最新版本。

使用当前的 0.2.0 快照

我用那个主干编译的0.2.0开发版测试了你的代码，结果确实不一样：

最好是高亮部分有搜索字符的宽度，并且离实际字符位置更近。但是仍然存在一个错误，因为第二和第三场比赛的亮点有点不对劲。

修复错误

剩下的问题其实和文章的语言无关。它只是在处理一种类型的 PDF 文本绘图命令时出现的错误，因此可以在包含任意语言文本的文档中观察到它。然而，由于这些命令如今很少使用，因此几乎从未观察到该错误，更不用说报告了。另一方面，您的 PDF 使用了那种文本绘图命令。

错误在 ShowText class（包 org.pdfclown.documents.contents.objects）中。在 scan 方法结束时，如果 ShowText 实例实际上是从它派生的 ShowTextToNextLine 实例，则图形状态中的文本行矩阵将像这样更新：

if(textScanner == null)
{
  state.setTm(tm);

  if(this instanceof ShowTextToNextLine)
  {state.setTlm((AffineTransform)tm.clone());}
}

这里的文本行矩阵设置为移动到下一行并绘制文本后的文本矩阵。这是错误的，它必须在移动到下一行之后在绘制文本之前立即设置为文本矩阵。

这可以解决，例如像这样：

if(textScanner == null)
{
  state.setTm(tm);

  if(this instanceof ShowTextToNextLine)
    state.getTlm().concatenate(new AffineTransform(1, 0, 0, 1, 0, -state.getLead()));
}

进行此更改后，结果如下所示：

pdf小丑-不突出显示特定的搜索关键字

pdf clown- not highlighting specific search keyword

java

pdfclown

您的 PDF 小丑版本

使用当前的 0.2.0 快照

修复错误