Apache POI 找不到突出显示的文本

Apache POI doesn't find highlighted text

我有一个以 doc 格式保存的文件,我需要提取突出显示的文本。 我有如下代码:

HWPFDocument document = new HWPFDocument(fis);
        Range r = document.getRange();
        for (int i=0;i<5;i++) {
            CharacterRun t = r.getCharacterRun(i);
            System.out.println(t.isHighlighted());
            System.out.println(t.getHighlightedColor());
            System.out.println(r.getCharacterRun(i).SPRM_HIGHLIGHT);
            System.out.println(r.getCharacterRun(i));
        }

None上面的方法都显示文字是高亮的,但是我打开的时候是高亮的。 可能是什么原因,以及如何查找文本是否突出显示?

可以使用两种不同的方法在 Word 中突出显示文本。首先是 applying highlighting to text runs. Second is applying shading to words or paragraphs.

对于第一个和使用*.docWord二进制文件格式,apache poi提供CharacterRun. For the second apache poi provides Paragraph.getShading中的方法。但这只有在阴影适用于整个段落时才会设置。如果阴影仅应用于单次运行,则 apache poi 不提供任何内容。因此需要使用底层 SprmOperations。

Microsoft's documentation 2.6.1 Character Properties 描述 sprmCShd80 (0x4866) 即 "A Shd80 structure that specifies the background shading for the text."。所以我们需要搜索它。

示例:

import java.io.FileInputStream;

import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.*;

import org.apache.poi.hwpf.sprm.*;

import java.lang.reflect.Field;
import java.lang.reflect.Method;

public class HWPFInspectBgColor {

 private static void showCharacterRunInternals(CharacterRun run) throws Exception {
  Field _chpx = CharacterRun.class.getDeclaredField("_chpx"); 
  _chpx.setAccessible(true);
  SprmBuffer sprmBuffer = (SprmBuffer) _chpx.get(run);
  for (SprmIterator sprmIterator = sprmBuffer.iterator(); sprmIterator.hasNext(); ) {
   SprmOperation sprmOperation = sprmIterator.next();
System.out.println(sprmOperation);
  }
 }

 static SprmOperation getCharacterRunShading(CharacterRun run) throws Exception {
  SprmOperation shd80Operation = null;
  Field _chpx = CharacterRun.class.getDeclaredField("_chpx"); 
  _chpx.setAccessible(true);
  Field _value = SprmOperation.class.getDeclaredField("_value"); 
  _value.setAccessible(true);
  SprmBuffer sprmBuffer = (SprmBuffer) _chpx.get(run);
  for (SprmIterator sprmIterator = sprmBuffer.iterator(); sprmIterator.hasNext(); ) {
   SprmOperation sprmOperation = sprmIterator.next();
   short sprmValue = (short)_value.get(sprmOperation);
   if (sprmValue == (short)0x4866) { // we have a Shd80 structure, see https://msdn.microsoft.com/en-us/library/dd947480(v=office.12).aspx
    shd80Operation = sprmOperation;
   }
  }
  return shd80Operation;
 }

 public static void main(String[] args) throws Exception {
  HWPFDocument document = new HWPFDocument(new FileInputStream("sample.doc"));
  Range range = document.getRange();
  for (int p = 0; p < range.numParagraphs(); p++) {
   Paragraph paragraph = range.getParagraph(p);
System.out.println(paragraph);
   if (!paragraph.getShading().isEmpty()) {
System.out.println("Paragraph's shading: " + paragraph.getShading());
   }

   for (int r = 0; r < paragraph.numCharacterRuns(); r++) {
    CharacterRun run = paragraph.getCharacterRun(r);
System.out.println(run);
    if (run.isHighlighted()) {
System.out.println("Run's highlighted color: " + run.getHighlightedColor());
    }
    if (getCharacterRunShading(run) != null) {
System.out.println("Run's Shd80 structure: " + getCharacterRunShading(run));
    }
   }
  }
 }
}