Apache POI - 检索 .doc 文件中关键字之间的文本内容并有条件地呈现它
Apache POI - Retrieve text content between keywords in .doc file and conditionally render it
我想在 .doc 文件中找到两个关键字之间的文本内容,并有条件地呈现该文本内容或隐藏它。例如:
Lorem Ipsum is simply dummy text ${if condition}
of the printing and
typesetting industry. Lorem Ipsum has been the industry's standard
dummy text ever since the 1500s ${endif}
当我使用 Apache - POI 解析文档时,我希望能够以某种方式在文档中发现这些 blockquotes [=12] 之间的每一个内容=] ${endif}
并在我要生成的下一个文档中有条件地渲染它或不渲染它。
所以经过我解析后的上述文字应该有以下两种不同的形式:
1)如果满足条件
Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s
或
2) 如果不满足条件
Lorem Ipsum is simply dummy text
我尝试通过使用 XWPFParagraph
对象然后使用 XWPFRun
来做到这一点,但这不是可靠的方法,因为 运行 可以在中间随机分割在不可预测的条件下的单词。
您能否提出任何可靠的方法来实现我的用例?提前致谢。
以此为例(代码已测试):
class ParagraphModifier {
private final Pattern pIf = Pattern.compile("\$\{if\s+(\w+)\}");
private final Pattern pEIf = Pattern.compile("\$\{endif\}");
private final Function<String, Boolean> processor;
public ParagraphModifier(Function<String, Boolean> processor) {
this.processor = processor;
}
// Process
static class Pair<K, V> {
public K key;
public V value;
public Pair(K key, V value) {
this.key = key;
this.value = value;
}
}
//
public static void cloneRun(XWPFRun clone, XWPFRun source) {
CTRPr rPr = clone.getCTR().isSetRPr() ? clone.getCTR().getRPr() : clone.getCTR().addNewRPr();
rPr.set(source.getCTR().getRPr());
clone.setText(source.getText(0));
}
// Split runs in paragraph at a specific text offset and returns the run index
int splitAtTextPosition(XWPFParagraph paragraph, int position) {
List<XWPFRun> runs = paragraph.getRuns();
int offset = 0;
for (int i = 0; i < runs.size(); i++) {
XWPFRun run = runs.get(i);
String text = run.getText(0);
int length = text.length();
if (position >= (offset + length)) {
offset += length;
continue;
}
// Do split
XWPFRun run2 = paragraph.insertNewRun(i + 1);
cloneRun(run2, run);
run.setText(text.substring(0, position - offset), 0);
run2.setText(text.substring(position - offset), 0);
return i + 1;
}
return -1;
}
String getParagraphText(XWPFParagraph paragraph) {
StringBuilder sb = new StringBuilder("");
for (XWPFRun run : paragraph.getRuns()) sb.append(run.getText(0));
return sb.toString();
}
void removeRunsRange(XWPFParagraph paragraph, int from, int to) {
int runs = paragraph.getRuns().size();
to = Math.min(to, runs);
for (int i = (to - 1); i >= from; i--) {
paragraph.removeRun(i);
}
}
Pair<Integer, String> extractToken(Pattern pattern, XWPFParagraph paragraph) {
String text = getParagraphText(paragraph);
Matcher matcher = pattern.matcher(text);
if (matcher.find()) {
int rStart = splitAtTextPosition(paragraph, matcher.start());
int rEnd = splitAtTextPosition(paragraph, matcher.end());
removeRunsRange(paragraph, rStart, rEnd);
return new Pair<>(rStart, matcher.group());
} else {
return new Pair<>(-1, "");
}
}
void applyParagraph(XWPFParagraph paragraph) {
int lastIf = -1;
while (true) {
var tIf = extractToken(pIf, paragraph);
if (tIf.key == -1) {
break;
}
if (tIf.key < lastIf) {
throw new IllegalStateException("If conditions can not be nested");
}
var tEIf = extractToken(pEIf, paragraph);
if (tEIf.key == -1) {
throw new IllegalStateException("If condition missing endif");
}
var m = pIf.matcher(tIf.value);
var keep = m.find() && processor.apply(m.group(1));
if (!keep) {
removeRunsRange(paragraph, tIf.key, tEIf.key);
}
lastIf = tEIf.key;
}
}
void apply(Iterable<XWPFParagraph> paragraphs) {
for (XWPFParagraph p : paragraphs) {
applyParagraph(p);
}
}
}
用法:
class Main {
private static XWPFDocument loadDoc(String name) throws IOException, InvalidFormatException {
String path = Main.class.getClassLoader().getResource(name).getPath();
FileInputStream fis = new FileInputStream( path);
return new XWPFDocument(OPCPackage.open(fis));
}
private static void saveDoc(String path, XWPFDocument doc) throws IOException {
try (var fos = new FileOutputStream(path)) {
doc.write(fos);
}
}
public static void main (String[] args) throws Exception {
var xdoc = loadDoc("test.docx");
var pm = new ParagraphModifier(str -> str.toLowerCase().equals("true"));
pm.apply(xdoc.getParagraphs());
saveDoc("test.out.docx", xdoc);
}
}
此解决方案不支持 ${if }
块跨越段落,如果嵌套,也不支持 Table 结构。扩展解决方案以支持它们应该很简单。
我想在 .doc 文件中找到两个关键字之间的文本内容,并有条件地呈现该文本内容或隐藏它。例如:
Lorem Ipsum is simply dummy text
${if condition}
of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s${endif}
当我使用 Apache - POI 解析文档时,我希望能够以某种方式在文档中发现这些 blockquotes [=12] 之间的每一个内容=] ${endif}
并在我要生成的下一个文档中有条件地渲染它或不渲染它。
所以经过我解析后的上述文字应该有以下两种不同的形式:
1)如果满足条件
Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s
或
2) 如果不满足条件
Lorem Ipsum is simply dummy text
我尝试通过使用 XWPFParagraph
对象然后使用 XWPFRun
来做到这一点,但这不是可靠的方法,因为 运行 可以在中间随机分割在不可预测的条件下的单词。
您能否提出任何可靠的方法来实现我的用例?提前致谢。
以此为例(代码已测试):
class ParagraphModifier {
private final Pattern pIf = Pattern.compile("\$\{if\s+(\w+)\}");
private final Pattern pEIf = Pattern.compile("\$\{endif\}");
private final Function<String, Boolean> processor;
public ParagraphModifier(Function<String, Boolean> processor) {
this.processor = processor;
}
// Process
static class Pair<K, V> {
public K key;
public V value;
public Pair(K key, V value) {
this.key = key;
this.value = value;
}
}
//
public static void cloneRun(XWPFRun clone, XWPFRun source) {
CTRPr rPr = clone.getCTR().isSetRPr() ? clone.getCTR().getRPr() : clone.getCTR().addNewRPr();
rPr.set(source.getCTR().getRPr());
clone.setText(source.getText(0));
}
// Split runs in paragraph at a specific text offset and returns the run index
int splitAtTextPosition(XWPFParagraph paragraph, int position) {
List<XWPFRun> runs = paragraph.getRuns();
int offset = 0;
for (int i = 0; i < runs.size(); i++) {
XWPFRun run = runs.get(i);
String text = run.getText(0);
int length = text.length();
if (position >= (offset + length)) {
offset += length;
continue;
}
// Do split
XWPFRun run2 = paragraph.insertNewRun(i + 1);
cloneRun(run2, run);
run.setText(text.substring(0, position - offset), 0);
run2.setText(text.substring(position - offset), 0);
return i + 1;
}
return -1;
}
String getParagraphText(XWPFParagraph paragraph) {
StringBuilder sb = new StringBuilder("");
for (XWPFRun run : paragraph.getRuns()) sb.append(run.getText(0));
return sb.toString();
}
void removeRunsRange(XWPFParagraph paragraph, int from, int to) {
int runs = paragraph.getRuns().size();
to = Math.min(to, runs);
for (int i = (to - 1); i >= from; i--) {
paragraph.removeRun(i);
}
}
Pair<Integer, String> extractToken(Pattern pattern, XWPFParagraph paragraph) {
String text = getParagraphText(paragraph);
Matcher matcher = pattern.matcher(text);
if (matcher.find()) {
int rStart = splitAtTextPosition(paragraph, matcher.start());
int rEnd = splitAtTextPosition(paragraph, matcher.end());
removeRunsRange(paragraph, rStart, rEnd);
return new Pair<>(rStart, matcher.group());
} else {
return new Pair<>(-1, "");
}
}
void applyParagraph(XWPFParagraph paragraph) {
int lastIf = -1;
while (true) {
var tIf = extractToken(pIf, paragraph);
if (tIf.key == -1) {
break;
}
if (tIf.key < lastIf) {
throw new IllegalStateException("If conditions can not be nested");
}
var tEIf = extractToken(pEIf, paragraph);
if (tEIf.key == -1) {
throw new IllegalStateException("If condition missing endif");
}
var m = pIf.matcher(tIf.value);
var keep = m.find() && processor.apply(m.group(1));
if (!keep) {
removeRunsRange(paragraph, tIf.key, tEIf.key);
}
lastIf = tEIf.key;
}
}
void apply(Iterable<XWPFParagraph> paragraphs) {
for (XWPFParagraph p : paragraphs) {
applyParagraph(p);
}
}
}
用法:
class Main {
private static XWPFDocument loadDoc(String name) throws IOException, InvalidFormatException {
String path = Main.class.getClassLoader().getResource(name).getPath();
FileInputStream fis = new FileInputStream( path);
return new XWPFDocument(OPCPackage.open(fis));
}
private static void saveDoc(String path, XWPFDocument doc) throws IOException {
try (var fos = new FileOutputStream(path)) {
doc.write(fos);
}
}
public static void main (String[] args) throws Exception {
var xdoc = loadDoc("test.docx");
var pm = new ParagraphModifier(str -> str.toLowerCase().equals("true"));
pm.apply(xdoc.getParagraphs());
saveDoc("test.out.docx", xdoc);
}
}
此解决方案不支持 ${if }
块跨越段落,如果嵌套,也不支持 Table 结构。扩展解决方案以支持它们应该很简单。