使用应用程序脚本检测 Google 文档中的 PAGE_BREAK 元素
Detecting PAGE_BREAK element in Google docs using app script
问题陈述:
我有一份 google 文档,有 N 页。要求是将每个页面转换为单独的 Google 文档。
尝试过的解决方案:
试图通过解析正文找到 PAGE_BREAK 元素,当检测到 PAGE_BREAK 时,创建一个范围,复制内容,然后创建一个新的 Google 文档。
问题:
创建了一个示例文档,其中第 1 页有一行文本,第 2 页有一行文本。解析文档时,无法检测到 PAGE_BREAK 元素。我期待当内容流向第 2 页时,应该有 PAGE_BREAK,在这种情况下应该是 PARAGRAPH 元素的子元素。
下面是我试过的示例 Google 应用程序脚本代码片段:
var activeDocument = DocumentApp.getActiveDocument();
var body = activeDocument.getBody();
function resetDoc() {
body.clear();
// When PAGE BREAK is added via script, I am able to detect the PAGE_BREAK element using findElement method only.
// body.appendParagraph("Page 1");
// body.appendPageBreak();
// body.appendParagraph("Page 2");
}
function init() {
const tree = extractTree(body);
Logger.log(tree);
}
function extractTree(element) {
const node = {
element: element,
};
if (element.getNumChildren) {
var numChildren = element.getNumChildren();
var children = [];
for (var i = 0; i < numChildren; i++) {
var child = element.getChild(i);
var found = findBreak(element);
if(found)
{
Logger.log("Found page break at" + i );
}
var childNode = extractTree(child);
Logger.log(child.getType());
children.push(childNode);
}
node["children"] = children;
}
return node;
};
function findBreak(element) {
var searchType = DocumentApp.ElementType.PAGE_BREAK;
var breakElement = body.findElement(searchType);
if(breakElement) {
Logger.log("Found page break");
return true;
} else {
Logger.log("No page break");
return false;
}
}
关于我应该如何解决这个问题的任何建议。
日志:
[19-04-12 15:46:32:636 IST] TEXT
[19-04-12 15:46:32:637 IST] PARAGRAPH
[19-04-12 15:46:32:638 IST] PARAGRAPH
[19-04-12 15:46:32:640 IST] PARAGRAPH
[19-04-12 15:46:32:642 IST] PARAGRAPH
[19-04-12 15:46:32:643 IST] PARAGRAPH
[19-04-12 15:46:32:645 IST] PARAGRAPH
[19-04-12 15:46:32:647 IST] PARAGRAPH
[19-04-12 15:46:32:648 IST] PARAGRAPH
[19-04-12 15:46:32:650 IST] PARAGRAPH
[19-04-12 15:46:32:651 IST] PARAGRAPH
[19-04-12 15:46:32:653 IST] PARAGRAPH
[19-04-12 15:46:32:655 IST] PARAGRAPH
[19-04-12 15:46:32:656 IST] PARAGRAPH
[19-04-12 15:46:32:658 IST] PARAGRAPH
[19-04-12 15:46:32:660 IST] PARAGRAPH
[19-04-12 15:46:32:662 IST] PARAGRAPH
[19-04-12 15:46:32:663 IST] PARAGRAPH
[19-04-12 15:46:32:665 IST] PARAGRAPH
[19-04-12 15:46:32:666 IST] PARAGRAPH
[19-04-12 15:46:32:668 IST] PARAGRAPH
[19-04-12 15:46:32:670 IST] PARAGRAPH
[19-04-12 15:46:32:671 IST] PARAGRAPH
[19-04-12 15:46:32:673 IST] PARAGRAPH
[19-04-12 15:46:32:675 IST] PARAGRAPH
[19-04-12 15:46:32:676 IST] PARAGRAPH
[19-04-12 15:46:32:678 IST] PARAGRAPH
[19-04-12 15:46:32:680 IST] PARAGRAPH
[19-04-12 15:46:32:682 IST] PARAGRAPH
[19-04-12 15:46:32:684 IST] PARAGRAPH
[19-04-12 15:46:32:685 IST] PARAGRAPH
[19-04-12 15:46:32:687 IST] PARAGRAPH
[19-04-12 15:46:32:689 IST] PARAGRAPH
[19-04-12 15:46:32:690 IST] PARAGRAPH
[19-04-12 15:46:32:692 IST] PARAGRAPH
[19-04-12 15:46:32:693 IST] PARAGRAPH
[19-04-12 15:46:32:695 IST] PARAGRAPH
[19-04-12 15:46:32:697 IST] PARAGRAPH
[19-04-12 15:46:32:699 IST] PARAGRAPH
[19-04-12 15:46:32:701 IST] PARAGRAPH
[19-04-12 15:46:32:702 IST] PARAGRAPH
[19-04-12 15:46:32:704 IST] PARAGRAPH
[19-04-12 15:46:32:705 IST] PARAGRAPH
[19-04-12 15:46:32:706 IST] PARAGRAPH
[19-04-12 15:46:32:708 IST] TEXT
[19-04-12 15:46:32:709 IST] PARAGRAPH
[19-04-12 15:46:32:710 IST] PARAGRAPH
[19-04-12 15:46:32:711 IST] PARAGRAPH
[19-04-12 15:46:32:712 IST] {children=[{children=[{element=Text}], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[{element=Text}], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}], element=DocumentBodySection}
[19-04-12 15:46:32:706 IST] PARAGRAPH 应该是 PAGE_BREAK,但它作为 PARAGRAPH 出现。
示例 google 文档:
https://docs.google.com/document/d/1bs_Jcfs-n1VEx65Ew5buBpsf_JCHgX0A7NHYIY8mAqw/edit?usp=sharing
引用link:
1. Google 应用脚本文档
https://developers.google.com/apps-script/reference/document/page-break
首先,我不确定是否正确理解了任务,因为一般情况下有N个可视页面并不意味着有N-1个显式分页符。我建议您只使用明确的分页符,因为您已经尝试过找到它们。
在这种情况下,复制文档片段最有用的单位(对象)是Paragraph。以下函数获取所有文档段落并检查每个段落是否包含 PAGE_BREAK 元素。如果找到 PAGE_BREAK,则意味着一页的结尾和另一页的开始。当然这时候我们应该新建一个目标文档才能继续复制。
function copyPartsByPageBreaks() {
var activeDoc = DocumentApp.getActiveDocument();
var pars = activeDoc.getBody().getParagraphs();
var pageIndex = 0;
var targetBody = DocumentApp.create('PageBreak.' + pageIndex).getBody();
while (pars.length > 0) {
var p = pars.shift();
targetBody.appendParagraph(p.copy());
if (p.findElement(DocumentApp.ElementType.PAGE_BREAK) != null) {
pageIndex++; // Prepare a new target place for coping
targetBody = DocumentApp.create('PageBreak.' + pageIndex).getBody();
}
}
}
问题陈述:
我有一份 google 文档,有 N 页。要求是将每个页面转换为单独的 Google 文档。
尝试过的解决方案:
试图通过解析正文找到 PAGE_BREAK 元素,当检测到 PAGE_BREAK 时,创建一个范围,复制内容,然后创建一个新的 Google 文档。
问题:
创建了一个示例文档,其中第 1 页有一行文本,第 2 页有一行文本。解析文档时,无法检测到 PAGE_BREAK 元素。我期待当内容流向第 2 页时,应该有 PAGE_BREAK,在这种情况下应该是 PARAGRAPH 元素的子元素。 下面是我试过的示例 Google 应用程序脚本代码片段:
var activeDocument = DocumentApp.getActiveDocument();
var body = activeDocument.getBody();
function resetDoc() {
body.clear();
// When PAGE BREAK is added via script, I am able to detect the PAGE_BREAK element using findElement method only.
// body.appendParagraph("Page 1");
// body.appendPageBreak();
// body.appendParagraph("Page 2");
}
function init() {
const tree = extractTree(body);
Logger.log(tree);
}
function extractTree(element) {
const node = {
element: element,
};
if (element.getNumChildren) {
var numChildren = element.getNumChildren();
var children = [];
for (var i = 0; i < numChildren; i++) {
var child = element.getChild(i);
var found = findBreak(element);
if(found)
{
Logger.log("Found page break at" + i );
}
var childNode = extractTree(child);
Logger.log(child.getType());
children.push(childNode);
}
node["children"] = children;
}
return node;
};
function findBreak(element) {
var searchType = DocumentApp.ElementType.PAGE_BREAK;
var breakElement = body.findElement(searchType);
if(breakElement) {
Logger.log("Found page break");
return true;
} else {
Logger.log("No page break");
return false;
}
}
关于我应该如何解决这个问题的任何建议。
日志:
[19-04-12 15:46:32:636 IST] TEXT
[19-04-12 15:46:32:637 IST] PARAGRAPH
[19-04-12 15:46:32:638 IST] PARAGRAPH
[19-04-12 15:46:32:640 IST] PARAGRAPH
[19-04-12 15:46:32:642 IST] PARAGRAPH
[19-04-12 15:46:32:643 IST] PARAGRAPH
[19-04-12 15:46:32:645 IST] PARAGRAPH
[19-04-12 15:46:32:647 IST] PARAGRAPH
[19-04-12 15:46:32:648 IST] PARAGRAPH
[19-04-12 15:46:32:650 IST] PARAGRAPH
[19-04-12 15:46:32:651 IST] PARAGRAPH
[19-04-12 15:46:32:653 IST] PARAGRAPH
[19-04-12 15:46:32:655 IST] PARAGRAPH
[19-04-12 15:46:32:656 IST] PARAGRAPH
[19-04-12 15:46:32:658 IST] PARAGRAPH
[19-04-12 15:46:32:660 IST] PARAGRAPH
[19-04-12 15:46:32:662 IST] PARAGRAPH
[19-04-12 15:46:32:663 IST] PARAGRAPH
[19-04-12 15:46:32:665 IST] PARAGRAPH
[19-04-12 15:46:32:666 IST] PARAGRAPH
[19-04-12 15:46:32:668 IST] PARAGRAPH
[19-04-12 15:46:32:670 IST] PARAGRAPH
[19-04-12 15:46:32:671 IST] PARAGRAPH
[19-04-12 15:46:32:673 IST] PARAGRAPH
[19-04-12 15:46:32:675 IST] PARAGRAPH
[19-04-12 15:46:32:676 IST] PARAGRAPH
[19-04-12 15:46:32:678 IST] PARAGRAPH
[19-04-12 15:46:32:680 IST] PARAGRAPH
[19-04-12 15:46:32:682 IST] PARAGRAPH
[19-04-12 15:46:32:684 IST] PARAGRAPH
[19-04-12 15:46:32:685 IST] PARAGRAPH
[19-04-12 15:46:32:687 IST] PARAGRAPH
[19-04-12 15:46:32:689 IST] PARAGRAPH
[19-04-12 15:46:32:690 IST] PARAGRAPH
[19-04-12 15:46:32:692 IST] PARAGRAPH
[19-04-12 15:46:32:693 IST] PARAGRAPH
[19-04-12 15:46:32:695 IST] PARAGRAPH
[19-04-12 15:46:32:697 IST] PARAGRAPH
[19-04-12 15:46:32:699 IST] PARAGRAPH
[19-04-12 15:46:32:701 IST] PARAGRAPH
[19-04-12 15:46:32:702 IST] PARAGRAPH
[19-04-12 15:46:32:704 IST] PARAGRAPH
[19-04-12 15:46:32:705 IST] PARAGRAPH
[19-04-12 15:46:32:706 IST] PARAGRAPH
[19-04-12 15:46:32:708 IST] TEXT
[19-04-12 15:46:32:709 IST] PARAGRAPH
[19-04-12 15:46:32:710 IST] PARAGRAPH
[19-04-12 15:46:32:711 IST] PARAGRAPH
[19-04-12 15:46:32:712 IST] {children=[{children=[{element=Text}], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}, {children=[{element=Text}], element=Paragraph}, {children=[], element=Paragraph}, {children=[], element=Paragraph}], element=DocumentBodySection}
[19-04-12 15:46:32:706 IST] PARAGRAPH 应该是 PAGE_BREAK,但它作为 PARAGRAPH 出现。
示例 google 文档:
https://docs.google.com/document/d/1bs_Jcfs-n1VEx65Ew5buBpsf_JCHgX0A7NHYIY8mAqw/edit?usp=sharing
引用link:
1. Google 应用脚本文档
https://developers.google.com/apps-script/reference/document/page-break
首先,我不确定是否正确理解了任务,因为一般情况下有N个可视页面并不意味着有N-1个显式分页符。我建议您只使用明确的分页符,因为您已经尝试过找到它们。
在这种情况下,复制文档片段最有用的单位(对象)是Paragraph。以下函数获取所有文档段落并检查每个段落是否包含 PAGE_BREAK 元素。如果找到 PAGE_BREAK,则意味着一页的结尾和另一页的开始。当然这时候我们应该新建一个目标文档才能继续复制。
function copyPartsByPageBreaks() {
var activeDoc = DocumentApp.getActiveDocument();
var pars = activeDoc.getBody().getParagraphs();
var pageIndex = 0;
var targetBody = DocumentApp.create('PageBreak.' + pageIndex).getBody();
while (pars.length > 0) {
var p = pars.shift();
targetBody.appendParagraph(p.copy());
if (p.findElement(DocumentApp.ElementType.PAGE_BREAK) != null) {
pageIndex++; // Prepare a new target place for coping
targetBody = DocumentApp.create('PageBreak.' + pageIndex).getBody();
}
}
}