使用 Google Apps 脚本从 Google 幻灯片演示文稿中提取所有 URL 链接
Extract all URL links out of Google Slides presentation with Google Apps Script
我正在尝试创建一个函数,当传递 Google 幻灯片演示文稿 ID 时,它可以解析演示文稿并将它找到的所有 URL link 写入 Google Sheet。我构建了以下函数,对基于来自@Yuval
的 this answer 的 Google Docs 文档输入执行相同的操作
function getAllLinks(docId, mergeAdjacent) {
var links = [];
//var doc = DocumentApp.getActiveDocument();
var doc = DocumentApp.openById(docId);
var parentDocName = doc.getName();
var ss=SpreadsheetApp.getActive();
var sh=ss.getSheetByName('Extracted Links');
iterateSections(doc, function(section, sectionIndex, isFirstPageSection) {
if (!("getParagraphs" in section)) {
// as we're using some undocumented API, adding this to avoid cryptic
// messages upon possible API changes.
throw new Error("An API change has caused this script to stop " +
"working.\n" +
"Section #" + sectionIndex + " of type " +
section.getType() + " has no .getParagraphs() method. " +
"Stopping script.");
}
section.getParagraphs().forEach(function(par) {
// skip empty paragraphs
if (par.getNumChildren() == 0) {
return;
}
// go over all text elements in paragraph / list-item
for (var el=par.getChild(0); el!=null; el=el.getNextSibling()) {
if (el.getType() != DocumentApp.ElementType.TEXT) {
continue;
}
// go over all styling segments in text element
var attributeIndices = el.getTextAttributeIndices();
var lastLink = null;
attributeIndices.forEach(function(startOffset, i, attributeIndices) {
var url = el.getLinkUrl(startOffset);
if (url != null) {
// we hit a link
var endOffsetInclusive = (i+1 < attributeIndices.length?
attributeIndices[i+1]-1 : null);
// check if this and the last found link are continuous
if (mergeAdjacent && lastLink != null && lastLink.url == url &&
lastLink.endOffsetInclusive == startOffset - 1) {
// this and the previous style segment are continuous
lastLink.endOffsetInclusive = endOffsetInclusive;
return;
}
lastLink = {
"section": section,
"isFirstPageSection": isFirstPageSection,
"paragraph": par,
"textEl": el,
"startOffset": startOffset,
"endOffsetInclusive": endOffsetInclusive,
"url": url
};
var row = sh.getLastRow() + 1;
var r1=sh.getRange(row, 1);
r1.setValue(parentDocName);
var r2=sh.getRange(row, 2);
r2.setValue(url);
Logger.log(parentDocName)
Logger.log(url)
links.push(lastLink);
}
});
}
});
});
return links;
}
/**
* Calls the given function for each section of the document (body, header,
* etc.). Sections are children of the DocumentElement object.
*
* @param {Document} doc The Document object (such as the one obtained via
* a call to DocumentApp.getActiveDocument()) with the sections to iterate
* over.
* @param {Function} func A callback function which will be called, for each
* section, with the following arguments (in order):
* - {ContainerElement} section - the section element
* - {Number} sectionIndex - the child index of the section, such that
* doc.getBody().getParent().getChild(sectionIndex) == section.
* - {Boolean} isFirstPageSection - whether the section is a first-page
* header/footer section.
*/
function iterateSections(doc, func) {
// get the DocumentElement interface to iterate over all sections
// this bit is undocumented API
var docEl = doc.getBody().getParent();
var regularHeaderSectionIndex = (doc.getHeader() == null? -1 :
docEl.getChildIndex(doc.getHeader()));
var regularFooterSectionIndex = (doc.getFooter() == null? -1 :
docEl.getChildIndex(doc.getFooter()));
for (var i=0; i<docEl.getNumChildren(); ++i) {
var section = docEl.getChild(i);
var sectionType = section.getType();
var uniqueSectionName;
var isFirstPageSection = (
i != regularHeaderSectionIndex &&
i != regularFooterSectionIndex &&
(sectionType == DocumentApp.ElementType.HEADER_SECTION ||
sectionType == DocumentApp.ElementType.FOOTER_SECTION));
func(section, i, isFirstPageSection);
}
}
当我尝试为 Google 幻灯片演示文稿创建相同的内容作为输入时,我卡在了如何解析文档并提取所有文本位的步骤(以便检查它们是否有 links)。似乎我需要使用 getSlides()
,然后使用 getPageElements()
并遍历这些,但我不清楚如何获取幻灯片上的实际文本。任何关于如何遍历幻灯片上实际文本的提示(以及可能如何从该文本中提取 link URL 如果它有一个)将不胜感激。谢谢!
如果您只是不想从幻灯片中获取链接,请参阅下面的代码:
代码:
function getLinksFromSlides() {
var presentation = SlidesApp.getActivePresentation();
var slides = presentation.getSlides();
// traverse each slide
slides.forEach(function (slide) {
var shapes = slide.getShapes();
// traverse each shape
shapes.forEach(function (shape) {
// get its text content
var textRange = shape.getText();
var links = textRange.getLinks();
// print all links found
links.forEach(link => Logger.log(link.getTextStyle().getLink().getUrl()));
});
});
}
样本:
输出:
注:
- 这只会提取超链接。它不会提取示例数据中显示的任何未超链接的 links/url。 (例如
https://www.facebook.com
)
- 如果您想要非超链接 url,那么您可能需要尝试正则表达式。
我正在尝试创建一个函数,当传递 Google 幻灯片演示文稿 ID 时,它可以解析演示文稿并将它找到的所有 URL link 写入 Google Sheet。我构建了以下函数,对基于来自@Yuval
的 this answer 的 Google Docs 文档输入执行相同的操作function getAllLinks(docId, mergeAdjacent) {
var links = [];
//var doc = DocumentApp.getActiveDocument();
var doc = DocumentApp.openById(docId);
var parentDocName = doc.getName();
var ss=SpreadsheetApp.getActive();
var sh=ss.getSheetByName('Extracted Links');
iterateSections(doc, function(section, sectionIndex, isFirstPageSection) {
if (!("getParagraphs" in section)) {
// as we're using some undocumented API, adding this to avoid cryptic
// messages upon possible API changes.
throw new Error("An API change has caused this script to stop " +
"working.\n" +
"Section #" + sectionIndex + " of type " +
section.getType() + " has no .getParagraphs() method. " +
"Stopping script.");
}
section.getParagraphs().forEach(function(par) {
// skip empty paragraphs
if (par.getNumChildren() == 0) {
return;
}
// go over all text elements in paragraph / list-item
for (var el=par.getChild(0); el!=null; el=el.getNextSibling()) {
if (el.getType() != DocumentApp.ElementType.TEXT) {
continue;
}
// go over all styling segments in text element
var attributeIndices = el.getTextAttributeIndices();
var lastLink = null;
attributeIndices.forEach(function(startOffset, i, attributeIndices) {
var url = el.getLinkUrl(startOffset);
if (url != null) {
// we hit a link
var endOffsetInclusive = (i+1 < attributeIndices.length?
attributeIndices[i+1]-1 : null);
// check if this and the last found link are continuous
if (mergeAdjacent && lastLink != null && lastLink.url == url &&
lastLink.endOffsetInclusive == startOffset - 1) {
// this and the previous style segment are continuous
lastLink.endOffsetInclusive = endOffsetInclusive;
return;
}
lastLink = {
"section": section,
"isFirstPageSection": isFirstPageSection,
"paragraph": par,
"textEl": el,
"startOffset": startOffset,
"endOffsetInclusive": endOffsetInclusive,
"url": url
};
var row = sh.getLastRow() + 1;
var r1=sh.getRange(row, 1);
r1.setValue(parentDocName);
var r2=sh.getRange(row, 2);
r2.setValue(url);
Logger.log(parentDocName)
Logger.log(url)
links.push(lastLink);
}
});
}
});
});
return links;
}
/**
* Calls the given function for each section of the document (body, header,
* etc.). Sections are children of the DocumentElement object.
*
* @param {Document} doc The Document object (such as the one obtained via
* a call to DocumentApp.getActiveDocument()) with the sections to iterate
* over.
* @param {Function} func A callback function which will be called, for each
* section, with the following arguments (in order):
* - {ContainerElement} section - the section element
* - {Number} sectionIndex - the child index of the section, such that
* doc.getBody().getParent().getChild(sectionIndex) == section.
* - {Boolean} isFirstPageSection - whether the section is a first-page
* header/footer section.
*/
function iterateSections(doc, func) {
// get the DocumentElement interface to iterate over all sections
// this bit is undocumented API
var docEl = doc.getBody().getParent();
var regularHeaderSectionIndex = (doc.getHeader() == null? -1 :
docEl.getChildIndex(doc.getHeader()));
var regularFooterSectionIndex = (doc.getFooter() == null? -1 :
docEl.getChildIndex(doc.getFooter()));
for (var i=0; i<docEl.getNumChildren(); ++i) {
var section = docEl.getChild(i);
var sectionType = section.getType();
var uniqueSectionName;
var isFirstPageSection = (
i != regularHeaderSectionIndex &&
i != regularFooterSectionIndex &&
(sectionType == DocumentApp.ElementType.HEADER_SECTION ||
sectionType == DocumentApp.ElementType.FOOTER_SECTION));
func(section, i, isFirstPageSection);
}
}
当我尝试为 Google 幻灯片演示文稿创建相同的内容作为输入时,我卡在了如何解析文档并提取所有文本位的步骤(以便检查它们是否有 links)。似乎我需要使用 getSlides()
,然后使用 getPageElements()
并遍历这些,但我不清楚如何获取幻灯片上的实际文本。任何关于如何遍历幻灯片上实际文本的提示(以及可能如何从该文本中提取 link URL 如果它有一个)将不胜感激。谢谢!
如果您只是不想从幻灯片中获取链接,请参阅下面的代码:
代码:
function getLinksFromSlides() {
var presentation = SlidesApp.getActivePresentation();
var slides = presentation.getSlides();
// traverse each slide
slides.forEach(function (slide) {
var shapes = slide.getShapes();
// traverse each shape
shapes.forEach(function (shape) {
// get its text content
var textRange = shape.getText();
var links = textRange.getLinks();
// print all links found
links.forEach(link => Logger.log(link.getTextStyle().getLink().getUrl()));
});
});
}
样本:
输出:
注:
- 这只会提取超链接。它不会提取示例数据中显示的任何未超链接的 links/url。 (例如
https://www.facebook.com
) - 如果您想要非超链接 url,那么您可能需要尝试正则表达式。