使用 Google Apps 脚本从 Google 幻灯片演示文稿中提取所有 URL 链接

Extract all URL links out of Google Slides presentation with Google Apps Script

我正在尝试创建一个函数,当传递 Google 幻灯片演示文稿 ID 时,它可以解析演示文稿并将它找到的所有 URL link 写入 Google Sheet。我构建了以下函数,对基于来自@Yuval

this answer 的 Google Docs 文档输入执行相同的操作
function getAllLinks(docId, mergeAdjacent) {
  var links = [];

  //var doc = DocumentApp.getActiveDocument();
  var doc = DocumentApp.openById(docId);
  var parentDocName = doc.getName();
  var ss=SpreadsheetApp.getActive();
  var sh=ss.getSheetByName('Extracted Links');

  iterateSections(doc, function(section, sectionIndex, isFirstPageSection) {
    if (!("getParagraphs" in section)) {
      // as we're using some undocumented API, adding this to avoid cryptic
      // messages upon possible API changes.
      throw new Error("An API change has caused this script to stop " + 
                      "working.\n" +
                      "Section #" + sectionIndex + " of type " + 
                      section.getType() + " has no .getParagraphs() method. " +
        "Stopping script.");
    }

    section.getParagraphs().forEach(function(par) { 
      // skip empty paragraphs
      if (par.getNumChildren() == 0) {
        return;
      }

      // go over all text elements in paragraph / list-item
      for (var el=par.getChild(0); el!=null; el=el.getNextSibling()) {
        if (el.getType() != DocumentApp.ElementType.TEXT) {
          continue;
        }

        // go over all styling segments in text element
        var attributeIndices = el.getTextAttributeIndices();
        var lastLink = null;
        attributeIndices.forEach(function(startOffset, i, attributeIndices) { 
          var url = el.getLinkUrl(startOffset);

          if (url != null) {
            // we hit a link
            var endOffsetInclusive = (i+1 < attributeIndices.length? 
                                      attributeIndices[i+1]-1 : null);

            // check if this and the last found link are continuous
            if (mergeAdjacent && lastLink != null && lastLink.url == url && 
                  lastLink.endOffsetInclusive == startOffset - 1) {
              // this and the previous style segment are continuous
              lastLink.endOffsetInclusive = endOffsetInclusive;
              return;
            }

            lastLink = {
              "section": section,
              "isFirstPageSection": isFirstPageSection,
              "paragraph": par,
              "textEl": el,
              "startOffset": startOffset,
              "endOffsetInclusive": endOffsetInclusive,
              "url": url
            };
            var row = sh.getLastRow() + 1;
            var r1=sh.getRange(row, 1);
            r1.setValue(parentDocName);
            var r2=sh.getRange(row, 2);
            r2.setValue(url);
            Logger.log(parentDocName)
            Logger.log(url)
            links.push(lastLink);
          }        
        });
      }
    });
  });


  return links;
}

/**
 * Calls the given function for each section of the document (body, header, 
 * etc.). Sections are children of the DocumentElement object.
 *
 * @param {Document} doc The Document object (such as the one obtained via
 *     a call to DocumentApp.getActiveDocument()) with the sections to iterate
 *     over.
 * @param {Function} func A callback function which will be called, for each
 *     section, with the following arguments (in order):
 *       - {ContainerElement} section - the section element
 *       - {Number} sectionIndex - the child index of the section, such that
 *         doc.getBody().getParent().getChild(sectionIndex) == section.
 *       - {Boolean} isFirstPageSection - whether the section is a first-page
 *         header/footer section.
 */
function iterateSections(doc, func) {
  // get the DocumentElement interface to iterate over all sections
  // this bit is undocumented API
  var docEl = doc.getBody().getParent();

  var regularHeaderSectionIndex = (doc.getHeader() == null? -1 : 
                                   docEl.getChildIndex(doc.getHeader()));
  var regularFooterSectionIndex = (doc.getFooter() == null? -1 : 
                                   docEl.getChildIndex(doc.getFooter()));

  for (var i=0; i<docEl.getNumChildren(); ++i) {
    var section = docEl.getChild(i);

    var sectionType = section.getType();
    var uniqueSectionName;
    var isFirstPageSection = (
      i != regularHeaderSectionIndex &&
      i != regularFooterSectionIndex && 
      (sectionType == DocumentApp.ElementType.HEADER_SECTION ||
       sectionType == DocumentApp.ElementType.FOOTER_SECTION));

    func(section, i, isFirstPageSection);
  }
}

当我尝试为 Google 幻灯片演示文稿创建相同的内容作为输入时,我卡在了如何解析文档并提取所有文本位的步骤(以便检查它们是否有 links)。似乎我需要使用 getSlides(),然后使用 getPageElements() 并遍历这些,但我不清楚如何获取幻灯片上的实际文本。任何关于如何遍历幻灯片上实际文本的提示(以及可能如何从该文本中提取 link URL 如果它有一个)将不胜感激。谢谢!

如果您只是不想从幻灯片中获取链接,请参阅下面的代码:

代码:

function getLinksFromSlides() {
  var presentation = SlidesApp.getActivePresentation();
  var slides = presentation.getSlides();
  // traverse each slide
  slides.forEach(function (slide) {
    var shapes = slide.getShapes();
    // traverse each shape
    shapes.forEach(function (shape) {
      // get its text content
      var textRange = shape.getText();
      var links = textRange.getLinks();
      // print all links found
      links.forEach(link => Logger.log(link.getTextStyle().getLink().getUrl()));
    });
  });
}

样本:

输出:

注:

  • 这只会提取超链接。它不会提取示例数据中显示的任何未超链接的 links/url。 (例如 https://www.facebook.com
  • 如果您想要非超链接 url,那么您可能需要尝试正则表达式。