在 javascript 中使用正则表达式删除 html 标签

Question

我想使用以下代码从文档中删除除 <a> <img> 和 <iframe> 之外的所有 html 标签：

var regex = "<(?!a )(?!img )(?!iframe )([\s\S]*?)>";
var temp;
while (source.match(regex)) {
    temp = source.match(regex)[0];
    source = source.replace(temp, "");
}
return source;

它适用于在线正则表达式测试程序，但出于某种原因，它在我的页面上不起作用。例如，当输入为 returns 原始字符串时：

    "<p class="MsoNormal" style="margin-left:202.5pt;line-height:200%;background:white"><b><span style="font-size: 16pt; line-height: 200%; color: rgb(131, 60, 11); background-image: initial; background-attachment: initial; background-size: initial; background-origin: initial; background-clip: initial; background-position: initial; background-repeat: initial;">test</span></b><span style="font-size:16.0pt;
line-height:200%;color:#833C0B;letter-spacing:-.15pt;mso-ansi-language:EN-US"><o:p></o:p></span></p>"

请帮忙！

Answer 1

这是我能想到的最好的！

<((?!a)|a\w)(?!\/a)(?!img)(?!iframe)(?!\/iframe)+([\s\S]*?)>

第一个捕获组，not a 或 a 后跟一个词，允许音频、缩写、地址等全部通过。

只需将上述正则表达式中的匹配项替换为空即可。

请看：http://regexr.com/3a5hp

Answer 2

不用正则表达式也可以。尝试使用正则表达式解析 HTML 通常不是一个好主意，除非用例非常简单...

我实现的方式 stripHtmlElementsMatching，您可以将任何 CSS 选择器传递给它，它会去除所有匹配的实体。

因此，要删除 a, img, iframe 以外的任何内容，您可以传递 :not(a):not(img):not(iframe).

PS：htmlstripping-root自定义标签只是为了避免创建干扰传递的选择器的解析器元素。例如，如果我使用 div 作为解析器元素，而您将传递选择器 div > div，所有 div 都将被删除，即使它们没有嵌套在您的 html 字符串中。

var stripHtmlElementsMatching = (function(doc) {
  
  doc.registerElement('htmlstripping-root');
  
  return function(text, selector) {
    
    var parser = document.createElement('htmlstripping-root'),
        matchingEls, i, len, el;
    
    selector = typeof selector == 'string' ? selector : ':not(*)';
    parser.innerHTML = text;
    
    matchingEls = parser.querySelectorAll(selector);
    
    for (i = 0, len = matchingEls.length; i < len; i++) {
      el = matchingEls[i];
      el.parentNode.replaceChild(newFragFrom(el.childNodes), el);
    }
    
    return parser.innerHTML;
  };
  
  function newFragFrom(nodes) {
    var frag = document.createDocumentFragment();
    
    while (nodes.length) frag.appendChild(nodes[0]);
    
    return frag;
  }
  
})(document);


var text = '<p class="MsoNormal" style="margin-left:202.5pt;line-height:200%;background:white"><b><span style="font-size: 16pt; line-height: 200%; color: rgb(131, 60, 11); background-image: initial; background-attachment: initial; background-size: initial; background-origin: initial; background-clip: initial; background-position: initial; background-repeat: initial;">test</span></b><span style="font-size:16.0pt; line-height:200%;color:#833C0B;letter-spacing:-.15pt;mso-ansi-language:EN-US"><o:p></o:p></span></p>';

var tagsToKeep = ['a', 'img', 'iframe'];

var sanitizeSelector = tagsToKeep.map(function(tag) {
  return ':not(' + tag + ')';
}).join('');

var sanitizedText = stripHtmlElementsMatching(text, sanitizeSelector);

document.body.appendChild(document.createTextNode(sanitizedText));

在 javascript 中使用正则表达式删除 html 标签

Remove html tags using regex in javascript

html

javascript

regex