
Remove part of the string between the comments along with the comments

这是一个字符串。我想删除带有注释本身的 C 风格注释。 不使用正则表达式

a = "word234 /*12aaa12*/"



这里有一个简单的算法,可以将状态保持在 2 个字符以上,并使用标志来保留或不保留这些字符。

a = "word234 /*12aaa12*/ word123 /*xx*xx*/ end"

out = []
add = True
prev = None
for c in a:
    if c == '*' and prev == '/':
        if add:
            del out[-1]
        add = False
    if c == '/' and prev == '*':
        add = True
        prev = c
    prev = c
    if add:
s2 = ''.join(out)


word234  word123  end


a = "word234 /*12aaa12*/ word123 /*xx/*yy*/xx*/ end"

out = []
lvl = 0
prev = None
for c in a:
    if c == '*' and prev == '/':
        if lvl == 0:
            del out[-1]
        lvl -= 1
    if c == '/' and prev == '*':
        lvl += 1
        prev = c
    prev = c
    if lvl == 0:
s2 = ''.join(out)

您可以使用 str.find 搜索字符串中出现的 /**/

str.find returns /**/ 的索引。 str.find returns -1 如果在字符串中找不到 /*。我们可以将其用作循环中的停止条件,搜索下一条评论,直到没有评论为止。

然后,我们可以使用这些索引和 str.join 将所有 non-comment 个子字符串连接成一个字符串。

def indices_c_comments(s):
    yield 0
    i = s.find('/*')
    while i != -1:
        j = s.find('*/', i)
        yield from (i, j+2)
        i = s.find('/*', j)
    yield len(s)

def strip_c_comments(s):
    g = indices_c_comments(s)
    return ''.join(s[i:j] for i,j in zip(g, g))

for s in ('text/*comment*/text/*comment*/text', 'text/*comment*//*comment*/text', 'text/*comment*/', '/*comment*/'):
    print('"{}"  -->  "{}"'.format(s, strip_c_comments(s)))
# "text/*comment*/text/*comment*/text"  -->  "texttexttext"
# "text/*comment*//*comment*/text"  -->  "texttext"
# "text/*comment*/"  -->  "text"
# "/*comment*/"  -->  ""

增强来自@mozway 的建议答案:


def remove_comments(s, start_tok='/*', end_tok='*/', nested=True, raise_on_imbalance=True):
    lvl    = 0
    output = ""
    i      = 0
    while i < len(s):
        start_len_chs = s[i:i+len(start_tok)]
        end_len_chs   = s[i:i+len(end_tok)]
        if start_len_chs == start_tok and nested:
            lvl += 1
            i += len(start_tok)
        elif start_len_chs == start_tok and not nested and lvl == 0:
            lvl = 1
            i += len(start_tok)
        elif start_len_chs == start_tok and not nested and lvl != 0:
            output += start_len_chs
            i += len(start_tok)
        elif end_len_chs == end_tok and nested:
            lvl -= 1
            i += len(end_tok)
        elif end_len_chs == end_tok and not nested and lvl != 0:
            lvl = 0
            i += len(end_tok)
        elif end_len_chs == end_tok and not nested and lvl == 0:
            output += end_len_chs
            i += len(end_tok)
        elif lvl == 0:
            output += s[i]
            i += 1
            i += 1
    if raise_on_imbalance and (start_tok in output or end_tok in output):
        raise ValueError("Imbalanced comment tokens")
    return output
