从 python 中的字符串中删除所有 Html 内容
Remove all the Html content from a string in python
我想从字符串中删除所有 HTML 内容。
我有一个字符串
str= "I am happy with <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> 3333 <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> your code"
我想要最后的字符串
str= "I am happy with 3333 your code
我编写了这段代码来完成上述任务。
def removetags(input_str):
result = ''
startflag = 0
start=True
count=0
for ch in input_str:
if ch == '<':
if count!=len(input_str)-1:
if input_str[count+1]!='/':
start=True
startflag += 1
elif (ch == '>') and startflag :
if not start:
startflag -= 1
start=False
elif (not startflag) :
result += ch
count += 1
return result
print(removetags(str))
这可以正常工作,但如果文本中有 <
,则无法正确输出。所以我想删除 using html 解析。有什么办法吗?我找到了这个库,但是我找不到提前做的方法that.Thanks。
您可以为此使用正则表达式库,
import re
str= "I am happy with <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> 3333 <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> your code"
comp = re.compile(r'<([\w]+)[^>]*>(.*?)<\/>')
data = re.sub(comp, '', str)
print(data)
可能是这个帮助
from html.parser import HTMLParser
str = "I am happy with <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> 3333 > <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> your code"
class MyHTMLParser(HTMLParser):
got_html_in_tags = False
html_free_text = []
def handle_starttag(self, tag, attrs):
self.got_html_in_tags = True
def handle_endtag(self, tag):
self.got_html_in_tags = False
def handle_data(self, data):
if not self.got_html_in_tags:
self.html_free_text.append(data)
parser = MyHTMLParser()
parser.feed(str)
print("".join(parser.html_free_text))
这将打印 I am happy with 3333 your code
,即使文本中包含“>”或“<”
让我们递归地做这个 ;)
基本情况 1:当文本为空字符串时,
return 一个空字符串
基本情况 2:当文本的第一个字母是插入符时,
搜索结束标记并 return 调用结束标记后剩余文本的函数。
def remove_tags(text, tags=[]):
if text == '':
return text
if text[0] == '<':
closing_caret_pos = text.find('>')
tag = text[0:closing_caret_pos+1]
is_open_tag = '/' not in tag
is_close_tag = not is_open_tag
is_valid_tag = tag[1:-1].isalpha() or tag[2:-1].isalpha()
if is_valid_tag and is_open_tag:
tags.append(tag)
return remove_tags(text[1:], tags)
if is_valid_tag and is_close_tag:
tags.pop()
return remove_tags(text[len(tag):], tags)
if len(tags) != 0: # when an open tag exists keeping looking
return remove_tags(text[1:], tags)
return text[0] + remove_tags(text[1:], tags)
测试运行:
text = "I am happy with <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> 3333 <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> your code"
print(remove_tags(text))
>
I am happy with 3333 your code
text = "x<=1 <div> cookies </div>"
print(remove_tags(text))
>
x<=1
text = "I am <a happy with <body> </body> lal"
print(remove_tags(text))
>
I am <a happy with lal
另一个re
解决方案:
re.sub(r'(<(?P<tag>[a-zA-Z0-9]+)>.*?</(?P=tag)>)', '', string)
测试:
>>> re.sub(r'(<(?P<tag>[a-zA-Z0-9]+)>.*?</(?P=tag)>)', '', string)
'I am happy with 3333 your code'
>>> string = "I am happy with <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> 3333 > <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> your code"
>>> re.sub(r'(<(?P<tag>[a-zA-Z0-9]+)>.*?</(?P=tag)>)', '', string)
'I am happy with 3333 > your code'
>>> string = "I am <a happy with <body> </body> lal"
>>> re.sub(r'(<(?P<tag>[a-zA-Z0-9]+)>.*?</(?P=tag)>)', '', string)
'I am <a happy with lal'
我想从字符串中删除所有 HTML 内容。
我有一个字符串
str= "I am happy with <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> 3333 <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> your code"
我想要最后的字符串
str= "I am happy with 3333 your code
我编写了这段代码来完成上述任务。
def removetags(input_str):
result = ''
startflag = 0
start=True
count=0
for ch in input_str:
if ch == '<':
if count!=len(input_str)-1:
if input_str[count+1]!='/':
start=True
startflag += 1
elif (ch == '>') and startflag :
if not start:
startflag -= 1
start=False
elif (not startflag) :
result += ch
count += 1
return result
print(removetags(str))
这可以正常工作,但如果文本中有 <
,则无法正确输出。所以我想删除 using html 解析。有什么办法吗?我找到了这个库,但是我找不到提前做的方法that.Thanks。
您可以为此使用正则表达式库,
import re
str= "I am happy with <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> 3333 <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> your code"
comp = re.compile(r'<([\w]+)[^>]*>(.*?)<\/>')
data = re.sub(comp, '', str)
print(data)
可能是这个帮助
from html.parser import HTMLParser
str = "I am happy with <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> 3333 > <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> your code"
class MyHTMLParser(HTMLParser):
got_html_in_tags = False
html_free_text = []
def handle_starttag(self, tag, attrs):
self.got_html_in_tags = True
def handle_endtag(self, tag):
self.got_html_in_tags = False
def handle_data(self, data):
if not self.got_html_in_tags:
self.html_free_text.append(data)
parser = MyHTMLParser()
parser.feed(str)
print("".join(parser.html_free_text))
这将打印 I am happy with 3333 your code
,即使文本中包含“>”或“<”
让我们递归地做这个 ;)
基本情况 1:当文本为空字符串时,
return 一个空字符串
基本情况 2:当文本的第一个字母是插入符时,
搜索结束标记并 return 调用结束标记后剩余文本的函数。
def remove_tags(text, tags=[]):
if text == '':
return text
if text[0] == '<':
closing_caret_pos = text.find('>')
tag = text[0:closing_caret_pos+1]
is_open_tag = '/' not in tag
is_close_tag = not is_open_tag
is_valid_tag = tag[1:-1].isalpha() or tag[2:-1].isalpha()
if is_valid_tag and is_open_tag:
tags.append(tag)
return remove_tags(text[1:], tags)
if is_valid_tag and is_close_tag:
tags.pop()
return remove_tags(text[len(tag):], tags)
if len(tags) != 0: # when an open tag exists keeping looking
return remove_tags(text[1:], tags)
return text[0] + remove_tags(text[1:], tags)
测试运行:
text = "I am happy with <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> 3333 <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> your code"
print(remove_tags(text))
>
I am happy with 3333 your code
text = "x<=1 <div> cookies </div>"
print(remove_tags(text))
>
x<=1
text = "I am <a happy with <body> </body> lal"
print(remove_tags(text))
>
I am <a happy with lal
另一个re
解决方案:
re.sub(r'(<(?P<tag>[a-zA-Z0-9]+)>.*?</(?P=tag)>)', '', string)
测试:
>>> re.sub(r'(<(?P<tag>[a-zA-Z0-9]+)>.*?</(?P=tag)>)', '', string)
'I am happy with 3333 your code'
>>> string = "I am happy with <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> 3333 > <body> <h1>This is a Heading</h1> <p>This is a paragraph.</p> </body> your code"
>>> re.sub(r'(<(?P<tag>[a-zA-Z0-9]+)>.*?</(?P=tag)>)', '', string)
'I am happy with 3333 > your code'
>>> string = "I am <a happy with <body> </body> lal"
>>> re.sub(r'(<(?P<tag>[a-zA-Z0-9]+)>.*?</(?P=tag)>)', '', string)
'I am <a happy with lal'