HTML 标签验证
HTML tag validation
我想验证 HTML 标签..它是否有效 HTML ..所以我试过了
以下代码。
def must_have_proper_htmltag(self,filename):
"""
:param filename:
:return:
"""
print(filename)
return bool(BeautifulSoup(filename, 'html.parser').find())
'''Beautiful soup is library function to pull HTML or XML
html.parser is for choosing html doc and find() for
checking occuranc'''
`htmltags='<html><head><title>Test</title></head>''<body><h1>Parse me!</h1>
</body></html>'
nohtmltag = '<html><head><title>Test</title></head>''<body><h1>Parse me!'
print('html checkers:-',qc.must_have_proper_htmltag(htmltag))
print('html checkers:-',qc.must_have_proper_htmltag(nohtmltag))
</code>这个函数检查是否有HTML标签..它不<code>
验证HTML标签是否存在对不对
我的问题是如何验证 HTML 标签
我想要输出 HTML 标签一个是 True 另一个是 False
虽然不完全符合您的要求,但利用其他人已经完成的工作可能更容易。例如:
它不检查单个标签,而是检查整个 HTML 是否正确,这显然是您所追求的。
可能this approach也能帮到你:
import HTMLParser
import urllib
import sys
import urlparse
##################################################
# config
base_url = 'http://foo.com/bill_reid/'
depth = 100
w3c_validator = 'http://validator.w3.org/'
##################################################
# classes and functions
# HTML parser class
class parseLinks(HTMLParser.HTMLParser):
def handle_starttag(self, tag, attrs):
if tag == 'a':
for name,value in attrs:
if name == 'href':
url = url_normalize(value)
if url != "" and not(l.has_key(url)):
l[url] = True;
# HTML parsing function (use the class)
def parse_links(url):
try:
lParser = parseLinks()
lParser.feed(urllib.urlopen(url).read())
lParser.close()
except:
pass
# clean/normalize/reject url
def url_normalize(url):
url= url.strip()
# check it's not an email address
if url.startswith('mailto:'):
return ""
# remove any anchor
url = url.partition('#')[0]
# check it's not an outside-of-the-tree link
url = urlparse.urljoin(current_url, url)
if not(url.startswith(base_url)):
return ""
# check it's an HTML page
if urllib.urlopen(url).info().gettype() != 'text/html':
return ""
return url
# W3C validation
def url_w3c_validate(url):
return urllib.urlopen(w3c_validator + 'check?uri=' + url).info().getheader('x-w3c-validator-status') == 'Valid'
##################################################
# main
##################################################
l = {base_url: True}
l_error = []
n = 0
for i in range(depth):
for url in l.copy():
if l[url]:
n += 1
current_url = url
print n,
print "-",
print current_url,
print " parsing...",
parse_links(url)
print "done -",
print "validating...",
is_valid = url_w3c_validate(url)
print "done ->",
if is_valid:
print "Valid"
else:
l_error.append(url)
print "Invalid"
l[url] = False
#report
print """
-------------------------------------
URLs parsed: %d
URLS with invalid HTML: %d""" % (len(l), len(l_error))
for url in l_error:
print url
您可以使用 w3c 验证器对其进行验证
from py_w3c.validators.html.validator import HTMLValidator
def must_have_proper_htmltag(self,filename):
"""
:param filename:
:return:
"""
print(filename)
html_validator = HTMLValidator()
html_validator.validate_fragment(filename)
if not html_validator.errors:
'''Where there is no error it return empty list'''
return True
else:
return False
print('html checkers:-',qc.must_have_proper_htmltag('<!DOCTYPE html><html>
<head><title>Test</title></head>''<body><h1>Parse me!</h1></body></html>'))
我想验证 HTML 标签..它是否有效 HTML ..所以我试过了 以下代码。
def must_have_proper_htmltag(self,filename):
"""
:param filename:
:return:
"""
print(filename)
return bool(BeautifulSoup(filename, 'html.parser').find())
'''Beautiful soup is library function to pull HTML or XML
html.parser is for choosing html doc and find() for
checking occuranc'''
`htmltags='<html><head><title>Test</title></head>''<body><h1>Parse me!</h1>
</body></html>'
nohtmltag = '<html><head><title>Test</title></head>''<body><h1>Parse me!'
print('html checkers:-',qc.must_have_proper_htmltag(htmltag))
print('html checkers:-',qc.must_have_proper_htmltag(nohtmltag))
</code>这个函数检查是否有HTML标签..它不<code>
验证HTML标签是否存在对不对
我的问题是如何验证 HTML 标签
我想要输出 HTML 标签一个是 True 另一个是 False
虽然不完全符合您的要求,但利用其他人已经完成的工作可能更容易。例如:
它不检查单个标签,而是检查整个 HTML 是否正确,这显然是您所追求的。
可能this approach也能帮到你:
import HTMLParser
import urllib
import sys
import urlparse
##################################################
# config
base_url = 'http://foo.com/bill_reid/'
depth = 100
w3c_validator = 'http://validator.w3.org/'
##################################################
# classes and functions
# HTML parser class
class parseLinks(HTMLParser.HTMLParser):
def handle_starttag(self, tag, attrs):
if tag == 'a':
for name,value in attrs:
if name == 'href':
url = url_normalize(value)
if url != "" and not(l.has_key(url)):
l[url] = True;
# HTML parsing function (use the class)
def parse_links(url):
try:
lParser = parseLinks()
lParser.feed(urllib.urlopen(url).read())
lParser.close()
except:
pass
# clean/normalize/reject url
def url_normalize(url):
url= url.strip()
# check it's not an email address
if url.startswith('mailto:'):
return ""
# remove any anchor
url = url.partition('#')[0]
# check it's not an outside-of-the-tree link
url = urlparse.urljoin(current_url, url)
if not(url.startswith(base_url)):
return ""
# check it's an HTML page
if urllib.urlopen(url).info().gettype() != 'text/html':
return ""
return url
# W3C validation
def url_w3c_validate(url):
return urllib.urlopen(w3c_validator + 'check?uri=' + url).info().getheader('x-w3c-validator-status') == 'Valid'
##################################################
# main
##################################################
l = {base_url: True}
l_error = []
n = 0
for i in range(depth):
for url in l.copy():
if l[url]:
n += 1
current_url = url
print n,
print "-",
print current_url,
print " parsing...",
parse_links(url)
print "done -",
print "validating...",
is_valid = url_w3c_validate(url)
print "done ->",
if is_valid:
print "Valid"
else:
l_error.append(url)
print "Invalid"
l[url] = False
#report
print """
-------------------------------------
URLs parsed: %d
URLS with invalid HTML: %d""" % (len(l), len(l_error))
for url in l_error:
print url
您可以使用 w3c 验证器对其进行验证
from py_w3c.validators.html.validator import HTMLValidator
def must_have_proper_htmltag(self,filename):
"""
:param filename:
:return:
"""
print(filename)
html_validator = HTMLValidator()
html_validator.validate_fragment(filename)
if not html_validator.errors:
'''Where there is no error it return empty list'''
return True
else:
return False
print('html checkers:-',qc.must_have_proper_htmltag('<!DOCTYPE html><html>
<head><title>Test</title></head>''<body><h1>Parse me!</h1></body></html>'))