使用 python 字典解析 HTML table 数据
Parsing HTML table data using python dictionary
我正在从以下格式的 Jive 页面获取 html
table = <table class="test" style="border: 1px solid #c6c6c6;" width="100%"><thead><tr><th style="background-color: #efefef; width: 13%;">Tag</th><th style="background-color: #efefef; width: 23.7965%;">ID</th><th style="background-color: #efefef; width: 59.2035%;">URL</th></tr></thead><tbody><tr><td style="width: 13%;">3.7.3</td><td style="width: 23.7965%;"><p>12345</p><p>232323</p><p>4343454</p><p>5454554</p></td><td style="width: 59.2035%;"><p><a class="jive-link-external-small" href="http://google.com" rel="nofollow">http://google.com</a></p><p><a class="jive-link-external-small" href="http://test123.com" rel="nofollow">http://test123.com</a></p><p><a class="jive-link-external-small" href="http://www.yahoo.com" rel="nofollow">http://www.yahoo.com</a></p><p><a class="jive-link-external-small" href="http://www.test.com" rel="nofollow">http://www.test.com</a></p></td></tr><tr><td style="width: 13%;">3.7.4</td><td style="width: 23.7965%;"><p>456789</p><p>545454</p><p>5454545</p><p>545454</p></td><td style="width: 59.2035%;"><p><a class="jive-link-external-small" href="http://foo.com" rel="nofollow">http://foo.com</a></p><p><a class="jive-link-external-small" href="http://www.yahoo.com" rel="nofollow">http://www.yahoo.com</a></p><p><a class="jive-link-external-small" href="http://svn.com" rel="nofollow">http://svn.com</a></p><p><a class="jive-link-external-small" href="http://test.com" rel="nofollow">http://test.com</a></p></td></tr></tbody></table>
为了将 HTML 转换为字典,我尝试了以下代码
table = ET.XML(s)
rows = iter(table)
headers = [col.text for col in next(rows)]
for row in rows:
values = [col.text for col in row]
out = dict(zip(headers, values))
根据命令行参数的输入,上述方法没有给我预期的输出,如下所示
Tag ID URL
3.7.3 121211 http://yahoo.com
323243 http://url.com
这可以使用 Python XML 树来实现。
data = response.json()
html_doc = data['content']['text']
def find_version(ver):
table = ET.XML(html_doc)
# headers
ths = [th.text for th in table.findall('.//th')]
for tr in table.findall('.//tbody/tr'):
data = []
# first col
data.append(tr[0].text)
# second col
data.append([x.text for x in tr[1]])
# third col
temp = []
for x in tr[2]:
if x.tag == 'a':
temp.append(x.text)
else:
temp.append(x[0].text)
data.append(temp)
# dictionary to print all the available data using applicable keys
#out = OrderedDict(zip(ths, data))
out = OrderedDict(zip(ths, data))
#print('out:', out)
if out['Release'] == ver:
return out
# --- main ---
res = find_version(ver)
if res:
'''
for key, val in res.items():
print(key, '-', val)
'''
#print (res.values())
f = lambda x: x if isinstance(x, list) else [x]
for tup in zip(*res.items()):
for a, b, c in zip_longest(*map(f, tup), fillvalue=''):
print('{:15}{:15}{:15}'.format(a, b, c))
#for a, b, c in zip_longest(*map(f, tup), fillvalue=''):
#print('{:15}'.format(b))
else:
print ('Version not found')
将为您提供预期格式的输出
我正在从以下格式的 Jive 页面获取 html
table = <table class="test" style="border: 1px solid #c6c6c6;" width="100%"><thead><tr><th style="background-color: #efefef; width: 13%;">Tag</th><th style="background-color: #efefef; width: 23.7965%;">ID</th><th style="background-color: #efefef; width: 59.2035%;">URL</th></tr></thead><tbody><tr><td style="width: 13%;">3.7.3</td><td style="width: 23.7965%;"><p>12345</p><p>232323</p><p>4343454</p><p>5454554</p></td><td style="width: 59.2035%;"><p><a class="jive-link-external-small" href="http://google.com" rel="nofollow">http://google.com</a></p><p><a class="jive-link-external-small" href="http://test123.com" rel="nofollow">http://test123.com</a></p><p><a class="jive-link-external-small" href="http://www.yahoo.com" rel="nofollow">http://www.yahoo.com</a></p><p><a class="jive-link-external-small" href="http://www.test.com" rel="nofollow">http://www.test.com</a></p></td></tr><tr><td style="width: 13%;">3.7.4</td><td style="width: 23.7965%;"><p>456789</p><p>545454</p><p>5454545</p><p>545454</p></td><td style="width: 59.2035%;"><p><a class="jive-link-external-small" href="http://foo.com" rel="nofollow">http://foo.com</a></p><p><a class="jive-link-external-small" href="http://www.yahoo.com" rel="nofollow">http://www.yahoo.com</a></p><p><a class="jive-link-external-small" href="http://svn.com" rel="nofollow">http://svn.com</a></p><p><a class="jive-link-external-small" href="http://test.com" rel="nofollow">http://test.com</a></p></td></tr></tbody></table>
为了将 HTML 转换为字典,我尝试了以下代码
table = ET.XML(s)
rows = iter(table)
headers = [col.text for col in next(rows)]
for row in rows:
values = [col.text for col in row]
out = dict(zip(headers, values))
根据命令行参数的输入,上述方法没有给我预期的输出,如下所示
Tag ID URL
3.7.3 121211 http://yahoo.com
323243 http://url.com
这可以使用 Python XML 树来实现。
data = response.json()
html_doc = data['content']['text']
def find_version(ver):
table = ET.XML(html_doc)
# headers
ths = [th.text for th in table.findall('.//th')]
for tr in table.findall('.//tbody/tr'):
data = []
# first col
data.append(tr[0].text)
# second col
data.append([x.text for x in tr[1]])
# third col
temp = []
for x in tr[2]:
if x.tag == 'a':
temp.append(x.text)
else:
temp.append(x[0].text)
data.append(temp)
# dictionary to print all the available data using applicable keys
#out = OrderedDict(zip(ths, data))
out = OrderedDict(zip(ths, data))
#print('out:', out)
if out['Release'] == ver:
return out
# --- main ---
res = find_version(ver)
if res:
'''
for key, val in res.items():
print(key, '-', val)
'''
#print (res.values())
f = lambda x: x if isinstance(x, list) else [x]
for tup in zip(*res.items()):
for a, b, c in zip_longest(*map(f, tup), fillvalue=''):
print('{:15}{:15}{:15}'.format(a, b, c))
#for a, b, c in zip_longest(*map(f, tup), fillvalue=''):
#print('{:15}'.format(b))
else:
print ('Version not found')
将为您提供预期格式的输出