所有 HTML 个节点到 XPATH
All HTML nodes to XPATH
我正在尝试将所有 HTML 节点转换为 XPATH
这是一个示例输入。
基于 HTML 我正在寻找所有子节点的所有 XPATH
<html>
<head>
<title>
The Dormouse's story
</title>
</head>
<body>
<p class="title">
<b>
The Dormouse's story
</b>
</p>
<span>Hello</span>
</body>
</html>
我想要的输出
html
html/head
html/head/title
html/body
html/body/p
我目前拥有的
{
"name": "[document]",
"attr": {},
"children": [
{
"name": "html",
"attr": {},
"children": [
{
"name": "head",
"attr": {},
"children": [
{
"name": "title",
"attr": {},
"children": []
}
]
},
{
"name": "body",
"attr": {},
"children": [
{
"name": "p",
"attr": {
"class": [
"title"
]
},
"children": [
{
"name": "b",
"attr": {},
"children": []
}
]
},
{
"name": "span",
"attr": {},
"children": []
}
]
}
]
}
]
}
代码
try:
import os
import lxml.etree
from bs4 import BeautifulSoup
import json
import etree
except Exception as e:
pass
def traverse(soup):
if soup.name is not None:
dom_dictionary = {}
dom_dictionary['name'] = soup.name
dom_dictionary['attr'] = soup.attrs
dom_dictionary['children'] = [
traverse(child)
for child in soup.children if child.name is not None
]
return dom_dictionary
with open("html.txt", "r") as f:
data = f.read()
soup = BeautifulSoup(data, 'html.parser')
JsonDom = traverse(soup)
print(json.dumps(JsonDom, indent=4))
任何帮助都会很棒
如果你们也能给我指出正确的方向,那将是很大的帮助
任何想法建议都会很棒。
我确实研究了 Lxml bs4 和硒,但不幸的是没有运气
html_doc = """
<html>
<head>
<title>
The Dormouse's story
</title>
</head>
<body>
<p class="title">
<b>
The Dormouse's story
</b>
</p>
<span>Hello</span>
</body>
</html>
"""
def generate(soup, cur=""):
for tag in soup.find_all(recursive=False):
yield cur + tag.name
yield from generate(tag, cur=cur + tag.name + "/")
soup = BeautifulSoup(html_doc, "html.parser") # you can also use "lxml" or "html5lib"
for t in generate(soup):
print(t)
打印:
html
html/head
html/head/title
html/body
html/body/p
html/body/p/b
html/body/span
我正在尝试将所有 HTML 节点转换为 XPATH 这是一个示例输入。 基于 HTML 我正在寻找所有子节点的所有 XPATH
<html>
<head>
<title>
The Dormouse's story
</title>
</head>
<body>
<p class="title">
<b>
The Dormouse's story
</b>
</p>
<span>Hello</span>
</body>
</html>
我想要的输出
html
html/head
html/head/title
html/body
html/body/p
我目前拥有的
{
"name": "[document]",
"attr": {},
"children": [
{
"name": "html",
"attr": {},
"children": [
{
"name": "head",
"attr": {},
"children": [
{
"name": "title",
"attr": {},
"children": []
}
]
},
{
"name": "body",
"attr": {},
"children": [
{
"name": "p",
"attr": {
"class": [
"title"
]
},
"children": [
{
"name": "b",
"attr": {},
"children": []
}
]
},
{
"name": "span",
"attr": {},
"children": []
}
]
}
]
}
]
}
代码
try:
import os
import lxml.etree
from bs4 import BeautifulSoup
import json
import etree
except Exception as e:
pass
def traverse(soup):
if soup.name is not None:
dom_dictionary = {}
dom_dictionary['name'] = soup.name
dom_dictionary['attr'] = soup.attrs
dom_dictionary['children'] = [
traverse(child)
for child in soup.children if child.name is not None
]
return dom_dictionary
with open("html.txt", "r") as f:
data = f.read()
soup = BeautifulSoup(data, 'html.parser')
JsonDom = traverse(soup)
print(json.dumps(JsonDom, indent=4))
任何帮助都会很棒 如果你们也能给我指出正确的方向,那将是很大的帮助
任何想法建议都会很棒。 我确实研究了 Lxml bs4 和硒,但不幸的是没有运气
html_doc = """
<html>
<head>
<title>
The Dormouse's story
</title>
</head>
<body>
<p class="title">
<b>
The Dormouse's story
</b>
</p>
<span>Hello</span>
</body>
</html>
"""
def generate(soup, cur=""):
for tag in soup.find_all(recursive=False):
yield cur + tag.name
yield from generate(tag, cur=cur + tag.name + "/")
soup = BeautifulSoup(html_doc, "html.parser") # you can also use "lxml" or "html5lib"
for t in generate(soup):
print(t)
打印:
html
html/head
html/head/title
html/body
html/body/p
html/body/p/b
html/body/span