AngleSharp - HTML 属性到字典
AngleSharp - HTML attributes to Dictionary
我想从我的网页中解析架构 HTML 以在内部使用 JSON。因此,我为此测试了 AngleSharp,几乎成功地获得了我需要的东西,但在输出格式方面存在一些问题。查看我的输出与预期
string html = @"<div itemscope itemtype='http://schema.org/Organization'>
<span itemprop='name'>Google.org (GOOG)</span>
<div itemprop='address' itemscope itemtype='http://schema.org/PostalAddress'>
Main address:
<span itemprop='streetAddress'>38 avenue de l'Opera</span>
<span itemprop='postalCode'>F-75002</span>
<span itemprop='addressLocality'>Paris, France</span>
</div>
Tel:<span itemprop='telephone'>( 33 1) 42 68 53 00 </span>,
Fax:<span itemprop='faxNumber'>( 33 1) 42 68 53 01 </span>,
E-mail: <span itemprop='email'>secretariat(at)google.org</span>
<span itemprop='alumni' itemscope itemtype='http://schema.org/Person'>
<span itemprop='name'>Jack Dan</span>
</span>
<span itemprop='alumni' itemscope itemtype='http://schema.org/Person'>
<span itemprop='name'>John Smith</span>
</span>
</div>";
HtmlParser parser = new HtmlParser();
var htmlDocument = parser.ParseDocument(html);
var scopes = htmlDocument.All.Where(x => x.Attributes.Any(a => a.Name == "itemtype")).ToList();
var dictionary = new Dictionary<string, object>();
foreach(var scope in scopes)
{
var childrens = scope.Children.Where(x => x.Attributes.Any(a => a.Name == "itemprop"));
foreach (var child in childrens)
{
string prop = child.GetAttribute("itemprop");
if (!dictionary.ContainsKey(prop))
{
dictionary.Add(prop, child.TextContent);
}
}
}
//dictionary.Dump();
string json = JsonConvert.SerializeObject(dictionary, Newtonsoft.Json.Formatting.Indented);
json.Dump();
我的输出
{
"name": "Google.org (GOOG)",
"address": "\n Main address:\n 38 avenue de l'Opera\n F-75002\n Paris, France\n ",
"telephone": "( 33 1) 42 68 53 00 ",
"faxNumber": "( 33 1) 42 68 53 01 ",
"email": "secretariat(at)google.org",
"alumni": "\n Jack Dan\n ",
"streetAddress": "38 avenue de l'Opera",
"postalCode": "F-75002",
"addressLocality": "Paris, France"
}
预期
{
"itemtype": "http://schema.org/Organization",
"name": "Google.org (GOOG)",
"address": {
"itemtype": "PostalAddress",
"addressLocality": "Paris, France",
"postalCode": "F-75002",
"streetAddress": "38 avenue de l'Opera"
},
"email": "secretariat(at)google.org",
"faxNumber": "( 33 1) 42 68 53 01",
"telephone": "( 33 1) 42 68 53 00",
"alumni": [
{
"itemtype": "http://schema.org/Person",
"name": "Jack Dan"
},
{
"itemtype": "http://schema.org/Person",
"name": "John Smith"
}
]
}
.Children
并没有遍历所有 childrens/and 子子级获取所有属性,然后附加到它所属的父项范围。任何建议 what/where 我应该更改以获得预期格式的输出。
Lars代码输出:
{
"name": "Google.org (GOOG)",
"address": "\n\t Main address:\n\t 38 avenue de l'Opera\n\t F-75002\n\t Paris, France\n\t ",
"telephone": "( 33 1) 42 68 53 00 ",
"faxNumber": "( 33 1) 42 68 53 01 ",
"email": "secretariat(at)google.org",
"alumni": "\n\t Jack Dan\n\t "
}
{
"streetAddress": "38 avenue de l'Opera",
"postalCode": "F-75002",
"addressLocality": "Paris, France"
}
{
"name": "Jack Dan"
}
{
"name": "John Smith"
}
我不会在这里做所有的编码,但是这个想法可以是递归的方式:
string GetJsonStringFromItem(ScopeType scope) {
var localDict = new Dictionary<string, object>();
var childrens = scope.Children.Where(x => x.Attributes.Any(a => a.Name == "itemprop"));
foreach (var child in childrens)
{
var propValue;
if (child.HasAttribute("itemscope"))
{
// this is the recursion: do the same with the nested scope
propValue = GetJsonStringFromItem(child);
} else {
propValue = child.TextContent;
}
string prop = child.GetAttribute("itemprop");
if (!localDict.ContainsKey(prop))
{
localDict.Add(prop, propValue);
}
}
return JsonConvert.SerializeObject(localDict, Newtonsoft.Json.Formatting.Indented);
}
然后在最外层的循环中调用这段代码。
以下是我的解决方法。
async Task Main()
{
var html = @"<div itemscope itemtype='http://schema.org/Organization'>
<span itemprop='name'>Google.org (GOOG)</span>
<div itemprop='address' itemscope itemtype='http://schema.org/PostalAddress'>
Main address:
<span itemprop='streetAddress'>38 avenue de l'Opera</span>
<span itemprop='postalCode'>F-75002</span>
<span itemprop='addressLocality'>Paris, France</span>
</div>
Tel:<span itemprop='telephone'>( 33 1) 42 68 53 00 </span>,
Fax:<span itemprop='faxNumber'>( 33 1) 42 68 53 01 </span>,
E-mail: <span itemprop='email'>secretariat(at)google.org</span>
<span itemprop='alumni' itemscope itemtype='http://schema.org/Person'>
<span itemprop='name'>Jack Dan</span>
</span>
<span itemprop='alumni' itemscope itemtype='http://schema.org/Person'>
<span itemprop='name'>John Smith</span>
</span>
</div>";
var context = BrowsingContext.New();
var document = await context.OpenAsync(res => res.Content(html));
var result = Parse(document.QuerySelector("[itemscope]"));
var json = JsonConvert.SerializeObject(result, Newtonsoft.Json.Formatting.Indented);
json.Dump();
}
void Populate(IElement element, Dictionary<string, object> result)
{
foreach (var child in element.Children)
{
var prop = child.GetAttribute("itemprop");
if (prop != null)
{
var scope = child.GetAttribute("itemscope");
var value = default(Object);
if (scope != null)
{
value = Parse(child);
}
else
{
value = child.TextContent;
}
if (result.TryGetValue(prop, out var item))
{
if (item is List<Object> list)
{
list.Add(value);
}
else
{
result[prop] = new List<Object>
{
item,
value
};
}
}
else
{
result[prop] = value;
}
}
else
{
Populate(child, result);
}
}
}
Object Parse(IElement element)
{
var result = new Dictionary<string, object>();
result["itemtype"] = element.GetAttribute("itemtype");
Populate(element, result);
return result;
}
不确定我是否做对了一切,但我的输出如下所示:
{
"itemtype": "http://schema.org/Organization",
"name": "Google.org (GOOG)",
"address": {
"itemtype": "http://schema.org/PostalAddress",
"streetAddress": "38 avenue de l'Opera",
"postalCode": "F-75002",
"addressLocality": "Paris, France"
},
"telephone": "( 33 1) 42 68 53 00 ",
"faxNumber": "( 33 1) 42 68 53 01 ",
"email": "secretariat(at)google.org",
"alumni": [
{
"itemtype": "http://schema.org/Person",
"name": "Jack Dan"
},
{
"itemtype": "http://schema.org/Person",
"name": "John Smith"
}
]
}
与预期输出相符。 Parse
的代码可能不是那么优雅,但至少它在一次传递中得到结果(不需要其他 QuerySelector
)。
我不确定预期的输入是什么,但很可能你需要添加更多的保护措施来防止奇怪的 HTML / 输入(例如,检查 itemtype
是否真的是设置 itemscope
后可用。
希望对您有所帮助!
我想从我的网页中解析架构 HTML 以在内部使用 JSON。因此,我为此测试了 AngleSharp,几乎成功地获得了我需要的东西,但在输出格式方面存在一些问题。查看我的输出与预期
string html = @"<div itemscope itemtype='http://schema.org/Organization'>
<span itemprop='name'>Google.org (GOOG)</span>
<div itemprop='address' itemscope itemtype='http://schema.org/PostalAddress'>
Main address:
<span itemprop='streetAddress'>38 avenue de l'Opera</span>
<span itemprop='postalCode'>F-75002</span>
<span itemprop='addressLocality'>Paris, France</span>
</div>
Tel:<span itemprop='telephone'>( 33 1) 42 68 53 00 </span>,
Fax:<span itemprop='faxNumber'>( 33 1) 42 68 53 01 </span>,
E-mail: <span itemprop='email'>secretariat(at)google.org</span>
<span itemprop='alumni' itemscope itemtype='http://schema.org/Person'>
<span itemprop='name'>Jack Dan</span>
</span>
<span itemprop='alumni' itemscope itemtype='http://schema.org/Person'>
<span itemprop='name'>John Smith</span>
</span>
</div>";
HtmlParser parser = new HtmlParser();
var htmlDocument = parser.ParseDocument(html);
var scopes = htmlDocument.All.Where(x => x.Attributes.Any(a => a.Name == "itemtype")).ToList();
var dictionary = new Dictionary<string, object>();
foreach(var scope in scopes)
{
var childrens = scope.Children.Where(x => x.Attributes.Any(a => a.Name == "itemprop"));
foreach (var child in childrens)
{
string prop = child.GetAttribute("itemprop");
if (!dictionary.ContainsKey(prop))
{
dictionary.Add(prop, child.TextContent);
}
}
}
//dictionary.Dump();
string json = JsonConvert.SerializeObject(dictionary, Newtonsoft.Json.Formatting.Indented);
json.Dump();
我的输出
{
"name": "Google.org (GOOG)",
"address": "\n Main address:\n 38 avenue de l'Opera\n F-75002\n Paris, France\n ",
"telephone": "( 33 1) 42 68 53 00 ",
"faxNumber": "( 33 1) 42 68 53 01 ",
"email": "secretariat(at)google.org",
"alumni": "\n Jack Dan\n ",
"streetAddress": "38 avenue de l'Opera",
"postalCode": "F-75002",
"addressLocality": "Paris, France"
}
预期
{
"itemtype": "http://schema.org/Organization",
"name": "Google.org (GOOG)",
"address": {
"itemtype": "PostalAddress",
"addressLocality": "Paris, France",
"postalCode": "F-75002",
"streetAddress": "38 avenue de l'Opera"
},
"email": "secretariat(at)google.org",
"faxNumber": "( 33 1) 42 68 53 01",
"telephone": "( 33 1) 42 68 53 00",
"alumni": [
{
"itemtype": "http://schema.org/Person",
"name": "Jack Dan"
},
{
"itemtype": "http://schema.org/Person",
"name": "John Smith"
}
]
}
.Children
并没有遍历所有 childrens/and 子子级获取所有属性,然后附加到它所属的父项范围。任何建议 what/where 我应该更改以获得预期格式的输出。
Lars代码输出:
{
"name": "Google.org (GOOG)",
"address": "\n\t Main address:\n\t 38 avenue de l'Opera\n\t F-75002\n\t Paris, France\n\t ",
"telephone": "( 33 1) 42 68 53 00 ",
"faxNumber": "( 33 1) 42 68 53 01 ",
"email": "secretariat(at)google.org",
"alumni": "\n\t Jack Dan\n\t "
}
{
"streetAddress": "38 avenue de l'Opera",
"postalCode": "F-75002",
"addressLocality": "Paris, France"
}
{
"name": "Jack Dan"
}
{
"name": "John Smith"
}
我不会在这里做所有的编码,但是这个想法可以是递归的方式:
string GetJsonStringFromItem(ScopeType scope) {
var localDict = new Dictionary<string, object>();
var childrens = scope.Children.Where(x => x.Attributes.Any(a => a.Name == "itemprop"));
foreach (var child in childrens)
{
var propValue;
if (child.HasAttribute("itemscope"))
{
// this is the recursion: do the same with the nested scope
propValue = GetJsonStringFromItem(child);
} else {
propValue = child.TextContent;
}
string prop = child.GetAttribute("itemprop");
if (!localDict.ContainsKey(prop))
{
localDict.Add(prop, propValue);
}
}
return JsonConvert.SerializeObject(localDict, Newtonsoft.Json.Formatting.Indented);
}
然后在最外层的循环中调用这段代码。
以下是我的解决方法。
async Task Main()
{
var html = @"<div itemscope itemtype='http://schema.org/Organization'>
<span itemprop='name'>Google.org (GOOG)</span>
<div itemprop='address' itemscope itemtype='http://schema.org/PostalAddress'>
Main address:
<span itemprop='streetAddress'>38 avenue de l'Opera</span>
<span itemprop='postalCode'>F-75002</span>
<span itemprop='addressLocality'>Paris, France</span>
</div>
Tel:<span itemprop='telephone'>( 33 1) 42 68 53 00 </span>,
Fax:<span itemprop='faxNumber'>( 33 1) 42 68 53 01 </span>,
E-mail: <span itemprop='email'>secretariat(at)google.org</span>
<span itemprop='alumni' itemscope itemtype='http://schema.org/Person'>
<span itemprop='name'>Jack Dan</span>
</span>
<span itemprop='alumni' itemscope itemtype='http://schema.org/Person'>
<span itemprop='name'>John Smith</span>
</span>
</div>";
var context = BrowsingContext.New();
var document = await context.OpenAsync(res => res.Content(html));
var result = Parse(document.QuerySelector("[itemscope]"));
var json = JsonConvert.SerializeObject(result, Newtonsoft.Json.Formatting.Indented);
json.Dump();
}
void Populate(IElement element, Dictionary<string, object> result)
{
foreach (var child in element.Children)
{
var prop = child.GetAttribute("itemprop");
if (prop != null)
{
var scope = child.GetAttribute("itemscope");
var value = default(Object);
if (scope != null)
{
value = Parse(child);
}
else
{
value = child.TextContent;
}
if (result.TryGetValue(prop, out var item))
{
if (item is List<Object> list)
{
list.Add(value);
}
else
{
result[prop] = new List<Object>
{
item,
value
};
}
}
else
{
result[prop] = value;
}
}
else
{
Populate(child, result);
}
}
}
Object Parse(IElement element)
{
var result = new Dictionary<string, object>();
result["itemtype"] = element.GetAttribute("itemtype");
Populate(element, result);
return result;
}
不确定我是否做对了一切,但我的输出如下所示:
{
"itemtype": "http://schema.org/Organization",
"name": "Google.org (GOOG)",
"address": {
"itemtype": "http://schema.org/PostalAddress",
"streetAddress": "38 avenue de l'Opera",
"postalCode": "F-75002",
"addressLocality": "Paris, France"
},
"telephone": "( 33 1) 42 68 53 00 ",
"faxNumber": "( 33 1) 42 68 53 01 ",
"email": "secretariat(at)google.org",
"alumni": [
{
"itemtype": "http://schema.org/Person",
"name": "Jack Dan"
},
{
"itemtype": "http://schema.org/Person",
"name": "John Smith"
}
]
}
与预期输出相符。 Parse
的代码可能不是那么优雅,但至少它在一次传递中得到结果(不需要其他 QuerySelector
)。
我不确定预期的输入是什么,但很可能你需要添加更多的保护措施来防止奇怪的 HTML / 输入(例如,检查 itemtype
是否真的是设置 itemscope
后可用。
希望对您有所帮助!