从文本中提取有意义的词
Extract meaningful word from text
我想使用 C# 中的正则表达式函数从 html 页面内容中提取所有有意义的词来进行标记化,这就是我所做的,但仍然有垃圾,我该怎么做??
//Remove Html tags
content = Regex.Replace(content, @"<.*?>", " ");
//Decode Html characters
content = HttpUtility.HtmlDecode(content);
//Remove everything but letters, numbers and whitespace characters
content = Regex.Replace(content, @"[^\w\s]", string.Empty);
//Remove multiple whitespace characters
content = Regex.Replace(content, @"\s+", " ");
//remove any digits
content = Regex.Replace(content, @"[\d-]"," ");
//remove words less than 2 and more than 20 length
content = Regex.Replace(content, @"\b\w{2,20}\b", string.Empty);
使用 RegEx 进行 HTML 处理通常比它的价值更麻烦。抓住 HtmlAgilityPack 并使用它遍历 HTML DOM 提取文本节点内的任何内容。您可以使用类似于下面 class 的内容来收集 HTML 字符串中的所有文本块。
public sealed class HtmlTextExtractor
{
private readonly string m_html;
public HtmlTextExtractor(string html)
{
m_html = html;
}
public IEnumerable<string> GetTextBlocks()
{
var htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(m_html);
var text = new List<string>();
WalkNode(htmlDocument.DocumentNode, text);
return text;
}
private void WalkNode(HtmlNode node, List<string> text)
{
switch (node.NodeType)
{
case HtmlNodeType.Comment:
break; // Exclude comments?
case HtmlNodeType.Document:
case HtmlNodeType.Element:
{
if (node.HasChildNodes)
{
foreach (var childNode in node.ChildNodes)
WalkNode(childNode, text);
}
}
break;
case HtmlNodeType.Text:
{
var html = ((HtmlTextNode)node).Text;
if (html.Length <= 0)
break;
var cleanHtml = HtmlEntity.DeEntitize(html).Trim();
if (!string.IsNullOrEmpty(cleanHtml))
text.Add(cleanHtml);
}
break;
}
}
}
然后您可以专注于 splitting/tokenizing 之后的文字。
var extractor = new HtmlTextExtractor(html);
var textBlocks = extractor.GetTextBlocks();
var words = new List<string>();
foreach (var textBlock in textBlocks)
{
words.AddRange(textBlock.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries));
}
var distinctWords = words.Select(word => CleanWord(word))
.Where(word => word.Length > 2 && word.Length < 20 && !string.IsNullOrEmpty(word))
.Distinct()
.OrderBy(word => word);
最后清理单个单词或标记。
public string CleanWord(string word)
{
//Remove everything but letters, numbers and whitespace characters
word = Regex.Replace(word, @"[^\w\s]", string.Empty);
//Remove multiple whitespace characters
word = Regex.Replace(word, @"\s+", " ");
//remove any digits
word = Regex.Replace(word, @"[\d-]"," ");
return word.Trim();
}
显然这是可以想象到的最简单的实现。它非常原始,在非英语语言中不能很好地工作,这些语言不能围绕空格分开,不能很好地处理标点符号等,但它应该让您了解各个部分。您可以查看 Lucene.NET 之类的内容来改进您的标记化,如果您想改进实施,可能还有更多可用的库。
我想使用 C# 中的正则表达式函数从 html 页面内容中提取所有有意义的词来进行标记化,这就是我所做的,但仍然有垃圾,我该怎么做??
//Remove Html tags
content = Regex.Replace(content, @"<.*?>", " ");
//Decode Html characters
content = HttpUtility.HtmlDecode(content);
//Remove everything but letters, numbers and whitespace characters
content = Regex.Replace(content, @"[^\w\s]", string.Empty);
//Remove multiple whitespace characters
content = Regex.Replace(content, @"\s+", " ");
//remove any digits
content = Regex.Replace(content, @"[\d-]"," ");
//remove words less than 2 and more than 20 length
content = Regex.Replace(content, @"\b\w{2,20}\b", string.Empty);
使用 RegEx 进行 HTML 处理通常比它的价值更麻烦。抓住 HtmlAgilityPack 并使用它遍历 HTML DOM 提取文本节点内的任何内容。您可以使用类似于下面 class 的内容来收集 HTML 字符串中的所有文本块。
public sealed class HtmlTextExtractor
{
private readonly string m_html;
public HtmlTextExtractor(string html)
{
m_html = html;
}
public IEnumerable<string> GetTextBlocks()
{
var htmlDocument = new HtmlDocument();
htmlDocument.LoadHtml(m_html);
var text = new List<string>();
WalkNode(htmlDocument.DocumentNode, text);
return text;
}
private void WalkNode(HtmlNode node, List<string> text)
{
switch (node.NodeType)
{
case HtmlNodeType.Comment:
break; // Exclude comments?
case HtmlNodeType.Document:
case HtmlNodeType.Element:
{
if (node.HasChildNodes)
{
foreach (var childNode in node.ChildNodes)
WalkNode(childNode, text);
}
}
break;
case HtmlNodeType.Text:
{
var html = ((HtmlTextNode)node).Text;
if (html.Length <= 0)
break;
var cleanHtml = HtmlEntity.DeEntitize(html).Trim();
if (!string.IsNullOrEmpty(cleanHtml))
text.Add(cleanHtml);
}
break;
}
}
}
然后您可以专注于 splitting/tokenizing 之后的文字。
var extractor = new HtmlTextExtractor(html);
var textBlocks = extractor.GetTextBlocks();
var words = new List<string>();
foreach (var textBlock in textBlocks)
{
words.AddRange(textBlock.Split(new[] { ' ' }, StringSplitOptions.RemoveEmptyEntries));
}
var distinctWords = words.Select(word => CleanWord(word))
.Where(word => word.Length > 2 && word.Length < 20 && !string.IsNullOrEmpty(word))
.Distinct()
.OrderBy(word => word);
最后清理单个单词或标记。
public string CleanWord(string word)
{
//Remove everything but letters, numbers and whitespace characters
word = Regex.Replace(word, @"[^\w\s]", string.Empty);
//Remove multiple whitespace characters
word = Regex.Replace(word, @"\s+", " ");
//remove any digits
word = Regex.Replace(word, @"[\d-]"," ");
return word.Trim();
}
显然这是可以想象到的最简单的实现。它非常原始,在非英语语言中不能很好地工作,这些语言不能围绕空格分开,不能很好地处理标点符号等,但它应该让您了解各个部分。您可以查看 Lucene.NET 之类的内容来改进您的标记化,如果您想改进实施,可能还有更多可用的库。