link 使用 HtmlAgilityPack 和 c# 提取
link extraction using HtmlAgilityPack and c#
我想提取 google 结果 links
我的代码有效,它确实提取了 links,但这些 links 不是我期望提取的。
我的程序会在 "a href" 标签内提取 links,但搜索结果中的所有 links 都不合适 links,广告 link,googles link 也包括在内
我该怎么办?
using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Net;
using System.ServiceModel.Syndication;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.Xml;
namespace Search
{
public partial class Form1 : Form
{
// load snippet
HtmlAgilityPack.HtmlDocument htmlSnippet = new HtmlAgilityPack.HtmlDocument();
public Form1()
{
InitializeComponent();
}
private void btn1_Click(object sender, EventArgs e)
{
listBox1.Items.Clear();
StringBuilder sb = new StringBuilder();
byte[] ResultsBuffer = new byte[8192];
string SearchResults = "http://google.com/search?q=" + txtKeyWords.Text.Trim();
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(SearchResults);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream resStream = response.GetResponseStream();
string tempString = null;
int count = 0;
do
{
count = resStream.Read(ResultsBuffer, 0, ResultsBuffer.Length);
if (count != 0)
{
tempString = Encoding.ASCII.GetString(ResultsBuffer, 0, count);
sb.Append(tempString);
}
}
while (count > 0);
string sbb = sb.ToString();
HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument();
html.OptionOutputAsXml = true;
html.LoadHtml(sbb);
HtmlNode doc = html.DocumentNode;
foreach (HtmlNode link in doc.SelectNodes("//a[@href]"))
{
//HtmlAttribute att = link.Attributes["href"];
string hrefValue = link.GetAttributeValue("href", string.Empty);
// if ()
{
int index = hrefValue.IndexOf("&");
if (index > 0)
{
hrefValue = hrefValue.Substring(0, index);
listBox1.Items.Add(hrefValue.Replace("/url?q=", ""));
}
}
}
}
}
}
如果我想使用 "a href" 标签,我必须在 If 中添加一些条件
但我不知道我应该在这里使用什么条件:
if ()
我在某个地方读到过提取引用标签而不是 ahref 标签
有人可以帮忙吗?
要获取包含在 cite
元素中的链接,只需访问它们的内部文本,例如:
HtmlWeb w = new HtmlWeb();
var hd = w.Load("http://www.google.com/search?q=veverke");
var cites = hd.DocumentNode.SelectNodes("//cite");
foreach (var cite in cites)
Console.WriteLine(cite.InnerText);
我想提取 google 结果 links
我的代码有效,它确实提取了 links,但这些 links 不是我期望提取的。
我的程序会在 "a href" 标签内提取 links,但搜索结果中的所有 links 都不合适 links,广告 link,googles link 也包括在内
我该怎么办?
using HtmlAgilityPack;
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Net;
using System.ServiceModel.Syndication;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
using System.Xml;
namespace Search
{
public partial class Form1 : Form
{
// load snippet
HtmlAgilityPack.HtmlDocument htmlSnippet = new HtmlAgilityPack.HtmlDocument();
public Form1()
{
InitializeComponent();
}
private void btn1_Click(object sender, EventArgs e)
{
listBox1.Items.Clear();
StringBuilder sb = new StringBuilder();
byte[] ResultsBuffer = new byte[8192];
string SearchResults = "http://google.com/search?q=" + txtKeyWords.Text.Trim();
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(SearchResults);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream resStream = response.GetResponseStream();
string tempString = null;
int count = 0;
do
{
count = resStream.Read(ResultsBuffer, 0, ResultsBuffer.Length);
if (count != 0)
{
tempString = Encoding.ASCII.GetString(ResultsBuffer, 0, count);
sb.Append(tempString);
}
}
while (count > 0);
string sbb = sb.ToString();
HtmlAgilityPack.HtmlDocument html = new HtmlAgilityPack.HtmlDocument();
html.OptionOutputAsXml = true;
html.LoadHtml(sbb);
HtmlNode doc = html.DocumentNode;
foreach (HtmlNode link in doc.SelectNodes("//a[@href]"))
{
//HtmlAttribute att = link.Attributes["href"];
string hrefValue = link.GetAttributeValue("href", string.Empty);
// if ()
{
int index = hrefValue.IndexOf("&");
if (index > 0)
{
hrefValue = hrefValue.Substring(0, index);
listBox1.Items.Add(hrefValue.Replace("/url?q=", ""));
}
}
}
}
}
}
如果我想使用 "a href" 标签,我必须在 If 中添加一些条件
但我不知道我应该在这里使用什么条件:
if ()
我在某个地方读到过提取引用标签而不是 ahref 标签
有人可以帮忙吗?
要获取包含在 cite
元素中的链接,只需访问它们的内部文本,例如:
HtmlWeb w = new HtmlWeb();
var hd = w.Load("http://www.google.com/search?q=veverke");
var cites = hd.DocumentNode.SelectNodes("//cite");
foreach (var cite in cites)
Console.WriteLine(cite.InnerText);