如何在 Html 敏捷包中重定向 URL
How to get redirected URL in Html Agility pack
我想解析集合 URL 中的所有 URL。我找到了以下方法:
public static List<string> ParseLinks(string urlToCrawl)
{
WebClient webClient = new WebClient();
byte[] data = webClient.DownloadData(urlToCrawl);
string download = Encoding.ASCII.GetString(data);
HashSet<string> list = new HashSet<string>();
var doc = new HtmlDocument();
doc.LoadHtml(download);
HtmlNodeCollection nodes = doc.DocumentNode.SelectNodes("//a[@href]");
foreach (var n in nodes)
{
string href = n.Attributes["href"].Value;
list.Add(GetAbsoluteUrlString(urlToCrawl, href));
}
return list.ToList();
}
static string GetAbsoluteUrlString(string baseUrl, string url)
{
var uri = new Uri(url, UriKind.RelativeOrAbsolute);
if (!uri.IsAbsoluteUri)
uri = new Uri(new Uri(baseUrl), uri);
return uri.ToString();
}
一切都很好,但在一些网站中,links 进入了他们的网站(他们正在重定向)。我有一个link:https://www.houzz.com/trk/aHR0cHM6Ly9nb2xkbWFuYXJjaGl0ZWN0LmNvbS8/d76eaa05cc284c9f987d1d30948a6295/ue/MjgxNzk3OTg/84045ba5f6a5f8aa2c25d89b4e18c788. When I want to use my method for extracting links, ParseLinks method gives me the wrong URLs like https://www.houzz.com/contact, https://www.houzz.com/site-map/... My expectation is https://goldmanarchitect.com/contact, https://goldmanarchitect.com/site-map/ ... because when we go to the link above, it redirects to https://goldmanarchitect.com/。那么,如何从当前URL获取重定向页面呢?请给我一些解决方案来解决我的问题。
我用几个关键词做了一些研究,找到了解决我的问题的方法。以下方法解决了我的问题:
public static string GetFinalRedirect(string url)
{
if(string.IsNullOrWhiteSpace(url))
return url;
int maxRedirCount = 8; // prevent infinite loops
string newUrl = url;
do
{
HttpWebRequest req = null;
HttpWebResponse resp = null;
try
{
req = (HttpWebRequest) HttpWebRequest.Create(url);
req.Method = "HEAD";
req.AllowAutoRedirect = false;
resp = (HttpWebResponse)req.GetResponse();
switch (resp.StatusCode)
{
case HttpStatusCode.OK:
return newUrl;
case HttpStatusCode.Redirect:
case HttpStatusCode.MovedPermanently:
case HttpStatusCode.RedirectKeepVerb:
case HttpStatusCode.RedirectMethod:
newUrl = resp.Headers["Location"];
if (newUrl == null)
return url;
if (newUrl.IndexOf("://", System.StringComparison.Ordinal) == -1)
{
// Doesn't have a URL Schema, meaning it's a relative or absolute URL
Uri u = new Uri(new Uri(url), newUrl);
newUrl = u.ToString();
}
break;
default:
return newUrl;
}
url = newUrl;
}
catch (WebException)
{
// Return the last known good URL
return newUrl;
}
catch (Exception ex)
{
return null;
}
finally
{
if (resp != null)
resp.Close();
}
} while (maxRedirCount-- > 0);
return newUrl;
}
我想解析集合 URL 中的所有 URL。我找到了以下方法:
public static List<string> ParseLinks(string urlToCrawl)
{
WebClient webClient = new WebClient();
byte[] data = webClient.DownloadData(urlToCrawl);
string download = Encoding.ASCII.GetString(data);
HashSet<string> list = new HashSet<string>();
var doc = new HtmlDocument();
doc.LoadHtml(download);
HtmlNodeCollection nodes = doc.DocumentNode.SelectNodes("//a[@href]");
foreach (var n in nodes)
{
string href = n.Attributes["href"].Value;
list.Add(GetAbsoluteUrlString(urlToCrawl, href));
}
return list.ToList();
}
static string GetAbsoluteUrlString(string baseUrl, string url)
{
var uri = new Uri(url, UriKind.RelativeOrAbsolute);
if (!uri.IsAbsoluteUri)
uri = new Uri(new Uri(baseUrl), uri);
return uri.ToString();
}
一切都很好,但在一些网站中,links 进入了他们的网站(他们正在重定向)。我有一个link:https://www.houzz.com/trk/aHR0cHM6Ly9nb2xkbWFuYXJjaGl0ZWN0LmNvbS8/d76eaa05cc284c9f987d1d30948a6295/ue/MjgxNzk3OTg/84045ba5f6a5f8aa2c25d89b4e18c788. When I want to use my method for extracting links, ParseLinks method gives me the wrong URLs like https://www.houzz.com/contact, https://www.houzz.com/site-map/... My expectation is https://goldmanarchitect.com/contact, https://goldmanarchitect.com/site-map/ ... because when we go to the link above, it redirects to https://goldmanarchitect.com/。那么,如何从当前URL获取重定向页面呢?请给我一些解决方案来解决我的问题。
我用几个关键词做了一些研究,找到了解决我的问题的方法。以下方法解决了我的问题:
public static string GetFinalRedirect(string url)
{
if(string.IsNullOrWhiteSpace(url))
return url;
int maxRedirCount = 8; // prevent infinite loops
string newUrl = url;
do
{
HttpWebRequest req = null;
HttpWebResponse resp = null;
try
{
req = (HttpWebRequest) HttpWebRequest.Create(url);
req.Method = "HEAD";
req.AllowAutoRedirect = false;
resp = (HttpWebResponse)req.GetResponse();
switch (resp.StatusCode)
{
case HttpStatusCode.OK:
return newUrl;
case HttpStatusCode.Redirect:
case HttpStatusCode.MovedPermanently:
case HttpStatusCode.RedirectKeepVerb:
case HttpStatusCode.RedirectMethod:
newUrl = resp.Headers["Location"];
if (newUrl == null)
return url;
if (newUrl.IndexOf("://", System.StringComparison.Ordinal) == -1)
{
// Doesn't have a URL Schema, meaning it's a relative or absolute URL
Uri u = new Uri(new Uri(url), newUrl);
newUrl = u.ToString();
}
break;
default:
return newUrl;
}
url = newUrl;
}
catch (WebException)
{
// Return the last known good URL
return newUrl;
}
catch (Exception ex)
{
return null;
}
finally
{
if (resp != null)
resp.Close();
}
} while (maxRedirCount-- > 0);
return newUrl;
}