即使设置了 headers,某些网站也会拒绝 HttpClient 请求
Some websites rejecting HttpClient requests even with headers set
我已经编写了一些代码来检查我数据库中的所有网站是否仍在托管和在线。
问题是这些网站中的一些似乎有机器人保护,每当我尝试通过 HttpClient 请求时,它们都会引发错误而不是显示页面。
我看到其他类似的问题建议在浏览器中添加 headers 所以我已经这样做了,但这没有帮助。相同的站点仍然拒绝 HttpClient 连接,但当我在浏览器中查看它们时一切正常。
我的代码有没有做错或者我需要一些额外的步骤?
这是我的代码:
public static async Task CheckSite(string url, int id)
{
try
{
using(var db = new PlaceDBContext())
using (HttpClient client = new HttpClient(new HttpClientHandler()
{
AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip
}))
using (HttpResponseMessage response = await client.GetAsync(url))
using (HttpContent content = response.Content)
{
client.DefaultRequestHeaders.TryAddWithoutValidation("Accept", "text/html,application/xhtml+xml,application/xml");
client.DefaultRequestHeaders.TryAddWithoutValidation("Accept-Encoding", "gzip, deflate");
client.DefaultRequestHeaders.TryAddWithoutValidation("User-Agent", "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:19.0) Gecko/20100101 Firefox/19.0");
client.DefaultRequestHeaders.TryAddWithoutValidation("Accept-Charset", "ISO-8859-1");
var rd = db.RootDomains.Find(id);
string result = await content.ReadAsStringAsync();
if (result != null && result.Length >= 50)
{
Console.WriteLine("fine");
rd.LastCheckOnline = true;
}
else
{
Console.WriteLine("There was empty or short result");
rd.LastCheckOnline = false;
}
db.SaveChanges();
semaphore.Release();
}
}
catch(Exception ex)
{
Console.WriteLine(ex.Message);
using(var db = new PlaceDBContext())
{
var rd = db.RootDomains.Find(id);
rd.LastCheckOnline = false;
db.SaveChanges();
semaphore.Release();
}
}
}
在发送请求前设置headers。您是在收到回复后才开始做的
public static async Task CheckSite(string url, int id) {
try {
using (var db = new PlaceDBContext())
using (var client = new HttpClient(new HttpClientHandler() {
AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip
})) {
client.DefaultRequestHeaders.TryAddWithoutValidation("Accept-Encoding", "gzip, deflate");
client.DefaultRequestHeaders.TryAddWithoutValidation("User-Agent", "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:19.0) Gecko/20100101 Firefox/19.0");
client.DefaultRequestHeaders.TryAddWithoutValidation("Accept-Charset", "ISO-8859-1");
using (var response = await client.GetAsync(url))
using (var content = response.Content) {
client.DefaultRequestHeaders.TryAddWithoutValidation("Accept", "text/html,application/xhtml+xml,application/xml");
var rd = db.RootDomains.Find(id);
string result = await content.ReadAsStringAsync();
if (result != null && result.Length >= 50) {
Console.WriteLine("fine");
rd.LastCheckOnline = true;
} else {
Console.WriteLine("There was empty or short result");
rd.LastCheckOnline = false;
}
db.SaveChanges();
semaphore.Release();
}
}
} catch (Exception ex) {
Console.WriteLine(ex.Message);
using (var db = new PlaceDBContext()) {
var rd = db.RootDomains.Find(id);
rd.LastCheckOnline = false;
db.SaveChanges();
semaphore.Release();
}
}
}
我已经编写了一些代码来检查我数据库中的所有网站是否仍在托管和在线。
问题是这些网站中的一些似乎有机器人保护,每当我尝试通过 HttpClient 请求时,它们都会引发错误而不是显示页面。
我看到其他类似的问题建议在浏览器中添加 headers 所以我已经这样做了,但这没有帮助。相同的站点仍然拒绝 HttpClient 连接,但当我在浏览器中查看它们时一切正常。
我的代码有没有做错或者我需要一些额外的步骤?
这是我的代码:
public static async Task CheckSite(string url, int id)
{
try
{
using(var db = new PlaceDBContext())
using (HttpClient client = new HttpClient(new HttpClientHandler()
{
AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip
}))
using (HttpResponseMessage response = await client.GetAsync(url))
using (HttpContent content = response.Content)
{
client.DefaultRequestHeaders.TryAddWithoutValidation("Accept", "text/html,application/xhtml+xml,application/xml");
client.DefaultRequestHeaders.TryAddWithoutValidation("Accept-Encoding", "gzip, deflate");
client.DefaultRequestHeaders.TryAddWithoutValidation("User-Agent", "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:19.0) Gecko/20100101 Firefox/19.0");
client.DefaultRequestHeaders.TryAddWithoutValidation("Accept-Charset", "ISO-8859-1");
var rd = db.RootDomains.Find(id);
string result = await content.ReadAsStringAsync();
if (result != null && result.Length >= 50)
{
Console.WriteLine("fine");
rd.LastCheckOnline = true;
}
else
{
Console.WriteLine("There was empty or short result");
rd.LastCheckOnline = false;
}
db.SaveChanges();
semaphore.Release();
}
}
catch(Exception ex)
{
Console.WriteLine(ex.Message);
using(var db = new PlaceDBContext())
{
var rd = db.RootDomains.Find(id);
rd.LastCheckOnline = false;
db.SaveChanges();
semaphore.Release();
}
}
}
在发送请求前设置headers。您是在收到回复后才开始做的
public static async Task CheckSite(string url, int id) {
try {
using (var db = new PlaceDBContext())
using (var client = new HttpClient(new HttpClientHandler() {
AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip
})) {
client.DefaultRequestHeaders.TryAddWithoutValidation("Accept-Encoding", "gzip, deflate");
client.DefaultRequestHeaders.TryAddWithoutValidation("User-Agent", "Mozilla/5.0 (Windows NT 6.2; WOW64; rv:19.0) Gecko/20100101 Firefox/19.0");
client.DefaultRequestHeaders.TryAddWithoutValidation("Accept-Charset", "ISO-8859-1");
using (var response = await client.GetAsync(url))
using (var content = response.Content) {
client.DefaultRequestHeaders.TryAddWithoutValidation("Accept", "text/html,application/xhtml+xml,application/xml");
var rd = db.RootDomains.Find(id);
string result = await content.ReadAsStringAsync();
if (result != null && result.Length >= 50) {
Console.WriteLine("fine");
rd.LastCheckOnline = true;
} else {
Console.WriteLine("There was empty or short result");
rd.LastCheckOnline = false;
}
db.SaveChanges();
semaphore.Release();
}
}
} catch (Exception ex) {
Console.WriteLine(ex.Message);
using (var db = new PlaceDBContext()) {
var rd = db.RootDomains.Find(id);
rd.LastCheckOnline = false;
db.SaveChanges();
semaphore.Release();
}
}
}