C# WebBrowser 会话到 WebRequest
C# WebBrowser Session to WebRequest
我正在尝试编写一个程序来从 google 中抓取 url,当要求输入验证码时,将打开一个表单以允许用户输入验证码并让程序继续。该程序在验证码之前工作正常。表单将打开并允许用户输入验证码,网络浏览器将正常加载下一页,但会话不会转移到网络请求,从而导致打开网络浏览器表单要求用户输入的循环在验证码中。我曾尝试将 cookie 从 webbrowser 复制到 webrequest cookie 容器,但无济于事。
foreach (string cookie in f2.webForm.Document.Cookie.Split(';'))
{
string name = cookie.Split('=')[0];
string value = cookie.Substring(name.Length + 1);
string path = "/";
string domain = "ipv4.google.com";
//webRequest.CookieContainer.Add(new Cookie(name.Trim(), value.Trim(), path, domain));
cookieJar.Add(new Cookie(name.Trim(), value.Trim(), path, domain));
}
这是完整的代码。请记住它写的有点粗略,所以不要判断 :P
CookieContainer cookieJar = new CookieContainer();
for (int i = 0; i <= 30; i += 10)
{
string url = "https://www.google.com/search?newwindow=1&q=inurl:test.php" + "&start=" + i;
HttpWebRequest webRequest = (HttpWebRequest)WebRequest.Create(url);
webRequest.CookieContainer = cookieJar;
Thread.Sleep(1000);
try
{
webRequest.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246";
//webRequest.CookieContainer = new CookieContainer();
webRequest.ProtocolVersion = HttpVersion.Version11;
webRequest.Method = "GET";
webRequest.KeepAlive = false;
webRequest.ContentType = "text/html";
webRequest.Timeout = 20000;
//webRequest.UseDefaultCredentials = true;
Stream objStream = webRequest.GetResponse().GetResponseStream();
StreamReader streamReader = new StreamReader(objStream);
String sLine = "";
List<string> lLines = new List<string>();
List<string> lUrls = new List<string>();
string[] findhttp;
int endIndex = 0;
Thread.Sleep(1000);
HttpWebResponse webResponse = (HttpWebResponse)webRequest.GetResponse();
boxUrl.AppendText("test: " + webResponse.StatusCode + "\n");
// Get Google's web search and store each line in "lUrls" List
while (sLine != null)
{
boxDorks.AppendText(sLine);
lLines.Add(sLine);
sLine = streamReader.ReadLine();
}
// Lets loop through and get all the URLs
foreach (string s in lLines)
{
// Find the index of href="http
findhttp = s.Split(new string[] { "href=\"http" }, StringSplitOptions.None);
// Parse URL
foreach (string find in findhttp)
{
if (s.IndexOf("href=\"http") > 0)
{
endIndex = find.IndexOf("\" onmousedown"); // Find position of quote
if (endIndex > 0 && find.IndexOf("webcache.googleusercontent.com") < 0 &&
find.IndexOf("support.google.com") < 0 &&
find.IndexOf("robots.txt") < 0 &&
find.IndexOf("translate.google.com") < 0) // we don't want these!
{
lUrls.Add("http" + find.Substring(0, endIndex));
}
}
}
}
// Output URLs
foreach (string s in lUrls)
{
boxUrl.AppendText("test: " + s + "\n");
}
}
catch (WebException we)
{
boxUrl.AppendText("exception: " + we);
//using (var sr = new StreamReader(we.Response.GetResponseStream()))
// {
//var html = sr.ReadToEnd();
//}
// Open form to show google captcha
Form2 f2 = new Form2(we.Response.ResponseUri.ToString());//workaround to get webform.Navigate to work properly
f2.ShowDialog();
// Copy cookies from webbrowser to webrequest cookies
foreach (string cookie in f2.webForm.Document.Cookie.Split(';'))
{
string name = cookie.Split('=')[0];
string value = cookie.Substring(name.Length + 1);
string path = "/";
string domain = "ipv4.google.com";
//webRequest.CookieContainer.Add(new Cookie(name.Trim(), value.Trim(), path, domain));
cookieJar.Add(new Cookie(name.Trim(), value.Trim(), path, domain));
}
提前致谢!
经过大量搜索,我找到了解决方案。事实证明,如果您尝试使用我发布的方式从网络浏览器获取 cookie,它不会 return HTTP-Only cookie。这是我发现的解决方法,归功于 Yoni Couriel!
https://ycouriel.blogspot.com/2010/07/webbrowser-and-httpwebrequest-cookies.html
我正在尝试编写一个程序来从 google 中抓取 url,当要求输入验证码时,将打开一个表单以允许用户输入验证码并让程序继续。该程序在验证码之前工作正常。表单将打开并允许用户输入验证码,网络浏览器将正常加载下一页,但会话不会转移到网络请求,从而导致打开网络浏览器表单要求用户输入的循环在验证码中。我曾尝试将 cookie 从 webbrowser 复制到 webrequest cookie 容器,但无济于事。
foreach (string cookie in f2.webForm.Document.Cookie.Split(';'))
{
string name = cookie.Split('=')[0];
string value = cookie.Substring(name.Length + 1);
string path = "/";
string domain = "ipv4.google.com";
//webRequest.CookieContainer.Add(new Cookie(name.Trim(), value.Trim(), path, domain));
cookieJar.Add(new Cookie(name.Trim(), value.Trim(), path, domain));
}
这是完整的代码。请记住它写的有点粗略,所以不要判断 :P
CookieContainer cookieJar = new CookieContainer();
for (int i = 0; i <= 30; i += 10)
{
string url = "https://www.google.com/search?newwindow=1&q=inurl:test.php" + "&start=" + i;
HttpWebRequest webRequest = (HttpWebRequest)WebRequest.Create(url);
webRequest.CookieContainer = cookieJar;
Thread.Sleep(1000);
try
{
webRequest.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246";
//webRequest.CookieContainer = new CookieContainer();
webRequest.ProtocolVersion = HttpVersion.Version11;
webRequest.Method = "GET";
webRequest.KeepAlive = false;
webRequest.ContentType = "text/html";
webRequest.Timeout = 20000;
//webRequest.UseDefaultCredentials = true;
Stream objStream = webRequest.GetResponse().GetResponseStream();
StreamReader streamReader = new StreamReader(objStream);
String sLine = "";
List<string> lLines = new List<string>();
List<string> lUrls = new List<string>();
string[] findhttp;
int endIndex = 0;
Thread.Sleep(1000);
HttpWebResponse webResponse = (HttpWebResponse)webRequest.GetResponse();
boxUrl.AppendText("test: " + webResponse.StatusCode + "\n");
// Get Google's web search and store each line in "lUrls" List
while (sLine != null)
{
boxDorks.AppendText(sLine);
lLines.Add(sLine);
sLine = streamReader.ReadLine();
}
// Lets loop through and get all the URLs
foreach (string s in lLines)
{
// Find the index of href="http
findhttp = s.Split(new string[] { "href=\"http" }, StringSplitOptions.None);
// Parse URL
foreach (string find in findhttp)
{
if (s.IndexOf("href=\"http") > 0)
{
endIndex = find.IndexOf("\" onmousedown"); // Find position of quote
if (endIndex > 0 && find.IndexOf("webcache.googleusercontent.com") < 0 &&
find.IndexOf("support.google.com") < 0 &&
find.IndexOf("robots.txt") < 0 &&
find.IndexOf("translate.google.com") < 0) // we don't want these!
{
lUrls.Add("http" + find.Substring(0, endIndex));
}
}
}
}
// Output URLs
foreach (string s in lUrls)
{
boxUrl.AppendText("test: " + s + "\n");
}
}
catch (WebException we)
{
boxUrl.AppendText("exception: " + we);
//using (var sr = new StreamReader(we.Response.GetResponseStream()))
// {
//var html = sr.ReadToEnd();
//}
// Open form to show google captcha
Form2 f2 = new Form2(we.Response.ResponseUri.ToString());//workaround to get webform.Navigate to work properly
f2.ShowDialog();
// Copy cookies from webbrowser to webrequest cookies
foreach (string cookie in f2.webForm.Document.Cookie.Split(';'))
{
string name = cookie.Split('=')[0];
string value = cookie.Substring(name.Length + 1);
string path = "/";
string domain = "ipv4.google.com";
//webRequest.CookieContainer.Add(new Cookie(name.Trim(), value.Trim(), path, domain));
cookieJar.Add(new Cookie(name.Trim(), value.Trim(), path, domain));
}
提前致谢!
经过大量搜索,我找到了解决方案。事实证明,如果您尝试使用我发布的方式从网络浏览器获取 cookie,它不会 return HTTP-Only cookie。这是我发现的解决方法,归功于 Yoni Couriel! https://ycouriel.blogspot.com/2010/07/webbrowser-and-httpwebrequest-cookies.html