C# WebBrowser 会话到 WebRequest

C# WebBrowser Session to WebRequest

我正在尝试编写一个程序来从 google 中抓取 url,当要求输入验证码时,将打开一个表单以允许用户输入验证码并让程序继续。该程序在验证码之前工作正常。表单将打开并允许用户输入验证码,网络浏览器将正常加载下一页,但会话不会转移到网络请求,从而导致打开网络浏览器表单要求用户输入的循环在验证码中。我曾尝试将 cookie 从 webbrowser 复制到 webrequest cookie 容器,但无济于事。

            foreach (string cookie in f2.webForm.Document.Cookie.Split(';'))
            {
                string name = cookie.Split('=')[0];
                string value = cookie.Substring(name.Length + 1);
                string path = "/";
                string domain = "ipv4.google.com";
                //webRequest.CookieContainer.Add(new Cookie(name.Trim(), value.Trim(), path, domain));
                cookieJar.Add(new Cookie(name.Trim(), value.Trim(), path, domain));
            }

这是完整的代码。请记住它写的有点粗略,所以不要判断 :P

    CookieContainer cookieJar = new CookieContainer();
    for (int i = 0; i <= 30; i += 10)
    {
        string url = "https://www.google.com/search?newwindow=1&q=inurl:test.php" + "&start=" + i;
        HttpWebRequest webRequest = (HttpWebRequest)WebRequest.Create(url);
        webRequest.CookieContainer = cookieJar;
        Thread.Sleep(1000);
        try
        {
            webRequest.UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246";
            //webRequest.CookieContainer = new CookieContainer();
            webRequest.ProtocolVersion = HttpVersion.Version11;
            webRequest.Method = "GET";
            webRequest.KeepAlive = false;
            webRequest.ContentType = "text/html";
            webRequest.Timeout = 20000;
            //webRequest.UseDefaultCredentials = true;
            Stream objStream = webRequest.GetResponse().GetResponseStream();
            StreamReader streamReader = new StreamReader(objStream);
            String sLine = "";
            List<string> lLines = new List<string>();
            List<string> lUrls = new List<string>();
            string[] findhttp;
            int endIndex = 0;
            Thread.Sleep(1000);
            HttpWebResponse webResponse = (HttpWebResponse)webRequest.GetResponse();
            boxUrl.AppendText("test: " + webResponse.StatusCode + "\n");

            // Get Google's web search and store each line in "lUrls" List
            while (sLine != null)
            {
                boxDorks.AppendText(sLine);
                lLines.Add(sLine);
                sLine = streamReader.ReadLine();
            }

            // Lets loop through and get all the URLs
            foreach (string s in lLines)
            {
                // Find the index of href="http
                findhttp = s.Split(new string[] { "href=\"http" }, StringSplitOptions.None);

                // Parse URL
                foreach (string find in findhttp)
                {
                    if (s.IndexOf("href=\"http") > 0)
                    {
                        endIndex = find.IndexOf("\" onmousedown"); // Find position of quote

                        if (endIndex > 0 && find.IndexOf("webcache.googleusercontent.com") < 0 &&
                                            find.IndexOf("support.google.com") < 0 &&
                                            find.IndexOf("robots.txt") < 0 &&
                                            find.IndexOf("translate.google.com") < 0) // we don't want these!
                        {
                            lUrls.Add("http" + find.Substring(0, endIndex));
                        }
                    }
                }
            }

            // Output URLs
            foreach (string s in lUrls)
            {
                boxUrl.AppendText("test: " + s + "\n");
            }
        }
        catch (WebException we)
        {
            boxUrl.AppendText("exception: " + we);
            //using (var sr = new StreamReader(we.Response.GetResponseStream()))
            // {
            //var html = sr.ReadToEnd();
              //}
            // Open form to show google captcha
            Form2 f2 = new Form2(we.Response.ResponseUri.ToString());//workaround to get webform.Navigate to work properly
            f2.ShowDialog();

            // Copy cookies from webbrowser to webrequest cookies
            foreach (string cookie in f2.webForm.Document.Cookie.Split(';'))
            {
                string name = cookie.Split('=')[0];
                string value = cookie.Substring(name.Length + 1);
                string path = "/";
                string domain = "ipv4.google.com";
                //webRequest.CookieContainer.Add(new Cookie(name.Trim(), value.Trim(), path, domain));
                cookieJar.Add(new Cookie(name.Trim(), value.Trim(), path, domain));
            }

提前致谢!

经过大量搜索,我找到了解决方案。事实证明,如果您尝试使用我发布的方式从网络浏览器获取 cookie,它不会 return HTTP-Only cookie。这是我发现的解决方法,归功于 Yoni Couriel! https://ycouriel.blogspot.com/2010/07/webbrowser-and-httpwebrequest-cookies.html