C# 使用 Web 浏览器控件读取 ajax 数据时遇到问题
C# Facing problem to read ajax data using web browser control
这是一个网站 https://www.wsj.com/news/types/newsplus,其数据在运行时由 ajax 加载。我必须阅读所有文章标题文本。从早上开始,我尝试了很多代码,但仍然没有代码工作,因为数据正在按 ajax.
加载
这是我试过的代码。
HtmlDocument hd = GetHtmlAjax(new Uri("https://www.wsj.com/news/types/newsplus"), 300, true);
ParseData(hd);
HtmlElementCollection main_element = hd.GetElementsByTagName("h3");
if (main_element != null)
{
foreach (HtmlElement element in main_element)
{
string cls = element.GetAttribute("className");
if (String.IsNullOrEmpty(cls) || !cls.Equals("WSJTheme--headline--unZqjb45 undefined WSJTheme--heading-3--2z_phq5h typography--serif-display--ZXeuhS5E"))
continue;
HtmlElementCollection childDivs = element.Children.GetElementsByName("a");
foreach (HtmlElement childElement in childDivs)
{
//grab links and other stuff same way
string linktxt = childElement.InnerText;
}
}
}
WebBrowser wb = null;
public HtmlDocument GetHtmlAjax(Uri uri, int AjaxTimeLoadTimeOut,bool loadurl)
{
if (loadurl)
{
wb = new WebBrowser();
wb.ScriptErrorsSuppressed = true;
wb.Navigate(uri);
}
while (wb.ReadyState != WebBrowserReadyState.Complete)
Application.DoEvents();
Thread.Sleep(AjaxTimeLoadTimeOut);
Application.DoEvents();
return wb.Document;
}
我跟随许多 link 来处理这个问题但失败了。这些是我关注的 links。
htmlagilitypack and dynamic content issue
Retrieve ajax/JavaScript return results from webpage in c#
How to extract dynamic ajax content from a web page
请告诉我要在我的代码中更改什么以解析标题 link 文本。谢谢
Post 代码来自@aepot
private static HttpClient client = new HttpClient();
private static async Task<T> GetJsonPageAsync<T>(string url)
{
using (HttpResponseMessage response = await client.GetAsync(url, HttpCompletionOption.ResponseHeadersRead))
{
response.EnsureSuccessStatusCode();
string text = await response.Content.ReadAsStringAsync();
return JsonConvert.DeserializeObject<T>(text);
}
}
private async void button1_Click(object sender, EventArgs e)
{
try
{
dynamic newsList = await GetJsonPageAsync<dynamic>("https://www.wsj.com/news/types/newsplus?id={%22query%22:%22type:=\%22NewsPlus\%22%22,%22db%22:%22wsjie,blog,interactivemedia%22}&type=search_collection");
List<Task<dynamic>> tasks = new List<Task<dynamic>>();
foreach (dynamic item in newsList.collection)
{
string strUrl = "https://www.wsj.com/news/types/newsplus?id=" + item.id + "&type=article";
tasks.Add(GetJsonPageAsync<dynamic>(strUrl));
//tasks.Add(GetJsonPageAsync<dynamic>($"https://www.wsj.com/news/types/newsplus?id={item.id}&type=article"));
}
dynamic[] newsDataList = await Task.WhenAll(tasks);
foreach (dynamic newItem in newsDataList)
{
//Console.WriteLine(newItem.data.headline);
//Console.WriteLine(newItem.data.url);
txtData.Text += newItem.data.headline + System.Environment.NewLine;
txtData.Text += new string('-', 200); + System.Environment.NewLine;
}
}
catch (Exception ex)
{
Console.WriteLine(ex.Message);
}
}
AJAX 是简单的 GET 或 POST 请求。
使用常规的浏览器开发工具,我发现该页面发送简单的 GET 请求并接收 JSON 数据。 JSON 可以通过 reader 去实现或探索。
对于 JSON 解析我使用了 Newtonsoft.Json
NuGet 包
这是基于 WinForms 应用程序的简单示例。
public partial class Form1 : Form
{
private static readonly HttpClient client = new HttpClient();
private async Task<T> GetJsonPageAsync<T>(string url)
{
using (HttpResponseMessage response = await client.GetAsync(url, HttpCompletionOption.ResponseHeadersRead))
{
response.EnsureSuccessStatusCode();
string text = await response.Content.ReadAsStringAsync();
return JsonConvert.DeserializeObject<T>(text);
}
}
public Form1()
{
InitializeComponent();
ServicePointManager.DefaultConnectionLimit = 10; // to make it faster
ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls12;
}
private async void button1_Click(object sender, EventArgs e)
{
try
{
dynamic newsList = await GetJsonPageAsync<dynamic>("https://www.wsj.com/news/types/newsplus?id={%22query%22:%22type:=\%22NewsPlus\%22%22,%22db%22:%22wsjie,blog,interactivemedia%22}&type=search_collection");
List<Task<dynamic>> tasks = new List<Task<dynamic>>();
foreach (dynamic item in newsList.collection)
{
tasks.Add(GetJsonPageAsync<dynamic>($"https://www.wsj.com/news/types/newsplus?id={item.id}&type=article"));
}
dynamic[] newsDataList = await Task.WhenAll(tasks);
foreach (dynamic newItem in newsDataList)
{
textBox1.Text += newItem.data.headline + Environment.NewLine;
textBox1.Text += new string('-', 200) + Environment.NewLine;
}
}
catch (Exception ex)
{
textBox1.Text = ex.Message;
}
}
}
更新: 为 .NET Framework 4.5.2 添加了修复
这是一个网站 https://www.wsj.com/news/types/newsplus,其数据在运行时由 ajax 加载。我必须阅读所有文章标题文本。从早上开始,我尝试了很多代码,但仍然没有代码工作,因为数据正在按 ajax.
加载这是我试过的代码。
HtmlDocument hd = GetHtmlAjax(new Uri("https://www.wsj.com/news/types/newsplus"), 300, true);
ParseData(hd);
HtmlElementCollection main_element = hd.GetElementsByTagName("h3");
if (main_element != null)
{
foreach (HtmlElement element in main_element)
{
string cls = element.GetAttribute("className");
if (String.IsNullOrEmpty(cls) || !cls.Equals("WSJTheme--headline--unZqjb45 undefined WSJTheme--heading-3--2z_phq5h typography--serif-display--ZXeuhS5E"))
continue;
HtmlElementCollection childDivs = element.Children.GetElementsByName("a");
foreach (HtmlElement childElement in childDivs)
{
//grab links and other stuff same way
string linktxt = childElement.InnerText;
}
}
}
WebBrowser wb = null;
public HtmlDocument GetHtmlAjax(Uri uri, int AjaxTimeLoadTimeOut,bool loadurl)
{
if (loadurl)
{
wb = new WebBrowser();
wb.ScriptErrorsSuppressed = true;
wb.Navigate(uri);
}
while (wb.ReadyState != WebBrowserReadyState.Complete)
Application.DoEvents();
Thread.Sleep(AjaxTimeLoadTimeOut);
Application.DoEvents();
return wb.Document;
}
我跟随许多 link 来处理这个问题但失败了。这些是我关注的 links。
htmlagilitypack and dynamic content issue
How to extract dynamic ajax content from a web page
请告诉我要在我的代码中更改什么以解析标题 link 文本。谢谢
Post 代码来自@aepot
private static HttpClient client = new HttpClient();
private static async Task<T> GetJsonPageAsync<T>(string url)
{
using (HttpResponseMessage response = await client.GetAsync(url, HttpCompletionOption.ResponseHeadersRead))
{
response.EnsureSuccessStatusCode();
string text = await response.Content.ReadAsStringAsync();
return JsonConvert.DeserializeObject<T>(text);
}
}
private async void button1_Click(object sender, EventArgs e)
{
try
{
dynamic newsList = await GetJsonPageAsync<dynamic>("https://www.wsj.com/news/types/newsplus?id={%22query%22:%22type:=\%22NewsPlus\%22%22,%22db%22:%22wsjie,blog,interactivemedia%22}&type=search_collection");
List<Task<dynamic>> tasks = new List<Task<dynamic>>();
foreach (dynamic item in newsList.collection)
{
string strUrl = "https://www.wsj.com/news/types/newsplus?id=" + item.id + "&type=article";
tasks.Add(GetJsonPageAsync<dynamic>(strUrl));
//tasks.Add(GetJsonPageAsync<dynamic>($"https://www.wsj.com/news/types/newsplus?id={item.id}&type=article"));
}
dynamic[] newsDataList = await Task.WhenAll(tasks);
foreach (dynamic newItem in newsDataList)
{
//Console.WriteLine(newItem.data.headline);
//Console.WriteLine(newItem.data.url);
txtData.Text += newItem.data.headline + System.Environment.NewLine;
txtData.Text += new string('-', 200); + System.Environment.NewLine;
}
}
catch (Exception ex)
{
Console.WriteLine(ex.Message);
}
}
AJAX 是简单的 GET 或 POST 请求。
使用常规的浏览器开发工具,我发现该页面发送简单的 GET 请求并接收 JSON 数据。 JSON 可以通过 reader 去实现或探索。
对于 JSON 解析我使用了 Newtonsoft.Json
NuGet 包
这是基于 WinForms 应用程序的简单示例。
public partial class Form1 : Form
{
private static readonly HttpClient client = new HttpClient();
private async Task<T> GetJsonPageAsync<T>(string url)
{
using (HttpResponseMessage response = await client.GetAsync(url, HttpCompletionOption.ResponseHeadersRead))
{
response.EnsureSuccessStatusCode();
string text = await response.Content.ReadAsStringAsync();
return JsonConvert.DeserializeObject<T>(text);
}
}
public Form1()
{
InitializeComponent();
ServicePointManager.DefaultConnectionLimit = 10; // to make it faster
ServicePointManager.SecurityProtocol = SecurityProtocolType.Tls12;
}
private async void button1_Click(object sender, EventArgs e)
{
try
{
dynamic newsList = await GetJsonPageAsync<dynamic>("https://www.wsj.com/news/types/newsplus?id={%22query%22:%22type:=\%22NewsPlus\%22%22,%22db%22:%22wsjie,blog,interactivemedia%22}&type=search_collection");
List<Task<dynamic>> tasks = new List<Task<dynamic>>();
foreach (dynamic item in newsList.collection)
{
tasks.Add(GetJsonPageAsync<dynamic>($"https://www.wsj.com/news/types/newsplus?id={item.id}&type=article"));
}
dynamic[] newsDataList = await Task.WhenAll(tasks);
foreach (dynamic newItem in newsDataList)
{
textBox1.Text += newItem.data.headline + Environment.NewLine;
textBox1.Text += new string('-', 200) + Environment.NewLine;
}
}
catch (Exception ex)
{
textBox1.Text = ex.Message;
}
}
}
更新: 为 .NET Framework 4.5.2 添加了修复