使用 html 敏捷包从 LinkedIn 中提取搜索结果
Pull search result from LinkedIn using html agility pack
我想提取 LinkedIn 查询的热门搜索结果。
在此 fiddle 中:https://dotnetfiddle.net/Vtwi7g
传递给 'html' var this link :
https://www.linkedin.com/search/results/index/?keywords=firstname%3Ajohn%20AND%20lastname%3Adoe%20AND%20company%3Amicrosoft%20AND%20title%3Aceo&origin=GLOBAL_SEARCH_HEADER
我想得到第一个结果:
https://www.linkedin.com/in/john-doe-63803769/
我想该程序需要一些凭据才能先登录 LinkedIn - 我该如何传递这些凭据?
我尝试检查元素以查看其位置 - 如何遍历 DOM 以获得第一个结果?
在搜索中加入链接会更复杂。他们对未经授权的用户进行了搜索。
首先,您需要使用浏览器登录,然后获取您的会话 cookie li_at
和 _lipt
。
LinkedIn 不会将结果列表直接呈现为 html 标记。他正在将大 json 对象渲染到 <code>
元素中,然后使用 JS 渲染它。
你的控制台应用应该是这样的:
static void Main(string[] args)
{
var html = @"https://www.linkedin.com/search/results/index/?keywords=firstname%3Ajohn%20AND%20lastname%3Adoe%20AND%20company%3Amicrosoft%20AND%20title%3Aceo&origin=GLOBAL_SEARCH_HEADER";
HtmlWeb web = new HtmlWeb();
web.PreRequest = new HtmlWeb.PreRequestHandler(OnPreRequest2);
var htmlDoc = web.Load(html);
var codeElement = htmlDoc.DocumentNode.SelectNodes("//code[starts-with(@id,'bpr-guid')][last()]");
var json = WebUtility.HtmlDecode(codeElement.Last().InnerText);
var obj = JsonConvert.DeserializeObject<Rootobject>(json);
var profiles = obj.included.Where(i => i.firstName != null);
foreach(var profile in profiles)
{
Console.WriteLine("Profile Name: " + profile.firstName + ";" + profile.lastName + ";" + profile.occupation + ";https://www.linkedin.com/in/" + profile.publicIdentifier);
}
Console.ReadKey();
}
public static bool OnPreRequest2(HttpWebRequest request)
{
var cookies = "li_at={YOURCOOKIEHERE};" +
"_lipt={YOURCOOKIEHERE}";
request.Headers.Add(@"cookie:" + cookies);
return true;
}
public class Rootobject
{
public Included[] included { get; set; }
}
public class Included
{
public string firstName { get; set; }
public string lastName { get; set; }
public string occupation { get; set; }
public string objectUrn { get; set; }
public string publicIdentifier { get; set; }
}
最后会打印
Profile Name: John;Doe;ceo at Microsoft;https://www.linkedin.com/in/john-doe-8102b868
Profile Name: John;Doe;Ceo at Microsoft;https://www.linkedin.com/in/john-doe-63803769
Profile Name: John;Doe;CEO at Microsoft;https://www.linkedin.com/in/john-doe-2151b69b
我想提取 LinkedIn 查询的热门搜索结果。
在此 fiddle 中:https://dotnetfiddle.net/Vtwi7g
传递给 'html' var this link :
https://www.linkedin.com/search/results/index/?keywords=firstname%3Ajohn%20AND%20lastname%3Adoe%20AND%20company%3Amicrosoft%20AND%20title%3Aceo&origin=GLOBAL_SEARCH_HEADER
我想得到第一个结果: https://www.linkedin.com/in/john-doe-63803769/
我想该程序需要一些凭据才能先登录 LinkedIn - 我该如何传递这些凭据?
我尝试检查元素以查看其位置 - 如何遍历 DOM 以获得第一个结果?
在搜索中加入链接会更复杂。他们对未经授权的用户进行了搜索。
首先,您需要使用浏览器登录,然后获取您的会话 cookie li_at
和 _lipt
。
LinkedIn 不会将结果列表直接呈现为 html 标记。他正在将大 json 对象渲染到 <code>
元素中,然后使用 JS 渲染它。
你的控制台应用应该是这样的:
static void Main(string[] args)
{
var html = @"https://www.linkedin.com/search/results/index/?keywords=firstname%3Ajohn%20AND%20lastname%3Adoe%20AND%20company%3Amicrosoft%20AND%20title%3Aceo&origin=GLOBAL_SEARCH_HEADER";
HtmlWeb web = new HtmlWeb();
web.PreRequest = new HtmlWeb.PreRequestHandler(OnPreRequest2);
var htmlDoc = web.Load(html);
var codeElement = htmlDoc.DocumentNode.SelectNodes("//code[starts-with(@id,'bpr-guid')][last()]");
var json = WebUtility.HtmlDecode(codeElement.Last().InnerText);
var obj = JsonConvert.DeserializeObject<Rootobject>(json);
var profiles = obj.included.Where(i => i.firstName != null);
foreach(var profile in profiles)
{
Console.WriteLine("Profile Name: " + profile.firstName + ";" + profile.lastName + ";" + profile.occupation + ";https://www.linkedin.com/in/" + profile.publicIdentifier);
}
Console.ReadKey();
}
public static bool OnPreRequest2(HttpWebRequest request)
{
var cookies = "li_at={YOURCOOKIEHERE};" +
"_lipt={YOURCOOKIEHERE}";
request.Headers.Add(@"cookie:" + cookies);
return true;
}
public class Rootobject
{
public Included[] included { get; set; }
}
public class Included
{
public string firstName { get; set; }
public string lastName { get; set; }
public string occupation { get; set; }
public string objectUrn { get; set; }
public string publicIdentifier { get; set; }
}
最后会打印
Profile Name: John;Doe;ceo at Microsoft;https://www.linkedin.com/in/john-doe-8102b868
Profile Name: John;Doe;Ceo at Microsoft;https://www.linkedin.com/in/john-doe-63803769
Profile Name: John;Doe;CEO at Microsoft;https://www.linkedin.com/in/john-doe-2151b69b