我在 Delphi 中从 IHTMLDocument2 获取超链接时遇到问题

I have a problem getting hyperlinks from IHTMLDocument2 in Delphi

我在 Delphi 中从 IHTMLDocument2 获取 hyperlinks 时遇到问题。 例如,IHTMLDocument2 returns "about:/explanatory-online",而不是返回完整的 link“http://ena.ge/explanatory-online”。 "about" 与根 URL 的简单替换并不适用于所有情况。

这是我使用的代码:

procedure process_url(MyURL: string; var MyHTML, MyHyperlinks: TStrings; var MyInnerText,MyInnerHTML:widestring);
var
  resp: TMemoryStream;
  IdHTTP: TidHTTP;
  v: Variant;
  iDoc: IHTMLDocument2;
  links: OleVariant;
  MyHyperlink, aHref: string;
  i: integer;

begin
  resp := TMemoryStream.Create;
  IdHTTP := TidHTTP.Create(nil);
  iDoc := coHTMLDocument.Create as IHTMLDocument2;

  try
    IdHTTP.Get(MyURL, resp);

    resp.Position := 0;
    MyHTML.LoadFromStream(resp,TEncoding.UTF8);

  finally
    resp.Free;
    IdHTTP.Free;
  end;

  v := VarArrayCreate([0, 0], VarVariant);
  v[0] := MyHTML.text;
  iDoc.write(PSafeArray(System.TVarData(v).VArray));
  iDoc.designMode := 'off';

  while iDoc.readyState <> 'complete' do
    Application.ProcessMessages;

  showmessage(idoc.url);
  MyInnerText:=idoc.body.innerText;
  MyInnerHTML:=idoc.body.innerHTML;
  links := iDoc.all.tags('A');
  if links.Length > 0 then
  begin
    for i := 0 to -1 + links.Length do
    begin
      aHref := links.Item(i).href;

      MyHyperlinks.Add(aHref);
    end;
  end;

end;


查看页面的源代码,您会看到链接的样子,例如:href="/explanatory-online" 如果下载IdHttp 页面,IHTMLDocument2 没有原始页面地址。 您可以使用 TWebBrowser 或手动替换字符串或使用 IHTMLDocument4。

示例 1 (TWebBrowser):

procedure process_url(MyURL: string; var MyHTML, MyHyperlinks: TStrings;
    var MyInnerText,MyInnerHTML:widestring);
var
  Flags: System.OleVariant;
  iDoc: IHTMLDocument2;
  links: OleVariant;
  MyHyperlink, aHref: string;
  i: integer;
begin
  Flags := Flags or navNoReadFromCache or navNoWriteToCache;
  Form1.WebBrowser1.Silent := True;
  Form1.WebBrowser1.Navigate(MyURL, Flags);
  while Form1.WebBrowser1.ReadyState <> READYSTATE_COMPLETE do
    Application.ProcessMessages;
  iDoc := Form1.WebBrowser1.Document as IHTMLDocument2;
  //showmessage(idoc.url);
  MyInnerText:=idoc.body.innerText;
  MyInnerHTML:=idoc.body.innerHTML;
  links := iDoc.all.tags('A');
  if links.Length > 0 then
  begin
    for i := 0 to -1 + links.Length do
    begin
      aHref := links.Item(i).href;
      MyHyperlinks.Add(aHref);
    end;
  end;
end;

示例 2(替换字符串):

function GetDomain(URL: String): String;
var
  Pos1, Pos2: Integer;
begin
  Result := '';
  URL := Trim(URL);
  Pos1 := LastDelimiter('/', URL);
  Pos2 := Pos('/', URL, Pos1 + 1);
  if (Pos2 = 0) then
    Result := URL + '/'
  else if (Pos1 > 0) then
    Result := Copy(Url, 1, Pos1);
end;

procedure process_url(MyURL: string; var MyHTML, MyHyperlinks: TStrings;
  var MyInnerText, MyInnerHTML: WideString);
var
  resp: TMemoryStream;
  IdHTTP: TidHTTP;
  v: Variant;
  iDoc: IHTMLDocument2;
  links: OleVariant;
  MyHyperlink, aHref, Domain: string;
  I, J: Integer;
begin
  resp := TMemoryStream.Create;
  IdHTTP := TidHTTP.Create(nil);
  iDoc := coHTMLDocument.Create as IHTMLDocument2;
  try
    IdHTTP.Get(MyURL, resp);
    resp.Position := 0;
    MyHTML.LoadFromStream(resp,TEncoding.UTF8);
  finally
    resp.Free;
    IdHTTP.Free;
  end;
  v := VarArrayCreate([0, 0], VarVariant);
  v[0] := MyHTML.text;
  iDoc.write(PSafeArray(System.TVarData(v).VArray));
  iDoc.designMode := 'off';
  while iDoc.readyState <> 'complete' do
    Application.ProcessMessages;
  //showmessage(idoc.url);
  MyInnerText:=idoc.body.innerText;
  MyInnerHTML:=idoc.body.innerHTML;
  links := iDoc.all.tags('A');
  Domain := GetDomain(MyURL);
  if links.Length > 0 then
  begin
    for i := 0 to -1 + links.Length do
    begin
      aHref := links.Item(i).href;
      if (Copy(aHref, 1, 6) = 'about:') and (Length(Domain) > 0) then
      begin
        J := Pos('/', aHref);
        if (J > 0) then
        begin
          Delete(aHref, 1, J);
          aHref := Domain + aHref;
        end;
      end;
      MyHyperlinks.Add(aHref);
    end;
  end;
end;

示例 3 (IHTMLDocument4):

function process_url(MyURL: string; var MyHTML, MyHyperlinks: TStrings;
    var MyInnerText,MyInnerHTML:widestring): Integer;
const
  RS_COMPLETE = 'complete';
  WaitMs1     = 3000;
  WaitMs2     = 8000;
var
  IDoc : IHTMLDocument2;
  IDoc4: IHTMLDocument4;
  Links: OleVariant;
  AHref: String;
  I    : Integer;
  Ms   : Int64;
begin
  Result := 1;
  try
    iDoc := coHTMLDocument.Create as IHTMLDocument2;
    if (iDoc = nil) then
      Exit(2);
    Result := 3;
    iDoc.Set_designMode('off');
    Ms := GetTickCount64;
    while not (iDoc.ReadyState = RS_COMPLETE) and (GetTickCount64 - Ms < WaitMs1) do
    begin
      Sleep(10);
      Application.ProcessMessages;
    end;
    if not (iDoc.ReadyState = RS_COMPLETE) then
      Exit(4);
    Result := 5;
    iDoc4 := iDoc as IHTMLDocument4;
    iDoc := iDoc4.CreateDocumentFromUrl(MyUrl, 'null');
    Ms := GetTickCount64;
    while not (iDoc.ReadyState = RS_COMPLETE) and (GetTickCount64 - Ms < WaitMs2) do
    begin
      Sleep(20);
      Application.ProcessMessages;
    end;
    if not (iDoc.ReadyState = RS_COMPLETE) then
      Exit(6);
    Result := 7;
    MyInnerText := iDoc.Body.InnerText;
    MyInnerHTML := iDoc.Body.InnerHTML;
    Links := iDoc.All.Tags('A');
    for I := 0 to Links.Length - 1 do
    begin
      aHref := links.Item(i).href;
      MyHyperlinks.Add(aHref);
    end;
    Result := 0;
  except
     on E : Exception do
     begin //ShowMessage('Exception: ' + E.ClassName + ',' + E.Message);
       Result := 8;
     end;
  end;
end;