如何从给定的 html 文件源中检索所有图像 src 详细信息

how to retrive all image src details from given html file source

我编写了一个小程序来从 HTML 内容中检索所有图像。我在 LINQPad 中编写了这样的程序用于一般测试:

这是我的第一个特定要求的程序:

void Main()
{
    string body = @"<p></p><p></p><p><title></title></p><tablecellpadding='0'cellspacing='0'style='width:100%;'width='100%'><tbody><tr><tdalign='center'style='vertical-align:top;text-align:center;'valign='top'><tablecellpadding='0'cellspacing='0'style='width:600px;'width='600px'><tbody><tr><tdalign='left'background='#CBE8F8'colspan='2'height='143px'style='background-color:#cbe8f8;vertical-align:top;text-align:left;border-width:1px1px0px;border-style:solidsolidnone;border-color:#a7a7a7;'valign='top'><imgalt=''height='143'src='http://www.Newsletterservices.in/Templates/HNY_003/images/greet_header.jpg'width='600'/></td></tr><tr><tdalign='left'background='#CBE8F8'style='background-color:#cbe8f8;width:50%;text-align:left;vertical-align:top;border-left:1pxsolid#a7a7a7;'valign='middle'><imgalt=''height='146'src='http://www.Newsletterservices.in/Templates/HNY_003/images/left_happy_banner.jpg'width='289'/></td><tdalign='left'background='#CBE8F8'style='background-color:#cbe8f8;width:50%;border-right:1pxsolid#a7a7a7;text-align:left;vertical-align:top;'valign='top'><spanstyle='font-family:verdana,'mssansserif';color:#024e9b;font-size:11px;'>DearAllUsers,<br/><br/>Asweallwelcomethenewyearaheadandprayforpeaceandhappiness,wesendyouourwarmestwishes.<br/><br/>Maythenewyearbethebeginningofabettertomorrow,thejourneyduringtheyearfilledwithjoyandmayeachdayintheyearaheadbefilledwithreasonstocelebrate.<br/><br/>WewishyouandyourfamilyaverywonderfulNewYear.<br/><br/>Love,<br/>ElectrocomSoftwarePVT.LTD</span></td></tr><tr><tdalign='left'background='#CBE8F8'colspan='2'height='161px'style='background-color:#cbe8f8;vertical-align:top;text-align:left;height:161px;border-width:0px1px1px;border-style:nonesolidsolid;'valign='top'><imgalt=''height='161'src='http://www.Newsletterservices.in/Templates/HNY_003/images/greet_footer.jpg'width='600'/></td></tr><tr><tdalign='left'colspan='2'height='5px'style='vertical-align:top;text-align:left;height:5px;'valign='top'><imgalt=''height='5'src='http://www.Newsletterservices.in/Templates/HNY_003/images/spacer.gif'width='600'/></td></tr><tr><tdalign='left'colspan='2'height='30px'style='vertical-align:top;text-align:left;height:30px;background-color:#ffffff;border:1pxsolid#a7a7a7;padding:5px;'valign='top'><tablecellpadding='0'cellspacing='0'style='width:100%;'width='100%'><tbody><tr><tdstyle='border-right:1pxsolid#a7a7a7;width:50%;'width='50%'><divstyle='margin:5px5px5px8px;'><spanstyle='font-size:22px;'><spanstyle='color:rgb(0,88,132);font-family:trebuchetms,verdana,'mssansserif';font-weight:bold;'>ELECTROCOMSOFTWAREPVT.LTD</span></span></div></td><tdstyle='width:50%;'width='50%'><divstyle='margin:5px5px5px8px;'><spanstyle='font-family:verdana,'mssansserif';color:#333333;font-size:11px;white-space:normal;'><b>Address:</b>505,EASYOFFICE,SUKHSAGARCOMPLEX,NEAR.FORTUNELANFMARKHOTEL</span><br/>ASHRAMROAD</div><divstyle='margin:6px5px5px8px;'><spanstyle='font-family:verdana,'mssansserif';color:#333333;font-size:11px;white-space:normal;'><b>Phone:</b><imgalt=''height='1'src='http://www.Newsletterservices.in/Templates/HNY_003/images/spacer12px.gif'style='width:12px;height:1px;'width='12'/>01234567890</span></div><divstyle='margin:6px5px5px8px;'><spanstyle='font-family:verdana,'mssansserif';color:#333333;font-size:11px;white-space:normal;'><b>FAX:</b><imgalt=''height='1'src='http://www.Newsletterservices.in/Templates/HNY_003/images/spacer26px.gif'style='width:26px;height:1px;'width='26'/>##UserFax##</span></div><divstyle='margin:6px5px5px8px;'><spanstyle='font-family:verdana,'mssansserif';color:#333333;font-size:11px;white-space:normal;'><b>Email:</b><imgalt=''height='1'src='http://www.Newsletterservices.in/Templates/HNY_003/images/spacer18px.gif'style='width:18px;height:1px;'width='18'/>info@electrocom.in</span></div><divstyle='margin:6px5px5px8px;'><spanstyle='font-family:verdana,'mssansserif';color:#333333;font-size:11px;white-space:normal;'><b>Visitus:</b>www.electrocom.in</span></div></td></tr></tbody></table></td></tr></tbody></table></td></tr></tbody></table><p></p>";
    string fbody =  Regex.Replace(body, @"\s+", string.Empty);
    do
    {
     int src = fbody.IndexOf("src");
          if (src != -1)
            {
              fbody = fbody.Remove(0, src + 5);
               //var dot = Regex.Match(fbody,@"\.(jpg|gif|doc|pdf)$");
               int dot = fbody.IndexOf(".jpg");
               if (dot != -1)
               {
                  Console.WriteLine("IMG PATH : {0} \nIMG OLD NAME : {1} \nIMG EXT : {2}",fbody.Substring(0, dot + 4),Path.GetFileName(fbody.Substring(0, dot + 4)), Path.GetExtension(fbody.Substring(0, dot + 4)));
               }
            fbody = fbody.Remove(0,dot + 4);
        }
    }
    while(fbody.Contains("src"));
}

这是我的成功结果:

IMG PATH : http://www.Newsletterservices.in/Templates/HNY_003/images/greet_header.jpg 
IMG OLD NAME : greet_header.jpg 
IMG EXT : .jpg
IMG PATH : http://www.Newsletterservices.in/Templates/HNY_003/images/left_happy_banner.jpg 
IMG OLD NAME : left_happy_banner.jpg 
IMG EXT : .jpg
IMG PATH : http://www.Newsletterservices.in/Templates/HNY_003/images/greet_footer.jpg 
IMG OLD NAME : greet_footer.jpg 
IMG EXT : .jpg

现在我用更多的文件扩展名进行复杂的操作来检索那些东西:

void Main()
{
    string body = @"<p></p><p></p><p><title></title></p><tablecellpadding='0'cellspacing='0'style='width:100%;'width='100%'><tbody><tr><tdalign='center'style='vertical-align:top;text-align:center;'valign='top'><tablecellpadding='0'cellspacing='0'style='width:600px;'width='600px'><tbody><tr><tdalign='left'background='#CBE8F8'colspan='2'height='143px'style='background-color:#cbe8f8;vertical-align:top;text-align:left;border-width:1px1px0px;border-style:solidsolidnone;border-color:#a7a7a7;'valign='top'><imgalt=''height='143'src='http://www.Newsletterservices.in/Templates/HNY_003/images/greet_header.jpg'width='600'/></td></tr><tr><tdalign='left'background='#CBE8F8'style='background-color:#cbe8f8;width:50%;text-align:left;vertical-align:top;border-left:1pxsolid#a7a7a7;'valign='middle'><imgalt=''height='146'src='http://www.Newsletterservices.in/Templates/HNY_003/images/left_happy_banner.jpg'width='289'/></td><tdalign='left'background='#CBE8F8'style='background-color:#cbe8f8;width:50%;border-right:1pxsolid#a7a7a7;text-align:left;vertical-align:top;'valign='top'><spanstyle='font-family:verdana,'mssansserif';color:#024e9b;font-size:11px;'>DearAllUsers,<br/><br/>Asweallwelcomethenewyearaheadandprayforpeaceandhappiness,wesendyouourwarmestwishes.<br/><br/>Maythenewyearbethebeginningofabettertomorrow,thejourneyduringtheyearfilledwithjoyandmayeachdayintheyearaheadbefilledwithreasonstocelebrate.<br/><br/>WewishyouandyourfamilyaverywonderfulNewYear.<br/><br/>Love,<br/>ElectrocomSoftwarePVT.LTD</span></td></tr><tr><tdalign='left'background='#CBE8F8'colspan='2'height='161px'style='background-color:#cbe8f8;vertical-align:top;text-align:left;height:161px;border-width:0px1px1px;border-style:nonesolidsolid;'valign='top'><imgalt=''height='161'src='http://www.Newsletterservices.in/Templates/HNY_003/images/greet_footer.jpg'width='600'/></td></tr><tr><tdalign='left'colspan='2'height='5px'style='vertical-align:top;text-align:left;height:5px;'valign='top'><imgalt=''height='5'src='http://www.Newsletterservices.in/Templates/HNY_003/images/spacer.gif'width='600'/></td></tr><tr><tdalign='left'colspan='2'height='30px'style='vertical-align:top;text-align:left;height:30px;background-color:#ffffff;border:1pxsolid#a7a7a7;padding:5px;'valign='top'><tablecellpadding='0'cellspacing='0'style='width:100%;'width='100%'><tbody><tr><tdstyle='border-right:1pxsolid#a7a7a7;width:50%;'width='50%'><divstyle='margin:5px5px5px8px;'><spanstyle='font-size:22px;'><spanstyle='color:rgb(0,88,132);font-family:trebuchetms,verdana,'mssansserif';font-weight:bold;'>ELECTROCOMSOFTWAREPVT.LTD</span></span></div></td><tdstyle='width:50%;'width='50%'><divstyle='margin:5px5px5px8px;'><spanstyle='font-family:verdana,'mssansserif';color:#333333;font-size:11px;white-space:normal;'><b>Address:</b>505,EASYOFFICE,SUKHSAGARCOMPLEX,NEAR.FORTUNELANFMARKHOTEL</span><br/>ASHRAMROAD</div><divstyle='margin:6px5px5px8px;'><spanstyle='font-family:verdana,'mssansserif';color:#333333;font-size:11px;white-space:normal;'><b>Phone:</b><imgalt=''height='1'src='http://www.Newsletterservices.in/Templates/HNY_003/images/spacer12px.gif'style='width:12px;height:1px;'width='12'/>01234567890</span></div><divstyle='margin:6px5px5px8px;'><spanstyle='font-family:verdana,'mssansserif';color:#333333;font-size:11px;white-space:normal;'><b>FAX:</b><imgalt=''height='1'src='http://www.Newsletterservices.in/Templates/HNY_003/images/spacer26px.gif'style='width:26px;height:1px;'width='26'/>##UserFax##</span></div><divstyle='margin:6px5px5px8px;'><spanstyle='font-family:verdana,'mssansserif';color:#333333;font-size:11px;white-space:normal;'><b>Email:</b><imgalt=''height='1'src='http://www.Newsletterservices.in/Templates/HNY_003/images/spacer18px.gif'style='width:18px;height:1px;'width='18'/>info@electrocom.in</span></div><divstyle='margin:6px5px5px8px;'><spanstyle='font-family:verdana,'mssansserif';color:#333333;font-size:11px;white-space:normal;'><b>Visitus:</b>www.electrocom.in</span></div></td></tr></tbody></table></td></tr></tbody></table></td></tr></tbody></table><p></p>";
    string fbody =  Regex.Replace(body, @"\s+", string.Empty);
    do
    {
     int src = fbody.IndexOf("src");
          if (src != -1)
            {
              fbody = fbody.Remove(0, src + 5);
               var dot = Regex.Match(fbody,@"\.(jpg|gif|doc|pdf)$");
               //int dot = fbody.IndexOf(".jpg");
               if (dot.Success)
               {
                  Console.WriteLine("IMG PATH : {0} \nIMG OLD NAME : {1} \nIMG EXT : {2}",fbody.Substring(0, dot.Index + 4),Path.GetFileName(fbody.Substring(0, dot.Index + 4)), Path.GetExtension(fbody.Substring(0, dot.Index + 4)));
               }
            fbody = fbody.Remove(0,dot.Index + 4);
        }
    }
    while(fbody.Contains("src"));
}

在这里,我的 LINQPad 结果中什么也没有。有谁知道我是如何通过正则表达式验证的多文件扩展名验证的第二个程序结果得到第一个程序结果的?

尝试从正则表达式的末尾删除 $ 符号,因为这意味着只有当正文 .jpg 或 .gif 或 .doc 或 .pdf[ 结尾时才会匹配=10=]