如何访问网页中嵌入的pdf文件
how to access embedded pdf file in web page
当我下载一个包含嵌入式 pdf 文件的 url 时,我可以访问所有页面 html,但不能访问 pdf 文件本身。我尝试过 HttpWebRequest、WebClient、HtmlAgilityPack、memorystreams 等。不确定什么路径可以工作。这是我得到的最接近的。感谢您的帮助。
string url = "http://emaps.emapsplus.com/rdl/MadisonCoAl/MadisonCoAl.aspx?showimg=yes&pid=1701013003029000";
byte[] result;
byte[] buffer = new byte[4096];
WebRequest wr = WebRequest.Create(url);
using (WebResponse response = wr.GetResponse())
{
using (Stream responseStream = response.GetResponseStream())
{
using (MemoryStream memoryStream = new MemoryStream())
{
int count = 0;
do
{
count = responseStream.Read(buffer, 0, buffer.Length);
memoryStream.Write(buffer, 0, count);
} while (count != 0);
result = memoryStream.ToArray();
File.WriteAllBytes(@"C:\testpdf.pdf", result);
}
}
}
这真的很棘手,因为它实际上不是正在下载的 pdf 文件。如果它是普通的 pdf,您的代码就可以工作。这是一个运行一些 javascript 的网页,post 返回自身以生成 pdf。
我有解决您当前问题的答案,但如果您需要对许多文件执行此操作,您可能还有很长的路要走。为了让它工作,我 运行 页面通过 Fiddler 获取 post 字符串,它是 posting 回到自身,然后我使用 C# 来模拟该过程和将结果保存为 pdf。
这很好用,但问题是如果没有通过 Fiddler 获取 post 字符串的手动步骤,您基本上必须创建自己的网络浏览器来理解所有 javascript 代码并执行他们找出字符串是如何生成的。
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
namespace WindowsFormsApplication1
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private void Form1_Load(object sender, EventArgs e)
{
string sURL = "http://emaps.emapsplus.com/rdl/MadisonCoAl/MadisonCoAl.aspx?showimg=yes&pid=1701013003029000";
string sSource = "";
byte[] buffer = new byte[4096];
WebRequest wr = WebRequest.Create(sURL);
using (WebResponse response = wr.GetResponse())
{
using (Stream responseStream = response.GetResponseStream())
{
using (MemoryStream memoryStream = new MemoryStream())
{
int count = 0;
do
{
count = responseStream.Read(buffer, 0, buffer.Length);
memoryStream.Write(buffer, 0, count);
} while (count != 0);
sSource = System.Text.Encoding.UTF8.GetString(memoryStream.ToArray());
}
}
}
if (!string.IsNullOrEmpty(sSource))
{
const string sQuoteString = "\""; // If the values are not being found it could be because the markup is being output with single quotes change this variable from "\"" to "'" in that case
const string sViewStateString = "__VIEWSTATE";
const string sEventValidationString = "__EVENTVALIDATION";
const string sValueString = "value=" + sQuoteString;
Int32 nIndex1 = sSource.IndexOf(sViewStateString);
Int32 nIndex2 = default(Int32);
bool bFoundValues = false;
string sViewState = "";
string sEventValidation = "";
// Look for the view state and event validation tags and grab the values
// Without these values we cannot continue
if (nIndex1 > -1)
{
nIndex2 = sSource.IndexOf(sValueString, nIndex1);
if (nIndex2 > -1)
{
nIndex1 = sSource.IndexOf(sQuoteString, nIndex2 + sValueString.Length);
if (nIndex1 > -1)
{
sViewState = sSource.Substring(nIndex2 + sValueString.Length, nIndex1 - nIndex2 - sValueString.Length);
nIndex1 = sSource.IndexOf(sEventValidationString);
if (nIndex1 > -1)
{
nIndex2 = sSource.IndexOf(sValueString, nIndex1);
if (nIndex2 > -1)
{
nIndex1 = sSource.IndexOf(sQuoteString, nIndex2 + sValueString.Length);
if (nIndex1 > -1)
{
sEventValidation = sSource.Substring(nIndex2 + sValueString.Length, nIndex1 - nIndex2 - sValueString.Length);
bFoundValues = true;
}
}
}
}
}
}
if (bFoundValues == true)
{
Int32 nTimeout = 30;
HttpWebRequest oRequest = HttpWebRequest.Create(new Uri(sURL).AbsoluteUri) as HttpWebRequest;
string sPostData = "__EVENTTARGET=btnPageLoad&__EVENTARGUMENT=&__VIEWSTATE=" + System.Web.HttpUtility.UrlEncode(sViewState) + "&__EVENTVALIDATION=" + System.Web.HttpUtility.UrlEncode(sEventValidation) + "&hdnLoaded=false&ReportViewer1%24ctl03%24ctl00=&ReportViewer1%24ctl03%24ctl01=&ReportViewer1%24ctl11=&ReportViewer1%24ctl12=standards&ReportViewer1%24AsyncWait%24HiddenCancelField=False&ReportViewer1%24ToggleParam%24store=&ReportViewer1%24ToggleParam%24collapse=false&ReportViewer1%24ctl09%24ClientClickedId=&ReportViewer1%24ctl08%24store=&ReportViewer1%24ctl08%24collapse=false&ReportViewer1%24ctl10%24VisibilityState%24ctl00=Error&ReportViewer1%24ctl10%24ScrollPosition=&ReportViewer1%24ctl10%24ReportControl%24ctl02=&ReportViewer1%24ctl10%24ReportControl%24ctl03=&ReportViewer1%24ctl10%24ReportControl%24ctl04=100";
byte[] oPostDataBuffer = System.Text.Encoding.ASCII.GetBytes(sPostData);
oRequest.UserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0";
oRequest.Timeout = nTimeout * 1000;
oRequest.Method = "POST";
oRequest.ContentType = "application/x-www-form-urlencoded";
oRequest.ContentLength = oPostDataBuffer.Length;
using (Stream oRequestStream = oRequest.GetRequestStream())
{
oRequestStream.Write(oPostDataBuffer, 0, oPostDataBuffer.Length);
oRequestStream.Close();
}
HttpWebResponse oResponse = oRequest.GetResponse() as HttpWebResponse;
if (oResponse.StatusCode != HttpStatusCode.OK)
{
// Error
MessageBox.Show(oResponse.StatusCode.ToString());
}
else
{
// Status is OK
byte[] oBuffer = null;
byte[] oFile = null;
using (BinaryReader reader = new BinaryReader(oResponse.GetResponseStream()))
{
using (MemoryStream oMemoryStream = new MemoryStream())
{
oBuffer = reader.ReadBytes(1024);
while (oBuffer.Length > 0)
{
oMemoryStream.Write(oBuffer, 0, oBuffer.Length);
oBuffer = reader.ReadBytes(1024);
}
oFile = new byte[Convert.ToInt32(Math.Floor(Convert.ToDouble(oMemoryStream.Length)))];
oMemoryStream.Position = 0;
oMemoryStream.Read(oFile, 0, oFile.Length);
}
}
using (FileStream oFileStream = new FileStream("C:\testpdf.pdf", FileMode.Create))
{
oFileStream.Write(oFile, 0, oFile.Length);
}
}
MessageBox.Show("PDF downloaded to C:\testpdf.pdf");
}
else
{
MessageBox.Show("Cannot find the __VIEWSTATE and/or __EVENTVALIDATION variables.");
}
}
else
{
MessageBox.Show("Cannot find source code for original url.");
}
}
}
}
更新:
post 数据很可能涉及某种会话,并且在您能够对其进行测试之前它已经超时。因此,这意味着我们必须变得更有创意。我承认这是经过反复试验,此代码确实是为这个 url 量身定制的,可能适用于也可能不适用于同一网站上的其他 pdf。通过将我之前 post 编辑的 sPostData 字符串与我现在刚刚通过 Fiddler 代理从 运行 网站抓取的新字符串进行比较,我发现许多变量中只有 2 个是 post艾德变了。 html 源代码中提供了这两个变量,这些源代码可以从您的原始 C# 代码生成。所以我们所做的只是一些字符串操作并获取这些变量的副本,然后 THEN 执行我 posted 的原始代码。看?我们现在一起工作!这个更新后的代码现在应该每次都能正常工作,而不会给出 500 内部服务器错误消息。
注意:因为我们正在 posting 未针对网络正确编码的数据,我们必须包含对 system.web 的引用以获得访问 urlencode 方法。为此,您需要:
- 右键单击 "Solution Explorer" 中的 "References" 并选择 "Add Reference"
- 单击左侧的 "Assemblies" 并在 "Framework" 部分中找到 "System.Web" 或使用最右侧的搜索框
- 勾选 "System.Web" 并点击 "OK"
这是简化捕获post返回数据
的代码
var postData = new System.Collections.Generic.List<string>();
var document = new HtmlWeb().Load(url);
foreach (var input in document.DocumentNode.SelectNodes("//input[@type='hidden']"))
{
var name = input.GetAttributeValue("name", "");
name = Uri.EscapeDataString(name);
var value = input.GetAttributeValue("value", "");
value = Uri.EscapeDataString(value);
if (name == "__EVENTTARGET")
{
value = "btnPageLoad";
}
postData.Add(string.Format("{0}={1}", name, value));
}
// Use StringBuilder for concatenation
System.Text.StringBuilder sb = new System.Text.StringBuilder(postData[0]);
for (int i = 1; i < postData.Count; i++)
{
sb.Append("&");
sb.Append(postData[i]);
}
var postBody = sb.ToString();
string sPDFPath= "FULL PATH";
WebClient User = new WebClient();
Byte[] FileBuffer = User.DownloadData(sPDFPath);
if (FileBuffer != null)
{
Response.ContentType = "application/pdf";
Response.AddHeader("content-length", FileBuffer.Length.ToString());
Response.BinaryWrite(FileBuffer);
}
当我下载一个包含嵌入式 pdf 文件的 url 时,我可以访问所有页面 html,但不能访问 pdf 文件本身。我尝试过 HttpWebRequest、WebClient、HtmlAgilityPack、memorystreams 等。不确定什么路径可以工作。这是我得到的最接近的。感谢您的帮助。
string url = "http://emaps.emapsplus.com/rdl/MadisonCoAl/MadisonCoAl.aspx?showimg=yes&pid=1701013003029000";
byte[] result;
byte[] buffer = new byte[4096];
WebRequest wr = WebRequest.Create(url);
using (WebResponse response = wr.GetResponse())
{
using (Stream responseStream = response.GetResponseStream())
{
using (MemoryStream memoryStream = new MemoryStream())
{
int count = 0;
do
{
count = responseStream.Read(buffer, 0, buffer.Length);
memoryStream.Write(buffer, 0, count);
} while (count != 0);
result = memoryStream.ToArray();
File.WriteAllBytes(@"C:\testpdf.pdf", result);
}
}
}
这真的很棘手,因为它实际上不是正在下载的 pdf 文件。如果它是普通的 pdf,您的代码就可以工作。这是一个运行一些 javascript 的网页,post 返回自身以生成 pdf。
我有解决您当前问题的答案,但如果您需要对许多文件执行此操作,您可能还有很长的路要走。为了让它工作,我 运行 页面通过 Fiddler 获取 post 字符串,它是 posting 回到自身,然后我使用 C# 来模拟该过程和将结果保存为 pdf。
这很好用,但问题是如果没有通过 Fiddler 获取 post 字符串的手动步骤,您基本上必须创建自己的网络浏览器来理解所有 javascript 代码并执行他们找出字符串是如何生成的。
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
namespace WindowsFormsApplication1
{
public partial class Form1 : Form
{
public Form1()
{
InitializeComponent();
}
private void Form1_Load(object sender, EventArgs e)
{
string sURL = "http://emaps.emapsplus.com/rdl/MadisonCoAl/MadisonCoAl.aspx?showimg=yes&pid=1701013003029000";
string sSource = "";
byte[] buffer = new byte[4096];
WebRequest wr = WebRequest.Create(sURL);
using (WebResponse response = wr.GetResponse())
{
using (Stream responseStream = response.GetResponseStream())
{
using (MemoryStream memoryStream = new MemoryStream())
{
int count = 0;
do
{
count = responseStream.Read(buffer, 0, buffer.Length);
memoryStream.Write(buffer, 0, count);
} while (count != 0);
sSource = System.Text.Encoding.UTF8.GetString(memoryStream.ToArray());
}
}
}
if (!string.IsNullOrEmpty(sSource))
{
const string sQuoteString = "\""; // If the values are not being found it could be because the markup is being output with single quotes change this variable from "\"" to "'" in that case
const string sViewStateString = "__VIEWSTATE";
const string sEventValidationString = "__EVENTVALIDATION";
const string sValueString = "value=" + sQuoteString;
Int32 nIndex1 = sSource.IndexOf(sViewStateString);
Int32 nIndex2 = default(Int32);
bool bFoundValues = false;
string sViewState = "";
string sEventValidation = "";
// Look for the view state and event validation tags and grab the values
// Without these values we cannot continue
if (nIndex1 > -1)
{
nIndex2 = sSource.IndexOf(sValueString, nIndex1);
if (nIndex2 > -1)
{
nIndex1 = sSource.IndexOf(sQuoteString, nIndex2 + sValueString.Length);
if (nIndex1 > -1)
{
sViewState = sSource.Substring(nIndex2 + sValueString.Length, nIndex1 - nIndex2 - sValueString.Length);
nIndex1 = sSource.IndexOf(sEventValidationString);
if (nIndex1 > -1)
{
nIndex2 = sSource.IndexOf(sValueString, nIndex1);
if (nIndex2 > -1)
{
nIndex1 = sSource.IndexOf(sQuoteString, nIndex2 + sValueString.Length);
if (nIndex1 > -1)
{
sEventValidation = sSource.Substring(nIndex2 + sValueString.Length, nIndex1 - nIndex2 - sValueString.Length);
bFoundValues = true;
}
}
}
}
}
}
if (bFoundValues == true)
{
Int32 nTimeout = 30;
HttpWebRequest oRequest = HttpWebRequest.Create(new Uri(sURL).AbsoluteUri) as HttpWebRequest;
string sPostData = "__EVENTTARGET=btnPageLoad&__EVENTARGUMENT=&__VIEWSTATE=" + System.Web.HttpUtility.UrlEncode(sViewState) + "&__EVENTVALIDATION=" + System.Web.HttpUtility.UrlEncode(sEventValidation) + "&hdnLoaded=false&ReportViewer1%24ctl03%24ctl00=&ReportViewer1%24ctl03%24ctl01=&ReportViewer1%24ctl11=&ReportViewer1%24ctl12=standards&ReportViewer1%24AsyncWait%24HiddenCancelField=False&ReportViewer1%24ToggleParam%24store=&ReportViewer1%24ToggleParam%24collapse=false&ReportViewer1%24ctl09%24ClientClickedId=&ReportViewer1%24ctl08%24store=&ReportViewer1%24ctl08%24collapse=false&ReportViewer1%24ctl10%24VisibilityState%24ctl00=Error&ReportViewer1%24ctl10%24ScrollPosition=&ReportViewer1%24ctl10%24ReportControl%24ctl02=&ReportViewer1%24ctl10%24ReportControl%24ctl03=&ReportViewer1%24ctl10%24ReportControl%24ctl04=100";
byte[] oPostDataBuffer = System.Text.Encoding.ASCII.GetBytes(sPostData);
oRequest.UserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10; rv:33.0) Gecko/20100101 Firefox/33.0";
oRequest.Timeout = nTimeout * 1000;
oRequest.Method = "POST";
oRequest.ContentType = "application/x-www-form-urlencoded";
oRequest.ContentLength = oPostDataBuffer.Length;
using (Stream oRequestStream = oRequest.GetRequestStream())
{
oRequestStream.Write(oPostDataBuffer, 0, oPostDataBuffer.Length);
oRequestStream.Close();
}
HttpWebResponse oResponse = oRequest.GetResponse() as HttpWebResponse;
if (oResponse.StatusCode != HttpStatusCode.OK)
{
// Error
MessageBox.Show(oResponse.StatusCode.ToString());
}
else
{
// Status is OK
byte[] oBuffer = null;
byte[] oFile = null;
using (BinaryReader reader = new BinaryReader(oResponse.GetResponseStream()))
{
using (MemoryStream oMemoryStream = new MemoryStream())
{
oBuffer = reader.ReadBytes(1024);
while (oBuffer.Length > 0)
{
oMemoryStream.Write(oBuffer, 0, oBuffer.Length);
oBuffer = reader.ReadBytes(1024);
}
oFile = new byte[Convert.ToInt32(Math.Floor(Convert.ToDouble(oMemoryStream.Length)))];
oMemoryStream.Position = 0;
oMemoryStream.Read(oFile, 0, oFile.Length);
}
}
using (FileStream oFileStream = new FileStream("C:\testpdf.pdf", FileMode.Create))
{
oFileStream.Write(oFile, 0, oFile.Length);
}
}
MessageBox.Show("PDF downloaded to C:\testpdf.pdf");
}
else
{
MessageBox.Show("Cannot find the __VIEWSTATE and/or __EVENTVALIDATION variables.");
}
}
else
{
MessageBox.Show("Cannot find source code for original url.");
}
}
}
}
更新:
post 数据很可能涉及某种会话,并且在您能够对其进行测试之前它已经超时。因此,这意味着我们必须变得更有创意。我承认这是经过反复试验,此代码确实是为这个 url 量身定制的,可能适用于也可能不适用于同一网站上的其他 pdf。通过将我之前 post 编辑的 sPostData 字符串与我现在刚刚通过 Fiddler 代理从 运行 网站抓取的新字符串进行比较,我发现许多变量中只有 2 个是 post艾德变了。 html 源代码中提供了这两个变量,这些源代码可以从您的原始 C# 代码生成。所以我们所做的只是一些字符串操作并获取这些变量的副本,然后 THEN 执行我 posted 的原始代码。看?我们现在一起工作!这个更新后的代码现在应该每次都能正常工作,而不会给出 500 内部服务器错误消息。
注意:因为我们正在 posting 未针对网络正确编码的数据,我们必须包含对 system.web 的引用以获得访问 urlencode 方法。为此,您需要:
- 右键单击 "Solution Explorer" 中的 "References" 并选择 "Add Reference"
- 单击左侧的 "Assemblies" 并在 "Framework" 部分中找到 "System.Web" 或使用最右侧的搜索框
- 勾选 "System.Web" 并点击 "OK"
这是简化捕获post返回数据
的代码 var postData = new System.Collections.Generic.List<string>();
var document = new HtmlWeb().Load(url);
foreach (var input in document.DocumentNode.SelectNodes("//input[@type='hidden']"))
{
var name = input.GetAttributeValue("name", "");
name = Uri.EscapeDataString(name);
var value = input.GetAttributeValue("value", "");
value = Uri.EscapeDataString(value);
if (name == "__EVENTTARGET")
{
value = "btnPageLoad";
}
postData.Add(string.Format("{0}={1}", name, value));
}
// Use StringBuilder for concatenation
System.Text.StringBuilder sb = new System.Text.StringBuilder(postData[0]);
for (int i = 1; i < postData.Count; i++)
{
sb.Append("&");
sb.Append(postData[i]);
}
var postBody = sb.ToString();
string sPDFPath= "FULL PATH";
WebClient User = new WebClient();
Byte[] FileBuffer = User.DownloadData(sPDFPath);
if (FileBuffer != null)
{
Response.ContentType = "application/pdf";
Response.AddHeader("content-length", FileBuffer.Length.ToString());
Response.BinaryWrite(FileBuffer);
}