C# 中的 iText:GetPage returns 从第一页开始的所有页面
iText in C#: GetPage returns all pages from first
我在 C#/Net5 (5.0.1) 中使用 iText 库。该库使用 NuGet 安装,版本为 7.1.13。
我想阅读PDF文档并在文中搜索
我的问题来自 API GetPage(n)
。我假设它读取第 n 页,但事实是返回从 1 到 n 的所有页面。
这是我获取PDF内容的代码
public PdfDocument? GetPdfContent() {
PdfDocument? document = null;
HttpWebRequest? request = null;
HttpWebResponse? response = null;
Stream? responseStream = null;
MemoryStream? memoryStream = null;
try {
request = WebRequest.CreateHttp(_contentUrl);
} catch (ArgumentNullException e) {
Log.Logger.LogError(e, "Null address/URL in WebContent.GetPdfContent");
throw new ContentSearchException(ContentSearchErrorCode.ContentNotAccessible, "Null address/URL", e);
} catch (UriFormatException e) {
Log.Logger.LogError(e, "Invalid address/URL in WebContent.GetPdfContent " + _contentUrl);
throw new ContentSearchException(ContentSearchErrorCode.ContentNotAccessible, "Invalid address/URL", e);
} catch (NotSupportedException e) {
Log.Logger.LogError(e, "Invalid protocol URL in WebContent.GetPdfContent. Only http and https are supported. " + _contentUrl);
throw new ContentSearchException(ContentSearchErrorCode.ContentNotAccessible, "Invalid protocol URL", e);
} catch (SecurityException e) {
Log.Logger.LogError(e, "Cannot contect to uri. Invalid user/password provided. " + _contentUrl);
throw new ContentSearchException(ContentSearchErrorCode.ContentNotAccessible, "Invalid user/password", e);
}
if (request != null) {
// Configure request
request.Method = "GET";
// Automatic redirection enabled
request.AllowAutoRedirect = true;
// acept-encoding: deflate, gzip
request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip;
if (_accept != null) {
request.Accept = _accept;
}
request.Headers.Add(HttpRequestHeader.UserAgent, "sample/0.0.0");
if (_authorization != null) {
request.Headers.Add(HttpRequestHeader.Authorization, _authorization);
}
try {
using (response = (HttpWebResponse)request.GetResponse()) {
if (response.StatusCode != HttpStatusCode.OK) {
if (response.StatusCode == HttpStatusCode.NotFound) {
throw new ContentSearchException(ContentSearchErrorCode.ContentNotFound, $"Error topic not found: {response.StatusCode} {response.StatusDescription}.");
} else {
throw new ContentSearchException(ContentSearchErrorCode.ContentNotAccessible, $"Error returned by server: {response.StatusCode} {response.StatusDescription}.");
}
} else if (String.IsNullOrEmpty(response.ContentType) || response.ContentType.Split(";")[0] != "application/pdf") {
throw new ContentSearchException(ContentSearchErrorCode.InvalidContentType, $"Error invalid content type {response.ContentType}.");
} else {
try {
using (responseStream = response.GetResponseStream()) {
memoryStream = new MemoryStream();
responseStream.CopyTo(memoryStream);
// memoryStream remains open!
memoryStream.Position = 0;
document = new PdfDocument(new PdfReader(memoryStream));
responseStream.Close();
memoryStream.Close();
}
} catch (Exception e) {
// Error in GetResponseStream
throw new ContentSearchException(ContentSearchErrorCode.ContentNotAccessible, $"Error reading response: {e.Message}", e);
}
}
response.Close();
}
} catch (Exception e) {
// Error in GetResponse
throw new ContentSearchException(ContentSearchErrorCode.ContentNotAccessible, $"Error getting response: {e.Message}", e);
}
}
return document;
}
这是 GetPage 的失败代码
private List<string> GetStringPdfContent() {
List<string> ret = null;
// iText
PdfDocument pdfContent;
PdfPage page;
ITextExtractionStrategy strategy;
string strPage;
pdfContent = (PdfDocument)GetContent();
if (pdfContent != null) {
ret = new List<string>();
// Code for iText
strategy = new SimpleTextExtractionStrategy();
for (int i = 1; i <= pdfContent.GetNumberOfPages(); i++) {
page = pdfContent.GetPage(i);
strPage = PdfTextExtractor.GetTextFromPage(page, strategy);
Log.Logger.LogDebug($"[GetStringPdfContent] Extracted page {i} with length {strPage.Length}.");
ret.Add(strPage);
}
}
return ret;
}
这是一个示例输出。如您所见,我们得到第 1、1-2、1-3 页等...
dbug: Spider.Program[0]
[23/12/2020 17:47:59.793]: [GetStringPdfContent] Extracted page 1 with length 615.
dbug: Spider.Program[0]
[23/12/2020 17:48:10.207]: [GetStringPdfContent] Extracted page 2 with length 2659.
dbug: Spider.Program[0]
[23/12/2020 17:48:12.112]: [GetStringPdfContent] Extracted page 3 with length 4609.
dbug: Spider.Program[0]
[23/12/2020 17:48:13.255]: [GetStringPdfContent] Extracted page 4 with length 7273.
dbug: Spider.Program[0]
[23/12/2020 17:48:16.155]: [GetStringPdfContent] Extracted page 5 with length 9245.
My problem comes from the API GetPage(n). I assumed it reads the page n, but the fact is that is returning all the pages from 1 to n.
这不可能是真的,GetPage(n)
毕竟 returns 一个 PdfPage
代表单个页面的对象。
错误是您的代码在所有页面上重复使用相同的 SimpleTextExtractionStrategy
对象。 SimpleTextExtractionStrategy
收集所有给定的文本,因此如果您首先将它用于第 1 页,然后用于第 2 页,它包含两页的文本。
因此,每页实例化一个单独的文本提取策略对象:
for (int i = 1; i <= pdfContent.GetNumberOfPages(); i++) {
strategy = new SimpleTextExtractionStrategy();
page = pdfContent.GetPage(i);
strPage = PdfTextExtractor.GetTextFromPage(page, strategy);
Log.Logger.LogDebug($"[GetStringPdfContent] Extracted page {i} with length {strPage.Length}.");
ret.Add(strPage);
}
我在 C#/Net5 (5.0.1) 中使用 iText 库。该库使用 NuGet 安装,版本为 7.1.13。 我想阅读PDF文档并在文中搜索
我的问题来自 API GetPage(n)
。我假设它读取第 n 页,但事实是返回从 1 到 n 的所有页面。
这是我获取PDF内容的代码
public PdfDocument? GetPdfContent() {
PdfDocument? document = null;
HttpWebRequest? request = null;
HttpWebResponse? response = null;
Stream? responseStream = null;
MemoryStream? memoryStream = null;
try {
request = WebRequest.CreateHttp(_contentUrl);
} catch (ArgumentNullException e) {
Log.Logger.LogError(e, "Null address/URL in WebContent.GetPdfContent");
throw new ContentSearchException(ContentSearchErrorCode.ContentNotAccessible, "Null address/URL", e);
} catch (UriFormatException e) {
Log.Logger.LogError(e, "Invalid address/URL in WebContent.GetPdfContent " + _contentUrl);
throw new ContentSearchException(ContentSearchErrorCode.ContentNotAccessible, "Invalid address/URL", e);
} catch (NotSupportedException e) {
Log.Logger.LogError(e, "Invalid protocol URL in WebContent.GetPdfContent. Only http and https are supported. " + _contentUrl);
throw new ContentSearchException(ContentSearchErrorCode.ContentNotAccessible, "Invalid protocol URL", e);
} catch (SecurityException e) {
Log.Logger.LogError(e, "Cannot contect to uri. Invalid user/password provided. " + _contentUrl);
throw new ContentSearchException(ContentSearchErrorCode.ContentNotAccessible, "Invalid user/password", e);
}
if (request != null) {
// Configure request
request.Method = "GET";
// Automatic redirection enabled
request.AllowAutoRedirect = true;
// acept-encoding: deflate, gzip
request.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip;
if (_accept != null) {
request.Accept = _accept;
}
request.Headers.Add(HttpRequestHeader.UserAgent, "sample/0.0.0");
if (_authorization != null) {
request.Headers.Add(HttpRequestHeader.Authorization, _authorization);
}
try {
using (response = (HttpWebResponse)request.GetResponse()) {
if (response.StatusCode != HttpStatusCode.OK) {
if (response.StatusCode == HttpStatusCode.NotFound) {
throw new ContentSearchException(ContentSearchErrorCode.ContentNotFound, $"Error topic not found: {response.StatusCode} {response.StatusDescription}.");
} else {
throw new ContentSearchException(ContentSearchErrorCode.ContentNotAccessible, $"Error returned by server: {response.StatusCode} {response.StatusDescription}.");
}
} else if (String.IsNullOrEmpty(response.ContentType) || response.ContentType.Split(";")[0] != "application/pdf") {
throw new ContentSearchException(ContentSearchErrorCode.InvalidContentType, $"Error invalid content type {response.ContentType}.");
} else {
try {
using (responseStream = response.GetResponseStream()) {
memoryStream = new MemoryStream();
responseStream.CopyTo(memoryStream);
// memoryStream remains open!
memoryStream.Position = 0;
document = new PdfDocument(new PdfReader(memoryStream));
responseStream.Close();
memoryStream.Close();
}
} catch (Exception e) {
// Error in GetResponseStream
throw new ContentSearchException(ContentSearchErrorCode.ContentNotAccessible, $"Error reading response: {e.Message}", e);
}
}
response.Close();
}
} catch (Exception e) {
// Error in GetResponse
throw new ContentSearchException(ContentSearchErrorCode.ContentNotAccessible, $"Error getting response: {e.Message}", e);
}
}
return document;
}
这是 GetPage 的失败代码
private List<string> GetStringPdfContent() {
List<string> ret = null;
// iText
PdfDocument pdfContent;
PdfPage page;
ITextExtractionStrategy strategy;
string strPage;
pdfContent = (PdfDocument)GetContent();
if (pdfContent != null) {
ret = new List<string>();
// Code for iText
strategy = new SimpleTextExtractionStrategy();
for (int i = 1; i <= pdfContent.GetNumberOfPages(); i++) {
page = pdfContent.GetPage(i);
strPage = PdfTextExtractor.GetTextFromPage(page, strategy);
Log.Logger.LogDebug($"[GetStringPdfContent] Extracted page {i} with length {strPage.Length}.");
ret.Add(strPage);
}
}
return ret;
}
这是一个示例输出。如您所见,我们得到第 1、1-2、1-3 页等...
dbug: Spider.Program[0]
[23/12/2020 17:47:59.793]: [GetStringPdfContent] Extracted page 1 with length 615.
dbug: Spider.Program[0]
[23/12/2020 17:48:10.207]: [GetStringPdfContent] Extracted page 2 with length 2659.
dbug: Spider.Program[0]
[23/12/2020 17:48:12.112]: [GetStringPdfContent] Extracted page 3 with length 4609.
dbug: Spider.Program[0]
[23/12/2020 17:48:13.255]: [GetStringPdfContent] Extracted page 4 with length 7273.
dbug: Spider.Program[0]
[23/12/2020 17:48:16.155]: [GetStringPdfContent] Extracted page 5 with length 9245.
My problem comes from the API GetPage(n). I assumed it reads the page n, but the fact is that is returning all the pages from 1 to n.
这不可能是真的,GetPage(n)
毕竟 returns 一个 PdfPage
代表单个页面的对象。
错误是您的代码在所有页面上重复使用相同的 SimpleTextExtractionStrategy
对象。 SimpleTextExtractionStrategy
收集所有给定的文本,因此如果您首先将它用于第 1 页,然后用于第 2 页,它包含两页的文本。
因此,每页实例化一个单独的文本提取策略对象:
for (int i = 1; i <= pdfContent.GetNumberOfPages(); i++) {
strategy = new SimpleTextExtractionStrategy();
page = pdfContent.GetPage(i);
strPage = PdfTextExtractor.GetTextFromPage(page, strategy);
Log.Logger.LogDebug($"[GetStringPdfContent] Extracted page {i} with length {strPage.Length}.");
ret.Add(strPage);
}