编码 UTF-16 后,如果我想在 iTextSharp 中使用,字符串会被破坏
After encoding UTF-16, the string is broken if I want to use in iTextSharp
首先,我从一个文本文件中获取一些信息,然后这些信息被添加到 pdf 文件的元数据中。在 "Producer" 部分中,土耳其语字符 ğ
、ş
出现错误。我通过使用 UTF-16
解决了这个问题:
write.Info.Put(new PdfName("Producer"), new PdfString("Ankara Üniversitesi Hukuk Fakültesi Dergisi (AÜHFD), C.59, S.2, y.2010, s.309-334.", "UTF-16"));
截图如下:
然后,我使用 foreach
循环获取所有 pdf 文件并读取元数据并插入到 SQLite 数据库文件中。问题就发生在这里。因为当我想从pdf文件中获取并设置为数据库文件UTF-16
编码字符串(生产者数据)时,会出现这样的奇怪字符:
我不明白,为什么会出现错误。
编辑:这是我的所有代码。以下代码从文本文件获取元数据并插入 pdf 文件的元元部分:
var articles = Directory.GetFiles(FILE_PATH, "*.pdf");
foreach (var article in articles)
{
var file_name = Path.GetFileName(article);
var read = new PdfReader(article);
var size = read.GetPageSizeWithRotation(1);
var doc = new Document(size);
var write = PdfWriter.GetInstance(doc, new FileStream(TEMP_PATH + file_name, FileMode.Create, FileAccess.Write));
// Article file names like, 1.pdf, 2.pdf, 3.pdf....
// article_meta_data.txt file content like this:
//1@Article 1 Tag Number@Article 1 first - last page number@Article 1 Title@Article 1 Author@Article 1 Subject@Article 1 Keywords
//2@Article 2 Tag Number@Article 2 first - last page number@Article 2 Title@Article 2 Author@Article 2 Subject@Article 2 Keywords
//3@Article 3 Tag Number@Article 3 first - last page number@Article 3 Title@Article 3 Author@Article 3 Subject@Article 3 Keywords
var pdf_file_name = Convert.ToInt32(Path.GetFileNameWithoutExtension(article)) - 1;
var line = File.ReadAllLines(FILE_PATH + @"article_meta_data.txt");
var info = line[pdf_file_name].Split('@');
var producer = Kunye(info); // It returns like: Ankara Üniversitesi Hukuk Fakültesi Dergisi (AÜHFD), C.59, S.2, y.2010, s.309-334.
var keywords = string.IsNullOrEmpty(info[6]) ? "" : info[6];
doc.AddTitle(info[3]);
doc.AddSubject(info[5]);
doc.AddCreator("UzPDF");
doc.AddAuthor(info[4]);
write.Info.Put(new PdfName("Producer"), new PdfString(producer, "UTF-16"));
doc.AddKeywords(keywords);
doc.Open();
var cb = write.DirectContent;
for (var page_number = 1; page_number <= read.NumberOfPages; page_number++)
{
doc.NewPage();
var page = write.GetImportedPage(read, page_number);
cb.AddTemplate(page, 0, 0);
}
doc.Close();
read.Close();
File.Delete(article);
File.Move(TEMP_PATH + file_name, FILE_PATH + file_name);
}
下面的代码从文件中获取数据并插入 SQLite 数据库文件。对于数据库操作,我使用 Devart - dotConnect for SQLite.
var files = Directory.GetFiles(FILE_PATH, "*.pdf");
var connection = new Linq2SQLiteDataContext();
TruncateTable(connection);
var i = 1;
foreach (var file in files)
{
var read = new PdfReader(file);
var title = read.Info["Title"].Trim();
var author = read.Info["Author"].Trim();
var producer = read.Info["Producer"].Trim();
var file_name = Path.GetFileName(file)?.Trim();
var subject = read.Info["Subject"].Trim();
var keywords = read.Info["Keywords"].Trim();
var art = new article
{
id = i,
title = (title.Length > 255) ? title.Substring(0, 255) : title,
author = (author.Length > 100) ? author.Substring(0, 100) : author,
producer = (producer.Length > 255) ? producer.Substring(0, 255) : producer,
filename = file_name != null && (file_name.Length > 50) ? file_name.Substring(0, 50) : file_name,
subject = (subject.Length > 50) ? subject.Substring(0, 50) : subject,
keywords = (keywords.Length > 500) ? keywords.Substring(0, 500) : keywords,
createdate = File.GetCreationTime(file),
update = File.GetLastWriteTime(file)
};
connection.articles.InsertOnSubmit(art);
i++;
}
connection.SubmitChanges();
而不是:
new PdfString(producer, "UTF-16")
使用:
new PdfString(producer, PdfString.TEXT_UNICODE)
UTF-16 是一种存储 Unicode 值的特定方式,但您无需担心,iText 会为您处理一切。
首先,我从一个文本文件中获取一些信息,然后这些信息被添加到 pdf 文件的元数据中。在 "Producer" 部分中,土耳其语字符 ğ
、ş
出现错误。我通过使用 UTF-16
解决了这个问题:
write.Info.Put(new PdfName("Producer"), new PdfString("Ankara Üniversitesi Hukuk Fakültesi Dergisi (AÜHFD), C.59, S.2, y.2010, s.309-334.", "UTF-16"));
截图如下:
foreach
循环获取所有 pdf 文件并读取元数据并插入到 SQLite 数据库文件中。问题就发生在这里。因为当我想从pdf文件中获取并设置为数据库文件UTF-16
编码字符串(生产者数据)时,会出现这样的奇怪字符:
我不明白,为什么会出现错误。
编辑:这是我的所有代码。以下代码从文本文件获取元数据并插入 pdf 文件的元元部分:
var articles = Directory.GetFiles(FILE_PATH, "*.pdf");
foreach (var article in articles)
{
var file_name = Path.GetFileName(article);
var read = new PdfReader(article);
var size = read.GetPageSizeWithRotation(1);
var doc = new Document(size);
var write = PdfWriter.GetInstance(doc, new FileStream(TEMP_PATH + file_name, FileMode.Create, FileAccess.Write));
// Article file names like, 1.pdf, 2.pdf, 3.pdf....
// article_meta_data.txt file content like this:
//1@Article 1 Tag Number@Article 1 first - last page number@Article 1 Title@Article 1 Author@Article 1 Subject@Article 1 Keywords
//2@Article 2 Tag Number@Article 2 first - last page number@Article 2 Title@Article 2 Author@Article 2 Subject@Article 2 Keywords
//3@Article 3 Tag Number@Article 3 first - last page number@Article 3 Title@Article 3 Author@Article 3 Subject@Article 3 Keywords
var pdf_file_name = Convert.ToInt32(Path.GetFileNameWithoutExtension(article)) - 1;
var line = File.ReadAllLines(FILE_PATH + @"article_meta_data.txt");
var info = line[pdf_file_name].Split('@');
var producer = Kunye(info); // It returns like: Ankara Üniversitesi Hukuk Fakültesi Dergisi (AÜHFD), C.59, S.2, y.2010, s.309-334.
var keywords = string.IsNullOrEmpty(info[6]) ? "" : info[6];
doc.AddTitle(info[3]);
doc.AddSubject(info[5]);
doc.AddCreator("UzPDF");
doc.AddAuthor(info[4]);
write.Info.Put(new PdfName("Producer"), new PdfString(producer, "UTF-16"));
doc.AddKeywords(keywords);
doc.Open();
var cb = write.DirectContent;
for (var page_number = 1; page_number <= read.NumberOfPages; page_number++)
{
doc.NewPage();
var page = write.GetImportedPage(read, page_number);
cb.AddTemplate(page, 0, 0);
}
doc.Close();
read.Close();
File.Delete(article);
File.Move(TEMP_PATH + file_name, FILE_PATH + file_name);
}
下面的代码从文件中获取数据并插入 SQLite 数据库文件。对于数据库操作,我使用 Devart - dotConnect for SQLite.
var files = Directory.GetFiles(FILE_PATH, "*.pdf");
var connection = new Linq2SQLiteDataContext();
TruncateTable(connection);
var i = 1;
foreach (var file in files)
{
var read = new PdfReader(file);
var title = read.Info["Title"].Trim();
var author = read.Info["Author"].Trim();
var producer = read.Info["Producer"].Trim();
var file_name = Path.GetFileName(file)?.Trim();
var subject = read.Info["Subject"].Trim();
var keywords = read.Info["Keywords"].Trim();
var art = new article
{
id = i,
title = (title.Length > 255) ? title.Substring(0, 255) : title,
author = (author.Length > 100) ? author.Substring(0, 100) : author,
producer = (producer.Length > 255) ? producer.Substring(0, 255) : producer,
filename = file_name != null && (file_name.Length > 50) ? file_name.Substring(0, 50) : file_name,
subject = (subject.Length > 50) ? subject.Substring(0, 50) : subject,
keywords = (keywords.Length > 500) ? keywords.Substring(0, 500) : keywords,
createdate = File.GetCreationTime(file),
update = File.GetLastWriteTime(file)
};
connection.articles.InsertOnSubmit(art);
i++;
}
connection.SubmitChanges();
而不是:
new PdfString(producer, "UTF-16")
使用:
new PdfString(producer, PdfString.TEXT_UNICODE)
UTF-16 是一种存储 Unicode 值的特定方式,但您无需担心,iText 会为您处理一切。