C# 托管代码优化
C# Managed Code optimization
我有一个在我的 C# 应用程序中使用的托管 C++ DLL。 DLL 正在处理大量图像(数千张)并使用 OCR 从中提取文本;尽管我知道 OCR 处理会消耗很多 CPU,但我想知道是否可以优化代码以获得更好的性能。
目前解析大约需要一分钟。 15页PNG页面。我会减少到 30-40 秒左右。
C++代码:
char* OCRWrapper::GetUTF8Text(char* path, char* lang, char* imgPath)
{
char* imageText;
tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI();
if (api->Init(path, lang)) {
fprintf(stderr, "Could not initialize tesseract. Incorrect datapath or incorrect lanauge\n"); /*This should throw an error to the caller*/
exit(1);
}
/*Open a reference to the imagepath*/
Pix *image = pixRead(imgPath);
/*Read the image object;*/
api->SetImage(image);
// Get OCR result
imageText = api->GetUTF8Text();
/*writeToFile(outText);*/
/*printf("OCR output:\n%s", imageText);*/
/*Destroy the text*/
api->End();
pixDestroy(&image);
/*std::string x = std::string(imageText);*/
return imageText;
}
创建 OCROBject 实例的 C# 方法 class。 OCRObject是实际调用DLL的class,见下文这个方法。
private void GetTextFromSavedImages(List<string> imagesPath)
{
try
{
StringBuilder allPagesText = new StringBuilder();
OCRObject ocr = new OCRObject(this.dbHandler.GetApplicationSetting(this.m_ProfileName, "TesseractLanguage").ApplicationSettingValue, this.dbHandler.GetApplicationSetting(this.m_ProfileName, "TesseractConfigurationDataPath").ApplicationSettingValue); //Settings.Default.TesseractConfigurationDataPath
for (int i = 0; i < imagesPath.Count; i++)
{
string pageText = ocr.GetOCRText(imagesPath[i]);
this.m_pdfDictionary.Add(i + 1, pageText);
allPagesText.Append(pageText);
}
this.AllPageText = allPagesText.ToString();
}
catch (Exception ex)
{
Logger.Log(ex.ToString(), LogInformationType.Error);
}
}
最后是 OcrObject Class:
public class OCRObject
{
private string m_tessLanguage;
private string m_tessConfPath;
[DllImport(@"\OCR\OCR.dll", EntryPoint = "GetUTF8Text", CallingConvention = CallingConvention.Cdecl)]
private static extern IntPtr GetUTF8Text(string path, string lang, string imgPath);
public OCRObject(string language, string tessConfPath)
{
if (string.IsNullOrEmpty(language))
{
throw new ArgumentException("Tesseract language is null or empty.");
}
if (!System.IO.Directory.Exists(tessConfPath))
{
throw new DirectoryNotFoundException("Could not find directory => " + tessConfPath);
}
this.m_tessLanguage = language;
this.m_tessConfPath = tessConfPath;
}
public string GetOCRText(string imagePath)
{
return this.StringFromNativeUtf8(GetUTF8Text(this.m_tessConfPath, this.m_tessLanguage, imagePath));
}
private string StringFromNativeUtf8(IntPtr nativeUtf8)
{
try
{
int len = 0;
if (nativeUtf8 == IntPtr.Zero)
{
return string.Empty;
}
while (Marshal.ReadByte(nativeUtf8, len) != 0) ++len;
byte[] buffer = new byte[len];
Marshal.Copy(nativeUtf8, buffer, 0, buffer.Length);
//GC.Collect(GC.MaxGeneration, GCCollectionMode.Optimized); /*If this help???*/
string text = Encoding.UTF8.GetString(buffer);
return text;
}
catch
{
return string.Empty;
}
}
}
如果您需要更多详细信息,请告诉我。
Tesseract FAQ 建议人们 运行 它可以并行执行(即暗示它是单线程的)。
您可以尝试使用 Parallel.For
来替换您的 for
循环,看看您是否可以从中获得快速而肮脏的胜利。
编辑:他们已移至 GitHub,新的常见问题解答建议
You will get better results having Tesseract produce one page PDF
files in parallel, then splicing them together at the end
我有一个在我的 C# 应用程序中使用的托管 C++ DLL。 DLL 正在处理大量图像(数千张)并使用 OCR 从中提取文本;尽管我知道 OCR 处理会消耗很多 CPU,但我想知道是否可以优化代码以获得更好的性能。
目前解析大约需要一分钟。 15页PNG页面。我会减少到 30-40 秒左右。
C++代码:
char* OCRWrapper::GetUTF8Text(char* path, char* lang, char* imgPath)
{
char* imageText;
tesseract::TessBaseAPI *api = new tesseract::TessBaseAPI();
if (api->Init(path, lang)) {
fprintf(stderr, "Could not initialize tesseract. Incorrect datapath or incorrect lanauge\n"); /*This should throw an error to the caller*/
exit(1);
}
/*Open a reference to the imagepath*/
Pix *image = pixRead(imgPath);
/*Read the image object;*/
api->SetImage(image);
// Get OCR result
imageText = api->GetUTF8Text();
/*writeToFile(outText);*/
/*printf("OCR output:\n%s", imageText);*/
/*Destroy the text*/
api->End();
pixDestroy(&image);
/*std::string x = std::string(imageText);*/
return imageText;
}
创建 OCROBject 实例的 C# 方法 class。 OCRObject是实际调用DLL的class,见下文这个方法。
private void GetTextFromSavedImages(List<string> imagesPath)
{
try
{
StringBuilder allPagesText = new StringBuilder();
OCRObject ocr = new OCRObject(this.dbHandler.GetApplicationSetting(this.m_ProfileName, "TesseractLanguage").ApplicationSettingValue, this.dbHandler.GetApplicationSetting(this.m_ProfileName, "TesseractConfigurationDataPath").ApplicationSettingValue); //Settings.Default.TesseractConfigurationDataPath
for (int i = 0; i < imagesPath.Count; i++)
{
string pageText = ocr.GetOCRText(imagesPath[i]);
this.m_pdfDictionary.Add(i + 1, pageText);
allPagesText.Append(pageText);
}
this.AllPageText = allPagesText.ToString();
}
catch (Exception ex)
{
Logger.Log(ex.ToString(), LogInformationType.Error);
}
}
最后是 OcrObject Class:
public class OCRObject
{
private string m_tessLanguage;
private string m_tessConfPath;
[DllImport(@"\OCR\OCR.dll", EntryPoint = "GetUTF8Text", CallingConvention = CallingConvention.Cdecl)]
private static extern IntPtr GetUTF8Text(string path, string lang, string imgPath);
public OCRObject(string language, string tessConfPath)
{
if (string.IsNullOrEmpty(language))
{
throw new ArgumentException("Tesseract language is null or empty.");
}
if (!System.IO.Directory.Exists(tessConfPath))
{
throw new DirectoryNotFoundException("Could not find directory => " + tessConfPath);
}
this.m_tessLanguage = language;
this.m_tessConfPath = tessConfPath;
}
public string GetOCRText(string imagePath)
{
return this.StringFromNativeUtf8(GetUTF8Text(this.m_tessConfPath, this.m_tessLanguage, imagePath));
}
private string StringFromNativeUtf8(IntPtr nativeUtf8)
{
try
{
int len = 0;
if (nativeUtf8 == IntPtr.Zero)
{
return string.Empty;
}
while (Marshal.ReadByte(nativeUtf8, len) != 0) ++len;
byte[] buffer = new byte[len];
Marshal.Copy(nativeUtf8, buffer, 0, buffer.Length);
//GC.Collect(GC.MaxGeneration, GCCollectionMode.Optimized); /*If this help???*/
string text = Encoding.UTF8.GetString(buffer);
return text;
}
catch
{
return string.Empty;
}
}
}
如果您需要更多详细信息,请告诉我。
Tesseract FAQ 建议人们 运行 它可以并行执行(即暗示它是单线程的)。
您可以尝试使用 Parallel.For
来替换您的 for
循环,看看您是否可以从中获得快速而肮脏的胜利。
编辑:他们已移至 GitHub,新的常见问题解答建议
You will get better results having Tesseract produce one page PDF files in parallel, then splicing them together at the end