Tesseract 识别 android 中的阿拉伯文本
Tesseract recognizing Arabic text in android
我正在开发一个应用程序,我使用 Tesseract OCR 来识别图像中的文本。我针对英语和日语对其进行了测试,并且运行良好,但是当我尝试使用阿拉伯语时,应用程序甚至在启动之前就崩溃了!为什么?
阿拉伯语和 Tesseract OCR 有什么问题?有人可以告诉我吗?
代码:
public class MainActivity extends AppCompatActivity {
Bitmap image;
private TessBaseAPI mTess;
String datapath = "";
@Override
protected void onCreate(Bundle savedInstanceState) {
super.onCreate(savedInstanceState);
setContentView(R.layout.activity_main);
//init image
image = BitmapFactory.decodeResource(getResources(), R.drawable.test_ara);
//initialize Tesseract API
String language = "ra";
datapath = getFilesDir()+ "/tesseract/";
mTess = new TessBaseAPI();
checkFile(new File(datapath + "tessdata/"));
mTess.init(datapath, language);
}
public void processImage(View view){
String OCRresult = null;
mTess.setImage(image);
OCRresult = mTess.getUTF8Text();
TextView OCRTextView = (TextView) findViewById(R.id.OCRTextView);
OCRTextView.setText(OCRresult);
}
private void checkFile(File dir) {
if (!dir.exists()&& dir.mkdirs()){
copyFiles();
}
if(dir.exists()) {
String datafilepath = datapath+ "/tessdata/ara.traineddata";
File datafile = new File(datafilepath);
if (!datafile.exists()) {
copyFiles();
}
}
}
private void copyFiles() {
try {
String filepath = datapath + "/tessdata/ara.traineddata";
AssetManager assetManager = getAssets();
InputStream instream = assetManager.open("tessdata/ara.traineddata");
OutputStream outstream = new FileOutputStream(filepath);
byte[] buffer = new byte[1024];
int read;
while ((read = instream.read(buffer)) != -1) {
outstream.write(buffer, 0, read);
}
outstream.flush();
outstream.close();
instream.close();
File file = new File(filepath);
if (!file.exists()) {
throw new FileNotFoundException();
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
我得到的错误:
04-16 18:37:08.451 7405-7405/com.imperialsoupgmail.tesseractexample A/libc: Fatal signal 11 (SIGSEGV), code 1, fault addr 0x0 in tid 7405 (esseractexample)
对于阿拉伯语,您需要使用 Cube:调用 init() using the OEM_CUBE_ONLY
engine mode and use the Cube data files。
我正在开发一个应用程序,我使用 Tesseract OCR 来识别图像中的文本。我针对英语和日语对其进行了测试,并且运行良好,但是当我尝试使用阿拉伯语时,应用程序甚至在启动之前就崩溃了!为什么?
阿拉伯语和 Tesseract OCR 有什么问题?有人可以告诉我吗?
代码:
public class MainActivity extends AppCompatActivity {
Bitmap image;
private TessBaseAPI mTess;
String datapath = "";
@Override
protected void onCreate(Bundle savedInstanceState) {
super.onCreate(savedInstanceState);
setContentView(R.layout.activity_main);
//init image
image = BitmapFactory.decodeResource(getResources(), R.drawable.test_ara);
//initialize Tesseract API
String language = "ra";
datapath = getFilesDir()+ "/tesseract/";
mTess = new TessBaseAPI();
checkFile(new File(datapath + "tessdata/"));
mTess.init(datapath, language);
}
public void processImage(View view){
String OCRresult = null;
mTess.setImage(image);
OCRresult = mTess.getUTF8Text();
TextView OCRTextView = (TextView) findViewById(R.id.OCRTextView);
OCRTextView.setText(OCRresult);
}
private void checkFile(File dir) {
if (!dir.exists()&& dir.mkdirs()){
copyFiles();
}
if(dir.exists()) {
String datafilepath = datapath+ "/tessdata/ara.traineddata";
File datafile = new File(datafilepath);
if (!datafile.exists()) {
copyFiles();
}
}
}
private void copyFiles() {
try {
String filepath = datapath + "/tessdata/ara.traineddata";
AssetManager assetManager = getAssets();
InputStream instream = assetManager.open("tessdata/ara.traineddata");
OutputStream outstream = new FileOutputStream(filepath);
byte[] buffer = new byte[1024];
int read;
while ((read = instream.read(buffer)) != -1) {
outstream.write(buffer, 0, read);
}
outstream.flush();
outstream.close();
instream.close();
File file = new File(filepath);
if (!file.exists()) {
throw new FileNotFoundException();
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
我得到的错误:
04-16 18:37:08.451 7405-7405/com.imperialsoupgmail.tesseractexample A/libc: Fatal signal 11 (SIGSEGV), code 1, fault addr 0x0 in tid 7405 (esseractexample)
对于阿拉伯语,您需要使用 Cube:调用 init() using the OEM_CUBE_ONLY
engine mode and use the Cube data files。