Pytesseract-OCR日文错误检测
Pytesseract-OCR wrong japanese detection
我从Here安装了Pystesseract 5.0,安装时标明了日文。
它没有正确检测日语字母。
注意:同样的代码检测英文字符没问题
这是我的代码:
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
try:
from PIL import Image
except ImportError:
import Image
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
#################################################################################################
# Read your file
file = 'removed.png'
img = cv2.imread(file, 0)
img.shape
# thresholding the image to a binary image
thresh, img_bin = cv2.threshold(img, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
# inverting the image
img_bin = 255 - img_bin
# Plotting the image to see the output
plotting = plt.imshow(img_bin, cmap='gray')
plt.show()
# Define a kernel to detect rectangular boxes
# Length(width) of kernel as 100th of total width
kernel_len = np.array(img).shape[1] // 100
# Defining a vertical kernel to detect all vertical lines of image
ver_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_len))
# Defining a horizontal kernel to detect all horizontal lines of image
hor_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_len, 1))
# A kernel of 2x2
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
#### Vertical LINES ####
# Use vertical kernel to detect and save the vertical lines in a jpg
image_1 = cv2.erode(img_bin, ver_kernel, iterations=5)
vertical_lines = cv2.dilate(image_1, ver_kernel, iterations=45)
# Plot the generated image
plotting = plt.imshow(image_1, cmap='gray')
plt.show()
#### HORTIZONAL LINES ####
# Use horizontal kernel to detect and save the horizontal lines in a jpg
image_2 = cv2.erode(img_bin, hor_kernel, iterations=5)
horizontal_lines = cv2.dilate(image_2, hor_kernel, iterations=45)
# Plot the generated image
plotting = plt.imshow(image_2, cmap='gray')
plt.show()
# Combining both H and V
# Combine horizontal and vertical lines in a new third image, with both having same weight.
img_vh = cv2.addWeighted(vertical_lines, 0.5, horizontal_lines, 0.5, 0.0)
# Eroding and thesholding the image
img_vh = cv2.erode(~img_vh, kernel, iterations=1)
thresh, img_vh = cv2.threshold(img_vh, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
plotting = plt.imshow(img_vh, cmap='gray')
plt.show()
##
bitxor = cv2.bitwise_xor(img, img_vh)
bitnot = cv2.bitwise_not(bitxor)
# Plotting the generated image
plotting = plt.imshow(bitnot, cmap='gray')
plt.show()
# Detect contours for following box detection
contours, hierarchy = cv2.findContours(img_vh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
def sort_contours(cnts, method="left-to-right"):
# initialize the reverse flag and sort index
reverse = False
i = 0
# handle if we need to sort in reverse
if method == "right-to-left" or method == "bottom-to-top":
reverse = True
# handle if we are sorting against the y-coordinate rather than
# the x-coordinate of the bounding box
if method == "top-to-bottom" or method == "bottom-to-top":
i = 1
# construct the list of bounding boxes and sort them from top to
# bottom
boundingBoxes = [cv2.boundingRect(c) for c in cnts]
(cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
key=lambda b: b[1][i], reverse=reverse))
# return the list of sorted contours and bounding boxes
return (cnts, boundingBoxes)
# Sort all the contours by top to bottom.
contours, boundingBoxes = sort_contours(contours, method="bottom-to-top")
################
# Creating a list of heights for all detected boxes
heights = [boundingBoxes[i][3] for i in range(len(boundingBoxes))]
# Get mean of heights
mean = np.mean(heights)
# Create list box to store all boxes in
box = []
# Get position (x,y), width and height for every contour and show the contour on image
for c in contours:
x, y, w, h = cv2.boundingRect(c)
if (w > 110 and h < 85):
image = cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
box.append([x, y, w, h])
plotting = plt.imshow(image, cmap="cividis")
plt.show()
# Creating two lists to define row and column in which cell is located
row = []
column = []
j = 0
# Sorting the boxes to their respective row and column
for i in range(len(box)):
if (i == 0):
column.append(box[i])
previous = box[i]
else:
if (box[i][1] <= previous[1] + mean / 2):
column.append(box[i])
previous = box[i]
if (i == len(box) - 1):
row.append(column)
else:
row.append(column)
column = []
previous = box[i]
column.append(box[i])
print("Columns:", column)
print("Rows: ", row)
# calculating maximum number of cells
countcol = 0
for i in range(len(row)):
countcol = len(row[i])
if countcol > countcol:
countcol = countcol
print("Max cells: ", countcol)
# Retrieving the center of each column
center = [int(row[i][j][0] + row[i][j][2] / 2) for j in range(len(row[i])) if row[0]]
center = np.array(center)
center.sort()
# Regarding the distance to the columns center, the boxes are arranged in respective order
finalboxes = []
for i in range(len(row)):
lis = []
for k in range(countcol):
lis.append([])
for j in range(len(row[i])):
diff = abs(center - (row[i][j][0] + row[i][j][2] / 4))
minimum = min(diff)
indexing = list(diff).index(minimum)
lis[indexing].append(row[i][j])
finalboxes.append(lis)
# from every single image-based cell/box the strings are extracted via pytesseract and stored in a list
outer = []
for i in range(len(finalboxes)):
for j in range(len(finalboxes[i])):
inner = ''
if (len(finalboxes[i][j]) == 0):
outer.append(' ')
else:
for k in range(len(finalboxes[i][j])):
y, x, w, h = finalboxes[i][j][k][0], finalboxes[i][j][k][1], finalboxes[i][j][k][2], \
finalboxes[i][j][k][3]
finalimg = bitnot[x:x + h, y:y + w]
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 1))
border = cv2.copyMakeBorder(finalimg, 2, 2, 2, 2, cv2.BORDER_CONSTANT, value=[255, 255])
resizing = cv2.resize(border, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
dilation = cv2.dilate(resizing, kernel, iterations=1)
erosion = cv2.erode(dilation, kernel, iterations=1)
out = pytesseract.image_to_string(erosion)
if (len(out) == 0):
out = pytesseract.image_to_string(erosion, config='--psm 3')
inner = inner + " " + out
outer.append(inner)
##Printing all detected cells
for element in outer:
if len(element) > 10:
for line in element.splitlines():
if len(line)>2:
print("##", line.strip())
这是最后一段代码的输出:
## LotNo. |
## fo
## 900-1829-B
## 2021/09/17
## 1, 000
## Wes FE —Cu-ESEIRNI
## LCH2
## 4X17. 05
## TYYY GISA-5AIB
## /448642-2 B
如您所见,它很难检测日文字母
tesseract
默认仅使用 English
,您可能必须将其他语言设置为参数。
在控制台你可以测试它
tesseract.exe image.png output.txt -l jpn
甚至有多种语言
tesseract.exe image.png output.txt -l jpn+eng
(而不是 output.txt
你可以使用 -
直接在控制台中显示文本)
在代码中可以是
pytesseract.image_to_string('image.png', config='-l jpn')
pytesseract.image_to_string('image.png', config='-l jpn+eng')
或
pytesseract.image_to_string('image.png', lang='jpn')
pytesseract.image_to_string('image.png', lang='jpn+eng')
我从Here安装了Pystesseract 5.0,安装时标明了日文。 它没有正确检测日语字母。 注意:同样的代码检测英文字符没问题
这是我的代码:
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
try:
from PIL import Image
except ImportError:
import Image
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
#################################################################################################
# Read your file
file = 'removed.png'
img = cv2.imread(file, 0)
img.shape
# thresholding the image to a binary image
thresh, img_bin = cv2.threshold(img, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
# inverting the image
img_bin = 255 - img_bin
# Plotting the image to see the output
plotting = plt.imshow(img_bin, cmap='gray')
plt.show()
# Define a kernel to detect rectangular boxes
# Length(width) of kernel as 100th of total width
kernel_len = np.array(img).shape[1] // 100
# Defining a vertical kernel to detect all vertical lines of image
ver_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_len))
# Defining a horizontal kernel to detect all horizontal lines of image
hor_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_len, 1))
# A kernel of 2x2
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
#### Vertical LINES ####
# Use vertical kernel to detect and save the vertical lines in a jpg
image_1 = cv2.erode(img_bin, ver_kernel, iterations=5)
vertical_lines = cv2.dilate(image_1, ver_kernel, iterations=45)
# Plot the generated image
plotting = plt.imshow(image_1, cmap='gray')
plt.show()
#### HORTIZONAL LINES ####
# Use horizontal kernel to detect and save the horizontal lines in a jpg
image_2 = cv2.erode(img_bin, hor_kernel, iterations=5)
horizontal_lines = cv2.dilate(image_2, hor_kernel, iterations=45)
# Plot the generated image
plotting = plt.imshow(image_2, cmap='gray')
plt.show()
# Combining both H and V
# Combine horizontal and vertical lines in a new third image, with both having same weight.
img_vh = cv2.addWeighted(vertical_lines, 0.5, horizontal_lines, 0.5, 0.0)
# Eroding and thesholding the image
img_vh = cv2.erode(~img_vh, kernel, iterations=1)
thresh, img_vh = cv2.threshold(img_vh, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
plotting = plt.imshow(img_vh, cmap='gray')
plt.show()
##
bitxor = cv2.bitwise_xor(img, img_vh)
bitnot = cv2.bitwise_not(bitxor)
# Plotting the generated image
plotting = plt.imshow(bitnot, cmap='gray')
plt.show()
# Detect contours for following box detection
contours, hierarchy = cv2.findContours(img_vh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
def sort_contours(cnts, method="left-to-right"):
# initialize the reverse flag and sort index
reverse = False
i = 0
# handle if we need to sort in reverse
if method == "right-to-left" or method == "bottom-to-top":
reverse = True
# handle if we are sorting against the y-coordinate rather than
# the x-coordinate of the bounding box
if method == "top-to-bottom" or method == "bottom-to-top":
i = 1
# construct the list of bounding boxes and sort them from top to
# bottom
boundingBoxes = [cv2.boundingRect(c) for c in cnts]
(cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
key=lambda b: b[1][i], reverse=reverse))
# return the list of sorted contours and bounding boxes
return (cnts, boundingBoxes)
# Sort all the contours by top to bottom.
contours, boundingBoxes = sort_contours(contours, method="bottom-to-top")
################
# Creating a list of heights for all detected boxes
heights = [boundingBoxes[i][3] for i in range(len(boundingBoxes))]
# Get mean of heights
mean = np.mean(heights)
# Create list box to store all boxes in
box = []
# Get position (x,y), width and height for every contour and show the contour on image
for c in contours:
x, y, w, h = cv2.boundingRect(c)
if (w > 110 and h < 85):
image = cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
box.append([x, y, w, h])
plotting = plt.imshow(image, cmap="cividis")
plt.show()
# Creating two lists to define row and column in which cell is located
row = []
column = []
j = 0
# Sorting the boxes to their respective row and column
for i in range(len(box)):
if (i == 0):
column.append(box[i])
previous = box[i]
else:
if (box[i][1] <= previous[1] + mean / 2):
column.append(box[i])
previous = box[i]
if (i == len(box) - 1):
row.append(column)
else:
row.append(column)
column = []
previous = box[i]
column.append(box[i])
print("Columns:", column)
print("Rows: ", row)
# calculating maximum number of cells
countcol = 0
for i in range(len(row)):
countcol = len(row[i])
if countcol > countcol:
countcol = countcol
print("Max cells: ", countcol)
# Retrieving the center of each column
center = [int(row[i][j][0] + row[i][j][2] / 2) for j in range(len(row[i])) if row[0]]
center = np.array(center)
center.sort()
# Regarding the distance to the columns center, the boxes are arranged in respective order
finalboxes = []
for i in range(len(row)):
lis = []
for k in range(countcol):
lis.append([])
for j in range(len(row[i])):
diff = abs(center - (row[i][j][0] + row[i][j][2] / 4))
minimum = min(diff)
indexing = list(diff).index(minimum)
lis[indexing].append(row[i][j])
finalboxes.append(lis)
# from every single image-based cell/box the strings are extracted via pytesseract and stored in a list
outer = []
for i in range(len(finalboxes)):
for j in range(len(finalboxes[i])):
inner = ''
if (len(finalboxes[i][j]) == 0):
outer.append(' ')
else:
for k in range(len(finalboxes[i][j])):
y, x, w, h = finalboxes[i][j][k][0], finalboxes[i][j][k][1], finalboxes[i][j][k][2], \
finalboxes[i][j][k][3]
finalimg = bitnot[x:x + h, y:y + w]
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 1))
border = cv2.copyMakeBorder(finalimg, 2, 2, 2, 2, cv2.BORDER_CONSTANT, value=[255, 255])
resizing = cv2.resize(border, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
dilation = cv2.dilate(resizing, kernel, iterations=1)
erosion = cv2.erode(dilation, kernel, iterations=1)
out = pytesseract.image_to_string(erosion)
if (len(out) == 0):
out = pytesseract.image_to_string(erosion, config='--psm 3')
inner = inner + " " + out
outer.append(inner)
##Printing all detected cells
for element in outer:
if len(element) > 10:
for line in element.splitlines():
if len(line)>2:
print("##", line.strip())
这是最后一段代码的输出:
## LotNo. |
## fo
## 900-1829-B
## 2021/09/17
## 1, 000
## Wes FE —Cu-ESEIRNI
## LCH2
## 4X17. 05
## TYYY GISA-5AIB
## /448642-2 B
如您所见,它很难检测日文字母
tesseract
默认仅使用 English
,您可能必须将其他语言设置为参数。
在控制台你可以测试它
tesseract.exe image.png output.txt -l jpn
甚至有多种语言
tesseract.exe image.png output.txt -l jpn+eng
(而不是 output.txt
你可以使用 -
直接在控制台中显示文本)
在代码中可以是
pytesseract.image_to_string('image.png', config='-l jpn')
pytesseract.image_to_string('image.png', config='-l jpn+eng')
或
pytesseract.image_to_string('image.png', lang='jpn')
pytesseract.image_to_string('image.png', lang='jpn+eng')