OpenCV 和 Tesseract 在门标签检测上的应用
OpenCV and Tesseract on door label detection
我对 OpenCv 和 tesseract 还很陌生。我最近正在构建一个使用计算机视觉检测门标签的项目。希望对视障人士有所帮助
程序的思路是对输入图像进行预处理,将其转换为二值颜色,然后使用canny edge检测门标的轮廓,然后对canny edge结果进行膨胀。在这些之后,将图像提供给 tesseract,同时尝试显示用框检测到的文本。
预期结果是文本上的绿色矩形。在打印文本本身时。
问题是缺少矩形和文本检测失败。
我试过这些:
- Recognize Text in images using Canny Edge detection in Opencv
问题和解决方案太简单或不相关。有些也不在 python 中。
下面附上我对代码的尝试:
import pytesseract as pytess
import cv2 as cv
import numpy as np
from PIL import Image
from pytesseract import Output
img = cv.imread(r"C:\Users\User\Desktop\dataset\p\Image_31.jpg", 0)
# edges store the canny version of img
edges = cv.Canny(img, 100, 200)
# ker as in kernel
# (5, 5) is the matrix while uint8 is datatype
ker = np.ones((3, 3), np.uint8)
# dil as in dilation
# edges as the src, ker is the kernel we set above, number of dilation
dil = cv.dilate(edges, ker, iterations=1)
# setup pytesseract parameters
configs = r'--oem 3 --psm 6'
# feed image to tesseract
result = pytess.image_to_data(dil, output_type=Output.DICT, config=configs, lang='eng')
print(result.keys())
boxes = len(result['text'])
# make a new copy of edges
new_item = dil.copy()
for sequence_number in range(boxes):
if int(result['conf'][sequence_number]) > 30: # removed constraints
(x, y, w, h) = (result['left'][sequence_number], result['top'][sequence_number],
result['width'][sequence_number], result['height'][sequence_number])
new_item = cv.rectangle(new_item, (x, y), (x + w, y + h), (0, 255, 0), 2)
# detect sentence with tesseract
# pending as rectangle not achieved
cv.imshow("original", img)
cv.imshow("canny", edges)
cv.imshow("dilation", dil)
cv.imshow("capturedText", new_item)
#ignore below this line, it is only for testing
#testobj = Image.fromarray(dil)
#testtext = pytess.image_to_string(testobj, lang='eng')
#print(testtext)
cv.waitKey(0)
cv.destroyAllWindows()
结果图像:
代码的测试部分return结果如下图:
a)
Meets
其中,显然不满足objective.
编辑
发布问题后,我意识到我一开始可能做错了。我应该尝试使用 OpencV 来检测门标签的轮廓并隔离包含文本的部分,然后再发送矩形中的任何内容以进行 OCR 识别。
EDIT2
由于我们的 Whosebug 成员,我现在已经确定了问题,现在我正在尝试添加图像 rectification/image 环绕技术来检索直接的前视图,从而使系统更准确。即将更新。
EDIT3
经过一定的错误修复,减少约束,同时允许函数在原始图像上绘制,我得到了以下结果。同时附上更新后的代码。
import cv2 as cv
import numpy as np
import pytesseract as pytess
from pytesseract import Output
# input of img source
img = cv.imread(r"C:\Users\User\Desktop\dataset\p\Image_31.jpg")
# necessary image color conversion
img2 = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
# edges store the canny version of img
edges = cv.Canny(img2, 100, 200)
# ker as in kernel
# (5, 5) is the matrix while uint8 is datatype
ker = np.ones((3, 3), np.uint8)
# dil as in dilation
# edges as the src, ker is the kernel we set above, number of dilation
dil = cv.dilate(edges, ker, iterations=1)
# setup pytesseract parameters
configs = r'--oem 3 --psm 6'
# feed image to tesseract
result = pytess.image_to_data(dil, output_type=Output.DICT, config=configs, lang='eng')
# number of boxes that encapsulate the boxes
boxes = len(result['text'])
# make a new copy of edges
new_item = dil.copy()
for sequence_number in range(boxes):
if int(result['conf'][sequence_number]) > 0: #removed constraints
(x, y, w, h) = (result['left'][sequence_number], result['top'][sequence_number],
result['width'][sequence_number], result['height'][sequence_number])
# draw rectangle boxes on the original img
cv.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 3)
# Crop the image
crp = new_item[y:y + h, x:x + w]
# OCR
txt = pytess.image_to_string(crp, config=configs)
# returns recognised text
print(txt)
cv.imshow("capturedText", crp)
cv.waitKey(0)
# cv.imshow("original", img)
# cv.imshow("canny", edges)
# cv.imshow("dilation", dil)
cv.imshow("results", img)
cv.waitKey(0)
cv.destroyAllWindows()
我认为您在这里寻找的是 python 中的 image rectificaiton (warping image to make it look like taken from another point of view) and there seem to be tool。但是,问题变得更加复杂,因为在您的情况下,您需要检测如何纠正它。我不确定你应该怎么做。
您已在图片中找到所有检测到的文字:
for sequence_number in range(boxes):
if int(result['conf'][sequence_number]) > 30:
(x, y, w, h) = (result['left'][sequence_number], result['top'][sequence_number],
result['width'][sequence_number], result['height'][sequence_number])
new_item = cv.rectangle(new_item, (x, y), (x + w, y + h), (0, 255, 0), 2)
但是你也说现在的置信度应该在70%以上。
- 如果我们移除约束
- 如果我们对每个新项目进行 OCR
结果将是:
现在如果你阅读:
txt = pytesseract.image_to_string(new_item, config="--psm 6")
print(txt)
OCR 将为:
Meeting Room §
当前pytesseract版本的输出0.3.7
代码:
# Load the libraries
import cv2
import pytesseract
# Load the image
img = cv2.imread("fsUSw.png")
# Convert it to the gray-scale
gry = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# OCR detection
d = pytesseract.image_to_data(gry, config="--psm 6", output_type=pytesseract.Output.DICT)
# Get ROI part from the detection
n_boxes = len(d['level'])
# For each detected part
for i in range(1, 2):
# Get the localized region
(x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
# Draw rectangle to the detected region
cv2.rectangle(img, (x, y), (x + w, y + h), (0, 0, 255), 5)
# Crop the image
crp = gry[y:y + h, x:x + w]
# OCR
txt = pytesseract.image_to_string(crp, config="--psm 6")
print(txt)
# Display the cropped image
cv2.imshow("crp", crp)
cv2.waitKey(0)
# Display
cv2.imshow("img", img)
cv2.waitKey(0)
我对 OpenCv 和 tesseract 还很陌生。我最近正在构建一个使用计算机视觉检测门标签的项目。希望对视障人士有所帮助
程序的思路是对输入图像进行预处理,将其转换为二值颜色,然后使用canny edge检测门标的轮廓,然后对canny edge结果进行膨胀。在这些之后,将图像提供给 tesseract,同时尝试显示用框检测到的文本。
预期结果是文本上的绿色矩形。在打印文本本身时。
问题是缺少矩形和文本检测失败。
我试过这些:
- Recognize Text in images using Canny Edge detection in Opencv
问题和解决方案太简单或不相关。有些也不在 python 中。
下面附上我对代码的尝试:
import pytesseract as pytess
import cv2 as cv
import numpy as np
from PIL import Image
from pytesseract import Output
img = cv.imread(r"C:\Users\User\Desktop\dataset\p\Image_31.jpg", 0)
# edges store the canny version of img
edges = cv.Canny(img, 100, 200)
# ker as in kernel
# (5, 5) is the matrix while uint8 is datatype
ker = np.ones((3, 3), np.uint8)
# dil as in dilation
# edges as the src, ker is the kernel we set above, number of dilation
dil = cv.dilate(edges, ker, iterations=1)
# setup pytesseract parameters
configs = r'--oem 3 --psm 6'
# feed image to tesseract
result = pytess.image_to_data(dil, output_type=Output.DICT, config=configs, lang='eng')
print(result.keys())
boxes = len(result['text'])
# make a new copy of edges
new_item = dil.copy()
for sequence_number in range(boxes):
if int(result['conf'][sequence_number]) > 30: # removed constraints
(x, y, w, h) = (result['left'][sequence_number], result['top'][sequence_number],
result['width'][sequence_number], result['height'][sequence_number])
new_item = cv.rectangle(new_item, (x, y), (x + w, y + h), (0, 255, 0), 2)
# detect sentence with tesseract
# pending as rectangle not achieved
cv.imshow("original", img)
cv.imshow("canny", edges)
cv.imshow("dilation", dil)
cv.imshow("capturedText", new_item)
#ignore below this line, it is only for testing
#testobj = Image.fromarray(dil)
#testtext = pytess.image_to_string(testobj, lang='eng')
#print(testtext)
cv.waitKey(0)
cv.destroyAllWindows()
结果图像:
代码的测试部分return结果如下图:
a)
Meets
其中,显然不满足objective.
编辑
发布问题后,我意识到我一开始可能做错了。我应该尝试使用 OpencV 来检测门标签的轮廓并隔离包含文本的部分,然后再发送矩形中的任何内容以进行 OCR 识别。
EDIT2
由于我们的 Whosebug 成员,我现在已经确定了问题,现在我正在尝试添加图像 rectification/image 环绕技术来检索直接的前视图,从而使系统更准确。即将更新。
EDIT3
经过一定的错误修复,减少约束,同时允许函数在原始图像上绘制,我得到了以下结果。同时附上更新后的代码。
import cv2 as cv
import numpy as np
import pytesseract as pytess
from pytesseract import Output
# input of img source
img = cv.imread(r"C:\Users\User\Desktop\dataset\p\Image_31.jpg")
# necessary image color conversion
img2 = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
# edges store the canny version of img
edges = cv.Canny(img2, 100, 200)
# ker as in kernel
# (5, 5) is the matrix while uint8 is datatype
ker = np.ones((3, 3), np.uint8)
# dil as in dilation
# edges as the src, ker is the kernel we set above, number of dilation
dil = cv.dilate(edges, ker, iterations=1)
# setup pytesseract parameters
configs = r'--oem 3 --psm 6'
# feed image to tesseract
result = pytess.image_to_data(dil, output_type=Output.DICT, config=configs, lang='eng')
# number of boxes that encapsulate the boxes
boxes = len(result['text'])
# make a new copy of edges
new_item = dil.copy()
for sequence_number in range(boxes):
if int(result['conf'][sequence_number]) > 0: #removed constraints
(x, y, w, h) = (result['left'][sequence_number], result['top'][sequence_number],
result['width'][sequence_number], result['height'][sequence_number])
# draw rectangle boxes on the original img
cv.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 3)
# Crop the image
crp = new_item[y:y + h, x:x + w]
# OCR
txt = pytess.image_to_string(crp, config=configs)
# returns recognised text
print(txt)
cv.imshow("capturedText", crp)
cv.waitKey(0)
# cv.imshow("original", img)
# cv.imshow("canny", edges)
# cv.imshow("dilation", dil)
cv.imshow("results", img)
cv.waitKey(0)
cv.destroyAllWindows()
我认为您在这里寻找的是 python 中的 image rectificaiton (warping image to make it look like taken from another point of view) and there seem to be tool。但是,问题变得更加复杂,因为在您的情况下,您需要检测如何纠正它。我不确定你应该怎么做。
您已在图片中找到所有检测到的文字:
for sequence_number in range(boxes):
if int(result['conf'][sequence_number]) > 30:
(x, y, w, h) = (result['left'][sequence_number], result['top'][sequence_number],
result['width'][sequence_number], result['height'][sequence_number])
new_item = cv.rectangle(new_item, (x, y), (x + w, y + h), (0, 255, 0), 2)
但是你也说现在的置信度应该在70%以上。
- 如果我们移除约束
- 如果我们对每个新项目进行 OCR
结果将是:
现在如果你阅读:
txt = pytesseract.image_to_string(new_item, config="--psm 6")
print(txt)
OCR 将为:
Meeting Room §
当前pytesseract版本的输出0.3.7
代码:
# Load the libraries
import cv2
import pytesseract
# Load the image
img = cv2.imread("fsUSw.png")
# Convert it to the gray-scale
gry = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# OCR detection
d = pytesseract.image_to_data(gry, config="--psm 6", output_type=pytesseract.Output.DICT)
# Get ROI part from the detection
n_boxes = len(d['level'])
# For each detected part
for i in range(1, 2):
# Get the localized region
(x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
# Draw rectangle to the detected region
cv2.rectangle(img, (x, y), (x + w, y + h), (0, 0, 255), 5)
# Crop the image
crp = gry[y:y + h, x:x + w]
# OCR
txt = pytesseract.image_to_string(crp, config="--psm 6")
print(txt)
# Display the cropped image
cv2.imshow("crp", crp)
cv2.waitKey(0)
# Display
cv2.imshow("img", img)
cv2.waitKey(0)