如何使用 pytesseract 检测渐变背景上的彩色文本
How to detect colored text on gradient background with pytesseract
我目前正在开发一个小型 OCR 机器人。我几乎完成了所有工作,现在正在努力改进 OCR。具体来说,它有两个问题:相同颜色渐变上的 orange/red-ish 文本和出于某种原因的“1/1”的第一个 1。可悲的是,我还没有找到对我的情况有用的东西。我制作了一张小测试图,由多张图片组成,如下:
Source Image
Results
Adaptive Threshold
如您所见,渐变会产生一个斑点,该斑点有时大到足以与第一个单词(参见“学徒”)重叠,从而产生垃圾。
我尝试了很多变体,尝试了阈值、模糊、侵蚀、膨胀、使用膨胀方法进行框检测等,但没有一个效果很好。我摆脱斑点的唯一方法是使用自适应阈值。但遗憾的是我无法使用输出图像获得好的结果。
如果有人知道如何使 OCR 更强大、提高准确性并消除斑点,我将非常感谢您的帮助。谢谢
下面的代码是我'playground'想出来的更好的方法:
import cv2
import pytesseract
import numpy as np
pytesseract.pytesseract.tesseract_cmd = YOUR_PATH
def resize(img, scale_percent=300):
# use this instead?
# resize = image = imutils.resize(image, width=300)
# automatically resizes it about 300% by default
width = int(img.shape[1] * scale_percent / 100)
height = int(img.shape[0] * scale_percent / 100)
dim = (width, height)
resized = cv2.resize(img, dim, interpolation=cv2.INTER_AREA)
return resized
def preprocessImage(img, scale=300, threshhold=127):
""" input RGB colour space """
# makes results more accurate - inspired from
# another resource to improve accuracy - https://tesseract-ocr.github.io/tessdoc/ImproveQuality.html
# converts from rgb to grayscale then enlarges it
# applies gaussian blur
# convert to b&w
# invert black and white colours (white background, black text)
grayscale = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
cv2.imshow('grayscale', grayscale)
resized = resize(grayscale, scale)
cv2.imshow('resized', resized)
blurred = cv2.medianBlur(resized, 5)
#cv2.imshow('median', blurred)
blurred = cv2.GaussianBlur(resized, (5, 5), 5)
cv2.imshow('1', blurred)
cv2.waitKey()
blackAndWhite = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
cv2.imshow('blackAndWhite', blackAndWhite)
th3 = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2)
cv2.floodFill(th3, None, (0, 0), 255)
cv2.imshow('th3', th3)
#kernel = np.ones((3, 3), np.uint8)
#erode = cv2.erode(th3, kernel)
kernel = np.ones((5, 5), np.uint8)
#opening = cv2.morphologyEx(blackAndWhite, cv2.MORPH_OPEN, kernel)
invertedColours = cv2.bitwise_not(blackAndWhite)
return invertedColours
# excerpt from https://www.youtube.com/watch?v=6DjFscX4I_c
def imageToText(img):
# returns item name from image, preprocess if needed
boxes = pytesseract.image_to_data(img)
num = []
for count, box in enumerate(boxes.splitlines()):
if (count != 0):
box = box.split()
if (len(box) == 12):
text = box[11].strip('@®')
if (text != ''):
num.append(text)
text = ' '.join(num)
## Alternate method
# text = pytesseract.image_to_string(img)
# print("Name:", text)
return text
if __name__ == "__main__":
img = cv2.imread("test.png")
img = preprocessImage(img, scale=300)
print(imageToText(img))
##############################################
##### Detecting Words ######
##############################################
#[ 0 1 2 3 4 5 6 7 8 9 10 11 ]
#['level', 'page_num', 'block_num', 'par_num', 'line_num', 'word_num', 'left', 'top', 'width', 'height', 'conf', 'text']
boxes = pytesseract.image_to_data(img)
# convert back to colored image
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
# draw boxes and text
for a,b in enumerate(boxes.splitlines()):
print(b)
if a!=0:
b = b.split()
if len(b)==12:
x,y,w,h = int(b[6]),int(b[7]),int(b[8]),int(b[9])
cv2.putText(img,b[11],(x,y-5),cv2.FONT_HERSHEY_SIMPLEX,1,(50,50,255),2)
cv2.rectangle(img, (x,y), (x+w, y+h), (0, 0, 255), 2)
cv2.imshow('img', img)
cv2.waitKey(0)
我无法做到完美,但差不多...
我从 CLAHE 均衡中受益匪浅。请参阅教程 here. But that wasn't enough. Still needed thresholding. Adaptive techniques didn't work well, but cv2.THRESH_TOZERO gives OK results. See thresholding tutorial here
import cv2
from pytesseract import image_to_string, image_to_data
img = cv2.imread('gradient.png', cv2.IMREAD_GRAYSCALE)
img = cv2.resize(img, (0,0), fx=2.0, fy=2.0)
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
img = clahe.apply(img)
img = 255-img # invert image. tesseract prefers black text on white background
ret, img = cv2.threshold(img, 127, 255, cv2.THRESH_TOZERO)
cv2.imwrite('output.png', img)
ocr = image_to_string(img, config='--psm 6')
print(ocr)
给出 ocr 输出
Tool Crafting Part
Apprentice Craft Kit
Adept Craft Kit
Expert Craft Kit
=
Master Craft Kit
1/1
我目前正在开发一个小型 OCR 机器人。我几乎完成了所有工作,现在正在努力改进 OCR。具体来说,它有两个问题:相同颜色渐变上的 orange/red-ish 文本和出于某种原因的“1/1”的第一个 1。可悲的是,我还没有找到对我的情况有用的东西。我制作了一张小测试图,由多张图片组成,如下:
Source Image
Results
Adaptive Threshold
如您所见,渐变会产生一个斑点,该斑点有时大到足以与第一个单词(参见“学徒”)重叠,从而产生垃圾。
我尝试了很多变体,尝试了阈值、模糊、侵蚀、膨胀、使用膨胀方法进行框检测等,但没有一个效果很好。我摆脱斑点的唯一方法是使用自适应阈值。但遗憾的是我无法使用输出图像获得好的结果。
如果有人知道如何使 OCR 更强大、提高准确性并消除斑点,我将非常感谢您的帮助。谢谢
下面的代码是我'playground'想出来的更好的方法:
import cv2
import pytesseract
import numpy as np
pytesseract.pytesseract.tesseract_cmd = YOUR_PATH
def resize(img, scale_percent=300):
# use this instead?
# resize = image = imutils.resize(image, width=300)
# automatically resizes it about 300% by default
width = int(img.shape[1] * scale_percent / 100)
height = int(img.shape[0] * scale_percent / 100)
dim = (width, height)
resized = cv2.resize(img, dim, interpolation=cv2.INTER_AREA)
return resized
def preprocessImage(img, scale=300, threshhold=127):
""" input RGB colour space """
# makes results more accurate - inspired from
# another resource to improve accuracy - https://tesseract-ocr.github.io/tessdoc/ImproveQuality.html
# converts from rgb to grayscale then enlarges it
# applies gaussian blur
# convert to b&w
# invert black and white colours (white background, black text)
grayscale = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
cv2.imshow('grayscale', grayscale)
resized = resize(grayscale, scale)
cv2.imshow('resized', resized)
blurred = cv2.medianBlur(resized, 5)
#cv2.imshow('median', blurred)
blurred = cv2.GaussianBlur(resized, (5, 5), 5)
cv2.imshow('1', blurred)
cv2.waitKey()
blackAndWhite = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
cv2.imshow('blackAndWhite', blackAndWhite)
th3 = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2)
cv2.floodFill(th3, None, (0, 0), 255)
cv2.imshow('th3', th3)
#kernel = np.ones((3, 3), np.uint8)
#erode = cv2.erode(th3, kernel)
kernel = np.ones((5, 5), np.uint8)
#opening = cv2.morphologyEx(blackAndWhite, cv2.MORPH_OPEN, kernel)
invertedColours = cv2.bitwise_not(blackAndWhite)
return invertedColours
# excerpt from https://www.youtube.com/watch?v=6DjFscX4I_c
def imageToText(img):
# returns item name from image, preprocess if needed
boxes = pytesseract.image_to_data(img)
num = []
for count, box in enumerate(boxes.splitlines()):
if (count != 0):
box = box.split()
if (len(box) == 12):
text = box[11].strip('@®')
if (text != ''):
num.append(text)
text = ' '.join(num)
## Alternate method
# text = pytesseract.image_to_string(img)
# print("Name:", text)
return text
if __name__ == "__main__":
img = cv2.imread("test.png")
img = preprocessImage(img, scale=300)
print(imageToText(img))
##############################################
##### Detecting Words ######
##############################################
#[ 0 1 2 3 4 5 6 7 8 9 10 11 ]
#['level', 'page_num', 'block_num', 'par_num', 'line_num', 'word_num', 'left', 'top', 'width', 'height', 'conf', 'text']
boxes = pytesseract.image_to_data(img)
# convert back to colored image
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
# draw boxes and text
for a,b in enumerate(boxes.splitlines()):
print(b)
if a!=0:
b = b.split()
if len(b)==12:
x,y,w,h = int(b[6]),int(b[7]),int(b[8]),int(b[9])
cv2.putText(img,b[11],(x,y-5),cv2.FONT_HERSHEY_SIMPLEX,1,(50,50,255),2)
cv2.rectangle(img, (x,y), (x+w, y+h), (0, 0, 255), 2)
cv2.imshow('img', img)
cv2.waitKey(0)
我无法做到完美,但差不多...
我从 CLAHE 均衡中受益匪浅。请参阅教程 here. But that wasn't enough. Still needed thresholding. Adaptive techniques didn't work well, but cv2.THRESH_TOZERO gives OK results. See thresholding tutorial here
import cv2
from pytesseract import image_to_string, image_to_data
img = cv2.imread('gradient.png', cv2.IMREAD_GRAYSCALE)
img = cv2.resize(img, (0,0), fx=2.0, fy=2.0)
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
img = clahe.apply(img)
img = 255-img # invert image. tesseract prefers black text on white background
ret, img = cv2.threshold(img, 127, 255, cv2.THRESH_TOZERO)
cv2.imwrite('output.png', img)
ocr = image_to_string(img, config='--psm 6')
print(ocr)
给出 ocr 输出
Tool Crafting Part
Apprentice Craft Kit
Adept Craft Kit
Expert Craft Kit
=
Master Craft Kit
1/1