如何使用 pytesseract 检测渐变背景上的彩色文本

How to detect colored text on gradient background with pytesseract

我目前正在开发一个小型 OCR 机器人。我几乎完成了所有工作,现在正在努力改进 OCR。具体来说,它有两个问题:相同颜色渐变上的 orange/red-ish 文本和出于某种原因的“1/1”的第一个 1。可悲的是,我还没有找到对我的情况有用的东西。我制作了一张小测试图,由多张图片组成,如下:

Source Image

Results

Adaptive Threshold

如您所见,渐变会产生一个斑点,该斑点有时大到足以与第一个单词(参见“学徒”)重叠,从而产生垃圾。

我尝试了很多变体,尝试了阈值、模糊、侵蚀、膨胀、使用膨胀方法进行框检测等,但没有一个效果很好。我摆脱斑点的唯一方法是使用自适应阈值。但遗憾的是我无法使用输出图像获得好的结果。

如果有人知道如何使 OCR 更强大、提高准确性并消除斑点,我将非常感谢您的帮助。谢谢

下面的代码是我'playground'想出来的更好的方法:

import cv2
import pytesseract
import numpy as np

pytesseract.pytesseract.tesseract_cmd = YOUR_PATH

def resize(img, scale_percent=300):
    # use this instead?
    # resize = image = imutils.resize(image, width=300)

    # automatically resizes it about 300% by default
    width = int(img.shape[1] * scale_percent / 100)
    height = int(img.shape[0] * scale_percent / 100)
    dim = (width, height)
    resized = cv2.resize(img, dim, interpolation=cv2.INTER_AREA)
    return resized


def preprocessImage(img, scale=300, threshhold=127):
    """ input RGB colour space """
    # makes results more accurate - inspired from 
    # another resource to improve accuracy - https://tesseract-ocr.github.io/tessdoc/ImproveQuality.html

    # converts from rgb to grayscale then enlarges it
    # applies gaussian blur
    # convert to b&w
    # invert black and white colours (white background, black text)
    grayscale = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    cv2.imshow('grayscale', grayscale)
    resized = resize(grayscale, scale)
    cv2.imshow('resized', resized)

    blurred = cv2.medianBlur(resized, 5)
    #cv2.imshow('median', blurred)

    blurred = cv2.GaussianBlur(resized, (5, 5), 5)
    cv2.imshow('1', blurred)

    cv2.waitKey()

    blackAndWhite = cv2.threshold(blurred, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    cv2.imshow('blackAndWhite', blackAndWhite)

    th3 = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2)
    cv2.floodFill(th3, None, (0, 0), 255)
    cv2.imshow('th3', th3)


    #kernel = np.ones((3, 3), np.uint8)
    #erode = cv2.erode(th3, kernel)

    kernel = np.ones((5, 5), np.uint8)
    #opening = cv2.morphologyEx(blackAndWhite, cv2.MORPH_OPEN, kernel)

    invertedColours = cv2.bitwise_not(blackAndWhite)

    return invertedColours


# excerpt from https://www.youtube.com/watch?v=6DjFscX4I_c
def imageToText(img):
    # returns item name from image, preprocess if needed
    boxes = pytesseract.image_to_data(img)
    num = []
    for count, box in enumerate(boxes.splitlines()):
        if (count != 0):
            box = box.split()
            if (len(box) == 12):
                text = box[11].strip('@®')
                if (text != ''):
                    num.append(text)
    text = ' '.join(num)
    ## Alternate method
    # text = pytesseract.image_to_string(img)
    # print("Name:", text)
    return text


if __name__ == "__main__":
    img = cv2.imread("test.png")
    img = preprocessImage(img, scale=300)
    print(imageToText(img))

    ##############################################
    ##### Detecting Words  ######
    ##############################################
    #[   0          1           2           3           4          5         6       7       8        9        10       11 ]
    #['level', 'page_num', 'block_num', 'par_num', 'line_num', 'word_num', 'left', 'top', 'width', 'height', 'conf', 'text']

    boxes = pytesseract.image_to_data(img)
    # convert back to colored image
    img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
    # draw boxes and text
    for a,b in enumerate(boxes.splitlines()):
            print(b)
            if a!=0:
                b = b.split()
                if len(b)==12:
                    x,y,w,h = int(b[6]),int(b[7]),int(b[8]),int(b[9])
                    cv2.putText(img,b[11],(x,y-5),cv2.FONT_HERSHEY_SIMPLEX,1,(50,50,255),2)
                    cv2.rectangle(img, (x,y), (x+w, y+h), (0, 0, 255), 2)

    cv2.imshow('img', img)
    cv2.waitKey(0)

我无法做到完美,但差不多...

我从 CLAHE 均衡中受益匪浅。请参阅教程 here. But that wasn't enough. Still needed thresholding. Adaptive techniques didn't work well, but cv2.THRESH_TOZERO gives OK results. See thresholding tutorial here

import cv2
from pytesseract import image_to_string, image_to_data


img = cv2.imread('gradient.png', cv2.IMREAD_GRAYSCALE)
img = cv2.resize(img, (0,0), fx=2.0, fy=2.0)
clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
img = clahe.apply(img)
img = 255-img # invert image. tesseract prefers black text on white background

ret, img = cv2.threshold(img, 127, 255, cv2.THRESH_TOZERO)

cv2.imwrite('output.png', img)
ocr = image_to_string(img, config='--psm 6')
print(ocr)

给出 ocr 输出

Tool Crafting Part
Apprentice Craft Kit
Adept Craft Kit
Expert Craft Kit
=
Master Craft Kit
1/1