从图像中去除扫描伪影，只留下文本 (OpenCV + Python)

Question

我正在尝试编写一个 Python 脚本来“清理”扫描的图像，然后才能使用 Tesseract 对其进行处理。除文字外，图像还存在一些灰尘、扫描伪影、页边距处的怪异线条等。 Here's what a typical page looks like

到目前为止，这就是我所拥有的。它尝试使用 cv2.ConnectedComponentsWithStats 去除一点灰尘，使用形态学结构元素去除水平和垂直线，然后尝试将图像裁剪为文本。总比没有好，因为它确实去除了一些噪音，但有时它也会去除实际文本，并在页边距处留下一些线条：

image = cv2.imread(path, 0)
logging.info('Opening image ' + path)
logging.info('Converting to grayscale...')
_, blackAndWhite = cv2.threshold(image, 127, 255, cv2.THRESH_BINARY_INV)
# Find and exclude small elements
logging.info('Removing small dotted regions (dust, etc.)...')
nlabels, labels, stats, centroids = cv2.connectedComponentsWithStats(blackAndWhite, None, None, None, 8, cv2.CV_32S)
sizes = stats[1:, -1] #get CC_STAT_AREA component
img2 = np.zeros((labels.shape), np.uint8)
for i in range(0, nlabels - 1):
    if sizes[i] >= 40:   #filter small dotted regions
        img2[labels == i + 1] = 255
image = cv2.bitwise_not(img2)
cv2.imwrite(out_filename, image)
logging.info('Writing the modified image...')
# ------ START CROPPING ----- #
image = cv2.imread(out_filename)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Load image, grayscale, Gaussian blur, Otsu's threshold
blur = cv2.GaussianBlur(gray, (5,5), 0)
thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
logging.info('Applying Otsu\'s Threshold')

horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (25,4))
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,32))
detected_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
detected_vlines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2)

for l in [detected_lines, detected_vlines]:
    cnts = cv2.findContours(l, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = cnts[0] if len(cnts) == 2 else cnts[1]
    for c in cnts:
        cv2.drawContours(thresh, [c], -1, (0,0,0), 50)
        cv2.drawContours(image, [c], -1, (255,255,255), 50)

# Create rectangular structuring element and dilate
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (18,18))
dilate = cv2.dilate(thresh, kernel, iterations=4)
logging.info('Dilating text regions')

try:
    # Find contours and draw rectangle
    cnts, hierarchy = cv2.findContours(dilate, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    logging.info('Extracting contours')
    # Search for contours and append their coordinates into an array
    arr = []
    for i,c in enumerate(cnts):
        # Exclude small elements
        x,y,w,h = cv2.boundingRect(c)
        # Exclude oddly shaped elements
        if w/h > 8 or h/w > 1.6:
            continue
        arr.append((x,y))
        arr.append((x+w,y+h))
    # Calculate the coordinates and crop the image
    logging.info('Cropping the image')
    x,y,w,h = cv2.boundingRect(np.asarray(arr))
    image = image[y:y+h,x:x+w]
    if debug:
        logging.info('Showing the image (press "q" to continue)')
        label = "STAGE FOUR: CROPPED IMAGE"
    logging.info('Writing to ' + out_filename)
except cv2.error:
    pass
cv2.imwrite(out_filename, image)

我对图像处理还很陌生，没有太多经验。想听听关于如何改进算法的一些建议！

Answer 1

我会先对整个图像调用 pytesseract.image_to_data()。这将为您提供所有检测到的单词（包括页面边缘的无效字符）的位置和 OCR 置信度。然后根据高置信度词的位置确定包含有效文本的区域。最后，在该区域上使用 pytesseract.image_to_string() 来获取文本（或从您已有的 pytesseract.image_to_data() 中过滤结果）。

此方法适用于给定的示例。如果您想去除灰尘斑点，可以查看“椒盐噪声过滤”，但似乎没有必要。

import cv2
import pandas as pd
import pytesseract
from io import StringIO

# Obtain OCR data
img_bgr = cv2.imread("XVePx.jpg")
img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
ocr_data = pytesseract.image_to_data(img_rgb, lang="deu")
ocr_df = pd.read_table(StringIO(ocr_data), quoting=3)

# Determine the text region based on the words (2+ characters) of high confidence (>90%)
confident_words_df = ocr_df[
    (ocr_df["conf"] > 90)
    & (ocr_df["text"].str.len() - ocr_df["text"].str.count(" ") > 1)
]
top = confident_words_df["top"].min()
left = confident_words_df["left"].min()
bot = (confident_words_df["top"] + confident_words_df["height"]).max()
right = (confident_words_df["left"] + confident_words_df["width"]).max()

# Obtain OCR string
ocr_string = pytesseract.image_to_string(img_rgb[top:bot, left:right, :], lang="deu")
print(ocr_string)

从图像中去除扫描伪影，只留下文本 (OpenCV + Python)

Removing scanning artifacts from an image, leaving only text (OpenCV + Python)

python

ocr

opencv