python 如何在不降低图像质量的情况下去除水平和垂直线

Question

我正在尝试从图像中删除水平线和垂直线。此图像是使用 pdf2jpg 库从 pdf 生成的。删除水平线和垂直线后，该图像将被馈送到 pytesseract 以提取单词及其各自的坐标。在这里我只是提取全文用于测试目的。我是 OpenCV 的新手。我通过积累来自不同网站（包括堆栈溢出）的代码片段来编写此代码。除了偶尔会残留一些垂直线外，该代码几乎可以完美运行。这些残余物混淆了 tesseract，有时被视为 I、1 或 |。此外，tesseract 的误读次数（如 s 被读为 5，I 被读为 1 或 |，反之亦然）似乎比原始图像的处理图像更高。我认为这是因为字体清晰度低于我们开始使用的原始图像。可以对此代码进行哪些更改，以在不影响字体清晰度的情况下删除垂直线的残余。任何正确方向的建议或指导将不胜感激。提前致谢

from importlib import invalidate_caches
from pytesseract import image_to_string
#from pdf2image import convert_from_path
from pdf2jpg.pdf2jpg import convert_pdf2jpg
from PIL import Image
import sys
import cv2
import numpy

def pre_process(image):
    if isinstance(image, str):
        image = cv2.imread(image, cv2.IMREAD_GRAYSCALE)
    else:
        # image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        pass
    #Convert the image to true black n white from grayscale
    threshold, image_bin = cv2.threshold(image, 128, 255, cv2.THRESH_BINARY|cv2.THRESH_OTSU)
    #Invert the image to change white to black and vice versa
    image_inv = 255-image_bin
    
    #Define kernels for horizontal and vertical lines
    kernel_len = numpy.array(image).shape[1]//100
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_len))
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_len, 1))
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))

    #Remove anything that is not a vertical line
    image_inv1 = cv2.erode(image_inv, vertical_kernel, iterations=3)
    vertical_lines = cv2.dilate(image_inv1, vertical_kernel, iterations=3)

    #Remove anything that is not a horizontal line
    image_inv2 = cv2.erode(image_inv, horizontal_kernel, iterations=3)
    horizontal_lines = cv2.dilate(image_inv2, horizontal_kernel, iterations=3)

    #Add horizontal and vertical lines to get all lines
    image_vh = cv2.addWeighted(vertical_lines, 0.5, horizontal_lines, 0.5, 0.0)
    image_vh = cv2.erode(~image_vh, kernel, iterations=2)
    threshold, image_vh = cv2.threshold(image_vh, 128, 255, cv2.THRESH_BINARY|cv2.THRESH_OTSU)

    # Make a inverted copy of original grayscale image
    org_img_inv = cv2.bitwise_not(image)
    #Apply mask of all lines
    final_image_inv = cv2.bitwise_and(org_img_inv, org_img_inv, mask=image_vh)
    #Invert again to get clean image without lines
    image = cv2.bitwise_not(final_image_inv)
    cv2.imshow("final", image)
    cv2.waitKey(0)
    return image

if __name__ =="__main__":
    pdf_path = sys.argv[1]
    images = convert_pdf2jpg(pdf_path, "temp", dpi=100, pages="ALL")
    result = ""
    for image_path in images[0]["output_jpgfiles"]:
        # with Image.open(image_path) as image:
            # text = image_to_string(image)
            # result = "\n".join((result, text))
        image = pre_process(image_path)
        #image = pre_process(image)
        text = image_to_string(image)
        result = "\n".join((result, text))
    # print(result)
    with open("text.txt", "w") as out:
        out.write(result)
        # pre_process(image_path)
        # break

请找到附件中的 pdf，我将其用作代码的输入 pdf 和处理过的图像的片段以供参考。可以使用

从命令提示符触发代码

python .\read_pdf_ocr.py path_to_pdf_file

环境详情：

Python: 3.7.9
图书馆：
- opencv-python: 4.4.0.46
- pdf2jpg: 1.0
- pytesseract: 0.3.6
Tesseract-OCR - 开源 OCR 引擎：v5.0.0-alpha.20200328

Snip of processed image

Test PDF with table

Answer 1

您可以使用 line-detector 检测给定图像中的线条。

使用 convert_pdf2jpg

转换图像后

找到图像的边缘。您可以使用 Canny.

import cv2
import pytesseract

img = cv2.imread("ex.png")
img_gry = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img_cny = cv2.Canny(img_gry, 50, 200)

Canny-applied-image

精明图的一部分：

现在我们可以使用line-detector来查找图片的坐标了。

lns = cv2.ximgproc.createFastLineDetector().detect(img_cny)

img_cpy = img.copy()

for ln in lns:
    x1 = int(ln[0][0])
    y1 = int(ln[0][1])
    x2 = int(ln[0][2])
    y2 = int(ln[0][3])

    cv2.line(img_cpy, pt1=(x1, y1), pt2=(x2, y2),
             color=(0, 255, 0), thickness=5)

    print("Coords: ({}, {})->({}, {})".format(x1, y1, x2, y2))

当我们运行代码：

输出：

Coords: (8, 6)->(586, 6)

所以 table 的宽度是 580 pixel。 (586 - 6)

输出：

Coords: (589, 28)->(589, 6)

所以，两条连续线之间的距离接近22 pixel (28 - 6)

共有37行

我们把每条线都画成和背景颜色一样怎么样？

我们知道两条连续线之间的距离，水平线的起点和终点。

for _ in range(0, 37):
    cv2.line(img, pt1=(6, y1), pt2=(590, y1),
             color=(255, 255, 255), thickness=5)
    print("Coords: ({}, {})->({}, {})".format(6, y1, 590, y1))
    y1 += 20

结果：Horizontal lines removed

样本：

如果你看输出的最后一句：

Coords: (6, 726)->(590, 726)

所以结束坐标是726。太删除垂直线我们需要找到起始坐标，因为我们已经知道结束坐标。

输出：

Coords: (8, 6)->(586, 6)
Coords: (589, 28)->(589, 6)
Coords: (69, 8)->(69, 24)
Coords: (337, 8)->(337, 24)

第一个坐标给出起点，589, 69, 337和6

第一条竖线坐标为：(6, 6)->(6, 726) 第二条竖线坐标为：(69, 6)->(69, 726) 第三条竖线坐标为：(337, 6)->(337, 726) 第四条竖线坐标为：(589, 6)->(589, 726)

cv2.line(img, pt1=(6, 6), pt2=(6, 726),
         color=(255, 255, 255), thickness=5)

cv2.line(img, pt1=(72, 6), pt2=(72, 726),
         color=(255, 255, 255), thickness=5)

cv2.line(img, pt1=(337, 6), pt2=(337, 726),
         color=(255, 255, 255), thickness=5)

cv2.line(img, pt1=(589, 6), pt2=(589, 726),
         color=(255, 255, 255), thickness=5)

结果：Vertical-lines-removed

样本：

现在，当您从输出图像中读取文本时：

LINE 1
LINE 2
LINE 3
.
.
.
SOME RANDOM TEXT FOR LINE 1 CELL 1.
SOME RANDOM TEXT FOR LINE 2 CELL 1.
SOME RANDOM TEXT FOR LINE 3 CELL 1.
SOME RANDOM TEXT FOR LINE 4 CELL 1.
.
.
.
1AM OTHER TEXT FOR LINE 1 CELL 2
1AM OTHER TEXT FOR LINE 2 CELL 2
1AM OTHER TEXT FOR LINE 3 CELL 2
1AM OTHER TEXT FOR LINE 4 CELL 2

代码：

import cv2
import pytesseract

img = cv2.imread("ex.png")
img_gry = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img_cny = cv2.Canny(img_gry, 50, 200)

# lns = cv2.ximgproc.createFastLineDetector().detect(img_cny)
#
# img_cpy = img.copy()
#
# for ln in lns:
#     x1 = int(ln[0][0])
#     y1 = int(ln[0][1])
#     x2 = int(ln[0][2])
#     y2 = int(ln[0][3])
#
#     cv2.line(img_cpy, pt1=(x1, y1), pt2=(x2, y2),
#              color=(0, 255, 0), thickness=5)
#
#     print("Coords: ({}, {})->({}, {})".format(x1, y1, x2, y2))
#
#     cv2.imshow("img_cpy", img_cpy)
#     cv2.waitKey(0)

y1 = 6

for _ in range(0, 37):
    cv2.line(img, pt1=(6, y1), pt2=(590, y1),
             color=(255, 255, 255), thickness=5)
    print("Coords: ({}, {})->({}, {})".format(6, y1, 590, y1))
    y1 += 20

cv2.line(img, pt1=(6, 6), pt2=(6, 726),
         color=(255, 255, 255), thickness=5)

cv2.line(img, pt1=(72, 6), pt2=(72, 726),
         color=(255, 255, 255), thickness=5)

cv2.line(img, pt1=(337, 6), pt2=(337, 726),
         color=(255, 255, 255), thickness=5)

cv2.line(img, pt1=(589, 6), pt2=(589, 726),
         color=(255, 255, 255), thickness=5)

cv2.imshow("lns", img)
cv2.waitKey(0)

txt = pytesseract.image_to_string(img)
print(txt)

python 如何在不降低图像质量的情况下去除水平和垂直线

How to remove horizontal and vertical lines without degrading the image quality in python

python

python-3.x

opencv

python-tesseract