python 如何在不降低图像质量的情况下去除水平和垂直线
How to remove horizontal and vertical lines without degrading the image quality in python
我正在尝试从图像中删除水平线和垂直线。此图像是使用 pdf2jpg 库从 pdf 生成的。删除水平线和垂直线后,该图像将被馈送到 pytesseract 以提取单词及其各自的坐标。在这里我只是提取全文用于测试目的。
我是 OpenCV 的新手。我通过积累来自不同网站(包括堆栈溢出)的代码片段来编写此代码。除了偶尔会残留一些垂直线外,该代码几乎可以完美运行。这些残余物混淆了 tesseract,有时被视为 I、1 或 |。此外,tesseract 的误读次数(如 s 被读为 5,I 被读为 1 或 |,反之亦然)似乎比原始图像的处理图像更高。我认为这是因为字体清晰度低于我们开始使用的原始图像。可以对此代码进行哪些更改,以在不影响字体清晰度的情况下删除垂直线的残余。任何正确方向的建议或指导将不胜感激。提前致谢
from importlib import invalidate_caches
from pytesseract import image_to_string
#from pdf2image import convert_from_path
from pdf2jpg.pdf2jpg import convert_pdf2jpg
from PIL import Image
import sys
import cv2
import numpy
def pre_process(image):
if isinstance(image, str):
image = cv2.imread(image, cv2.IMREAD_GRAYSCALE)
else:
# image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
pass
#Convert the image to true black n white from grayscale
threshold, image_bin = cv2.threshold(image, 128, 255, cv2.THRESH_BINARY|cv2.THRESH_OTSU)
#Invert the image to change white to black and vice versa
image_inv = 255-image_bin
#Define kernels for horizontal and vertical lines
kernel_len = numpy.array(image).shape[1]//100
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_len))
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_len, 1))
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
#Remove anything that is not a vertical line
image_inv1 = cv2.erode(image_inv, vertical_kernel, iterations=3)
vertical_lines = cv2.dilate(image_inv1, vertical_kernel, iterations=3)
#Remove anything that is not a horizontal line
image_inv2 = cv2.erode(image_inv, horizontal_kernel, iterations=3)
horizontal_lines = cv2.dilate(image_inv2, horizontal_kernel, iterations=3)
#Add horizontal and vertical lines to get all lines
image_vh = cv2.addWeighted(vertical_lines, 0.5, horizontal_lines, 0.5, 0.0)
image_vh = cv2.erode(~image_vh, kernel, iterations=2)
threshold, image_vh = cv2.threshold(image_vh, 128, 255, cv2.THRESH_BINARY|cv2.THRESH_OTSU)
# Make a inverted copy of original grayscale image
org_img_inv = cv2.bitwise_not(image)
#Apply mask of all lines
final_image_inv = cv2.bitwise_and(org_img_inv, org_img_inv, mask=image_vh)
#Invert again to get clean image without lines
image = cv2.bitwise_not(final_image_inv)
cv2.imshow("final", image)
cv2.waitKey(0)
return image
if __name__ =="__main__":
pdf_path = sys.argv[1]
images = convert_pdf2jpg(pdf_path, "temp", dpi=100, pages="ALL")
result = ""
for image_path in images[0]["output_jpgfiles"]:
# with Image.open(image_path) as image:
# text = image_to_string(image)
# result = "\n".join((result, text))
image = pre_process(image_path)
#image = pre_process(image)
text = image_to_string(image)
result = "\n".join((result, text))
# print(result)
with open("text.txt", "w") as out:
out.write(result)
# pre_process(image_path)
# break
请找到附件中的 pdf,我将其用作代码的输入 pdf 和处理过的图像的片段以供参考。可以使用
从命令提示符触发代码
python .\read_pdf_ocr.py path_to_pdf_file
环境详情:
- Python: 3.7.9
- 图书馆:
- opencv-python: 4.4.0.46
- pdf2jpg: 1.0
- pytesseract: 0.3.6
- Tesseract-OCR - 开源 OCR 引擎:v5.0.0-alpha.20200328
Snip of processed image
您可以使用 line-detector
检测给定图像中的线条。
使用 convert_pdf2jpg
转换图像后
找到图像的边缘。您可以使用 Canny
.
import cv2
import pytesseract
img = cv2.imread("ex.png")
img_gry = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img_cny = cv2.Canny(img_gry, 50, 200)
Canny-applied-image
精明图的一部分:
现在我们可以使用line-detector
来查找图片的坐标了。
lns = cv2.ximgproc.createFastLineDetector().detect(img_cny)
img_cpy = img.copy()
for ln in lns:
x1 = int(ln[0][0])
y1 = int(ln[0][1])
x2 = int(ln[0][2])
y2 = int(ln[0][3])
cv2.line(img_cpy, pt1=(x1, y1), pt2=(x2, y2),
color=(0, 255, 0), thickness=5)
print("Coords: ({}, {})->({}, {})".format(x1, y1, x2, y2))
当我们运行代码:
输出:
Coords: (8, 6)->(586, 6)
所以 table 的宽度是 580 pixel
。 (586 - 6)
输出:
Coords: (589, 28)->(589, 6)
所以,两条连续线之间的距离接近22 pixel
(28 - 6)
共有37行
我们把每条线都画成和背景颜色一样怎么样?
我们知道两条连续线之间的距离,水平线的起点和终点。
for _ in range(0, 37):
cv2.line(img, pt1=(6, y1), pt2=(590, y1),
color=(255, 255, 255), thickness=5)
print("Coords: ({}, {})->({}, {})".format(6, y1, 590, y1))
y1 += 20
结果:Horizontal lines removed
样本:
如果你看输出的最后一句:
Coords: (6, 726)->(590, 726)
所以结束坐标是726
。太删除垂直线我们需要找到起始坐标,因为我们已经知道结束坐标。
输出:
Coords: (8, 6)->(586, 6)
Coords: (589, 28)->(589, 6)
Coords: (69, 8)->(69, 24)
Coords: (337, 8)->(337, 24)
第一个坐标给出起点,589, 69, 337
和6
第一条竖线坐标为:(6, 6)->(6, 726)
第二条竖线坐标为:(69, 6)->(69, 726)
第三条竖线坐标为:(337, 6)->(337, 726)
第四条竖线坐标为:(589, 6)->(589, 726)
cv2.line(img, pt1=(6, 6), pt2=(6, 726),
color=(255, 255, 255), thickness=5)
cv2.line(img, pt1=(72, 6), pt2=(72, 726),
color=(255, 255, 255), thickness=5)
cv2.line(img, pt1=(337, 6), pt2=(337, 726),
color=(255, 255, 255), thickness=5)
cv2.line(img, pt1=(589, 6), pt2=(589, 726),
color=(255, 255, 255), thickness=5)
结果:Vertical-lines-removed
样本:
现在,当您从输出图像中读取文本时:
LINE 1
LINE 2
LINE 3
.
.
.
SOME RANDOM TEXT FOR LINE 1 CELL 1.
SOME RANDOM TEXT FOR LINE 2 CELL 1.
SOME RANDOM TEXT FOR LINE 3 CELL 1.
SOME RANDOM TEXT FOR LINE 4 CELL 1.
.
.
.
1AM OTHER TEXT FOR LINE 1 CELL 2
1AM OTHER TEXT FOR LINE 2 CELL 2
1AM OTHER TEXT FOR LINE 3 CELL 2
1AM OTHER TEXT FOR LINE 4 CELL 2
代码:
import cv2
import pytesseract
img = cv2.imread("ex.png")
img_gry = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img_cny = cv2.Canny(img_gry, 50, 200)
# lns = cv2.ximgproc.createFastLineDetector().detect(img_cny)
#
# img_cpy = img.copy()
#
# for ln in lns:
# x1 = int(ln[0][0])
# y1 = int(ln[0][1])
# x2 = int(ln[0][2])
# y2 = int(ln[0][3])
#
# cv2.line(img_cpy, pt1=(x1, y1), pt2=(x2, y2),
# color=(0, 255, 0), thickness=5)
#
# print("Coords: ({}, {})->({}, {})".format(x1, y1, x2, y2))
#
# cv2.imshow("img_cpy", img_cpy)
# cv2.waitKey(0)
y1 = 6
for _ in range(0, 37):
cv2.line(img, pt1=(6, y1), pt2=(590, y1),
color=(255, 255, 255), thickness=5)
print("Coords: ({}, {})->({}, {})".format(6, y1, 590, y1))
y1 += 20
cv2.line(img, pt1=(6, 6), pt2=(6, 726),
color=(255, 255, 255), thickness=5)
cv2.line(img, pt1=(72, 6), pt2=(72, 726),
color=(255, 255, 255), thickness=5)
cv2.line(img, pt1=(337, 6), pt2=(337, 726),
color=(255, 255, 255), thickness=5)
cv2.line(img, pt1=(589, 6), pt2=(589, 726),
color=(255, 255, 255), thickness=5)
cv2.imshow("lns", img)
cv2.waitKey(0)
txt = pytesseract.image_to_string(img)
print(txt)
我正在尝试从图像中删除水平线和垂直线。此图像是使用 pdf2jpg 库从 pdf 生成的。删除水平线和垂直线后,该图像将被馈送到 pytesseract 以提取单词及其各自的坐标。在这里我只是提取全文用于测试目的。 我是 OpenCV 的新手。我通过积累来自不同网站(包括堆栈溢出)的代码片段来编写此代码。除了偶尔会残留一些垂直线外,该代码几乎可以完美运行。这些残余物混淆了 tesseract,有时被视为 I、1 或 |。此外,tesseract 的误读次数(如 s 被读为 5,I 被读为 1 或 |,反之亦然)似乎比原始图像的处理图像更高。我认为这是因为字体清晰度低于我们开始使用的原始图像。可以对此代码进行哪些更改,以在不影响字体清晰度的情况下删除垂直线的残余。任何正确方向的建议或指导将不胜感激。提前致谢
from importlib import invalidate_caches
from pytesseract import image_to_string
#from pdf2image import convert_from_path
from pdf2jpg.pdf2jpg import convert_pdf2jpg
from PIL import Image
import sys
import cv2
import numpy
def pre_process(image):
if isinstance(image, str):
image = cv2.imread(image, cv2.IMREAD_GRAYSCALE)
else:
# image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
pass
#Convert the image to true black n white from grayscale
threshold, image_bin = cv2.threshold(image, 128, 255, cv2.THRESH_BINARY|cv2.THRESH_OTSU)
#Invert the image to change white to black and vice versa
image_inv = 255-image_bin
#Define kernels for horizontal and vertical lines
kernel_len = numpy.array(image).shape[1]//100
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_len))
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_len, 1))
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2, 2))
#Remove anything that is not a vertical line
image_inv1 = cv2.erode(image_inv, vertical_kernel, iterations=3)
vertical_lines = cv2.dilate(image_inv1, vertical_kernel, iterations=3)
#Remove anything that is not a horizontal line
image_inv2 = cv2.erode(image_inv, horizontal_kernel, iterations=3)
horizontal_lines = cv2.dilate(image_inv2, horizontal_kernel, iterations=3)
#Add horizontal and vertical lines to get all lines
image_vh = cv2.addWeighted(vertical_lines, 0.5, horizontal_lines, 0.5, 0.0)
image_vh = cv2.erode(~image_vh, kernel, iterations=2)
threshold, image_vh = cv2.threshold(image_vh, 128, 255, cv2.THRESH_BINARY|cv2.THRESH_OTSU)
# Make a inverted copy of original grayscale image
org_img_inv = cv2.bitwise_not(image)
#Apply mask of all lines
final_image_inv = cv2.bitwise_and(org_img_inv, org_img_inv, mask=image_vh)
#Invert again to get clean image without lines
image = cv2.bitwise_not(final_image_inv)
cv2.imshow("final", image)
cv2.waitKey(0)
return image
if __name__ =="__main__":
pdf_path = sys.argv[1]
images = convert_pdf2jpg(pdf_path, "temp", dpi=100, pages="ALL")
result = ""
for image_path in images[0]["output_jpgfiles"]:
# with Image.open(image_path) as image:
# text = image_to_string(image)
# result = "\n".join((result, text))
image = pre_process(image_path)
#image = pre_process(image)
text = image_to_string(image)
result = "\n".join((result, text))
# print(result)
with open("text.txt", "w") as out:
out.write(result)
# pre_process(image_path)
# break
请找到附件中的 pdf,我将其用作代码的输入 pdf 和处理过的图像的片段以供参考。可以使用
从命令提示符触发代码python .\read_pdf_ocr.py path_to_pdf_file
环境详情:
- Python: 3.7.9
- 图书馆:
- opencv-python: 4.4.0.46
- pdf2jpg: 1.0
- pytesseract: 0.3.6
- Tesseract-OCR - 开源 OCR 引擎:v5.0.0-alpha.20200328
Snip of processed image
您可以使用 line-detector
检测给定图像中的线条。
使用 convert_pdf2jpg
找到图像的边缘。您可以使用 Canny
.
import cv2
import pytesseract
img = cv2.imread("ex.png")
img_gry = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img_cny = cv2.Canny(img_gry, 50, 200)
Canny-applied-image
精明图的一部分:
现在我们可以使用line-detector
来查找图片的坐标了。
lns = cv2.ximgproc.createFastLineDetector().detect(img_cny)
img_cpy = img.copy()
for ln in lns:
x1 = int(ln[0][0])
y1 = int(ln[0][1])
x2 = int(ln[0][2])
y2 = int(ln[0][3])
cv2.line(img_cpy, pt1=(x1, y1), pt2=(x2, y2),
color=(0, 255, 0), thickness=5)
print("Coords: ({}, {})->({}, {})".format(x1, y1, x2, y2))
当我们运行代码:
输出:
Coords: (8, 6)->(586, 6)
所以 table 的宽度是 580 pixel
。 (586 - 6)
输出:
Coords: (589, 28)->(589, 6)
所以,两条连续线之间的距离接近22 pixel
(28 - 6)
共有37行
我们把每条线都画成和背景颜色一样怎么样?
我们知道两条连续线之间的距离,水平线的起点和终点。
for _ in range(0, 37):
cv2.line(img, pt1=(6, y1), pt2=(590, y1),
color=(255, 255, 255), thickness=5)
print("Coords: ({}, {})->({}, {})".format(6, y1, 590, y1))
y1 += 20
结果:Horizontal lines removed
样本:
如果你看输出的最后一句:
Coords: (6, 726)->(590, 726)
所以结束坐标是726
。太删除垂直线我们需要找到起始坐标,因为我们已经知道结束坐标。
输出:
Coords: (8, 6)->(586, 6)
Coords: (589, 28)->(589, 6)
Coords: (69, 8)->(69, 24)
Coords: (337, 8)->(337, 24)
第一个坐标给出起点,589, 69, 337
和6
第一条竖线坐标为:(6, 6)->(6, 726)
第二条竖线坐标为:(69, 6)->(69, 726)
第三条竖线坐标为:(337, 6)->(337, 726)
第四条竖线坐标为:(589, 6)->(589, 726)
cv2.line(img, pt1=(6, 6), pt2=(6, 726),
color=(255, 255, 255), thickness=5)
cv2.line(img, pt1=(72, 6), pt2=(72, 726),
color=(255, 255, 255), thickness=5)
cv2.line(img, pt1=(337, 6), pt2=(337, 726),
color=(255, 255, 255), thickness=5)
cv2.line(img, pt1=(589, 6), pt2=(589, 726),
color=(255, 255, 255), thickness=5)
结果:Vertical-lines-removed
样本:
现在,当您从输出图像中读取文本时:
LINE 1
LINE 2
LINE 3
.
.
.
SOME RANDOM TEXT FOR LINE 1 CELL 1.
SOME RANDOM TEXT FOR LINE 2 CELL 1.
SOME RANDOM TEXT FOR LINE 3 CELL 1.
SOME RANDOM TEXT FOR LINE 4 CELL 1.
.
.
.
1AM OTHER TEXT FOR LINE 1 CELL 2
1AM OTHER TEXT FOR LINE 2 CELL 2
1AM OTHER TEXT FOR LINE 3 CELL 2
1AM OTHER TEXT FOR LINE 4 CELL 2
代码:
import cv2
import pytesseract
img = cv2.imread("ex.png")
img_gry = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img_cny = cv2.Canny(img_gry, 50, 200)
# lns = cv2.ximgproc.createFastLineDetector().detect(img_cny)
#
# img_cpy = img.copy()
#
# for ln in lns:
# x1 = int(ln[0][0])
# y1 = int(ln[0][1])
# x2 = int(ln[0][2])
# y2 = int(ln[0][3])
#
# cv2.line(img_cpy, pt1=(x1, y1), pt2=(x2, y2),
# color=(0, 255, 0), thickness=5)
#
# print("Coords: ({}, {})->({}, {})".format(x1, y1, x2, y2))
#
# cv2.imshow("img_cpy", img_cpy)
# cv2.waitKey(0)
y1 = 6
for _ in range(0, 37):
cv2.line(img, pt1=(6, y1), pt2=(590, y1),
color=(255, 255, 255), thickness=5)
print("Coords: ({}, {})->({}, {})".format(6, y1, 590, y1))
y1 += 20
cv2.line(img, pt1=(6, 6), pt2=(6, 726),
color=(255, 255, 255), thickness=5)
cv2.line(img, pt1=(72, 6), pt2=(72, 726),
color=(255, 255, 255), thickness=5)
cv2.line(img, pt1=(337, 6), pt2=(337, 726),
color=(255, 255, 255), thickness=5)
cv2.line(img, pt1=(589, 6), pt2=(589, 726),
color=(255, 255, 255), thickness=5)
cv2.imshow("lns", img)
cv2.waitKey(0)
txt = pytesseract.image_to_string(img)
print(txt)