如何仅从 no 中查找粗体文本行。图片数量

How to find only the bolded text lines from no. of images

from PIL import Image
import pytesseract
from pdf2image import convert_from_path
import os
import pandas as pd
import cv2
import numpy as np

files = os.chdir("C:/Users/abhishek_kumar1/Desktop/New folder")
#print(os.getcwd())
pages = convert_from_path("d.pdf",190,single_file=True,
                      poppler_path='C:/Users/abhishek_kumar1/Downloads/poppler-0.68.0_x86/poppler-0.68.0/bin')
image_counter=1
for page in pages:
    filename = "page_"+str(image_counter)+".jpg"
    page.save(filename,'JPEG')

img = cv2.imread(filename)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
cv2.imwrite('grey.png',gray)
binary,thresh1 = cv2.threshold(gray, 0, 255,cv2.THRESH_OTSU|cv2.THRESH_BINARY_INV)
cv2.imwrite('Thresh1.png',thresh1)
rect_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 3))
dilation = cv2.dilate(thresh1, rect_kernel, iterations = 6)
contours, hierarchy = cv2.findContours(dilation, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
im2 = img.copy()


ROI_number = 0
for cnt in contours[::-1]:
    [x,y,w,h] = cv2.boundingRect(cnt)
    ROI=im2[y:y+h, x:x+w]
    #print(str(w),str(h))
    #cv2.putText(im2, str(h), (x,y - 10 ), cv2.FONT_HERSHEY_SIMPLEX, 0.1, (255, 0, 0), 1)
    #cv2.putText(im2, str(w), (x,y + 10 ), cv2.FONT_HERSHEY_SIMPLEX, 0.1, (0, 0, 255), 1)
    cv2.imwrite('ROI_{}.jpg'.format(ROI_number),ROI)
    cv2.rectangle(im2,(x,y),(x+w,y+h),(36,255,12),1)
    ROI_number += 1

cv2.imwrite('contours1.png',im2)

如何从上面的代码部分中只找到这张图片,是否有任何选项可以从图像中理解字体类型,如粗体、斜体等 很难从所有图像中仅找到粗线部分。 请任何人对此提出建议,请帮助我。

查看python代码和结果:

import cv2
import numpy as np
img = cv2.imread('C.png')
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 160, 255, cv2.THRESH_BINARY)[1]
kernel = np.ones((5,5),np.uint8)
kernel2 = np.ones((3,3),np.uint8)
marker = cv2.dilate(thresh,kernel,iterations = 1)
mask=cv2.erode(thresh,kernel,iterations = 1)

while True:
    tmp=marker.copy()
    marker=cv2.erode(marker, kernel2)
    marker=cv2.max(mask, marker)
    difference = cv2.subtract(tmp, marker)
    if cv2.countNonZero(difference) == 0:
        break

marker_color = cv2.cvtColor(marker, cv2.COLOR_GRAY2BGR)
out=cv2.bitwise_or(img, marker_color)
cv2.imwrite('out.png', out)
cv2.imshow('result', out )

Alex Alex 的回答对我不起作用。这是我用文字描述的替代方案。

一般的想法是,我们将有多少黑色像素与仍能形成字符的最小可能像素进行比较。这为我们提供了从骨架到普通文本和骨架到粗体文本的区别。这样我们就可以比较清楚的把普通文字和加粗文字区分开来了。

  1. 使用 OCR 软件提取单个单词的边界框。 可选:将单个单词组合成单词行,例如 Pytesseract 中的word_num
  2. 将图像转换为灰度并反转图像颜色
  3. 对图片上选中的文字区域进行张珅细化(opencv贡献:cv2.ximgproc.thinning
  4. 细化图像中有白色像素的总和,即值等于 255 的地方(白色像素是字母)
  5. 反转图像中有白色像素的总和
  6. 最后计算厚度(sum_inverted_pixels - sum_skeleton_pixels) / sum_skeleton_pixels(有时会出现除零错误,检查骨架总和为0时return改为0)
  7. 通过最小值和最大值归一化厚度
  8. 应用阈值来决定 word/line 文本何时为粗体,例如0.6 或 0.7