确定报纸文章中的栏数

Determining the number of columns within a newspaper article

假设下面的报纸文章需要分析栏的数量(解决方案应该是 3 个文本栏)。我尝试使用带有 python 的 cv2 库检索列数,并在 Whosebug 上找到以下建议:Detect number of rows and columns in table image with OpenCV

但是,由于该解决方案的 table 结构良好,可以很容易地提取列和行的数量。基于该解决方案,这是我想出的:

import numpy as np
from imutils import contours
import cv2

# Load image, grayscale, Gaussian blur, Otsu's threshold
image = cv2.imread('example_newspaper_article.png')
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blur = cv2.GaussianBlur(gray, (5,5), 0)
thresh = cv2.threshold(blur, 240, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]

# Find contours and remove text inside cells
cnts = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
    area = cv2.contourArea(c)
    if area < 10000:
        cv2.drawContours(thresh, [c], -1, (255, 255, 255), 30)

# Invert image
invert = thresh
offset, old_cY, first = 10, 0, True
visualize = cv2.cvtColor(invert, cv2.COLOR_GRAY2BGR)

# Find contours, sort from top-to-bottom and then sum up column/rows
cnts = cv2.findContours(invert, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
(cnts, _) = contours.sort_contours(cnts, method="top-to-bottom")
for c in cnts:
    # Find centroid
    M = cv2.moments(c)
    cX = int(M["m10"] / M["m00"])
    cY = int(M["m01"] / M["m00"])

    # New row
    if (abs(cY) - abs(old_cY)) > offset:
        if first:
            row, table = [], []
            first = False
        old_cY = cY
        table.append(row)
        row = []
    # Cell in same row
    if ((abs(cY) - abs(old_cY)) <= offset) or first:
        row.append(1)
    # Uncomment to visualize
    #cv2.circle(visualize, (cX, cY), 10, (36, 255, 12), -1)
    #cv2.imshow('visualize', visualize)
    #cv2.waitKey(200)

print('Rows: {}'.format(len(table)))
print('Columns: {}'.format(len(table[1])))

cv2.imshow('invert', invert)
cv2.imshow('thresh', thresh)
cv2.waitKey()

我认为,增加 drawContours 方法的厚度参数会有所帮助,但不幸的是,这并不能解决问题。结果如下所示:

我想,在文本区域上绘制矩形会更有帮助吗? 有谁知道解决方案并可以帮助我吗? 提前致谢!

每当有这样的任务时,我都会沿着 y-axis 计算像素数,并尝试找出相邻列之间的(大)差异。那就是我的完整管道:

  1. 将图像转换为灰度;使用 Otsu 的逆二进制阈值在黑色背景上获得白色像素。
  2. 做一些形态学上的闭合,这里使用一个大的垂直线核来连接同一列中的所有像素。
  3. 计算所有白色像素;计算相邻列之间的绝对差异。
  4. 在该“信号”中查找峰值 – 手动或如此处所示,使用 scipy.signal.find_peaks。峰值标识每个文本列的开始和结束,因此文本列的数量是峰值数量的一半。

这是包括一些可视化的完整代码:

import cv2
import matplotlib.pyplot as plt     # Only for visualization output
import numpy as np
from scipy import signal
from skimage import io              # Only for web grabbing images

# Read image from web (attention: RGB order here, scikit-image)
image = io.imread('https://i.stack.imgur.com/jbAeZ.png')

# Convert image to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)

# Inverse binary threshold by Otsu's
thr = cv2.threshold(gray, 0, 255, cv2.THRESH_OTSU + cv2.THRESH_BINARY_INV)[1]

# Morphological closing with large vertical line kernel
thr_mod = cv2.morphologyEx(thr, cv2.MORPH_CLOSE, np.ones((image.shape[0], 1)))

# Count white pixels along y-axis
y_count = np.sum(thr_mod / 255, 0)

# Calculate absolute difference between neighbouring x-axis values
y_count_diff = np.abs(np.diff(y_count))

# Find peaks in that "signal"
peaks = signal.find_peaks(y_count_diff, distance=50)[0]

# Number of columns is half the number of found peaks
n_cols = np.int(peaks.shape[0] / 2)

# Text output
print('Number of columns: ' + str(n_cols))

# Some visualization output
plt.figure(0)
plt.subplot(221)
plt.imshow(image)
plt.title('Original image')

plt.subplot(222)
plt.imshow(thr_mod, cmap='gray')
plt.title('Thresholded, morphlogically closed image')

plt.subplot(223)
plt.plot(y_count)
plt.plot(peaks, y_count[peaks], 'r.')
plt.title('Summed white pixels along y-axis')

plt.subplot(224)
plt.plot(y_count_diff)
plt.plot(peaks, y_count_diff[peaks], 'r.')
plt.title('Absolute difference in summed white pixels')

plt.tight_layout()
plt.show()

文本输出:

Number of columns: 3

可视化输出:

局限性:如果您的图像倾斜等,您可能会得到不好的结果。如果你有很多(大)图像穿过文本列,你也可能会得到不好的结果。一般来说,您需要调整给定实现中的细节以满足您的实际需求(没有给出更多示例)。

----------------------------------------
System information
----------------------------------------
Platform:    Windows-10-10.0.16299-SP0
Python:      3.8.5
Matplotlib:  3.3.1
NumPy:       1.19.1
OpenCV:      4.4.0
SciPy:       1.5.2
----------------------------------------

您可以在搜索列之前以稍微不同的方式准备图像。例如,您可以先水平连接文本(使用一些形态学操作)。这将为您提供具有一定高度的轮廓(标题将垂直连接为每行一个轮廓,列中的文本将连接为每行一个轮廓)。然后搜索所有轮廓并在高于您设置的特定值(可以手动计算或设置)的轮廓上绘制边界矩形。之后用更大的内核(水平和垂直)再次执行形态学操作,这样如果它们靠得很近,您就可以将所有剩余的文本连接起来。

这是一个示例代码:

import cv2
import numpy as np

img = cv2.imread("columns.png")  # read image
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)   # grayscale transform
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV+cv2.THRESH_OTSU)[1]  # OTSU thresold
kernel = np.ones((5, 10), dtype=np.uint8)  # kernel for first closing procedure (connect blobs in x direction)
closing = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)  # closing
cv2.imwrite("closing1.png", closing)
contours = cv2.findContours(closing, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)[0]  # search for contours

heights = []  # all of contours heights

for cnt in contours:
    x, y, w, h = cv2.boundingRect(cnt)  # bounding rectangles height, width and coordinates
    heights.append(h)  # append height of one contours

boundary = np.mean(heights, axis=0)  # mean of heights will serve as boundary but
# this will probably not be the case on other samples - you would need to make
# a function to determin this boundary or manualy set it

# iterate through contours
for cnt in contours:
    x, y, w, h = cv2.boundingRect(cnt)  # bounding rectangles height, width and coordinates
    if h > boundary:  # condition - contour must be higher than height boundary
        cv2.rectangle(closing, (x, y), (x+w, y+h), (0, 0, 0), -1)  # draw filled rectangle on the closing image

cv2.imwrite("closing1-filled.png", closing)

kernel = np.ones((25, 25), dtype=np.uint8)  # kernel for second closing (connect blobs in x and y direction)
closing = cv2.morphologyEx(closing, cv2.MORPH_CLOSE, kernel)  # closing again

cv2.imwrite("closing2.png", closing)


contours = cv2.findContours(closing, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)[0]   # search for contours again


# iterate through contours
print("Number of columns: ", len(contours))  # this is the number of columns
for cnt in contours:
    x, y, w, h = cv2.boundingRect(cnt)  # this are height, width and coordinates of the columns
    cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), 3)  # draw bouning rectangle on original image

cv2.imwrite("result.png", img)


cv2.imshow("img", img)
cv2.waitKey(0)
cv2.destroyAllWindows()

结果:

列数:3

第 1 步:

第 2 步:

第 3 步: