检测图像中的单词和图形,并将图像切成每个单词或图形的 1 个图像
Detect words and graphs in image and slice image into 1 image per word or graph
我正在构建一个网络应用程序来帮助学生学习数学。
该应用程序需要显示来自 LaTex 文件的数学内容。
这些 Latex 文件呈现(漂亮地)为 pdf,我可以通过 pdf2svg 将其干净地转换为 svg。
(svg 或 png 或任何图像格式)图像看起来像这样:
_______________________________________
| |
| 1. Word1 word2 word3 word4 |
| a. Word5 word6 word7 |
| |
| ///////////Graph1/////////// |
| |
| b. Word8 word9 word10 |
| |
| 2. Word11 word12 word13 word14 |
| |
|_______________________________________|
真实例子:
Web 应用程序的目的是操纵并向其添加内容,从而导致如下结果:
_______________________________________
| |
| 1. Word1 word2 | <-- New line break
|_______________________________________|
| |
| -> NewContent1 |
|_______________________________________|
| |
| word3 word4 |
|_______________________________________|
| |
| -> NewContent2 |
|_______________________________________|
| |
| a. Word5 word6 word7 |
|_______________________________________|
| |
| ///////////Graph1/////////// |
|_______________________________________|
| |
| -> NewContent3 |
|_______________________________________|
| |
| b. Word8 word9 word10 |
|_______________________________________|
| |
| 2. Word11 word12 word13 word14 |
|_______________________________________|
示例:
一张大的单张图片无法让我灵活地进行这种操作。
但是如果图像文件被分解成包含单个单词和单个图形的较小文件,我可以进行这些操作。
我想我需要做的是检测图像中的空白,并将图像切成多个子图像,看起来像这样:
_______________________________________
| | | | |
| 1. Word1 | word2 | word3 | word4 |
|__________|_______|_______|____________|
| | | |
| a. Word5 | word6 | word7 |
|_____________|_______|_________________|
| |
| ///////////Graph1/////////// |
|_______________________________________|
| | | |
| b. Word8 | word9 | word10 |
|_____________|_______|_________________|
| | | | |
| 2. Word11 | word12 | word13 | word14 |
|___________|________|________|_________|
我正在寻找一种方法来做到这一点。
你觉得怎么走?
感谢您的帮助!
图像质量一流,非常干净,没有歪斜,字符分离度很好。一个梦想!
首先执行二值化和斑点检测(OpenCV 中的标准)。
然后通过将纵坐标重叠的字符分组(即彼此面对排成一排)来对字符进行聚类。这将自然地隔离各个行。
现在,在每一行中,从左到右对 blob 进行排序,并按接近程度进行聚类以隔离单词。这将是一个微妙的步骤,因为单词中字符的间距与不同单词之间的间距很接近。不要期望完美的结果。这应该比投影更好。
斜体的情况更糟,因为水平间距更窄。您可能还需要查看 "slanted distance",即找到在斜体方向上与字符相切的线。这可以通过应用反向剪切变换来实现。
感谢网格,图表将显示为大斑点。
我会使用水平和垂直投影首先将图像分割成线,然后将每条线分割成更小的切片(例如单词)。
首先将图像转换为灰度,然后将其反转,以便间隙包含零并且任何 text/graphics 都是非零的。
img = cv2.imread('article.png', cv2.IMREAD_COLOR)
img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img_gray_inverted = 255 - img_gray
计算水平投影——每行的平均强度,使用 cv2.reduce
,并将其展平为线性阵列。
row_means = cv2.reduce(img_gray_inverted, 1, cv2.REDUCE_AVG, dtype=cv2.CV_32F).flatten()
现在找出所有连续间隙的行范围。您可以使用this answer.
中提供的功能
row_gaps = zero_runs(row_means)
最后计算间隙的中点,我们将使用它来切割图像。
row_cutpoints = (row_gaps[:,0] + row_gaps[:,1] - 1) / 2
你最终会遇到类似这种情况(间隙是粉红色的,切割点是红色的):
下一步是处理每个识别出的行。
bounding_boxes = []
for n,(start,end) in enumerate(zip(row_cutpoints, row_cutpoints[1:])):
line = img[start:end]
line_gray_inverted = img_gray_inverted[start:end]
计算垂直投影(每列平均强度),找到间隙和切割点。此外,计算间隙大小,以允许过滤掉各个字母之间的小间隙。
column_means = cv2.reduce(line_gray_inverted, 0, cv2.REDUCE_AVG, dtype=cv2.CV_32F).flatten()
column_gaps = zero_runs(column_means)
column_gap_sizes = column_gaps[:,1] - column_gaps[:,0]
column_cutpoints = (column_gaps[:,0] + column_gaps[:,1] - 1) / 2
过滤切点。
filtered_cutpoints = column_cutpoints[column_gap_sizes > 5]
并为每个段创建边界框列表。
for xstart,xend in zip(filtered_cutpoints, filtered_cutpoints[1:]):
bounding_boxes.append(((xstart, start), (xend, end)))
现在你得到了这样的结果(间隙还是粉红色的,切割点是红色的):
现在可以切图了。我将只可视化找到的边界框:
完整脚本:
import cv2
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec
def plot_horizontal_projection(file_name, img, projection):
fig = plt.figure(1, figsize=(12,16))
gs = gridspec.GridSpec(1, 2, width_ratios=[3,1])
ax = plt.subplot(gs[0])
im = ax.imshow(img, interpolation='nearest', aspect='auto')
ax.grid(which='major', alpha=0.5)
ax = plt.subplot(gs[1])
ax.plot(projection, np.arange(img.shape[0]), 'm')
ax.grid(which='major', alpha=0.5)
plt.xlim([0.0, 255.0])
plt.ylim([-0.5, img.shape[0] - 0.5])
ax.invert_yaxis()
fig.suptitle("FOO", fontsize=16)
gs.tight_layout(fig, rect=[0, 0.03, 1, 0.97])
fig.set_dpi(200)
fig.savefig(file_name, bbox_inches='tight', dpi=fig.dpi)
plt.clf()
def plot_vertical_projection(file_name, img, projection):
fig = plt.figure(2, figsize=(12, 4))
gs = gridspec.GridSpec(2, 1, height_ratios=[1,5])
ax = plt.subplot(gs[0])
im = ax.imshow(img, interpolation='nearest', aspect='auto')
ax.grid(which='major', alpha=0.5)
ax = plt.subplot(gs[1])
ax.plot(np.arange(img.shape[1]), projection, 'm')
ax.grid(which='major', alpha=0.5)
plt.xlim([-0.5, img.shape[1] - 0.5])
plt.ylim([0.0, 255.0])
fig.suptitle("FOO", fontsize=16)
gs.tight_layout(fig, rect=[0, 0.03, 1, 0.97])
fig.set_dpi(200)
fig.savefig(file_name, bbox_inches='tight', dpi=fig.dpi)
plt.clf()
def visualize_hp(file_name, img, row_means, row_cutpoints):
row_highlight = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
row_highlight[row_means == 0, :, :] = [255,191,191]
row_highlight[row_cutpoints, :, :] = [255,0,0]
plot_horizontal_projection(file_name, row_highlight, row_means)
def visualize_vp(file_name, img, column_means, column_cutpoints):
col_highlight = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
col_highlight[:, column_means == 0, :] = [255,191,191]
col_highlight[:, column_cutpoints, :] = [255,0,0]
plot_vertical_projection(file_name, col_highlight, column_means)
# From
def zero_runs(a):
# Create an array that is 1 where a is 0, and pad each end with an extra 0.
iszero = np.concatenate(([0], np.equal(a, 0).view(np.int8), [0]))
absdiff = np.abs(np.diff(iszero))
# Runs start and end where absdiff is 1.
ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
return ranges
img = cv2.imread('article.png', cv2.IMREAD_COLOR)
img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img_gray_inverted = 255 - img_gray
row_means = cv2.reduce(img_gray_inverted, 1, cv2.REDUCE_AVG, dtype=cv2.CV_32F).flatten()
row_gaps = zero_runs(row_means)
row_cutpoints = (row_gaps[:,0] + row_gaps[:,1] - 1) / 2
visualize_hp("article_hp.png", img, row_means, row_cutpoints)
bounding_boxes = []
for n,(start,end) in enumerate(zip(row_cutpoints, row_cutpoints[1:])):
line = img[start:end]
line_gray_inverted = img_gray_inverted[start:end]
column_means = cv2.reduce(line_gray_inverted, 0, cv2.REDUCE_AVG, dtype=cv2.CV_32F).flatten()
column_gaps = zero_runs(column_means)
column_gap_sizes = column_gaps[:,1] - column_gaps[:,0]
column_cutpoints = (column_gaps[:,0] + column_gaps[:,1] - 1) / 2
filtered_cutpoints = column_cutpoints[column_gap_sizes > 5]
for xstart,xend in zip(filtered_cutpoints, filtered_cutpoints[1:]):
bounding_boxes.append(((xstart, start), (xend, end)))
visualize_vp("article_vp_%02d.png" % n, line, column_means, filtered_cutpoints)
result = img.copy()
for bounding_box in bounding_boxes:
cv2.rectangle(result, bounding_box[0], bounding_box[1], (255,0,0), 2)
cv2.imwrite("article_boxes.png", result)
我正在构建一个网络应用程序来帮助学生学习数学。
该应用程序需要显示来自 LaTex 文件的数学内容。 这些 Latex 文件呈现(漂亮地)为 pdf,我可以通过 pdf2svg 将其干净地转换为 svg。
(svg 或 png 或任何图像格式)图像看起来像这样:
_______________________________________
| |
| 1. Word1 word2 word3 word4 |
| a. Word5 word6 word7 |
| |
| ///////////Graph1/////////// |
| |
| b. Word8 word9 word10 |
| |
| 2. Word11 word12 word13 word14 |
| |
|_______________________________________|
真实例子:
Web 应用程序的目的是操纵并向其添加内容,从而导致如下结果:
_______________________________________
| |
| 1. Word1 word2 | <-- New line break
|_______________________________________|
| |
| -> NewContent1 |
|_______________________________________|
| |
| word3 word4 |
|_______________________________________|
| |
| -> NewContent2 |
|_______________________________________|
| |
| a. Word5 word6 word7 |
|_______________________________________|
| |
| ///////////Graph1/////////// |
|_______________________________________|
| |
| -> NewContent3 |
|_______________________________________|
| |
| b. Word8 word9 word10 |
|_______________________________________|
| |
| 2. Word11 word12 word13 word14 |
|_______________________________________|
示例:
一张大的单张图片无法让我灵活地进行这种操作。
但是如果图像文件被分解成包含单个单词和单个图形的较小文件,我可以进行这些操作。
我想我需要做的是检测图像中的空白,并将图像切成多个子图像,看起来像这样:
_______________________________________
| | | | |
| 1. Word1 | word2 | word3 | word4 |
|__________|_______|_______|____________|
| | | |
| a. Word5 | word6 | word7 |
|_____________|_______|_________________|
| |
| ///////////Graph1/////////// |
|_______________________________________|
| | | |
| b. Word8 | word9 | word10 |
|_____________|_______|_________________|
| | | | |
| 2. Word11 | word12 | word13 | word14 |
|___________|________|________|_________|
我正在寻找一种方法来做到这一点。 你觉得怎么走?
感谢您的帮助!
图像质量一流,非常干净,没有歪斜,字符分离度很好。一个梦想!
首先执行二值化和斑点检测(OpenCV 中的标准)。
然后通过将纵坐标重叠的字符分组(即彼此面对排成一排)来对字符进行聚类。这将自然地隔离各个行。
现在,在每一行中,从左到右对 blob 进行排序,并按接近程度进行聚类以隔离单词。这将是一个微妙的步骤,因为单词中字符的间距与不同单词之间的间距很接近。不要期望完美的结果。这应该比投影更好。
斜体的情况更糟,因为水平间距更窄。您可能还需要查看 "slanted distance",即找到在斜体方向上与字符相切的线。这可以通过应用反向剪切变换来实现。
感谢网格,图表将显示为大斑点。
我会使用水平和垂直投影首先将图像分割成线,然后将每条线分割成更小的切片(例如单词)。
首先将图像转换为灰度,然后将其反转,以便间隙包含零并且任何 text/graphics 都是非零的。
img = cv2.imread('article.png', cv2.IMREAD_COLOR)
img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img_gray_inverted = 255 - img_gray
计算水平投影——每行的平均强度,使用 cv2.reduce
,并将其展平为线性阵列。
row_means = cv2.reduce(img_gray_inverted, 1, cv2.REDUCE_AVG, dtype=cv2.CV_32F).flatten()
现在找出所有连续间隙的行范围。您可以使用this answer.
中提供的功能row_gaps = zero_runs(row_means)
最后计算间隙的中点,我们将使用它来切割图像。
row_cutpoints = (row_gaps[:,0] + row_gaps[:,1] - 1) / 2
你最终会遇到类似这种情况(间隙是粉红色的,切割点是红色的):
下一步是处理每个识别出的行。
bounding_boxes = []
for n,(start,end) in enumerate(zip(row_cutpoints, row_cutpoints[1:])):
line = img[start:end]
line_gray_inverted = img_gray_inverted[start:end]
计算垂直投影(每列平均强度),找到间隙和切割点。此外,计算间隙大小,以允许过滤掉各个字母之间的小间隙。
column_means = cv2.reduce(line_gray_inverted, 0, cv2.REDUCE_AVG, dtype=cv2.CV_32F).flatten()
column_gaps = zero_runs(column_means)
column_gap_sizes = column_gaps[:,1] - column_gaps[:,0]
column_cutpoints = (column_gaps[:,0] + column_gaps[:,1] - 1) / 2
过滤切点。
filtered_cutpoints = column_cutpoints[column_gap_sizes > 5]
并为每个段创建边界框列表。
for xstart,xend in zip(filtered_cutpoints, filtered_cutpoints[1:]):
bounding_boxes.append(((xstart, start), (xend, end)))
现在你得到了这样的结果(间隙还是粉红色的,切割点是红色的):
现在可以切图了。我将只可视化找到的边界框:
完整脚本:
import cv2
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec
def plot_horizontal_projection(file_name, img, projection):
fig = plt.figure(1, figsize=(12,16))
gs = gridspec.GridSpec(1, 2, width_ratios=[3,1])
ax = plt.subplot(gs[0])
im = ax.imshow(img, interpolation='nearest', aspect='auto')
ax.grid(which='major', alpha=0.5)
ax = plt.subplot(gs[1])
ax.plot(projection, np.arange(img.shape[0]), 'm')
ax.grid(which='major', alpha=0.5)
plt.xlim([0.0, 255.0])
plt.ylim([-0.5, img.shape[0] - 0.5])
ax.invert_yaxis()
fig.suptitle("FOO", fontsize=16)
gs.tight_layout(fig, rect=[0, 0.03, 1, 0.97])
fig.set_dpi(200)
fig.savefig(file_name, bbox_inches='tight', dpi=fig.dpi)
plt.clf()
def plot_vertical_projection(file_name, img, projection):
fig = plt.figure(2, figsize=(12, 4))
gs = gridspec.GridSpec(2, 1, height_ratios=[1,5])
ax = plt.subplot(gs[0])
im = ax.imshow(img, interpolation='nearest', aspect='auto')
ax.grid(which='major', alpha=0.5)
ax = plt.subplot(gs[1])
ax.plot(np.arange(img.shape[1]), projection, 'm')
ax.grid(which='major', alpha=0.5)
plt.xlim([-0.5, img.shape[1] - 0.5])
plt.ylim([0.0, 255.0])
fig.suptitle("FOO", fontsize=16)
gs.tight_layout(fig, rect=[0, 0.03, 1, 0.97])
fig.set_dpi(200)
fig.savefig(file_name, bbox_inches='tight', dpi=fig.dpi)
plt.clf()
def visualize_hp(file_name, img, row_means, row_cutpoints):
row_highlight = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
row_highlight[row_means == 0, :, :] = [255,191,191]
row_highlight[row_cutpoints, :, :] = [255,0,0]
plot_horizontal_projection(file_name, row_highlight, row_means)
def visualize_vp(file_name, img, column_means, column_cutpoints):
col_highlight = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
col_highlight[:, column_means == 0, :] = [255,191,191]
col_highlight[:, column_cutpoints, :] = [255,0,0]
plot_vertical_projection(file_name, col_highlight, column_means)
# From
def zero_runs(a):
# Create an array that is 1 where a is 0, and pad each end with an extra 0.
iszero = np.concatenate(([0], np.equal(a, 0).view(np.int8), [0]))
absdiff = np.abs(np.diff(iszero))
# Runs start and end where absdiff is 1.
ranges = np.where(absdiff == 1)[0].reshape(-1, 2)
return ranges
img = cv2.imread('article.png', cv2.IMREAD_COLOR)
img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
img_gray_inverted = 255 - img_gray
row_means = cv2.reduce(img_gray_inverted, 1, cv2.REDUCE_AVG, dtype=cv2.CV_32F).flatten()
row_gaps = zero_runs(row_means)
row_cutpoints = (row_gaps[:,0] + row_gaps[:,1] - 1) / 2
visualize_hp("article_hp.png", img, row_means, row_cutpoints)
bounding_boxes = []
for n,(start,end) in enumerate(zip(row_cutpoints, row_cutpoints[1:])):
line = img[start:end]
line_gray_inverted = img_gray_inverted[start:end]
column_means = cv2.reduce(line_gray_inverted, 0, cv2.REDUCE_AVG, dtype=cv2.CV_32F).flatten()
column_gaps = zero_runs(column_means)
column_gap_sizes = column_gaps[:,1] - column_gaps[:,0]
column_cutpoints = (column_gaps[:,0] + column_gaps[:,1] - 1) / 2
filtered_cutpoints = column_cutpoints[column_gap_sizes > 5]
for xstart,xend in zip(filtered_cutpoints, filtered_cutpoints[1:]):
bounding_boxes.append(((xstart, start), (xend, end)))
visualize_vp("article_vp_%02d.png" % n, line, column_means, filtered_cutpoints)
result = img.copy()
for bounding_box in bounding_boxes:
cv2.rectangle(result, bounding_box[0], bounding_box[1], (255,0,0), 2)
cv2.imwrite("article_boxes.png", result)