在扫描文档中拆分文本行
Split text lines in scanned document
我正在尝试找到一种方法来打破已自适应阈值化的扫描文档中文本行的拆分。现在,我将文档的像素值存储为 0 到 255 之间的无符号整数,我取每行中像素的平均值,然后根据像素值的平均值是否为大于 250,然后我取它所适用的每个线范围的中值。但是,这种方法有时会失败,因为图像上可能会出现黑色斑点。
有没有更抗噪的方法来完成这个任务?
编辑:这是一些代码。 "warped"是原图的名字,"cuts"是我要分割的地方
warped = threshold_adaptive(warped, 250, offset = 10)
warped = warped.astype("uint8") * 255
# get areas where we can split image on whitespace to make OCR more accurate
color_level = np.array([np.sum(line) / len(line) for line in warped])
cuts = []
i = 0
while(i < len(color_level)):
if color_level[i] > 250:
begin = i
while(color_level[i] > 250):
i += 1
cuts.append((i + begin)/2) # middle of the whitespace region
else:
i += 1
编辑 2:添加了示例图像
根据您的输入图像,您需要将文本设为白色,将背景设为黑色
然后您需要计算帐单的旋转角度。一个简单的方法是找到所有白点的minAreaRect
(findNonZero
),你得到:
然后你可以旋转你的帐单,使文字水平:
现在您可以计算水平投影 (reduce
)。您可以取每行的平均值。在直方图上应用阈值 th
以考虑图像中的一些噪声(这里我使用 0
,即没有噪声)。只有背景的行在直方图中的值为 >0
,文本行的值为 0
。然后取直方图中每个连续的白色 bin 序列的平均 bin 坐标。那将是你的行的 y
坐标:
这里是代码。它是用 C++ 编写的,但由于大部分工作都是使用 OpenCV 函数,它应该很容易转换为 Python。至少,你可以以此为参考:
#include <opencv2/opencv.hpp>
using namespace cv;
using namespace std;
int main()
{
// Read image
Mat3b img = imread("path_to_image");
// Binarize image. Text is white, background is black
Mat1b bin;
cvtColor(img, bin, COLOR_BGR2GRAY);
bin = bin < 200;
// Find all white pixels
vector<Point> pts;
findNonZero(bin, pts);
// Get rotated rect of white pixels
RotatedRect box = minAreaRect(pts);
if (box.size.width > box.size.height)
{
swap(box.size.width, box.size.height);
box.angle += 90.f;
}
Point2f vertices[4];
box.points(vertices);
for (int i = 0; i < 4; ++i)
{
line(img, vertices[i], vertices[(i + 1) % 4], Scalar(0, 255, 0));
}
// Rotate the image according to the found angle
Mat1b rotated;
Mat M = getRotationMatrix2D(box.center, box.angle, 1.0);
warpAffine(bin, rotated, M, bin.size());
// Compute horizontal projections
Mat1f horProj;
reduce(rotated, horProj, 1, CV_REDUCE_AVG);
// Remove noise in histogram. White bins identify space lines, black bins identify text lines
float th = 0;
Mat1b hist = horProj <= th;
// Get mean coordinate of white white pixels groups
vector<int> ycoords;
int y = 0;
int count = 0;
bool isSpace = false;
for (int i = 0; i < rotated.rows; ++i)
{
if (!isSpace)
{
if (hist(i))
{
isSpace = true;
count = 1;
y = i;
}
}
else
{
if (!hist(i))
{
isSpace = false;
ycoords.push_back(y / count);
}
else
{
y += i;
count++;
}
}
}
// Draw line as final result
Mat3b result;
cvtColor(rotated, result, COLOR_GRAY2BGR);
for (int i = 0; i < ycoords.size(); ++i)
{
line(result, Point(0, ycoords[i]), Point(result.cols, ycoords[i]), Scalar(0, 255, 0));
}
return 0;
}
基本步骤作为@Miki,
- read the source
- threshed
- find minAreaRect
- warp by the rotated matrix
- find and draw upper and lower bounds
虽然 代码在 Python:
#!/usr/bin/python3
# 2018.01.16 01:11:49 CST
# 2018.01.16 01:55:01 CST
import cv2
import numpy as np
## (1) read
img = cv2.imread("img02.jpg")
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
## (2) threshold
th, threshed = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY_INV|cv2.THRESH_OTSU)
## (3) minAreaRect on the nozeros
pts = cv2.findNonZero(threshed)
ret = cv2.minAreaRect(pts)
(cx,cy), (w,h), ang = ret
if w>h:
w,h = h,w
ang += 90
## (4) Find rotated matrix, do rotation
M = cv2.getRotationMatrix2D((cx,cy), ang, 1.0)
rotated = cv2.warpAffine(threshed, M, (img.shape[1], img.shape[0]))
## (5) find and draw the upper and lower boundary of each lines
hist = cv2.reduce(rotated,1, cv2.REDUCE_AVG).reshape(-1)
th = 2
H,W = img.shape[:2]
uppers = [y for y in range(H-1) if hist[y]<=th and hist[y+1]>th]
lowers = [y for y in range(H-1) if hist[y]>th and hist[y+1]<=th]
rotated = cv2.cvtColor(rotated, cv2.COLOR_GRAY2BGR)
for y in uppers:
cv2.line(rotated, (0,y), (W, y), (255,0,0), 1)
for y in lowers:
cv2.line(rotated, (0,y), (W, y), (0,255,0), 1)
cv2.imwrite("result.png", rotated)
最后结果:
我正在尝试找到一种方法来打破已自适应阈值化的扫描文档中文本行的拆分。现在,我将文档的像素值存储为 0 到 255 之间的无符号整数,我取每行中像素的平均值,然后根据像素值的平均值是否为大于 250,然后我取它所适用的每个线范围的中值。但是,这种方法有时会失败,因为图像上可能会出现黑色斑点。
有没有更抗噪的方法来完成这个任务?
编辑:这是一些代码。 "warped"是原图的名字,"cuts"是我要分割的地方
warped = threshold_adaptive(warped, 250, offset = 10)
warped = warped.astype("uint8") * 255
# get areas where we can split image on whitespace to make OCR more accurate
color_level = np.array([np.sum(line) / len(line) for line in warped])
cuts = []
i = 0
while(i < len(color_level)):
if color_level[i] > 250:
begin = i
while(color_level[i] > 250):
i += 1
cuts.append((i + begin)/2) # middle of the whitespace region
else:
i += 1
编辑 2:添加了示例图像
根据您的输入图像,您需要将文本设为白色,将背景设为黑色
然后您需要计算帐单的旋转角度。一个简单的方法是找到所有白点的minAreaRect
(findNonZero
),你得到:
然后你可以旋转你的帐单,使文字水平:
现在您可以计算水平投影 (reduce
)。您可以取每行的平均值。在直方图上应用阈值 th
以考虑图像中的一些噪声(这里我使用 0
,即没有噪声)。只有背景的行在直方图中的值为 >0
,文本行的值为 0
。然后取直方图中每个连续的白色 bin 序列的平均 bin 坐标。那将是你的行的 y
坐标:
这里是代码。它是用 C++ 编写的,但由于大部分工作都是使用 OpenCV 函数,它应该很容易转换为 Python。至少,你可以以此为参考:
#include <opencv2/opencv.hpp>
using namespace cv;
using namespace std;
int main()
{
// Read image
Mat3b img = imread("path_to_image");
// Binarize image. Text is white, background is black
Mat1b bin;
cvtColor(img, bin, COLOR_BGR2GRAY);
bin = bin < 200;
// Find all white pixels
vector<Point> pts;
findNonZero(bin, pts);
// Get rotated rect of white pixels
RotatedRect box = minAreaRect(pts);
if (box.size.width > box.size.height)
{
swap(box.size.width, box.size.height);
box.angle += 90.f;
}
Point2f vertices[4];
box.points(vertices);
for (int i = 0; i < 4; ++i)
{
line(img, vertices[i], vertices[(i + 1) % 4], Scalar(0, 255, 0));
}
// Rotate the image according to the found angle
Mat1b rotated;
Mat M = getRotationMatrix2D(box.center, box.angle, 1.0);
warpAffine(bin, rotated, M, bin.size());
// Compute horizontal projections
Mat1f horProj;
reduce(rotated, horProj, 1, CV_REDUCE_AVG);
// Remove noise in histogram. White bins identify space lines, black bins identify text lines
float th = 0;
Mat1b hist = horProj <= th;
// Get mean coordinate of white white pixels groups
vector<int> ycoords;
int y = 0;
int count = 0;
bool isSpace = false;
for (int i = 0; i < rotated.rows; ++i)
{
if (!isSpace)
{
if (hist(i))
{
isSpace = true;
count = 1;
y = i;
}
}
else
{
if (!hist(i))
{
isSpace = false;
ycoords.push_back(y / count);
}
else
{
y += i;
count++;
}
}
}
// Draw line as final result
Mat3b result;
cvtColor(rotated, result, COLOR_GRAY2BGR);
for (int i = 0; i < ycoords.size(); ++i)
{
line(result, Point(0, ycoords[i]), Point(result.cols, ycoords[i]), Scalar(0, 255, 0));
}
return 0;
}
基本步骤作为@Miki,
- read the source
- threshed
- find minAreaRect
- warp by the rotated matrix
- find and draw upper and lower bounds
虽然 代码在 Python:
#!/usr/bin/python3
# 2018.01.16 01:11:49 CST
# 2018.01.16 01:55:01 CST
import cv2
import numpy as np
## (1) read
img = cv2.imread("img02.jpg")
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
## (2) threshold
th, threshed = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY_INV|cv2.THRESH_OTSU)
## (3) minAreaRect on the nozeros
pts = cv2.findNonZero(threshed)
ret = cv2.minAreaRect(pts)
(cx,cy), (w,h), ang = ret
if w>h:
w,h = h,w
ang += 90
## (4) Find rotated matrix, do rotation
M = cv2.getRotationMatrix2D((cx,cy), ang, 1.0)
rotated = cv2.warpAffine(threshed, M, (img.shape[1], img.shape[0]))
## (5) find and draw the upper and lower boundary of each lines
hist = cv2.reduce(rotated,1, cv2.REDUCE_AVG).reshape(-1)
th = 2
H,W = img.shape[:2]
uppers = [y for y in range(H-1) if hist[y]<=th and hist[y+1]>th]
lowers = [y for y in range(H-1) if hist[y]>th and hist[y+1]<=th]
rotated = cv2.cvtColor(rotated, cv2.COLOR_GRAY2BGR)
for y in uppers:
cv2.line(rotated, (0,y), (W, y), (255,0,0), 1)
for y in lowers:
cv2.line(rotated, (0,y), (W, y), (0,255,0), 1)
cv2.imwrite("result.png", rotated)
最后结果: