OpenCV實現基於傅里葉變換的旋轉文本校訂

時間 2019-11-05

標籤 opencv 實現基於傅里葉變換旋轉文本校訂简体版

原文原文鏈接

代碼html

先給出代碼，再詳細解釋一下過程：ios

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

#include <opencv2/core/core.hpp>

#include <opencv2/imgproc/imgproc.hpp>

#include <opencv2/highgui/highgui.hpp>

#include <iostream>

using namespace cv;

using namespace std;

#define GRAY_THRESH 150

#define HOUGH_VOTE 100

//#define DEGREE 27

int main(int argc, char **argv)

{

//Read a single-channel image

const char* filename = "imageText.jpg";

Mat srcImg = imread(filename, CV_LOAD_IMAGE_GRAYSCALE);

if(srcImg.empty())

return -1;

imshow("source", srcImg);

Point center(srcImg.cols/2, srcImg.rows/2);

#ifdef DEGREE

//Rotate source image

Mat rotMatS = getRotationMatrix2D(center, DEGREE, 1.0);

warpAffine(srcImg, srcImg, rotMatS, srcImg.size(), 1, 0, Scalar(255,255,255));

imshow("RotatedSrc", srcImg);

//imwrite("imageText_R.jpg",srcImg);

#endif

//Expand image to an optimal size, for faster processing speed

//Set widths of borders in four directions

//If borderType==BORDER_CONSTANT, fill the borders with (0,0,0)

Mat padded;

int opWidth = getOptimalDFTSize(srcImg.rows);

int opHeight = getOptimalDFTSize(srcImg.cols);

copyMakeBorder(srcImg, padded, 0, opWidth-srcImg.rows, 0, opHeight-srcImg.cols, BORDER_CONSTANT, Scalar::all(0));

Mat planes[] = {Mat_<float>(padded), Mat::zeros(padded.size(), CV_32F)};

Mat comImg;

//Merge into a double-channel image

merge(planes,2,comImg);

//Use the same image as input and output,

//so that the results can fit in Mat well

dft(comImg, comImg);

//Compute the magnitude

//planes[0]=Re(DFT(I)), planes[1]=Im(DFT(I))

//magnitude=sqrt(Re^2+Im^2)

split(comImg, planes);

magnitude(planes[0], planes[1], planes[0]);

//Switch to logarithmic scale, for better visual results

//M2=log(1+M1)

Mat magMat = planes[0];

magMat += Scalar::all(1);

log(magMat, magMat);

//Crop the spectrum

//Width and height of magMat should be even, so that they can be divided by 2

//-2 is 11111110 in binary system, operator & make sure width and height are always even

magMat = magMat(Rect(0, 0, magMat.cols & -2, magMat.rows & -2));

//Rearrange the quadrants of Fourier image,

//so that the origin is at the center of image,

//and move the high frequency to the corners

int cx = magMat.cols/2;

int cy = magMat.rows/2;

Mat q0(magMat, Rect(0, 0, cx, cy));

Mat q1(magMat, Rect(0, cy, cx, cy));

Mat q2(magMat, Rect(cx, cy, cx, cy));

Mat q3(magMat, Rect(cx, 0, cx, cy));

Mat tmp;

q0.copyTo(tmp);

q2.copyTo(q0);

tmp.copyTo(q2);

q1.copyTo(tmp);

q3.copyTo(q1);

tmp.copyTo(q3);

//Normalize the magnitude to [0,1], then to[0,255]

normalize(magMat, magMat, 0, 1, CV_MINMAX);

Mat magImg(magMat.size(), CV_8UC1);

magMat.convertTo(magImg,CV_8UC1,255,0);

imshow("magnitude", magImg);

//imwrite("imageText_mag.jpg",magImg);

//Turn into binary image

threshold(magImg,magImg,GRAY_THRESH,255,CV_THRESH_BINARY);

imshow("mag_binary", magImg);

//imwrite("imageText_bin.jpg",magImg);

//Find lines with Hough Transformation

vector<Vec2f> lines;

float pi180 = (float)CV_PI/180;

Mat linImg(magImg.size(),CV_8UC3);

HoughLines(magImg,lines,1,pi180,HOUGH_VOTE,0,0);

int numLines = lines.size();

for(int l=0; l<numLines; l++)

{

float rho = lines[l][0], theta = lines[l][1];

Point pt1, pt2;

double a = cos(theta), b = sin(theta);

double x0 = a*rho, y0 = b*rho;

pt1.x = cvRound(x0 + 1000*(-b));

pt1.y = cvRound(y0 + 1000*(a));

pt2.x = cvRound(x0 - 1000*(-b));

pt2.y = cvRound(y0 - 1000*(a));

line(linImg,pt1,pt2,Scalar(255,0,0),3,8,0);

}

imshow("lines",linImg);

//imwrite("imageText_line.jpg",linImg);

if(lines.size() == 3){

cout << "found three angels:" << endl;

cout << lines[0][1]*180/CV_PI << endl << lines[1][1]*180/CV_PI << endl << lines[2][1]*180/CV_PI << endl << endl;

}

//Find the proper angel from the three found angels

float angel=0;

float piThresh = (float)CV_PI/90;

float pi2 = CV_PI/2;

for(int l=0; l<numLines; l++)

{

float theta = lines[l][1];

if(abs(theta) < piThresh || abs(theta-pi2) < piThresh)

continue;

else{

angel = theta;

break;

}

//Calculate the rotation angel

//The image has to be square,

//so that the rotation angel can be calculate right

angel = angel<pi2 ? angel : angel-CV_PI;

if(angel != pi2){

float angelT = srcImg.rows*tan(angel)/srcImg.cols;

angel = atan(angelT);

}

float angelD = angel*180/(float)CV_PI;

cout << "the rotation angel to be applied:" << endl << angelD << endl << endl;

//Rotate the image to recover

Mat rotMat = getRotationMatrix2D(center,angelD,1.0);

Mat dstImg = Mat::ones(srcImg.size(),CV_8UC3);

warpAffine(srcImg,dstImg,rotMat,srcImg.size(),1,0,Scalar(255,255,255));

imshow("result",dstImg);

//imwrite("imageText_D.jpg",dstImg);

waitKey(0);

return 0;

}

過程
讀取圖片git

Mat srcImg = imread(filename, CV_LOAD_IMAGE_GRAYSCALE);

if(srcImg.empty())

return -1;

srcImg.empty()用來判斷是否成功讀進圖像，若是srcImg中沒有數據，在後面的步驟會產生內存錯誤。
因爲處理的是文本，彩色信息不會提供額外幫助，因此要用CV_LOAD_IMAGE_GRAYSCALE代表以灰度形式讀進圖像。
假定讀取的圖像以下：github

旋轉原圖像(可選)算法

Point center(srcImg.cols/2, srcImg.rows/2);

#ifdef DEGREE

//Rotate source image

Mat rotMatS = getRotationMatrix2D(center, DEGREE, 1.0);

warpAffine(srcImg, srcImg, rotMatS, srcImg.size(), 1, 0, Scalar(255,255,255));

imshow("RotatedSrc", srcImg);

//imwrite("H:\\imageText_02_R.jpg",srcImg);

#endif

若是手頭沒有這樣的傾斜圖像，能夠選擇一張正放的文本圖像，再把第12行#define DEGREE那行前的註釋符號去掉。而後這部分代碼就會把所給的圖像旋轉你規定的角度，再交給後面處理。app

圖像延擴ide

Mat padded;

int opWidth = getOptimalDFTSize(srcImg.rows);

int opHeight = getOptimalDFTSize(srcImg.cols);

copyMakeBorder(srcImg, padded, 0, opWidth-srcImg.rows, 0, opHeight-srcImg.cols, BORDER_CONSTANT, Scalar::all(0));

OpenCV中的DFT採用的是快速算法，這種算法要求圖像的尺寸是二、3和5的倍數時處理速度最快。因此須要用getOptimalDFTSize()找到最適合的尺寸，而後用copyMakeBorder()填充多餘的部分。這裏是讓原圖像和擴大的圖像左上角對齊。填充的顏色若是是純色對變換結果的影響不會很大，後面尋找傾斜線的過程又會徹底忽略這一點影響。函數

DFT測試

Mat planes[] = {Mat_<float>(padded), Mat::zeros(padded.size(), CV_32F)};

Mat comImg;

merge(planes,2,comImg);

dft(comImg, comImg);

DFT要分別計算實部和虛部，把要處理的圖像做爲輸入的實部、一個全零的圖像做爲輸入的虛部。dft()輸入和輸出應該分別爲單張圖像，因此要先用merge()把實虛部圖像合併，分別處於圖像comImg的兩個通道內。計算獲得的實虛部仍然保存在comImg的兩個通道內。ui

得到DFT圖像

split(comImg, planes);

magnitude(planes[0], planes[1], planes[0]);

Mat magMat = planes[0];

magMat += Scalar::all(1);

log(magMat, magMat);

通常都會用幅度圖像來表示圖像傅里葉的變換結果（傅里葉譜）。
幅度的計算公式：magnitude = sqrt(Re(DFT)^2 + Im(DFT)^2)。
因爲幅度的變化範圍很大，而通常圖像亮度範圍只有[0,255]，容易形成一大片漆黑，只有幾個點很亮。因此要用log函數把數值的範圍縮小。

magMat = magMat(Rect(0, 0, magMat.cols & -2, magMat.rows & -2));

int cx = magMat.cols/2;

int cy = magMat.rows/2;

Mat q0(magMat, Rect(0, 0, cx, cy));

Mat q1(magMat, Rect(0, cy, cx, cy));

Mat q2(magMat, Rect(cx, cy, cx, cy));

Mat q3(magMat, Rect(cx, 0, cx, cy));

Mat tmp;

q0.copyTo(tmp);

q2.copyTo(q0);

tmp.copyTo(q2);

q1.copyTo(tmp);

q3.copyTo(q1);

tmp.copyTo(q3);

normalize(magMat, magMat, 0, 1, CV_MINMAX);

Mat magImg(magMat.size(), CV_8UC1);

magMat.convertTo(magImg,CV_8UC1,255,0);

dft()直接得到的結果中，低頻部分位於四角，高頻部分位於中間。習慣上會把圖像作四等份，互相對調，使低頻部分位於圖像中心，也就是讓頻域原點位於中心。

雖然用log()縮小了數據範圍，但仍然不能保證數值都落在[0,255]以內，因此要先用normalize()規範化到[0,1]內，再用convertTo()把小數映射到[0,255]內的整數。結果保存在一幅單通道圖像內：

Hough直線檢測
從傅里葉譜能夠明顯地看到一條過中心點的傾斜直線。要想求出這個傾斜角，首先要在圖像上找出這條直線。
一個很方便的方法是採用霍夫（Hough）變換檢測直線。

1	threshold(magImg,magImg,GRAY_THRESH,255,CV_THRESH_BINARY);

Hough變換要求輸入圖像是二值的，因此要用threshold()把圖像二值化。
二值化的一種結果：

vector<Vec2f> lines;

float pi180 = (float)CV_PI/180;

Mat linImg(magImg.size(),CV_8UC3);

HoughLines(magImg,lines,1,pi180,HOUGH_VOTE,0,0);

int numLines = lines.size();

for(int l=0; l<numLines; l++)

{

float rho = lines[l][0], theta = lines[l][1];

Point pt1, pt2;

double a = cos(theta), b = sin(theta);

double x0 = a*rho, y0 = b*rho;

pt1.x = cvRound(x0 + 1000*(-b));

pt1.y = cvRound(y0 + 1000*(a));

pt2.x = cvRound(x0 - 1000*(-b));

pt2.y = cvRound(y0 - 1000*(a));

line(linImg,pt1,pt2,Scalar(255,0,0),3,8,0);

}

這一部分用HoughLines()檢測圖像中可能存在的直線，並把直線參數保存在向量組lines中，而後繪製出找到的直線。
兩個參數GRAY_THRESH和HOUGH_VOTE須要手動指定，不一樣的圖像須要設置不一樣的參數，同一段文本旋轉不一樣的角度也須要不一樣的參數。GRAY_THRESH越大，二值化的閾值就越高；HOUGH_VOTE越大，霍夫檢測的投票數就越高（須要更多的共線點來肯定一條直線）。說白了，若是發現二值化圖像中直線附近有不少散點，就要適當提升GRAY_THRESH；若是發現從二值圖像的一條直線上檢測到了幾條角度相差很小的直線，就須要適當提升HOUGH_VOTE。咱們但願獲得的結果時恰好檢測到三條直線（有時只能檢測到一條直線，後面會給出一個例子）。
檢測到的直線：

計算傾斜角
上面獲得了三個角度，一個是0度，一個是90度，另外一個就是咱們所須要的傾斜角。要把這個角找出來，並且要考慮偏差。

float angel=0;

float piThresh = (float)CV_PI/90;

float pi2 = CV_PI/2;

for(int l=0; l<numLines; l++)

{

float theta = lines[l][1];

if(abs(theta) < piThresh || abs(theta-pi2) < piThresh)

continue;

else{

angel = theta;

break;

}

angel = angel<pi2 ? angel : angel-CV_PI;

if(angel != pi2){

float angelT = srcImg.rows*tan(angel)/srcImg.cols;

angel = atan(angelT);

}

float angelD = angel*180/(float)CV_PI;

因爲DFT的特色，只有輸入圖像是正方形時，檢測到的角纔是文本真正旋轉的角度。但咱們的輸入圖像不必定是正方形的，因此要根據圖像的長寬比改變這個角度。
還有一個須要注意的細節，雖然HoughLines()輸出的傾斜角在[0,180)之間，但在[0,90]和(90,180)之間這個角的含義是不一樣的。請看圖示：

當傾斜角大於90度時，(180-傾斜角)纔是直線相對豎直方向的偏離角度。在OpenCV中，逆時針旋轉，角度爲正。要把圖像轉回去，這個角度就變成了(傾斜角-180)。
校訂圖像
最後一步，固然是把圖像轉回去

Mat rotMat = getRotationMatrix2D(center,angelD,1.0);

Mat dstImg = Mat::ones(srcImg.size(),CV_8UC3);

warpAffine(srcImg,dstImg,rotMat,srcImg.size(),1,0,Scalar(255,255,255));

先用getRotationMatrix2D()得到一個2*3的仿射變換矩陣，再把這個矩陣輸入warpAffine()，作一個單純旋轉的仿射變換。warpAffine()的最後一個參數Scalar(255,255,255)是把因爲旋轉產生的空白用白色填充。
校訂的結果：

一個檢測單條直線的例子
原始圖像：

傅里葉譜：

只有一條明顯的直線。還好僅有的這條直線正是咱們所須要的。
檢測直線：

校訂結果：

對中文的效果
咱們來試試看這段程序對中文的校訂效果。
輸入圖像：

傅里葉譜：

能夠發現有許多條平行的亮線，其中過頻域原點的那條長度最長，最容易檢測出來。
檢測直線：

校訂結果：

雖然中文和英文在文字上有很大的不一樣，但字母（或者文字）的高度比較一致，使得行與行之間的分隔很明顯。因此它們的頻域特徵是類似的。

對其餘語言文字的效果
我從IMDB.com摘取影片《教父》的英文介紹，而後用谷歌翻譯成其餘文字進行測試。
阿拉伯語

一枚反例
老撾語：

傅里葉譜：

一種二值化的結果：

直線檢測：

這種文字的不少字母的上下方多了不少「筆畫」（我不知道該怎麼稱呼那些小曲線），讓行與行之間的分離變得不明顯，使得頻域特徵變得不明顯。
雖然用肉眼能夠看出傅里葉譜中存在一條傾斜的直線，但它的亮度過低，二值化過程很難排除噪聲，致使直線檢測會首先檢出噪聲產生的直線。這也是個人程序目前受限之處。須要增長一個過濾散點噪聲的步驟以增長程序的適用範圍。

參考：Discrete Fourier Transform — OpenCV 2.4.7.0 documentation

代碼還能夠在這裏下載：https://github.com/johnhany/textRotCorrect

原文：http://johnhany.net/2013/11/dft-based-text-rotation-correction/