快速遍歷OpenCV Mat圖像數據的多種方法和性能分析 | opencv mat for loop

本文首發於我的博客kezunlin.me/post/61d55a…,歡迎閱讀!python

opencv mat for loopgit

Series

Guide

Mat

  • for gray image, use type
  • for RGB color image,use type

gray format storagegraygithub

color format storage: BGRBGR
ubuntu

we can use method isContinuous() to judge whether the memory buffer is continuous or not.windows

color space reduction

uchar color_space_reduction(uchar pixel)
{
    /*
    0-9 ===>0
    10-19===>10
    20-29===>20
    ...
    240-249===>24
    250-255===>25

    map from 256*256*256===>26*26*26
    */

    int divideWith = 10;
    uchar new_pixel = (pixel / divideWith)*divideWith;
    return new_pixel;
}複製代碼

color table

void get_color_table()
{
    // cache color value in table[256]
    int divideWith = 10;
    uchar table[256];
    for (int i = 0; i < 256; ++i)
        table[i] = divideWith* (i / divideWith);
}複製代碼

C++

ptr []

// C ptr []: faster but not safe
Mat& ScanImageAndReduce_Cptr(Mat& I, const uchar* const table)
{
    // accept only char type matrices
    CV_Assert(I.depth() != sizeof(uchar));
    int channels = I.channels();
    int nRows = I.rows;
    int nCols = I.cols* channels;
    if (I.isContinuous())
    {
        nCols *= nRows;
        nRows = 1;
    }
    int i, j;
    uchar* p;
    for (i = 0; i < nRows; ++i)
    {
        p = I.ptr<uchar>(i);
        for (j = 0; j < nCols; ++j)
        {
            p[j] = table[p[j]];
        }
    }
    return I;
}複製代碼

ptr ++

// C ptr ++: faster but not safe
Mat& ScanImageAndReduce_Cptr2(Mat& I, const uchar* const table)
{
    // accept only char type matrices
    CV_Assert(I.depth() != sizeof(uchar));
    int channels = I.channels();
    int nRows = I.rows;
    int nCols = I.cols* channels;
    if (I.isContinuous())
    {
        nCols *= nRows;
        nRows = 1;
    }
    uchar* start = I.ptr<uchar>(0); // same as I.ptr<uchar>(0,0)
    uchar* end = start + nRows * nCols;
    for (uchar* p=start; p < end; ++p)
    {
        *p = table[*p];
    }
    return I;
}複製代碼

at (i,j)


// at<uchar>(i,j): random access, slow
Mat& ScanImageAndReduce_atRandomAccess(Mat& I, const uchar* const table)
{
    // accept only char type matrices
    CV_Assert(I.depth() != sizeof(uchar));
    const int channels = I.channels();
    switch (channels)
    {
    case 1:
    {
        for (int i = 0; i < I.rows; ++i)
            for (int j = 0; j < I.cols; ++j)
                I.at<uchar>(i, j) = table[I.at<uchar>(i, j)];
        break;
    }
    case 3:
    {
        Mat_<Vec3b> _I = I;

        for (int i = 0; i < I.rows; ++i)
            for (int j = 0; j < I.cols; ++j)
            {
                _I(i, j)[0] = table[_I(i, j)[0]];
                _I(i, j)[1] = table[_I(i, j)[1]];
                _I(i, j)[2] = table[_I(i, j)[2]];
            }
        I = _I;
        break;
    }
    }
    return I;
}
複製代碼


Iterator

// MatIterator_<uchar>: safe but slow
Mat& ScanImageAndReduce_Iterator(Mat& I, const uchar* const table)
{
    // accept only char type matrices
    CV_Assert(I.depth() != sizeof(uchar));
    const int channels = I.channels();
    switch (channels)
    {
    case 1:
    {
        MatIterator_<uchar> it, end;
        for (it = I.begin<uchar>(), end = I.end<uchar>(); it != end; ++it)
            *it = table[*it];
        break;
    }
    case 3:
    {
        MatIterator_<Vec3b> it, end;
        for (it = I.begin<Vec3b>(), end = I.end<Vec3b>(); it != end; ++it)
        {
            (*it)[0] = table[(*it)[0]];
            (*it)[1] = table[(*it)[1]];
            (*it)[2] = table[(*it)[2]];
        }
    }
    }
    return I;
}複製代碼

opencv LUT

// LUT
Mat& ScanImageAndReduce_LUT(Mat& I, const uchar* const table)
{
    Mat lookUpTable(1, 256, CV_8U);
    uchar* p = lookUpTable.data;
    for (int i = 0; i < 256; ++i)
        p[i] = table[i];

    cv::LUT(I, lookUpTable, I);
    return I;
}複製代碼

forEach

forEach method of the Mat class that utilizes all the cores on your machine to apply any function at every pixel.app

// Parallel execution with function object.
struct ForEachOperator
{
    uchar m_table[256];
    ForEachOperator(const uchar* const table)
    {
        for (size_t i = 0; i < 256; i++)
        {
            m_table[i] = table[i];
        }
    }

    void operator ()(uchar& p, const int * position) const
    {
        // Perform a simple operation
        p = m_table[p];
    }
};

// forEach use multiple processors, very fast
Mat& ScanImageAndReduce_forEach(Mat& I, const uchar* const table)
{
    I.forEach<uchar>(ForEachOperator(table));
    return I;
}複製代碼

forEach with lambda


// forEach lambda use multiple processors, very fast (lambda slower than ForEachOperator)
Mat& ScanImageAndReduce_forEach_with_lambda(Mat& I, const uchar* const table)
{
    I.forEach<uchar>
    (
        [=](uchar &p, const int * position) -> void
        {
            p = table[p];
        }
    );
    return I;
}複製代碼

time cost

no foreach

[1 Cptr   ] times=5000, total_cost=988 ms, avg_cost=0.1976 ms
    [1 Cptr2  ] times=5000, total_cost=1704 ms, avg_cost=0.3408 ms
    [2 atRandom] times=5000, total_cost=9611 ms, avg_cost=1.9222 ms
    [3 Iterator] times=5000, total_cost=20195 ms, avg_cost=4.039 ms
    [4 LUT    ] times=5000, total_cost=899 ms, avg_cost=0.1798 ms複製代碼
[1 Cptr   ] times=10000, total_cost=2425 ms, avg_cost=0.2425 ms
    [1 Cptr2  ] times=10000, total_cost=3391 ms, avg_cost=0.3391 ms
    [2 atRandom] times=10000, total_cost=20024 ms, avg_cost=2.0024 ms
    [3 Iterator] times=10000, total_cost=39980 ms, avg_cost=3.998 ms
    [4 LUT    ] times=10000, total_cost=103 ms, avg_cost=0.0103 ms複製代碼

foreach

[5 forEach     ] times=200000, total_cost=199 ms, avg_cost=0.000995 ms
    [5 forEach lambda] times=200000, total_cost=521 ms, avg_cost=0.002605 ms複製代碼
[5 forEach     ] times=20000, total_cost=17 ms, avg_cost=0.00085 ms
    [5 forEach lambda] times=20000, total_cost=23 ms, avg_cost=0.00115 ms
   複製代碼

results

Loop Type Time Cost (us)
ptr []
242
ptr ++
339
at 2002
iterator
3998
LUT
10
forEach
0.85
forEach lambda
1.15

forEach is 10x times faster than LUT, 240~340x times faster than ptr [] and ptr ++, and 2000~4000x times faster than at and iterator.less

code

code heredom

Python

pure python

# import the necessary packages
import matplotlib.pyplot as plt
import cv2
print(cv2.__version__)

%matplotlib inline複製代碼

3.4.2

複製代碼
# load the original image, convert it to grayscale, and display
# it inline
image = cv2.imread("cat.jpg")
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
print(image.shape)
#plt.imshow(image, cmap="gray")複製代碼

(360, 480)

複製代碼
%load_ext cython複製代碼

The cython extension is already loaded. To reload it, use:
      %reload_ext cython

複製代碼
%%cython -a
 
def threshold_python(T, image):
    # grab the image dimensions
    h = image.shape[0]
    w = image.shape[1]
    
    # loop over the image, pixel by pixel
    for y in range(0, h):
        for x in range(0, w):
            # threshold the pixel
            image[y, x] = 255 if image[y, x] >= T else 0
            
    # return the thresholded image
    return image複製代碼

%timeit threshold_python(5, image)複製代碼

263 ms ± 20.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
複製代碼

cython

%%cython -a
 
import cython
 
@cython.boundscheck(False)
cpdef unsigned char[:, :] threshold_cython(int T, unsigned char [:, :] image):
    # set the variable extension types
    cdef int x, y, w, h
    
    # grab the image dimensions
    h = image.shape[0]
    w = image.shape[1]
    
    # loop over the image
    for y in range(0, h):
        for x in range(0, w):
            # threshold the pixel
            image[y, x] = 255 if image[y, x] >= T else 0
    
    # return the thresholded image
    return image複製代碼

numba

%timeit threshold_cython(5, image)複製代碼

150 µs ± 7.14 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)

複製代碼
from numba import njit

@njit
def threshold_njit(T, image):
    # grab the image dimensions
    h = image.shape[0]
    w = image.shape[1]
    
    # loop over the image, pixel by pixel
    for y in range(0, h):
        for x in range(0, w):
            # threshold the pixel
            image[y, x] = 255 if image[y, x] >= T else 0
            
    # return the thresholded image
    return image複製代碼

%timeit threshold_njit(5, image)複製代碼

43.5 µs ± 142 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
複製代碼

numpy

def threshold_numpy(T, image):
    image[image > T] = 255
    return image複製代碼

%timeit threshold_numpy(5, image)複製代碼

111 µs ± 334 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
複製代碼

conclusions

image = cv2.imread("cat.jpg")
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
print(image.shape)

%timeit threshold_python(5, image)
%timeit threshold_cython(5, image)
%timeit threshold_njit(5, image)
%timeit threshold_numpy(5, image)複製代碼

(360, 480)
    251 ms ± 6.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
    143 µs ± 1.19 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
    43.8 µs ± 284 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
    113 µs ± 957 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)

複製代碼
image = cv2.imread("big.jpg")
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
print(image.shape)

%timeit threshold_python(5, image)
%timeit threshold_cython(5, image)
%timeit threshold_njit(5, image)
%timeit threshold_numpy(5, image)複製代碼

(2880, 5120)
    21.8 s ± 460 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
    12.3 ms ± 231 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
    3.91 ms ± 66.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
    10.3 ms ± 179 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
複製代碼

60,480ide

  • python: 251 ms
  • cython: 143 us
  • numba: 43 us
  • numpy: 113 us

2880, 5120oop

  • python: 21 s
  • cython: 12 ms
  • numba: 4 ms
  • numpy: 10 ms

Reference

History

  • 20180823: created.

Copyright

相關文章
相關標籤/搜索