SSE圖像算法優化系列十八：三次卷積插值的進一步SSE優化。

時間 2019-11-06

標籤 sse 圖像算法優化系列十八三次插值進一步简体版

原文原文鏈接

　　本文是在學習https://blog.csdn.net/housisong/article/details/1452249一文的基礎上對算法的理解和從新整理，再次很是感謝原文做者的深刻分析以及分享。html

　　三次卷積插值的基礎原理也是對取樣點附近的領域像素按照某種權重分佈計算加權的結果值，比起雙線性的4個領域像素計算，三次卷積涉及到了16個領域像素，這也決定了其取樣點位置不是對稱的，同時耗時比雙線性也大爲增長。算法

如左圖所示，P00爲向下取整後的取樣點的座標，其領域16個像素的位置總體靠取樣點的右下側，各個位置的重係數並非固定值，而是和取樣點的浮點座標的小數部分關。其值由函數Sin(x * pi) / (x * pi)決定，該函數曲線如右圖藍色曲線所示，當小數部分假定爲U時，在水平或者垂直方向的4個權重份量對應的x值分別爲：1+U、U、1-U以及2-U。緩存

　　實際的操做中，咱們經常用一個擬合的表達式來近似該曲線，好比原文做者提供的以下代碼：app

float SinXDivX(float X) { const float a = -1;                    // a還能夠取 a=-2,-1,-0.75,-0.5等等，起到調節銳化或模糊程度的做用
    X = abs(X); float X2 = X * X, X3 = X2 * X; if (X <= 1) return (a + 2) * X3 - (a + 3) * X2 + 1; else if (X <= 2) return a * X3 - (5 * a) * X2 + (8 * a) * X - (4 * a); else
        return 0; }

　　標準的函數應該是：ide

float SinXDivX_Standard(float X) { if (abs(X) < 0.000001f) return 1; else
        return sin(X * 3.1415926f) / (X * 3.1415926f); }

　　注意到一點，好比X取值爲0.3，若是按照標準函數函數

　　　　SinXDivX_Standard(1 + X) + SinXDivX_Standard(X) + SinXDivX_Standard(1 - X) + SinXDivX_Standard(2 - X) = 0.8767學習

　　可是若是是下式：　　測試

SinXDivX(1 + X) + SinXDivX(X) + SinXDivX(1 - X) + SinXDivX(2 - X) 則等於1。優化

　　因此使用擬合式的好處就是權重係數以後無需在進行歸一化的處理了。ui

　　對於一個浮點的映射座標，使用三次卷積插值的簡單的代碼以下所示：

void Bicubic_Original(unsigned char *Src, int Width, int Height, int Stride, unsigned char *Pixel, float X, float Y) { int Channel = Stride / Width; int PosX = floor(X), PosY = floor(Y); float PartXX = X - PosX, PartYY = Y - PosY; unsigned char *Pixel00 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX - 1, PosY - 1); unsigned char *Pixel01 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 0, PosY - 1); unsigned char *Pixel02 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 1, PosY - 1); unsigned char *Pixel03 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 2, PosY - 1); unsigned char *Pixel10 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX - 1, PosY + 0); unsigned char *Pixel11 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 0, PosY + 0); unsigned char *Pixel12 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 1, PosY + 0); unsigned char *Pixel13 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 2, PosY + 0); unsigned char *Pixel20 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX - 1, PosY + 1); unsigned char *Pixel21 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 0, PosY + 1); unsigned char *Pixel22 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 1, PosY + 1); unsigned char *Pixel23 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 2, PosY + 1); unsigned char *Pixel30 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX - 1, PosY + 2); unsigned char *Pixel31 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 0, PosY + 2); unsigned char *Pixel32 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 1, PosY + 2); unsigned char *Pixel33 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 2, PosY + 2); float U0 = SinXDivX(1 + PartXX), U1 = SinXDivX(PartXX); float U2 = SinXDivX(1 - PartXX), U3 = SinXDivX(2 - PartXX); float V0 = SinXDivX(1 + PartYY), V1 = SinXDivX(PartYY); float V2 = SinXDivX(1 - PartYY), V3 = SinXDivX(2 - PartYY); for (int I = 0; I < Channel; I++) { float Sum1 = (Pixel00[I] * U0 + Pixel01[I] * U1 + Pixel02[I] * U2 + Pixel03[I] * U3) * V0; float Sum2 = (Pixel10[I] * U0 + Pixel11[I] * U1 + Pixel12[I] * U2 + Pixel13[I] * U3) * V1; float Sum3 = (Pixel20[I] * U0 + Pixel21[I] * U1 + Pixel22[I] * U2 + Pixel23[I] * U3) * V2; float Sum4 = (Pixel30[I] * U0 + Pixel31[I] * U1 + Pixel22[I] * U2 + Pixel33[I] * U3) * V3; Pixel[I] = IM_ClampToByte(Sum1 + Sum2 + Sum3 + Sum4 + 0.5f); } }

　　其中GetCheckedPixel爲簡單的取像素值的函數。

inline unsigned char *GetCheckedPixel(unsigned char *Src, int Width, int Height, int Stride, int Channel, int PosX, int PosY) { return Src + IM_ClampI(PosY, 0, Height - 1) * Stride + IM_ClampI(PosX, 0, Width - 1) * Channel; }

　　參考做者的源代碼，一個最直接的三次卷積插值的函數以下所示：

int IM_Resample_Original(unsigned char *Src, unsigned char *Dest, int SrcW, int SrcH, int StrideS, int DstW, int DstH, int StrideD, int InterpolationMode) { int Channel = StrideS / SrcW; if ((Src == NULL) || (Dest == NULL))                                return IM_STATUS_NULLREFRENCE; if ((SrcW <= 0) || (SrcH <= 0) || (DstW <= 0) || (DstH <= 0))        return IM_STATUS_INVALIDPARAMETER; if ((Channel != 1) && (Channel != 3) && (Channel != 4))                return IM_STATUS_INVALIDPARAMETER; if ((SrcW == DstW) && (SrcH == DstH)) { memcpy(Dest, Src, SrcW * SrcH * Channel * sizeof(unsigned char)); return IM_STATUS_OK; } // 已經論證這個沒有必要用SSE去作優化，速度不會有太大的變化， 2018.3.28
    if (InterpolationMode == 0)                            // 最近鄰插值
 { } else if (InterpolationMode == 1)                    　　// 雙線性插值方式
 { } else if (InterpolationMode == 2)            　　　　　　// 三次立方插值
 { for (int Y = 0; Y < DstH; Y++) { unsigned char *LinePD = Dest + Y * StrideD; float SrcY = (Y + 0.4999999f) * SrcH / DstH - 0.5f; for (int X = 0; X < DstW; X++) { float SrcX = (X + 0.4999999f) * SrcW / DstW - 0.5f; Bicubic_Original(Src, SrcW, SrcH, StrideS, LinePD, SrcX, SrcY); LinePD += Channel; } } } return IM_STATUS_OK; }

　　這個速度是很是緩慢的，由於有大量的浮點計算和座標位置計算。

　　爲了提升速度，原文的做者對該算法進行了大量的優化，主要包括（1）使用定點數來優化縮放函數；（2）邊界和內部分開處理；（3）對SinXDivX作一個查找表；（4）對border_color作一個查找表，我按照我本身的思路進一步整理成了我比較熟悉的代碼格式，主要以下片斷所示：

// 邊界處的三次立方插值
__forceinline void Bicubic_Border(unsigned char *Src, int Width, int Height, int Stride, unsigned char *Pixel, short *SinXDivX_Table, int SrcX, int SrcY) { int Channel = Stride / Width; int U = (unsigned char)(SrcX >> 8), V = (unsigned char)(SrcY >> 8); int U0 = SinXDivX_Table[256 + U], U1 = SinXDivX_Table[U]; int U2 = SinXDivX_Table[256 - U], U3 = SinXDivX_Table[512 - U]; int V0 = SinXDivX_Table[256 + V], V1 = SinXDivX_Table[V]; int V2 = SinXDivX_Table[256 - V], V3 = SinXDivX_Table[512 - V]; int PosX = SrcX >> 16, PosY = SrcY >> 16; unsigned char *Pixel00 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX - 1, PosY - 1); unsigned char *Pixel01 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 0, PosY - 1); unsigned char *Pixel02 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 1, PosY - 1); unsigned char *Pixel03 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 2, PosY - 1); unsigned char *Pixel10 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX - 1, PosY + 0); unsigned char *Pixel11 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 0, PosY + 0); unsigned char *Pixel12 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 1, PosY + 0); unsigned char *Pixel13 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 2, PosY + 0); unsigned char *Pixel20 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX - 1, PosY + 1); unsigned char *Pixel21 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 0, PosY + 1); unsigned char *Pixel22 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 1, PosY + 1); unsigned char *Pixel23 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 2, PosY + 1); unsigned char *Pixel30 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX - 1, PosY + 2); unsigned char *Pixel31 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 0, PosY + 2); unsigned char *Pixel32 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 1, PosY + 2); unsigned char *Pixel33 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 2, PosY + 2); for (int I = 0; I < Channel; I++) { int Sum1 = (Pixel00[I] * U0 + Pixel01[I] * U1 + Pixel02[I] * U2 + Pixel03[I] * U3) * V0; int Sum2 = (Pixel10[I] * U0 + Pixel11[I] * U1 + Pixel12[I] * U2 + Pixel13[I] * U3) * V1; int Sum3 = (Pixel20[I] * U0 + Pixel21[I] * U1 + Pixel22[I] * U2 + Pixel23[I] * U3) * V2; int Sum4 = (Pixel30[I] * U0 + Pixel31[I] * U1 + Pixel22[I] * U2 + Pixel33[I] * U3) * V3; Pixel[I] = IM_ClampToByte((Sum1 + Sum2 + Sum3 + Sum4) >> 16); } } // __forceinline強制內聯仍是能提升點速度的，畢竟這個函數的參數不少 // 若是是肯定的通道數，能夠把裏面的Channel改成固定的值，速度能提升不少
__forceinline void Bicubic_Center(unsigned char *Src, int Width, int Height, int Stride, unsigned char *Pixel, short *SinXDivX_Table, int SrcX, int SrcY) { int Channel = Stride / Width; int U = (unsigned char)(SrcX >> 8), V = (unsigned char)(SrcY >> 8); int U0 = SinXDivX_Table[256 + U], U1 = SinXDivX_Table[U]; int U2 = SinXDivX_Table[256 - U], U3 = SinXDivX_Table[512 - U]; int V0 = SinXDivX_Table[256 + V], V1 = SinXDivX_Table[V]; int V2 = SinXDivX_Table[256 - V], V3 = SinXDivX_Table[512 - V]; int PosX = SrcX >> 16, PosY = SrcY >> 16; unsigned char *Pixel00 = Src + (PosY - 1) * Stride + (PosX - 1) * Channel; unsigned char *Pixel01 = Pixel00 + Channel; unsigned char *Pixel02 = Pixel01 + Channel; unsigned char *Pixel03 = Pixel02 + Channel; unsigned char *Pixel10 = Pixel00 + Stride; unsigned char *Pixel11 = Pixel10 + Channel; unsigned char *Pixel12 = Pixel11 + Channel; unsigned char *Pixel13 = Pixel12 + Channel; unsigned char *Pixel20 = Pixel10 + Stride; unsigned char *Pixel21 = Pixel20 + Channel; unsigned char *Pixel22 = Pixel21 + Channel; unsigned char *Pixel23 = Pixel22 + Channel; unsigned char *Pixel30 = Pixel20 + Stride; unsigned char *Pixel31 = Pixel30 + Channel; unsigned char *Pixel32 = Pixel31 + Channel; unsigned char *Pixel33 = Pixel32 + Channel; for (int I = 0; I < Channel; I++) { int Sum1 = (Pixel00[I] * U0 + Pixel01[I] * U1 + Pixel02[I] * U2 + Pixel03[I] * U3) * V0; int Sum2 = (Pixel10[I] * U0 + Pixel11[I] * U1 + Pixel12[I] * U2 + Pixel13[I] * U3) * V1; int Sum3 = (Pixel20[I] * U0 + Pixel21[I] * U1 + Pixel22[I] * U2 + Pixel23[I] * U3) * V2; int Sum4 = (Pixel30[I] * U0 + Pixel31[I] * U1 + Pixel22[I] * U2 + Pixel33[I] * U3) * V3; Pixel[I] = IM_ClampToByte((Sum1 + Sum2 + Sum3 + Sum4) >> 16); } } int IM_Resample_PureC(unsigned char *Src, unsigned char *Dest, int SrcW, int SrcH, int StrideS, int DstW, int DstH, int StrideD, int InterpolationMode) { int Channel = StrideS / SrcW; if ((Src == NULL) || (Dest == NULL))                                return IM_STATUS_NULLREFRENCE; if ((SrcW <= 0) || (SrcH <= 0) || (DstW <= 0) || (DstH <= 0))        return IM_STATUS_INVALIDPARAMETER; if ((Channel != 1) && (Channel != 3) && (Channel != 4))                return IM_STATUS_INVALIDPARAMETER; if ((SrcW == DstW) && (SrcH == DstH)) { memcpy(Dest, Src, SrcW * SrcH * Channel * sizeof(unsigned char)); return IM_STATUS_OK; } // 已經論證這個沒有必要用SSE去作優化，速度不會有太大的變化， 2018.3.28
    if (InterpolationMode == 0)                            // 最近鄰插值
 { } else if (InterpolationMode == 1)                    // 雙線性插值方式
 { } else if (InterpolationMode == 2)            // 三次立方插值
 { short *SinXDivX_Table = (short *)malloc(513 * sizeof(short)); if (SinXDivX_Table == NULL) { if (SinXDivX_Table != NULL) free(SinXDivX_Table); return IM_STATUS_NULLREFRENCE; } for (int I = 0; I < 513; I++) SinXDivX_Table[I] = int(0.5 + 256 * SinXDivX(I / 256.0f));            // 創建查找表，定點化

        int AddX = (SrcW << 16) / DstW, AddY = (SrcH << 16) / DstH; int ErrorX = -(1 << 15) + (AddX >> 1), ErrorY = -(1 << 15) + (AddY >> 1); int StartX = ((1 << 16) - ErrorX) / AddX + 1;            // 計算出須要特殊處理的邊界
        int StartY = ((1 << 16) - ErrorY) / AddY + 1;            // y0+y*yr>=1; y0=ErrorY => y>=(1-ErrorY)/yr
        int EndX = (((SrcW - 3) << 16) - ErrorX) / AddX + 1; int EndY = (((SrcH - 3) << 16) - ErrorY) / AddY + 1;    // y0+y*yr<=(height-3) => y<=(height-3-ErrorY)/yr
        if (StartY >= DstH)            StartY = DstH; if (StartX >= DstW)            StartX = DstW; if (EndX < StartX)            EndX = StartX; if (EndY < StartY)            EndY = StartY; int SrcY = ErrorY; for (int Y = 0; Y < StartY; Y++, SrcY += AddY)            // 前面的不是都有效的取樣部分數據
 { unsigned char *LinePD = Dest + Y * StrideD; for (int X = 0, SrcX = ErrorX; X < DstW; X++, SrcX += AddX, LinePD += Channel) { Bicubic_Border(Src, SrcW, SrcH, StrideS, LinePD, SinXDivX_Table, SrcX, SrcY); } } for (int Y = StartY; Y < EndY; Y++, SrcY += AddY) { int SrcX = ErrorX; unsigned char *LinePD = Dest + Y * StrideD; for (int X = 0; X < StartX; X++, SrcX += AddX, LinePD += Channel) { Bicubic_Border(Src, SrcW, SrcH, StrideS, LinePD, SinXDivX_Table, SrcX, SrcY); } for (int X = StartX; X < EndX; X++, SrcX += AddX, LinePD += Channel) { Bicubic_Center(Src, SrcW, SrcH, StrideS, LinePD, SinXDivX_Table, SrcX, SrcY); } for (int X = EndX; X < DstW; X++, SrcX += AddX, LinePD += Channel) { Bicubic_Border(Src, SrcW, SrcH, StrideS, LinePD, SinXDivX_Table, SrcX, SrcY); } } for (int Y = EndY; Y < DstH; Y++, SrcY += AddY) { unsigned char *LinePD = Dest + Y * StrideD; for (int X = 0, SrcX = ErrorX; X < DstW; X++, SrcX += AddX, LinePD += Channel) { Bicubic_Border(Src, SrcW, SrcH, StrideS, LinePD, SinXDivX_Table, SrcX, SrcY); } } free(SinXDivX_Table); } return IM_STATUS_OK; }

　　用於Bicubic_Border 和Bicubic_Center在函數中大量的被調用，函數的調用開銷也是不可忽略的，在VS中能夠用__forceinline來進行強制內聯，這個大約對本例大約有10%的提速效果。

　　本例的Bicubic_Border 和Bicubic_Center函數是爲了通用不一樣通道，用了一個for循環，實際操做時爲了效率應該要分通道展開的，展開後的效率約能提升30%。

　　以上純C代碼將32位的800*600的代碼放大到1024*768大約須要40ms(若是通道分開寫，大約須要30ms)。

　　爲了進一步提升速度，咱們來考慮這個算法的SSE優化，在HouSisong的專欄裏已經有了SSE優化的代碼，不過他時直接內嵌彙編寫的，比較難以看懂，而且如今的64位操做系統時沒法內嵌彙編的了，可是仍是可使用intrinsic，因此這裏我使用intrinsic語句來處理（其實我也沒看懂HouSisong的代碼）。

　　對於邊緣部分，計算量不大，直接使用C版本的Bicubic_Border函數，重點咱們看看Bicubic_Center函數。

　　Bicubic_Center函數前面部分的代碼主要時計算權重係數和取樣點的內存座標，先無論，咱們看看核心的計算部分代碼以下：

    for (int I = 0; I < Channel; I++) { int Sum1 = (Pixel00[I] * U0 + Pixel01[I] * U1 + Pixel02[I] * U2 + Pixel03[I] * U3) * V0;　　　　//　　行1 int Sum2 = (Pixel10[I] * U0 + Pixel11[I] * U1 + Pixel12[I] * U2 + Pixel13[I] * U3) * V1;　　　　//　　行2 int Sum3 = (Pixel20[I] * U0 + Pixel21[I] * U1 + Pixel22[I] * U2 + Pixel23[I] * U3) * V2;　　　　//　　行3 int Sum4 = (Pixel30[I] * U0 + Pixel31[I] * U1 + Pixel22[I] * U2 + Pixel33[I] * U3) * V3;　　　　//　　行4 Pixel[I] = IM_ClampToByte((Sum1 + Sum2 + Sum3 + Sum4) >> 16); }

　　先考慮Channel爲1的狀況，觀察這一句：Pixel00[I] * U0 + Pixel01[I] * U1 + Pixel02[I] * U2 + Pixel03[I] * U3，注意此時Pixel00/Pixel01/Pixel02/Pixel03在內存中是連續的，並且取值範圍在[0,255]之間，U0/U1/U2/U3根據前面的查找表創建過程，也在[0,256]之間，他們都能用short類型來表達，而這個式子爲連乘而後累加，咱們考慮使用一個特殊的SSE指令_mm_madd_epi16，在MSDN中其功能解釋以下：

Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit integers from b.

  __m128i _mm_madd_epi16 (__m128i a, __m128i b);

Return Value

　 Adds the signed 32-bit integer results pairwise and packs the 4 signed 32-bit integer results.

 
     　　r0 := (a0 * b0) + (a1 * b1)
　　r1 := (a2 * b2) + (a3 * b3)
　　r2 := (a4 * b4) + (a5 * b5)
　　r3 := (a6 * b6) + (a7 * b7) 
    

　即a和b裏分別有8個有符號的16位數，而後對應的16位數據兩兩相乘，而後在兩兩相加，最後保存到4個32位有符號數中。

　考慮咱們的應用場景，行1到行4每行的代碼都只有4次乘法和3次加法，不能直接使用，可是咱們能夠考慮把兩行整合在一塊兒，一次性計算，這樣就須要調用2次

_mm_madd_epi16 ，而後2次的結果在調用_mm_hadd_epi32這個水平方向的累加函數就能獲得新的結果，感受真的有點奇妙，核心代碼以下所示：

　　if (Channel == 1) { __m128i P01 = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*((int *)Pixel0)), _mm_cvtsi32_si128(*((int *)Pixel1))));            // P00 P01 P02 P03 P10 P11 P12 P13
        __m128i P23 = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*((int *)Pixel2)), _mm_cvtsi32_si128(*((int *)Pixel3))));            // P20 P21 P22 P23 P30 P31 P32 P33
        __m128i Sum01 = _mm_madd_epi16(P01, PartX);                            // P00 * U0 + P01 * U1 P02 * U2 + P03 * U3 P10 * U0 + P11 * U1 P12 * U2 + P13 * U3
        __m128i Sum23 = _mm_madd_epi16(P23, PartX);                            // P20 * U0 + P21 * U1 P22 * U2 + P23 * U3 P30 * U0 + P31 * U1 P32 * U2 + P33 * U3
        __m128i Sum = _mm_hadd_epi32(Sum01, Sum23);                            // P00 * U0 + P01 * U1 + P02 * U2 + P03 * U3 P10 * U0 + P11 * U1 + P12 * U2 + P13 * U3 P20 * U0 + P21 * U1 + P22 * U2 + P23 * U3 P30 * U0 + P31 * U1 + P32 * U2 + P33 * U3
        LinePD[0] = IM_ClampToByte(_mm_hsum_epi32(_mm_mullo_epi32(Sum, PartY)) >> 16); }

其中_mm_hsum_epi32爲自定義的一個函數。

// 4個有符號的32位的數據相加的和。
inline int _mm_hsum_epi32(__m128i V)                        // V3 V2 V1 V0
{ // 實測這個速度要快些，_mm_extract_epi32最慢。
    __m128i T = _mm_add_epi32(V, _mm_srli_si128(V, 8));        // V3+V1 V2+V0 V1 V0 
    T = _mm_add_epi32(T, _mm_srli_si128(T, 4));                // V3+V1+V2+V0 V2+V0+V1 V1+V0 V0 
    return _mm_cvtsi128_si32(T);                            // 提取低位 
}

　　我感受有的時候這些東西用語言是沒法能明確而有效的表達的，而直接用代碼卻能達到事半功倍的效果。

　　前面已經測試過用擬合曲線那個公式能知足累加和正好爲一，而不須要歸一化的，那麼理論上這個最後的移位操做後數據應該就在【0，255】範圍內，而不須要進行Clamp的，可是實際若是沒有這個Clamp，結果圖像會有部分像素溢出的，這是由於在咱們定點化的過程當中，這個和爲1的特性已經遭到了必定的破壞了。

　　注意在Bicubic_Center的循環計算中，V份量在計算每行時是固定的，每行開始時能夠直接一次使用_mm_setr_epi32來設置，，U份量計算每行時對於每一個像素都是變化的，咱們能夠對每一個像素用_mm_setr_epi32來設置，可是屢次使用這個intrinsic是個比較耗時的過程，所以咱們應該把每行的U保存到一個臨時內存中，而後每次使用時從不一樣的Load方可提升速度。

　　當不是單通道的圖像時，好比4通道，優化的思路是相同的，只不過咱們須要作更多的拆分和組合工做，把原始的數據組合成符合SIMD指令須要的格式，這就須要靈活的使用_mm_shuffle_epi八、_mm_unpacklo_epi3二、_mm_unpackhi_epi3二、_mm_unpacklo_epi八、_mm_unpacklo_epi8等語句的組合，這些語句都是很是快速和高效的，對於32位圖像，因爲一次性能夠處理4個字節，最後的IM_ClampToByte還能夠直接使用SIMD的抗飽和指令（_mm_packus_epi32）代替，效率能提升少量，並且還能夠調用不進行緩存的_mm_stream_si32指令直接寫內存，所以能極大的提升效率。具體的組合代碼請參考本文附件。

　　最後貼出基於SSE優化的代碼：

int IM_Resample_SSE(unsigned char *Src, unsigned char *Dest, int SrcW, int SrcH, int StrideS, int DstW, int DstH, int StrideD, int InterpolationMode) { int Channel = StrideS / SrcW; if ((Src == NULL) || (Dest == NULL))                                return IM_STATUS_NULLREFRENCE; if ((SrcW <= 0) || (SrcH <= 0) || (DstW <= 0) || (DstH <= 0))        return IM_STATUS_INVALIDPARAMETER; if ((Channel != 1) && (Channel != 3) && (Channel != 4))                return IM_STATUS_INVALIDPARAMETER; if ((SrcW == DstW) && (SrcH == DstH)) { memcpy(Dest, Src, SrcW * SrcH * Channel * sizeof(unsigned char)); return IM_STATUS_OK; } // 已經論證這個沒有必要用SSE去作優化，速度不會有太大的變化， 2018.3.28
    if (InterpolationMode == 0)                            // 最近鄰插值
 { } else if (InterpolationMode == 1)                    // 雙線性插值方式
 { } else if (InterpolationMode == 2)            // 三次立方插值
 { short *SinXDivX_Table = (short *)malloc(513 * sizeof(short)); short *Table = (short *)malloc(DstW * 4 * sizeof(short)); if ((SinXDivX_Table == NULL) || (Table == NULL)) { if (SinXDivX_Table != NULL) free(SinXDivX_Table); if (Table != NULL) free(Table); return IM_STATUS_NULLREFRENCE; } for (int I = 0; I < 513; I++) SinXDivX_Table[I] = int(0.5 + 256 * SinXDivX(I / 256.0f));            // 創建查找表，定點化

        int AddX = (SrcW << 16) / DstW, AddY = (SrcH << 16) / DstH; int ErrorX = -(1 << 15) + (AddX >> 1), ErrorY = -(1 << 15) + (AddY >> 1); int StartX = ((1 << 16) - ErrorX) / AddX + 1;            // 計算出須要特殊處理的邊界
        int StartY = ((1 << 16) - ErrorY) / AddY + 1;            // y0+y*yr>=1; y0=ErrorY => y>=(1-ErrorY)/yr
        int EndX = (((SrcW - 3) << 16) - ErrorX) / AddX + 1; int EndY = (((SrcH - 3) << 16) - ErrorY) / AddY + 1;    // y0+y*yr<=(height-3) => y<=(height-3-ErrorY)/yr
        if (StartY >= DstH)            StartY = DstH; if (StartX >= DstW)            StartX = DstW; if (EndX < StartX)            EndX = StartX; if (EndY < StartY)            EndY = StartY; for (int X = StartX, SrcX = ErrorX + StartX * AddX; X < EndX; X++, SrcX += AddX) { int U = (unsigned char)(SrcX >> 8);                    // StartX以前和EndX以後的數據雖然沒用，可是爲了方便仍是分配了內存
            Table[X * 4 + 0] = SinXDivX_Table[256 + U];            // 前面創建這樣的一個表，方便後面用SSE進行讀取和優化
            Table[X * 4 + 1] = SinXDivX_Table[U]; Table[X * 4 + 2] = SinXDivX_Table[256 - U]; Table[X * 4 + 3] = SinXDivX_Table[512 - U]; } int SrcY = ErrorY; for (int Y = 0; Y < StartY; Y++, SrcY += AddY)            // 前面的不是都有效的取樣部分數據
 { unsigned char *LinePD = Dest + Y * StrideD; for (int X = 0, SrcX = ErrorX; X < DstW; X++, SrcX += AddX, LinePD += Channel) { Bicubic_Border(Src, SrcW, SrcH, StrideS, LinePD, SinXDivX_Table, SrcX, SrcY); } } for (int Y = StartY; Y < EndY; Y++, SrcY += AddY) { int SrcX = ErrorX; unsigned char *LinePD = Dest + Y * StrideD; for (int X = 0; X < StartX; X++, SrcX += AddX, LinePD += Channel) { Bicubic_Border(Src, SrcW, SrcH, StrideS, LinePD, SinXDivX_Table, SrcX, SrcY); } int V = (unsigned char)(SrcY >> 8); unsigned char *LineY = Src + ((SrcY >> 16) - 1) * StrideS; __m128i PartY = _mm_setr_epi32(SinXDivX_Table[256 + V], SinXDivX_Table[V], SinXDivX_Table[256 - V], SinXDivX_Table[512 - V]); for (int X = StartX; X < EndX; X++, SrcX += AddX, LinePD += Channel) { __m128i PartX = _mm_loadl_epi64((__m128i *)(Table + X * 4)); PartX = _mm_unpacklo_epi64(PartX, PartX);                                // U0 U1 U2 U3 U0 U1 U2 U3
                unsigned char *Pixel0 = LineY + ((SrcX >> 16) - 1) * Channel; unsigned char *Pixel1 = Pixel0 + StrideS; unsigned char *Pixel2 = Pixel1 + StrideS; unsigned char *Pixel3 = Pixel2 + StrideS; if (Channel == 1) { __m128i P01 = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*((int *)Pixel0)), _mm_cvtsi32_si128(*((int *)Pixel1))));            // P00 P01 P02 P03 P10 P11 P12 P13
                    __m128i P23 = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*((int *)Pixel2)), _mm_cvtsi32_si128(*((int *)Pixel3))));            // P20 P21 P22 P23 P30 P31 P32 P33
                    __m128i Sum01 = _mm_madd_epi16(P01, PartX);                            // P00 * U0 + P01 * U1 P02 * U2 + P03 * U3 P10 * U0 + P11 * U1 P12 * U2 + P13 * U3
                    __m128i Sum23 = _mm_madd_epi16(P23, PartX);                            // P20 * U0 + P21 * U1 P22 * U2 + P23 * U3 P30 * U0 + P31 * U1 P32 * U2 + P33 * U3
                    __m128i Sum = _mm_hadd_epi32(Sum01, Sum23);                            // P00 * U0 + P01 * U1 + P02 * U2 + P03 * U3 P10 * U0 + P11 * U1 + P12 * U2 + P13 * U3 P20 * U0 + P21 * U1 + P22 * U2 + P23 * U3 P30 * U0 + P31 * U1 + P32 * U2 + P33 * U3
                    LinePD[0] = IM_ClampToByte(_mm_hsum_epi32(_mm_mullo_epi32(Sum, PartY)) >> 16); } else if (Channel == 3) { } else if (Channel == 4) { __m128i P0 = _mm_loadu_si128((__m128i *)Pixel0), P1 = _mm_loadu_si128((__m128i *)Pixel1); __m128i P2 = _mm_loadu_si128((__m128i *)Pixel2), P3 = _mm_loadu_si128((__m128i *)Pixel3); // 如下組合方式比使用 _mm_shuffle_epi8 和 _mm_or_si128要少8條指令
                    P0 = _mm_shuffle_epi8(P0, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15));        // B0 G0 R0 A0
                    P1 = _mm_shuffle_epi8(P1, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15));        // B1 G1 R1 A1
                    P2 = _mm_shuffle_epi8(P2, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15));        // B2 G2 R2 A2
                    P3 = _mm_shuffle_epi8(P3, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15));        // B3 G3 R3 A3
 __m128i BG01 = _mm_unpacklo_epi32(P0, P1);        // B0 B1 G0 G1
                    __m128i RA01 = _mm_unpackhi_epi32(P0, P1);        // R0 R1 A0 A1
                    __m128i BG23 = _mm_unpacklo_epi32(P2, P3);        // B2 B3 G2 G3
                    __m128i RA23 = _mm_unpackhi_epi32(P2, P3);        // R2 R3 A2 A3
 __m128i B01 = _mm_unpacklo_epi8(BG01, _mm_setzero_si128()); __m128i B23 = _mm_unpacklo_epi8(BG23, _mm_setzero_si128()); __m128i SumB = _mm_hadd_epi32(_mm_madd_epi16(B01, PartX), _mm_madd_epi16(B23, PartX)); __m128i G01 = _mm_unpackhi_epi8(BG01, _mm_setzero_si128()); __m128i G23 = _mm_unpackhi_epi8(BG23, _mm_setzero_si128()); __m128i SumG = _mm_hadd_epi32(_mm_madd_epi16(G01, PartX), _mm_madd_epi16(G23, PartX)); __m128i R01 = _mm_unpacklo_epi8(RA01, _mm_setzero_si128()); __m128i R23 = _mm_unpacklo_epi8(RA23, _mm_setzero_si128()); __m128i SumR = _mm_hadd_epi32(_mm_madd_epi16(R01, PartX), _mm_madd_epi16(R23, PartX)); __m128i A01 = _mm_unpackhi_epi8(RA01, _mm_setzero_si128()); __m128i A23 = _mm_unpackhi_epi8(RA23, _mm_setzero_si128()); __m128i SumA = _mm_hadd_epi32(_mm_madd_epi16(A01, PartX), _mm_madd_epi16(A23, PartX)); // 這個竟然比註釋掉的還快點
                    __m128i Result = _mm_setr_epi32(_mm_hsum_epi32(_mm_mullo_epi32(SumB, PartY)), _mm_hsum_epi32(_mm_mullo_epi32(SumG, PartY)), _mm_hsum_epi32(_mm_mullo_epi32(SumR, PartY)), _mm_hsum_epi32(_mm_mullo_epi32(SumA, PartY))); Result = _mm_srai_epi32(Result, 16); // *((int *)LinePD) = _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packus_epi32(Result, Result), Result));
                    _mm_stream_si32((int *)LinePD, _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packus_epi32(Result, Result), Result))); //LinePD[0] = IM_ClampToByte(_mm_hsum_epi32(_mm_mullo_epi32(SumB, PartY)) >> 16); // 確實有部分存在超出unsigned char範圍的，由於定點化的緣故 //LinePD[1] = IM_ClampToByte(_mm_hsum_epi32(_mm_mullo_epi32(SumG, PartY)) >> 16); //LinePD[2] = IM_ClampToByte(_mm_hsum_epi32(_mm_mullo_epi32(SumR, PartY)) >> 16); //LinePD[3] = IM_ClampToByte(_mm_hsum_epi32(_mm_mullo_epi32(SumA, PartY)) >> 16);
 } } for (int X = EndX; X < DstW; X++, SrcX += AddX, LinePD += Channel) { Bicubic_Border(Src, SrcW, SrcH, StrideS, LinePD, SinXDivX_Table, SrcX, SrcY); } } for (int Y = EndY; Y < DstH; Y++, SrcY += AddY) { unsigned char *LinePD = Dest + Y * StrideD; for (int X = 0, SrcX = ErrorX; X < DstW; X++, SrcX += AddX, LinePD += Channel) { Bicubic_Border(Src, SrcW, SrcH, StrideS, LinePD, SinXDivX_Table, SrcX, SrcY); } } free(Table); free(SinXDivX_Table); } return IM_STATUS_OK; }