



     如左圖所示,P00爲向下取整後的取樣點的座標,其領域16個像素的位置總體靠取樣點的右下側,各個位置的重係數並非固定 值,而是和取樣點的浮點座標的小數部分關。其值由函數Sin(x * pi) / (x * pi)決定,該函數曲線如右圖藍色曲線所示,當小數部分假定爲U時,在水平或者垂直方向的4個權重份量對應的x值分別爲:1+U、U、1-U以及2-U。緩存




float SinXDivX(float X) { const float a = -1;                    // a還能夠取 a=-2,-1,-0.75,-0.5等等,起到調節銳化或模糊程度的做用
    X = abs(X); float X2 = X * X, X3 = X2 * X; if (X <= 1) return (a + 2) * X3 - (a + 3) * X2 + 1; else if (X <= 2) return a * X3 - (5 * a) * X2 + (8 * a) * X - (4 * a); else
        return 0; }


float SinXDivX_Standard(float X) { if (abs(X) < 0.000001f) return 1; else
        return sin(X * 3.1415926f) / (X * 3.1415926f); }


    SinXDivX_Standard(1 + X) + SinXDivX_Standard(X) + SinXDivX_Standard(1 - X) + SinXDivX_Standard(2 - X) = 0.8767學習

  可是若是是下式:  測試

            SinXDivX(1 + X) + SinXDivX(X) + SinXDivX(1 - X) + SinXDivX(2 - X) 則等於1。優化



void Bicubic_Original(unsigned char *Src, int Width, int Height, int Stride, unsigned char *Pixel, float X, float Y) { int Channel = Stride / Width; int PosX = floor(X), PosY = floor(Y); float PartXX = X - PosX, PartYY = Y - PosY; unsigned char *Pixel00 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX - 1, PosY - 1); unsigned char *Pixel01 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 0, PosY - 1); unsigned char *Pixel02 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 1, PosY - 1); unsigned char *Pixel03 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 2, PosY - 1); unsigned char *Pixel10 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX - 1, PosY + 0); unsigned char *Pixel11 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 0, PosY + 0); unsigned char *Pixel12 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 1, PosY + 0); unsigned char *Pixel13 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 2, PosY + 0); unsigned char *Pixel20 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX - 1, PosY + 1); unsigned char *Pixel21 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 0, PosY + 1); unsigned char *Pixel22 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 1, PosY + 1); unsigned char *Pixel23 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 2, PosY + 1); unsigned char *Pixel30 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX - 1, PosY + 2); unsigned char *Pixel31 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 0, PosY + 2); unsigned char *Pixel32 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 1, PosY + 2); unsigned char *Pixel33 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 2, PosY + 2); float U0 = SinXDivX(1 + PartXX), U1 = SinXDivX(PartXX); float U2 = SinXDivX(1 - PartXX), U3 = SinXDivX(2 - PartXX); float V0 = SinXDivX(1 + PartYY), V1 = SinXDivX(PartYY); float V2 = SinXDivX(1 - PartYY), V3 = SinXDivX(2 - PartYY); for (int I = 0; I < Channel; I++) { float Sum1 = (Pixel00[I] * U0 + Pixel01[I] * U1 + Pixel02[I] * U2 + Pixel03[I] * U3) * V0; float Sum2 = (Pixel10[I] * U0 + Pixel11[I] * U1 + Pixel12[I] * U2 + Pixel13[I] * U3) * V1; float Sum3 = (Pixel20[I] * U0 + Pixel21[I] * U1 + Pixel22[I] * U2 + Pixel23[I] * U3) * V2; float Sum4 = (Pixel30[I] * U0 + Pixel31[I] * U1 + Pixel22[I] * U2 + Pixel33[I] * U3) * V3; Pixel[I] = IM_ClampToByte(Sum1 + Sum2 + Sum3 + Sum4 + 0.5f); } }


inline unsigned char *GetCheckedPixel(unsigned char *Src, int Width, int Height, int Stride, int Channel, int PosX, int PosY) { return Src + IM_ClampI(PosY, 0, Height - 1) * Stride + IM_ClampI(PosX, 0, Width - 1) * Channel; }


int IM_Resample_Original(unsigned char *Src, unsigned char *Dest, int SrcW, int SrcH, int StrideS, int DstW, int DstH, int StrideD, int InterpolationMode) { int Channel = StrideS / SrcW; if ((Src == NULL) || (Dest == NULL))                                return IM_STATUS_NULLREFRENCE; if ((SrcW <= 0) || (SrcH <= 0) || (DstW <= 0) || (DstH <= 0))        return IM_STATUS_INVALIDPARAMETER; if ((Channel != 1) && (Channel != 3) && (Channel != 4))                return IM_STATUS_INVALIDPARAMETER; if ((SrcW == DstW) && (SrcH == DstH)) { memcpy(Dest, Src, SrcW * SrcH * Channel * sizeof(unsigned char)); return IM_STATUS_OK; } // 已經論證這個沒有必要用SSE去作優化,速度不會有太大的變化, 2018.3.28
    if (InterpolationMode == 0)                            // 最近鄰插值
 { } else if (InterpolationMode == 1)                      // 雙線性插值方式
 { } else if (InterpolationMode == 2)                  // 三次立方插值
 { for (int Y = 0; Y < DstH; Y++) { unsigned char *LinePD = Dest + Y * StrideD; float SrcY = (Y + 0.4999999f) * SrcH / DstH - 0.5f; for (int X = 0; X < DstW; X++) { float SrcX = (X + 0.4999999f) * SrcW / DstW - 0.5f; Bicubic_Original(Src, SrcW, SrcH, StrideS, LinePD, SrcX, SrcY); LinePD += Channel; } } } return IM_STATUS_OK; }


  爲了提升速度,原文的做者對該算法進行了大量的優化,主要包括(1)使用定點數來優化縮放函數;(2)邊界和內部分開處理;(3)對SinXDivX作一個查找表; (4)對border_color作一個查找表,我按照我本身的思路進一步整理成了我比較熟悉的代碼格式,主要以下片斷所示:

// 邊界處的三次立方插值
__forceinline void Bicubic_Border(unsigned char *Src, int Width, int Height, int Stride, unsigned char *Pixel, short *SinXDivX_Table, int SrcX, int SrcY) { int Channel = Stride / Width; int U = (unsigned char)(SrcX >> 8), V = (unsigned char)(SrcY >> 8); int U0 = SinXDivX_Table[256 + U], U1 = SinXDivX_Table[U]; int U2 = SinXDivX_Table[256 - U], U3 = SinXDivX_Table[512 - U]; int V0 = SinXDivX_Table[256 + V], V1 = SinXDivX_Table[V]; int V2 = SinXDivX_Table[256 - V], V3 = SinXDivX_Table[512 - V]; int PosX = SrcX >> 16, PosY = SrcY >> 16; unsigned char *Pixel00 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX - 1, PosY - 1); unsigned char *Pixel01 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 0, PosY - 1); unsigned char *Pixel02 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 1, PosY - 1); unsigned char *Pixel03 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 2, PosY - 1); unsigned char *Pixel10 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX - 1, PosY + 0); unsigned char *Pixel11 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 0, PosY + 0); unsigned char *Pixel12 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 1, PosY + 0); unsigned char *Pixel13 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 2, PosY + 0); unsigned char *Pixel20 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX - 1, PosY + 1); unsigned char *Pixel21 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 0, PosY + 1); unsigned char *Pixel22 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 1, PosY + 1); unsigned char *Pixel23 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 2, PosY + 1); unsigned char *Pixel30 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX - 1, PosY + 2); unsigned char *Pixel31 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 0, PosY + 2); unsigned char *Pixel32 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 1, PosY + 2); unsigned char *Pixel33 = GetCheckedPixel(Src, Width, Height, Stride, Channel, PosX + 2, PosY + 2); for (int I = 0; I < Channel; I++) { int Sum1 = (Pixel00[I] * U0 + Pixel01[I] * U1 + Pixel02[I] * U2 + Pixel03[I] * U3) * V0; int Sum2 = (Pixel10[I] * U0 + Pixel11[I] * U1 + Pixel12[I] * U2 + Pixel13[I] * U3) * V1; int Sum3 = (Pixel20[I] * U0 + Pixel21[I] * U1 + Pixel22[I] * U2 + Pixel23[I] * U3) * V2; int Sum4 = (Pixel30[I] * U0 + Pixel31[I] * U1 + Pixel22[I] * U2 + Pixel33[I] * U3) * V3; Pixel[I] = IM_ClampToByte((Sum1 + Sum2 + Sum3 + Sum4) >> 16); } } // __forceinline強制內聯仍是能提升點速度的,畢竟這個函數的參數不少 // 若是是肯定的通道數,能夠把裏面的Channel改成固定的值,速度能提升不少
__forceinline void Bicubic_Center(unsigned char *Src, int Width, int Height, int Stride, unsigned char *Pixel, short *SinXDivX_Table, int SrcX, int SrcY) { int Channel = Stride / Width; int U = (unsigned char)(SrcX >> 8), V = (unsigned char)(SrcY >> 8); int U0 = SinXDivX_Table[256 + U], U1 = SinXDivX_Table[U]; int U2 = SinXDivX_Table[256 - U], U3 = SinXDivX_Table[512 - U]; int V0 = SinXDivX_Table[256 + V], V1 = SinXDivX_Table[V]; int V2 = SinXDivX_Table[256 - V], V3 = SinXDivX_Table[512 - V]; int PosX = SrcX >> 16, PosY = SrcY >> 16; unsigned char *Pixel00 = Src + (PosY - 1) * Stride + (PosX - 1) * Channel; unsigned char *Pixel01 = Pixel00 + Channel; unsigned char *Pixel02 = Pixel01 + Channel; unsigned char *Pixel03 = Pixel02 + Channel; unsigned char *Pixel10 = Pixel00 + Stride; unsigned char *Pixel11 = Pixel10 + Channel; unsigned char *Pixel12 = Pixel11 + Channel; unsigned char *Pixel13 = Pixel12 + Channel; unsigned char *Pixel20 = Pixel10 + Stride; unsigned char *Pixel21 = Pixel20 + Channel; unsigned char *Pixel22 = Pixel21 + Channel; unsigned char *Pixel23 = Pixel22 + Channel; unsigned char *Pixel30 = Pixel20 + Stride; unsigned char *Pixel31 = Pixel30 + Channel; unsigned char *Pixel32 = Pixel31 + Channel; unsigned char *Pixel33 = Pixel32 + Channel; for (int I = 0; I < Channel; I++) { int Sum1 = (Pixel00[I] * U0 + Pixel01[I] * U1 + Pixel02[I] * U2 + Pixel03[I] * U3) * V0; int Sum2 = (Pixel10[I] * U0 + Pixel11[I] * U1 + Pixel12[I] * U2 + Pixel13[I] * U3) * V1; int Sum3 = (Pixel20[I] * U0 + Pixel21[I] * U1 + Pixel22[I] * U2 + Pixel23[I] * U3) * V2; int Sum4 = (Pixel30[I] * U0 + Pixel31[I] * U1 + Pixel22[I] * U2 + Pixel33[I] * U3) * V3; Pixel[I] = IM_ClampToByte((Sum1 + Sum2 + Sum3 + Sum4) >> 16); } } int IM_Resample_PureC(unsigned char *Src, unsigned char *Dest, int SrcW, int SrcH, int StrideS, int DstW, int DstH, int StrideD, int InterpolationMode) { int Channel = StrideS / SrcW; if ((Src == NULL) || (Dest == NULL))                                return IM_STATUS_NULLREFRENCE; if ((SrcW <= 0) || (SrcH <= 0) || (DstW <= 0) || (DstH <= 0))        return IM_STATUS_INVALIDPARAMETER; if ((Channel != 1) && (Channel != 3) && (Channel != 4))                return IM_STATUS_INVALIDPARAMETER; if ((SrcW == DstW) && (SrcH == DstH)) { memcpy(Dest, Src, SrcW * SrcH * Channel * sizeof(unsigned char)); return IM_STATUS_OK; } // 已經論證這個沒有必要用SSE去作優化,速度不會有太大的變化, 2018.3.28
    if (InterpolationMode == 0)                            // 最近鄰插值
 { } else if (InterpolationMode == 1)                    // 雙線性插值方式
 { } else if (InterpolationMode == 2)            // 三次立方插值
 { short *SinXDivX_Table = (short *)malloc(513 * sizeof(short)); if (SinXDivX_Table == NULL) { if (SinXDivX_Table != NULL) free(SinXDivX_Table); return IM_STATUS_NULLREFRENCE; } for (int I = 0; I < 513; I++) SinXDivX_Table[I] = int(0.5 + 256 * SinXDivX(I / 256.0f));            // 創建查找表,定點化

        int AddX = (SrcW << 16) / DstW, AddY = (SrcH << 16) / DstH; int ErrorX = -(1 << 15) + (AddX >> 1), ErrorY = -(1 << 15) + (AddY >> 1); int StartX = ((1 << 16) - ErrorX) / AddX + 1;            // 計算出須要特殊處理的邊界
        int StartY = ((1 << 16) - ErrorY) / AddY + 1;            // y0+y*yr>=1; y0=ErrorY => y>=(1-ErrorY)/yr
        int EndX = (((SrcW - 3) << 16) - ErrorX) / AddX + 1; int EndY = (((SrcH - 3) << 16) - ErrorY) / AddY + 1;    // y0+y*yr<=(height-3) => y<=(height-3-ErrorY)/yr
        if (StartY >= DstH)            StartY = DstH; if (StartX >= DstW)            StartX = DstW; if (EndX < StartX)            EndX = StartX; if (EndY < StartY)            EndY = StartY; int SrcY = ErrorY; for (int Y = 0; Y < StartY; Y++, SrcY += AddY)            // 前面的不是都有效的取樣部分數據
 { unsigned char *LinePD = Dest + Y * StrideD; for (int X = 0, SrcX = ErrorX; X < DstW; X++, SrcX += AddX, LinePD += Channel) { Bicubic_Border(Src, SrcW, SrcH, StrideS, LinePD, SinXDivX_Table, SrcX, SrcY); } } for (int Y = StartY; Y < EndY; Y++, SrcY += AddY) { int SrcX = ErrorX; unsigned char *LinePD = Dest + Y * StrideD; for (int X = 0; X < StartX; X++, SrcX += AddX, LinePD += Channel) { Bicubic_Border(Src, SrcW, SrcH, StrideS, LinePD, SinXDivX_Table, SrcX, SrcY); } for (int X = StartX; X < EndX; X++, SrcX += AddX, LinePD += Channel) { Bicubic_Center(Src, SrcW, SrcH, StrideS, LinePD, SinXDivX_Table, SrcX, SrcY); } for (int X = EndX; X < DstW; X++, SrcX += AddX, LinePD += Channel) { Bicubic_Border(Src, SrcW, SrcH, StrideS, LinePD, SinXDivX_Table, SrcX, SrcY); } } for (int Y = EndY; Y < DstH; Y++, SrcY += AddY) { unsigned char *LinePD = Dest + Y * StrideD; for (int X = 0, SrcX = ErrorX; X < DstW; X++, SrcX += AddX, LinePD += Channel) { Bicubic_Border(Src, SrcW, SrcH, StrideS, LinePD, SinXDivX_Table, SrcX, SrcY); } } free(SinXDivX_Table); } return IM_STATUS_OK; }

  用於Bicubic_Border 和Bicubic_Center在函數中大量的被調用,函數的調用開銷也是不可忽略的,在VS中能夠用__forceinline來進行強制內聯,這個大約對本例大約有10%的提速效果。

  本例的Bicubic_Border 和Bicubic_Center函數是爲了通用不一樣通道,用了一個for循環,實際操做時爲了效率應該要分通道展開的,展開後的效率約能提升30%。





    for (int I = 0; I < Channel; I++) { int Sum1 = (Pixel00[I] * U0 + Pixel01[I] * U1 + Pixel02[I] * U2 + Pixel03[I] * U3) * V0;    //  行1 int Sum2 = (Pixel10[I] * U0 + Pixel11[I] * U1 + Pixel12[I] * U2 + Pixel13[I] * U3) * V1;    //  行2 int Sum3 = (Pixel20[I] * U0 + Pixel21[I] * U1 + Pixel22[I] * U2 + Pixel23[I] * U3) * V2;    //  行3 int Sum4 = (Pixel30[I] * U0 + Pixel31[I] * U1 + Pixel22[I] * U2 + Pixel33[I] * U3) * V3;    //  行4 Pixel[I] = IM_ClampToByte((Sum1 + Sum2 + Sum3 + Sum4) >> 16); }

  先考慮Channel爲1的狀況,觀察這一句:Pixel00[I] * U0 + Pixel01[I] * U1 + Pixel02[I] * U2 + Pixel03[I] * U3, 注意此時Pixel00/Pixel01/Pixel02/Pixel03在內存中是連續的,並且取值範圍在[0,255]之間,U0/U1/U2/U3根據前面的查找表創建過程,也在[0,256]之間,他們都能用short類型來表達, 而這個式子爲連乘而後累加,咱們考慮使用一個特殊的SSE指令_mm_madd_epi16,在MSDN中其功能解釋以下:

      Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit integers from b.

  __m128i _mm_madd_epi16 (__m128i a, __m128i b); 
  r0 := (a0 * b0) + (a1 * b1)
  r1 := (a2 * b2) + (a3 * b3)
  r2 := (a4 * b4) + (a5 * b5)
  r3 := (a6 * b6) + (a7 * b7)



_mm_madd_epi16 ,而後2次的結果在調用_mm_hadd_epi32這個水平方向的累加函數就能獲得新的結果,感受真的有點奇妙,核心代碼以下所示:
  if (Channel == 1) { __m128i P01 = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*((int *)Pixel0)), _mm_cvtsi32_si128(*((int *)Pixel1))));            // P00 P01 P02 P03 P10 P11 P12 P13
        __m128i P23 = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*((int *)Pixel2)), _mm_cvtsi32_si128(*((int *)Pixel3))));            // P20 P21 P22 P23 P30 P31 P32 P33
        __m128i Sum01 = _mm_madd_epi16(P01, PartX);                            // P00 * U0 + P01 * U1 P02 * U2 + P03 * U3 P10 * U0 + P11 * U1 P12 * U2 + P13 * U3
        __m128i Sum23 = _mm_madd_epi16(P23, PartX);                            // P20 * U0 + P21 * U1 P22 * U2 + P23 * U3 P30 * U0 + P31 * U1 P32 * U2 + P33 * U3
        __m128i Sum = _mm_hadd_epi32(Sum01, Sum23);                            // P00 * U0 + P01 * U1 + P02 * U2 + P03 * U3 P10 * U0 + P11 * U1 + P12 * U2 + P13 * U3 P20 * U0 + P21 * U1 + P22 * U2 + P23 * U3 P30 * U0 + P31 * U1 + P32 * U2 + P33 * U3
        LinePD[0] = IM_ClampToByte(_mm_hsum_epi32(_mm_mullo_epi32(Sum, PartY)) >> 16); }


// 4個有符號的32位的數據相加的和。
inline int _mm_hsum_epi32(__m128i V)                        // V3 V2 V1 V0
{ // 實測這個速度要快些,_mm_extract_epi32最慢。
    __m128i T = _mm_add_epi32(V, _mm_srli_si128(V, 8));        // V3+V1 V2+V0 V1 V0 
    T = _mm_add_epi32(T, _mm_srli_si128(T, 4));                // V3+V1+V2+V0 V2+V0+V1 V1+V0 V0 
    return _mm_cvtsi128_si32(T);                            // 提取低位 






int IM_Resample_SSE(unsigned char *Src, unsigned char *Dest, int SrcW, int SrcH, int StrideS, int DstW, int DstH, int StrideD, int InterpolationMode) { int Channel = StrideS / SrcW; if ((Src == NULL) || (Dest == NULL))                                return IM_STATUS_NULLREFRENCE; if ((SrcW <= 0) || (SrcH <= 0) || (DstW <= 0) || (DstH <= 0))        return IM_STATUS_INVALIDPARAMETER; if ((Channel != 1) && (Channel != 3) && (Channel != 4))                return IM_STATUS_INVALIDPARAMETER; if ((SrcW == DstW) && (SrcH == DstH)) { memcpy(Dest, Src, SrcW * SrcH * Channel * sizeof(unsigned char)); return IM_STATUS_OK; } // 已經論證這個沒有必要用SSE去作優化,速度不會有太大的變化, 2018.3.28
    if (InterpolationMode == 0)                            // 最近鄰插值
 { } else if (InterpolationMode == 1)                    // 雙線性插值方式
 { } else if (InterpolationMode == 2)            // 三次立方插值
 { short *SinXDivX_Table = (short *)malloc(513 * sizeof(short)); short *Table = (short *)malloc(DstW * 4 * sizeof(short)); if ((SinXDivX_Table == NULL) || (Table == NULL)) { if (SinXDivX_Table != NULL) free(SinXDivX_Table); if (Table != NULL) free(Table); return IM_STATUS_NULLREFRENCE; } for (int I = 0; I < 513; I++) SinXDivX_Table[I] = int(0.5 + 256 * SinXDivX(I / 256.0f));            // 創建查找表,定點化

        int AddX = (SrcW << 16) / DstW, AddY = (SrcH << 16) / DstH; int ErrorX = -(1 << 15) + (AddX >> 1), ErrorY = -(1 << 15) + (AddY >> 1); int StartX = ((1 << 16) - ErrorX) / AddX + 1;            // 計算出須要特殊處理的邊界
        int StartY = ((1 << 16) - ErrorY) / AddY + 1;            // y0+y*yr>=1; y0=ErrorY => y>=(1-ErrorY)/yr
        int EndX = (((SrcW - 3) << 16) - ErrorX) / AddX + 1; int EndY = (((SrcH - 3) << 16) - ErrorY) / AddY + 1;    // y0+y*yr<=(height-3) => y<=(height-3-ErrorY)/yr
        if (StartY >= DstH)            StartY = DstH; if (StartX >= DstW)            StartX = DstW; if (EndX < StartX)            EndX = StartX; if (EndY < StartY)            EndY = StartY; for (int X = StartX, SrcX = ErrorX + StartX * AddX; X < EndX; X++, SrcX += AddX) { int U = (unsigned char)(SrcX >> 8);                    // StartX以前和EndX以後的數據雖然沒用,可是爲了方便仍是分配了內存
            Table[X * 4 + 0] = SinXDivX_Table[256 + U];            // 前面創建這樣的一個表,方便後面用SSE進行讀取和優化
            Table[X * 4 + 1] = SinXDivX_Table[U]; Table[X * 4 + 2] = SinXDivX_Table[256 - U]; Table[X * 4 + 3] = SinXDivX_Table[512 - U]; } int SrcY = ErrorY; for (int Y = 0; Y < StartY; Y++, SrcY += AddY)            // 前面的不是都有效的取樣部分數據
 { unsigned char *LinePD = Dest + Y * StrideD; for (int X = 0, SrcX = ErrorX; X < DstW; X++, SrcX += AddX, LinePD += Channel) { Bicubic_Border(Src, SrcW, SrcH, StrideS, LinePD, SinXDivX_Table, SrcX, SrcY); } } for (int Y = StartY; Y < EndY; Y++, SrcY += AddY) { int SrcX = ErrorX; unsigned char *LinePD = Dest + Y * StrideD; for (int X = 0; X < StartX; X++, SrcX += AddX, LinePD += Channel) { Bicubic_Border(Src, SrcW, SrcH, StrideS, LinePD, SinXDivX_Table, SrcX, SrcY); } int V = (unsigned char)(SrcY >> 8); unsigned char *LineY = Src + ((SrcY >> 16) - 1) * StrideS; __m128i PartY = _mm_setr_epi32(SinXDivX_Table[256 + V], SinXDivX_Table[V], SinXDivX_Table[256 - V], SinXDivX_Table[512 - V]); for (int X = StartX; X < EndX; X++, SrcX += AddX, LinePD += Channel) { __m128i PartX = _mm_loadl_epi64((__m128i *)(Table + X * 4)); PartX = _mm_unpacklo_epi64(PartX, PartX);                                // U0 U1 U2 U3 U0 U1 U2 U3
                unsigned char *Pixel0 = LineY + ((SrcX >> 16) - 1) * Channel; unsigned char *Pixel1 = Pixel0 + StrideS; unsigned char *Pixel2 = Pixel1 + StrideS; unsigned char *Pixel3 = Pixel2 + StrideS; if (Channel == 1) { __m128i P01 = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*((int *)Pixel0)), _mm_cvtsi32_si128(*((int *)Pixel1))));            // P00 P01 P02 P03 P10 P11 P12 P13
                    __m128i P23 = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*((int *)Pixel2)), _mm_cvtsi32_si128(*((int *)Pixel3))));            // P20 P21 P22 P23 P30 P31 P32 P33
                    __m128i Sum01 = _mm_madd_epi16(P01, PartX);                            // P00 * U0 + P01 * U1 P02 * U2 + P03 * U3 P10 * U0 + P11 * U1 P12 * U2 + P13 * U3
                    __m128i Sum23 = _mm_madd_epi16(P23, PartX);                            // P20 * U0 + P21 * U1 P22 * U2 + P23 * U3 P30 * U0 + P31 * U1 P32 * U2 + P33 * U3
                    __m128i Sum = _mm_hadd_epi32(Sum01, Sum23);                            // P00 * U0 + P01 * U1 + P02 * U2 + P03 * U3 P10 * U0 + P11 * U1 + P12 * U2 + P13 * U3 P20 * U0 + P21 * U1 + P22 * U2 + P23 * U3 P30 * U0 + P31 * U1 + P32 * U2 + P33 * U3
                    LinePD[0] = IM_ClampToByte(_mm_hsum_epi32(_mm_mullo_epi32(Sum, PartY)) >> 16); } else if (Channel == 3) { } else if (Channel == 4) { __m128i P0 = _mm_loadu_si128((__m128i *)Pixel0), P1 = _mm_loadu_si128((__m128i *)Pixel1); __m128i P2 = _mm_loadu_si128((__m128i *)Pixel2), P3 = _mm_loadu_si128((__m128i *)Pixel3); // 如下組合方式比使用 _mm_shuffle_epi8 和 _mm_or_si128要少8條指令
                    P0 = _mm_shuffle_epi8(P0, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15));        // B0 G0 R0 A0
                    P1 = _mm_shuffle_epi8(P1, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15));        // B1 G1 R1 A1
                    P2 = _mm_shuffle_epi8(P2, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15));        // B2 G2 R2 A2
                    P3 = _mm_shuffle_epi8(P3, _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15));        // B3 G3 R3 A3
 __m128i BG01 = _mm_unpacklo_epi32(P0, P1);        // B0 B1 G0 G1
                    __m128i RA01 = _mm_unpackhi_epi32(P0, P1);        // R0 R1 A0 A1
                    __m128i BG23 = _mm_unpacklo_epi32(P2, P3);        // B2 B3 G2 G3
                    __m128i RA23 = _mm_unpackhi_epi32(P2, P3);        // R2 R3 A2 A3
 __m128i B01 = _mm_unpacklo_epi8(BG01, _mm_setzero_si128()); __m128i B23 = _mm_unpacklo_epi8(BG23, _mm_setzero_si128()); __m128i SumB = _mm_hadd_epi32(_mm_madd_epi16(B01, PartX), _mm_madd_epi16(B23, PartX)); __m128i G01 = _mm_unpackhi_epi8(BG01, _mm_setzero_si128()); __m128i G23 = _mm_unpackhi_epi8(BG23, _mm_setzero_si128()); __m128i SumG = _mm_hadd_epi32(_mm_madd_epi16(G01, PartX), _mm_madd_epi16(G23, PartX)); __m128i R01 = _mm_unpacklo_epi8(RA01, _mm_setzero_si128()); __m128i R23 = _mm_unpacklo_epi8(RA23, _mm_setzero_si128()); __m128i SumR = _mm_hadd_epi32(_mm_madd_epi16(R01, PartX), _mm_madd_epi16(R23, PartX)); __m128i A01 = _mm_unpackhi_epi8(RA01, _mm_setzero_si128()); __m128i A23 = _mm_unpackhi_epi8(RA23, _mm_setzero_si128()); __m128i SumA = _mm_hadd_epi32(_mm_madd_epi16(A01, PartX), _mm_madd_epi16(A23, PartX)); // 這個竟然比註釋掉的還快點
                    __m128i Result = _mm_setr_epi32(_mm_hsum_epi32(_mm_mullo_epi32(SumB, PartY)), _mm_hsum_epi32(_mm_mullo_epi32(SumG, PartY)), _mm_hsum_epi32(_mm_mullo_epi32(SumR, PartY)), _mm_hsum_epi32(_mm_mullo_epi32(SumA, PartY))); Result = _mm_srai_epi32(Result, 16); // *((int *)LinePD) = _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packus_epi32(Result, Result), Result));
                    _mm_stream_si32((int *)LinePD, _mm_cvtsi128_si32(_mm_packus_epi16(_mm_packus_epi32(Result, Result), Result))); //LinePD[0] = IM_ClampToByte(_mm_hsum_epi32(_mm_mullo_epi32(SumB, PartY)) >> 16); // 確實有部分存在超出unsigned char範圍的,由於定點化的緣故 //LinePD[1] = IM_ClampToByte(_mm_hsum_epi32(_mm_mullo_epi32(SumG, PartY)) >> 16); //LinePD[2] = IM_ClampToByte(_mm_hsum_epi32(_mm_mullo_epi32(SumR, PartY)) >> 16); //LinePD[3] = IM_ClampToByte(_mm_hsum_epi32(_mm_mullo_epi32(SumA, PartY)) >> 16);
 } } for (int X = EndX; X < DstW; X++, SrcX += AddX, LinePD += Channel) { Bicubic_Border(Src, SrcW, SrcH, StrideS, LinePD, SinXDivX_Table, SrcX, SrcY); } } for (int Y = EndY; Y < DstH; Y++, SrcY += AddY) { unsigned char *LinePD = Dest + Y * StrideD; for (int X = 0, SrcX = ErrorX; X < DstW; X++, SrcX += AddX, LinePD += Channel) { Bicubic_Border(Src, SrcW, SrcH, StrideS, LinePD, SinXDivX_Table, SrcX, SrcY); } } free(Table); free(SinXDivX_Table); } return IM_STATUS_OK; }




