1.8位加: spa
*(__m128i*)(dest + i * 16) = _mm_add_epi8(*(__m128i*)(srcA + i * 16), *(__m128i*)(srcB + i * 16));blog
16位減法it
__m128i _mm_sub_epi16 (__m128i a, __m128i b);class
r0 := a0 - b0
r1 := a1 - b1
...
r7 := a7 - b7im
2. 加載128位數據數據
__m128i Src1 = _mm_loadu_si128((__m128i *)(LinePS + 0));
3. 把16個8bit數據送給 dst
__m128i _mm_setr_epi8 (char e15, char e14, char e13, char e12, char e11, char e10, char e9, char e8, char e7, char e6, char e5, char e4, char e3, char e2, char e1, char e0)
4. Src1 的 第0/3/6/9/15 給到Blue的前6個字節 能夠用於BGR B份量提取
Blue = _mm_shuffle_epi8(Src1, _mm_setr_epi8(0, 3, 6, 9, 12, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1));img
5. 或運算 把src2 的第 2 5 8 11 14 個字節和Blue進行或運算, 即填充Blue的中間6個字節co
Blue = _mm_or_si128(Blue, _mm_shuffle_epi8(Src2, _mm_setr_epi8(-1, -1, -1, -1, -1, -1, 2, 5, 8, 11, 14, -1, -1, -1, -1, -1)));ps
6. Blue的低8位字節放在Blue16L中, 防止後面的乘積溢出void
__m128i Blue16L = _mm_unpacklo_epi8(Blue, Zero);
7.移位
__m128i _mm_srai_epi32 (__m128i a, int count);
r0 := a0 >> count
r1 := a1 >> count
r2 := a2 >> count
r3 := a3 >> count
移位
__m128i _mm_sra_epi16 (__m128i a, __m128i count);
r0 := a0 >> count
r1 := a1 >> count
...
r7 := a7 >> count
8 . 兩個16位合成一個 32 位
__m128i _mm_packus_epi16 (__m128i a, __m128i b);
r0 := UnsignedSaturate(a0) r1 := UnsignedSaturate(a1) ... r7 := UnsignedSaturate(a7) r8 := UnsignedSaturate(b0) r9 := UnsignedSaturate(b1) ... r15 := UnsignedSaturate(b7)
9. a 賦值給p
void _mm_storeu_si128 (__m128i *p, __m128i a);
10 乘法 __m128i _mm_mullo_epi32( __m128i a, __m128i b );
r0 := a0 * b0
r1 := a1 * b1
r2 := a2 * b2
r3 := a3 * b3
10 比較0-31bit 是否相等 若相等,返回1 不然返回0
int _mm_comieq_ss (__m128 a, __m128 b)