【Optimizaiton/x86】x86 SSE Intrinsic: 點乘算法的Intrinsic實現

Date: 2018.6.12


一、參考:

https://blog.csdn.net/liujiayu2/article/details/39964813
https://blog.csdn.net/u010839382/article/details/52743664
http://www.javashuo.com/article/p-xlnniefl-r.html
https://blog.csdn.net/yutianzuijin/article/details/79944292html

二、點乘算法的C實現和x86 sse Intrinsic實現
#include <stdio.h>
#include <conio.h>
#include <time.h>
#include <windows.h>
#include <xmmintrin.h>

float dot(float* A, float* B,int n)
{
    int i = 0;
    float sum = 0;
    for (i; i < n; i++)
    {
        sum += A[i] * B[i];
    }

    return sum;
}

float dot_sse(const float* A, const float* B, int n)
{
    __m128  aa, bb, sum;
    float sum_s = 0.0;
    aa = _mm_loadu_ps(A);
    bb = _mm_loadu_ps(B);

    sum = _mm_mul_ps(aa, bb);

    sum_s = sum.m128_f32[0] + sum.m128_f32[1] + sum.m128_f32[2] + sum.m128_f32[3];
    return sum_s;
}
int main()
{
    int i;
    int count = 50000;
    float sum = 0.0, sum_sse = 0.0;
    double time_c,time_sse;
    LARGE_INTEGER start;
    LARGE_INTEGER end;
    LARGE_INTEGER freq;
    float A[4] = { 1.0, 2.0, 3.0, 4.0};
    float B[4] = { 6.0, 7.0, 8.0, 9.0};
    QueryPerformanceFrequency(&freq);

    QueryPerformanceCounter(&start);    
    for (i = 0; i < count;i++)
    {
        sum = dot(A, B, 4);
    }
    QueryPerformanceCounter(&end);
    time_c = (double)(end.QuadPart - start.QuadPart) / (double)(freq.QuadPart);

    QueryPerformanceCounter(&start);
    for (i = 0; i < count; i++)
    {
        sum_sse = dot_sse(A, B, 4);
    }
    QueryPerformanceCounter(&end);
    time_sse = (double)(end.QuadPart - start.QuadPart) / (double)(freq.QuadPart);

    printf("sum_C: %f, sum_sse: %f\n", sum, sum_sse);
    printf("time_c: %f, time_sse: %f\n", time_c, time_sse);
    printf("ratio: %f", time_c / time_sse);

    _getch();
    return 0;
}
相關文章
相關標籤/搜索