GPU編程(三): CPU與GPU的矩陣乘法對比

目錄

  • 前言
  • 代碼
  • 計時函數
  • 最後

前言

在上一篇的最後, 我提到了一個矩陣乘法, 此次與CPU進行對比, 從中能夠很明顯GPU在並行計算上的優點.html


計時函數

在貼出代碼以前, 來看下我經常使用的計時函數, 能夠精確到微秒級. 首先頭文件是#include<sys/time.h>. 結構體爲:linux

struct timeval{
    long tv_sec; /*秒*/
    long tv_usec; /*微秒*/
};
複製代碼

來看下使用的小栗子:git

struct timeval start, end;
double timeuse;
int sum = 0;

gettimeofday (&start, NULL);
for (int i = 0; i < 10000; i++){
    sum += i;
}
gettimeofday (&end, NULL);

timeuse = end.tv_sec - start.tv_sec + (end.tv_usec - start.tv_usec)/1000000.0;
printf("Use Time:%f\n",timeuse);
複製代碼

代碼

其實CPU部分的代碼就是for循環. 你可能會考慮到用多線程, 可是我實測效果不太好, 這篇有代碼, 能夠去看看. 因此用的基礎for循環.github

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h> 
#include <unistd.h>

#define w 1000

struct Matrix
{
    int width;
    int height;
    float *elements;
};

void matMul(float * M, float * N, float * P, int width){
	for (int i = 0; i < width; i++){
		for (int j = 0; j < width; j++){
			float sum = 0;
			for (int k = 0; k < width; k++){
				float a = M[i * width + k];
				float b = N[k * width + j];
				sum += a * b;
			}
			P[i * width + j] = sum;
		}
	}
}

int main(){
	int width = w;
	int height = w;	
	
	float * m = (float *)malloc (width * height * sizeof (float));
	float * n = (float *)malloc (width * height * sizeof (float));
	float * p = (float *)malloc (width * height * sizeof (float));

	for (int i = 0; i < width * height; i++){
		m[i] = 1.0;
		n[i] = 2.0;
	}

	struct timeval t1,t2;
	gettimeofday(&t1,NULL);
	double timeuse;

	matMul(m, n, p, w);

	gettimeofday(&t2,NULL);
	timeuse = t2.tv_sec - t1.tv_sec + (t2.tv_usec - t1.tv_usec)/1000000.0;
	printf("Use Time:%f\n",timeuse);

	return 0;
}
複製代碼

cuda部分的代碼直接貼出來, 解析能夠看以前的文章.bash

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <unistd.h>

#define w 1000

struct Matrix
{
	int width;
	int height;
	float *elements;
};

__device__ float getElement(Matrix *A, int row, int col)
{
        return A->elements[row * A->width + col];
}

__device__ void setElement(Matrix *A, int row, int col, float value)
{
        A->elements[row * A->width + col] = value;
}

__global__ void matMulKernel(Matrix *A, Matrix *B, Matrix *C)
{
        float Cvalue = 0.0;
        int row = threadIdx.y + blockIdx.y * blockDim.y;
        int col = threadIdx.x + blockIdx.x * blockDim.x;
        
        for (int i = 0; i < A->width; ++i)
        {
                Cvalue += getElement(A, row, i) * getElement(B, i, col);
        }
        setElement(C, row, col, Cvalue);
}

int main()
{
	int width = w;
	int height = w;

	Matrix *A, *B, *C;

	cudaMallocManaged((void**)&A, sizeof(Matrix));
	cudaMallocManaged((void**)&B, sizeof(Matrix));
	cudaMallocManaged((void**)&C, sizeof(Matrix));

	int nBytes = width * height * sizeof(float);

	cudaMallocManaged((void**)&A->elements, nBytes);
	cudaMallocManaged((void**)&B->elements, nBytes);
	cudaMallocManaged((void**)&C->elements, nBytes);

	A->height = height;
	A->width = width;
	B->height = height;
	B->width = width;
	C->height = height;
	C->width = width;

	for (int i = 0; i < width * height; ++i)
	{
		A->elements[i] = 1.0;
		B->elements[i] = 2.0;
	}

	dim3 blockSize(32, 32);
	dim3 gridSize((width + blockSize.x - 1) / blockSize.x,
		(height + blockSize.y - 1) / blockSize.y);

	struct timeval t1,t2;
	gettimeofday(&t1,NULL);
	double timeuse;

	matMulKernel << < gridSize, blockSize >> >(A, B, C);

	cudaDeviceSynchronize();

	gettimeofday(&t2,NULL);
	timeuse = t2.tv_sec - t1.tv_sec + (t2.tv_usec - t1.tv_usec)/1000000.0;
	printf("Use Time:%f\n", timeuse);

	return 0;
}
複製代碼

來看下結果圖:多線程

結果圖

gpu是gt750m, cpu是i7-4700mq. 其實cpu是比gpu好不少的, 可是並行計算上gpu的優點依舊明顯.函數


最後

喜歡記得點贊哦, 有意見或者建議評論區見~ui

相關文章
相關標籤/搜索