今天忙活了3個小時,居然被一個苦惱的CUDA小例程給困住了,原本是參照Rachal zhang大神的CUDA學習筆記來一個模仿,結果卻本身給本身糊里糊塗,最後仍是弄明白了一些。ios
RZ大神對CUDA關於kernel,memory的介紹仍是蠻清楚,看完決定寫一個二維數組的加法。若是是C++裏的加法,那就簡單了,用C[i][j] = A[i][j] +B[i][j]就能夠。c++
1 void CppMatAdd(int A[M][N],int B[M][N],int C[M][N]){ 2 for(int i=0;i<M;i++) 3 for(int j=0;j<N;j++) 4 C[i][j] = A[i][j] + B[i][j]; 5 }
1 int main() 2 { 3 int a[M][N] = {1,2,3,4,5,6,7,8,9,10,11,12}; 4 int b[M][N] = {1,2,3,4,5,6,7,8,9,10,11,12}; 5 int c[M][N] ; 6 CppMatAdd(a,b,c); 7 std::cout<<c[0][0]; 8 }
運行上面代碼,就能夠實現二維矩陣(也就是數組)的加法運算。數組
可是CUDA計算是在GPU上實現的,要劃分出專門的內存區域給GPU作運算,結果就是,咱們必須劃分出主機內存、設備內存分別供CPU、GPU訪問。學習
對於一維的狀況,咱們設置好主機變量,設備變量便可。具體能夠參找RZ的博客。spa
可是二維的狀況麻煩就來了,最一開始我也是設置出主機變量,設備變量,一一對應的分配內存,拷貝數據,GPU運算,最後考出結果。可是發現怎麼調試結果都不對,最主要的緣由是c++的二維數組其實是一維數組的指針,因此,沒法按照一位數組的模式去拷貝數據,結果相映的寫法就麻煩許多,其實說到底仍是還原成一維數組的方法去作的加法運算,代碼以下,具體就不想贅述了,代碼能力有限,慢慢來吧,今天算是把指針弄的更清楚了。.net
/*-------------------------------------------- * Date:2015-3-18 * Author:李根 * FileName:.cpp * Description:CUDA二維數組加法 ------------------------------------------------*/ #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <iostream> #include <stdio.h> static const int M = 4; static const int N = 3; //矩陣加法的kernel __global__ void addMat(int **A,int **B,int **C) { int i = blockIdx.x * blockDim.x + threadIdx.x; int j = blockIdx.y * blockDim.y + threadIdx.y; if(i < M && j < N) C[i][j] = A[i][j] + B[i][j]; } int main() {int **A = (int **)malloc(M*sizeof(int *)); //host memory int **B = (int **)malloc(M*sizeof(int *)); //host memory int **C = (int **)malloc(M*sizeof(int *)); //host memory int *dataA =(int *)malloc(M*N*sizeof(int )); //host memory data int *dataB = (int *)malloc(M*N*sizeof(int )); //host memory data int *dataC =(int *)malloc(M*N*sizeof(int )); //host memory data int **dev_A ; //device memory int **dev_B ; //device memory int **dev_C ; //device memory int *dev_dataA ; //device memory data int *dev_dataB ; //device memory data int *dev_dataC ; //device memory data cudaMalloc((void**)(&dev_A), M*sizeof(int*)); cudaMalloc((void**)(&dev_dataA), M*N*sizeof(int)); cudaMalloc((void**)(&dev_B), M*sizeof(int*)); cudaMalloc((void**)(&dev_dataB), M*N*sizeof(int)); cudaMalloc((void**)(&dev_C), M*sizeof(int*)); cudaMalloc((void**)(&dev_dataC), M*N*sizeof(int)); for(int i=0;i<M*N;i++) { dataA[i] = i; dataB[i] = i+1; dataC[i] =0; } cudaMemcpy((void*)(dev_dataA), (void*)(dataA), M*N*sizeof(int*), cudaMemcpyHostToDevice); cudaMemcpy((void*)(dev_dataB), (void*)(dataB), M*N*sizeof(int*), cudaMemcpyHostToDevice); for(int i=0;i<M;i++) { A[i] = dev_dataA + N*i; B[i] = dev_dataB + N*i; C[i] = dev_dataC + N*i; } cudaMemcpy((void*)(dev_A), (void*)(A), M*sizeof(int*), cudaMemcpyHostToDevice); cudaMemcpy((void*)(dev_B), (void*)(B), M*sizeof(int*), cudaMemcpyHostToDevice); cudaMemcpy((void*)(dev_C), (void*)(C), M*sizeof(int*), cudaMemcpyHostToDevice); dim3 threadPerBlock(16,16); dim3 numBlocks((N+threadPerBlock.x-1)/(threadPerBlock.x), (M+threadPerBlock.y-1)/(threadPerBlock.y)); addMat<<<numBlocks,threadPerBlock>>>(dev_A,dev_B,dev_C); cudaMemcpy((void*)(dataC), (void*)(dev_dataC), M*N*sizeof(int), cudaMemcpyDeviceToHost); for(int i=0;i<M*N;i++) std::cout<<dataC[i]<<" "; cudaFree((void*)dev_dataC); cudaFree((void*)dev_C); free(C); free(dataC); cudaFree((void*)dev_dataB); cudaFree((void*)dev_B); free(B); free(dataB); cudaFree((void*)dev_dataA); cudaFree((void*)dev_A); free(A); free(dataA); getchar(); }
博客恢復更新,慢慢的積累吧指針