右鍵解決方案屬性-》配置屬性-》CUDA C/C++-》Device-》Code Generation,加入compute_20,sm_20,而且把下面的「從父級或項目屬性默認設置繼承」的勾選去掉ide
//cpu 下 #include <time.h> clock_t start,end; start = clock(); //cpu codes end = clock(); printf("CPU Time: %.5f\n", (float)(end-start)); //gpu 下 cudaEvent_t st,ed; cudaEventCreate(&st); cudaEventCreate(&ed); cudaEventRecord(st,0); //gpu codes cudaEventRecord(ed,0); cudaEventSynchronize(ed); float gpu_time; cudaEventElapsedTime(&gpu_time,st,ed); printf("GPU Time: %.5f\n",gpu_time); cudaEventDestroy(st); cudaEventDestroy(ed);
#include <stdlib.h> #include <stdio.h> #include <cuda.h> #include <device_launch_parameters.h> #include <cuda_runtime.h> #include <book.h> const int Size = 256; const int block = 8; const int thread = 32; __global__ void calc(float *in, float *out){ unsigned int tid = threadIdx.x; unsigned int bid = blockIdx.x; //target array float * target = in + blockIdx.x * blockDim.x; //bounding if(tid > thread) return; for(int stride = 1 ; stride < blockDim.x ; stride *= 2) { if(tid % (stride*2) == 0) { target[tid] += target[tid+stride]; } __syncthreads(); } if(tid == 0) { out[blockIdx.x] = target[tid]; } } __global__ void calc2(float *in, float *out) { unsigned int tid = threadIdx.x; unsigned int bid = tid + blockIdx.x*blockDim.x; float * target = in + blockIdx.x * blockDim.x; //bounding if(tid > thread) return; //stride = 1,2,4,8 for(int stride = 1 ; stride < blockDim.x ; stride *= 2) { unsigned int index = 2*stride*tid; if(index < blockDim.x) target[index] += target[index+stride]; __syncthreads(); } if(tid == 0) { out[blockIdx.x] = target[tid]; } } //跨步規約 __global__ void calc3(float *in, float *out) { unsigned int tid = threadIdx.x; unsigned int bid = tid + blockIdx.x*blockDim.x; float * target = in + blockIdx.x * blockDim.x; //bounding if(tid > thread) return; for(int stride = blockDim.x/2 ; stride > 0 ; stride /=2) { if(tid < stride) target[tid] += target[tid+stride]; __syncthreads(); } if(tid == 0) { out[blockIdx.x] = target[tid]; } } __global__ void calc4(float *in, float *out) { int tid = threadIdx.x; int bid = blockIdx.x; float * target=in + bid * blockDim.x; if(tid < thread) return; __shared__ float share_in[thread]; share_in[tid] = target[tid]; __syncthreads(); for(int stride = blockDim.x/2 ; stride > 0; stride /= 2) { if(tid < stride) { share_in[tid] += share_in[tid+stride]; } __syncthreads(); } if(tid == 0) { out[blockIdx.x] = share_in[tid]; } } int main() { //host float * indata; // Size float * outdata; // block float * ans; // 1 // device float * dev_indata; // Size float * dev_outdata; // block // host malloc indata = (float*)malloc(sizeof(float)*Size); outdata = (float*)malloc(sizeof(float)*block); ans = (float*)malloc(sizeof(float)); // device malloc cudaMalloc((void**)&dev_indata,sizeof(float)*Size); cudaMalloc((void**)&dev_outdata,sizeof(float)*block); // init & generate data for(int i = 0 ; i < Size ; i++) { indata[i] = i; } *ans = 0; // time start cudaEvent_t st,ed; cudaEventCreate(&st); cudaEventCreate(&ed); cudaEventRecord(st,0); // memcpy to device HANDLE_ERROR(cudaMemcpy(dev_indata,indata,sizeof(float)*Size,cudaMemcpyHostToDevice)); // kernal functions cudaDeviceSynchronize(); calc4<<<block,thread>>>(dev_indata,dev_outdata); cudaDeviceSynchronize(); // memcpy to host HANDLE_ERROR(cudaMemcpy(outdata,dev_outdata,sizeof(float)*block,cudaMemcpyDeviceToHost)); // time end cudaEventRecord(ed,0); cudaEventSynchronize(ed); float gpu_time; cudaEventElapsedTime(&gpu_time,st,ed); // test output for(int i = 0 ; i < block ; i++) { //printf("%.3f\n",outdata[i]); *ans += outdata[i]; } printf("GPU Time: %.5f\nAns: %.5f\n",gpu_time,*ans); //time destory cudaEventDestroy(st); cudaEventDestroy(ed); //device destory cudaFree(indata); cudaFree(outdata); cudaFree(ans); getchar(); return 0; }
#include <stdlib.h> #include <cuda_runtime.h> #include <stdio.h> #include <cuda.h> #include <device_launch_parameters.h> const int N = 20; __global__ void mul(int *a,int* b,int *out) { unsigned int tidx = threadIdx.x; unsigned int tidy = threadIdx.y; unsigned int offset = tidx*N + tidy; if(offset > N*N)return; int t = 0; for(int i = 0 ; i < N ; i++) { t += a[tidx*N+i]*b[i*N+tidy]; } out[offset] = t; } int main() { //host int * matrix1; int * matrix2; int * output; //device int * dev_matrix1; int * dev_matrix2; int * dev_output; //host malloc matrix1 = (int*)malloc(sizeof(int)*N*N); matrix2 = (int*)malloc(sizeof(int)*N*N); output = (int*)malloc(sizeof(int)*N*N); //device malloc cudaMalloc((void**)&dev_matrix1,sizeof(int)*N*N); cudaMalloc((void**)&dev_matrix2,sizeof(int)*N*N); cudaMalloc((void**)&dev_output,sizeof(int)*N*N); //init generate data for(int i = 0 ; i < N*N ; i++) { matrix1[i] = i+1; matrix2[i] = i+1; output[i] = 0; } //CPU for(int i = 0 ; i < N ; i++) { for(int j = 0 ; j < N ; j++){ int tp = 0; for(int k = 0 ; k < N ; k++) { tp += matrix1[i*N+k] * matrix2[k*N+j]; } printf("%d ",tp); } } printf("\n----------\n"); //time start cudaEvent_t st,ed; cudaEventCreate(&st); cudaEventCreate(&ed); cudaEventRecord(st,0); //memcpy to device cudaMemcpy(dev_matrix1,matrix1,sizeof(int)*N*N,cudaMemcpyHostToDevice); cudaMemcpy(dev_matrix2,matrix2,sizeof(int)*N*N,cudaMemcpyHostToDevice); //kernel functions mul<<<2,dim3(N,N)>>>(dev_matrix1,dev_matrix2,dev_output); //memcpy to host cudaMemcpy(output,dev_output,sizeof(int)*N*N,cudaMemcpyDeviceToHost); //output for(int i = 0 ; i < N*N ; i++) { printf("%d ",output[i]); } printf("\n"); //time end cudaEventRecord(ed,0); cudaEventSynchronize(ed); float gpu_time; cudaEventElapsedTime(&gpu_time,st,ed); printf("gpu time: %.5f\n",gpu_time); //time destory cudaEventDestroy(st); cudaEventDestroy(ed); //device destory cudaFree(dev_matrix1); cudaFree(dev_matrix2); cudaFree(dev_output); free(matrix1); free(matrix2); free(output); return 0; }
#include <iostream> #include <stdlib.h> #include <stdio.h> #include "cuda.h" #include "cuda_runtime.h" #include "device_launch_parameters.h" const int N = 5; void output(int * arr) { for(int i = 0 ; i < N*N ; i++) { printf("%d\t",arr[i]); if((i+1) % N == 0) printf("\n"); } printf("\n"); } __global__ void trans(int * in, int * out) { unsigned int xIndex = threadIdx.x + blockDim.x * blockIdx.x; unsigned int yIndex = threadIdx.y + blockDim.y * blockIdx.y; if(xIndex < N && yIndex < N) { unsigned int index_in = xIndex + N * yIndex; unsigned int index_out = yIndex + N * xIndex; out[index_out] = in[index_in]; } } __global__ void trans2(int * in , int * out) { __shared__ float block[N][N]; unsigned int xIndex = blockIdx.x * N + threadIdx.x; unsigned int yIndex = blockIdx.y * N + threadIdx.y; if((xIndex < N) && (yIndex < N)) { unsigned int index_in = yIndex * N +xIndex; block[threadIdx.x][threadIdx.y] = in[index_in]; } __syncthreads(); xIndex = blockIdx.y * N + threadIdx.x; yIndex = blockIdx.x * N + threadIdx.y; if((xIndex < N) && (yIndex < N)) { unsigned int index_out = yIndex * N + xIndex; out[index_out] = block[threadIdx.x][threadIdx.y]; } } int main() { //host int * in; int * out; //device int * dev_in; int * dev_out; //host cudaMalloc in = (int*)malloc(sizeof(int)*N*N); out = (int*)malloc(sizeof(int)*N*N); //device cudaMalloc cudaMalloc((void**)&dev_in,sizeof(int)*N*N); cudaMalloc((void**)&dev_out,sizeof(int)*N*N); //init for(int i = 0 ; i < N*N ; i++){ in[i] = i+1; } //cudaMemcpy cudaMemcpy(dev_in,in,sizeof(int)*N*N,cudaMemcpyHostToDevice); //kernel functions trans<<<1,dim3(N,N)>>>(dev_in,dev_out); //memcpy back cudaMemcpy(out,dev_out,sizeof(int)*N*N,cudaMemcpyDeviceToHost); //dev_output output(in); printf("\n--------\n"); output(out); //cudaFree cudaFree(dev_in); cudaFree(dev_out); free(in); free(out); return 0; }
#include <iostream> #include <stdlib.h> #include <stdio.h> #include "cuda.h" #include "cuda_runtime.h" #include "device_launch_parameters.h" const int N = 26; const int L = 128; const int S = L*4; const int block = 4; const int thread = 32; __global__ void rec(char* book, int * record) { unsigned int tid = threadIdx.x; __shared__ int temp[N]; temp[tid] = 0; __syncthreads(); int index = tid + blockIdx.x * blockDim.x; int offset = blockDim.x * gridDim.x; //printf("%d-%d\n",index,offset); while(index < S) { atomicAdd(&(temp[book[index]]),1); index += offset; } __syncthreads(); atomicAdd(&(record[tid]),temp[tid]); } int main() { //host char * book; int * record; //device char * dev_book; int * dev_record; //host cudaMalloc book = (char*)malloc(sizeof(char)*S); record = (int*)malloc(sizeof(int)*N); //device malloc cudaMalloc((void**)&dev_book,sizeof(char)*S); cudaMalloc((void**)&dev_record,sizeof(int)*N); //init for(int i = 0 ; i < S ; i++) { srand(i+rand()); book[i] = (i+i*i+rand())%26; } //cpu int tp[N]={0}; for(int i = 0 ; i < S ; i++) { tp[book[i]]++; } for(int i = 0 ; i < N ; i++) printf("%d ",tp[i]); printf("\n"); //memcpy To device cudaMemcpy(dev_book,book,sizeof(char)*S,cudaMemcpyHostToDevice); //kernel functions rec<<<block,thread>>>(dev_book,dev_record); //memcpy To host cudaMemcpy(record,dev_record,sizeof(int)*N,cudaMemcpyDeviceToHost); //output for(int i = 0 ; i < N ; i++) { printf("%d ",record[i]); } printf("\n"); //destory cudaFree(dev_book); cudaFree(dev_record); free(book); free(record); return 0; }
#include <iostream> #include <stdlib.h> #include <stdio.h> #include "cuda.h" #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <time.h> /* * author : pprp * theme : 平方和 */ const int N = 128; const int block = 4; const int thread = 32; __global__ void calc0(int * arr, int * result) { int tid = threadIdx.x; int Size = N / block; int sum = 0; for(int i = tid * Size ; i <(tid+1)*Size; i++) { sum += arr[i]*arr[i]; } result[tid] = sum; //printf("sum: %d\n",sum); } int main() { //host int * arr; int * result; //device int * dev_arr; int * dev_result; //host malloc arr = (int*)malloc(sizeof(int)*N); result = (int*)malloc(sizeof(int)*block); //device malloc cudaMalloc((void**)&dev_arr,sizeof(int)*N); cudaMalloc((void**)&dev_result,sizeof(int)*block); //init for(int i = 0 ; i < N ; i++) { arr[i] = i+1; if(i < block) { result[i] = 0; } } //cpu clock_t start,end; start = clock(); unsigned int res = 0; for(int i = 0 ; i < N ; i++) { res += arr[i]*arr[i]; } end = clock(); printf("cpu ans : %d\ncpu time: %.5f\n",res,float(end-start)); //time start cudaEvent_t st,ed; cudaEventCreate(&st); cudaEventCreate(&ed); cudaEventRecord(st,0); //memcpy To Host cudaMemcpy(dev_arr,arr,sizeof(int)*N,cudaMemcpyHostToDevice); //kernel functions calc0<<<1,4>>>(dev_arr,dev_result); //memcpy To Device cudaMemcpy(result,dev_result,sizeof(int)*block,cudaMemcpyDeviceToHost); //output int res2=0; for(int i = 0 ; i < block ; i++) { res2 += result[i]; //printf("test: %d\n",result[i]); } //time end cudaEventRecord(ed,0); cudaEventSynchronize(ed); float gpu_time; cudaEventElapsedTime(&gpu_time,st,ed); printf("gpu ans :%d\ngpu time: %.5f\n",res2,gpu_time); //time destroy cudaEventDestroy(st); cudaEventDestroy(ed); //device free cudaFree(dev_arr); cudaFree(dev_result); free(arr); free(result); return 0; }
#include <iostream> #include <stdlib.h> #include <stdio.h> #include "cuda.h" #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <time.h> /* * author : pprp * theme : 平方和 */ const int N = 128; const int block = 4; const int thread = 32; __global__ void calc0(int * arr, int * result) { int tid = threadIdx.x; if(tid > block)return; int sum = 0; for(int i = tid; i < N ; i+=block) { sum += arr[i]*arr[i]; } result[tid] = sum; } int main() { //host int * arr; int * result; //device int * dev_arr; int * dev_result; //host malloc arr = (int*)malloc(sizeof(int)*N); result = (int*)malloc(sizeof(int)*block); //device malloc cudaMalloc((void**)&dev_arr,sizeof(int)*N); cudaMalloc((void**)&dev_result,sizeof(int)*block); //init for(int i = 0 ; i < N ; i++) { arr[i] = i+1; if(i < block) { result[i] = 0; } } //cpu clock_t start,end; start = clock(); unsigned int res = 0; for(int i = 0 ; i < N ; i++) { res += arr[i]*arr[i]; } end = clock(); printf("cpu ans : %d\ncpu time: %.5f\n",res,float(end-start)); //time start cudaEvent_t st,ed; cudaEventCreate(&st); cudaEventCreate(&ed); cudaEventRecord(st,0); //memcpy To Host cudaMemcpy(dev_arr,arr,sizeof(int)*N,cudaMemcpyHostToDevice); //kernel functions calc0<<<1,block>>>(dev_arr,dev_result); //memcpy To Device cudaMemcpy(result,dev_result,sizeof(int)*block,cudaMemcpyDeviceToHost); //output int res2=0; for(int i = 0 ; i < block ; i++) { res2 += result[i]; //printf("test: %d\n",result[i]); } //time end cudaEventRecord(ed,0); cudaEventSynchronize(ed); float gpu_time; cudaEventElapsedTime(&gpu_time,st,ed); printf("gpu ans :%d\ngpu time: %.5f\n",res2,gpu_time); //time destroy cudaEventDestroy(st); cudaEventDestroy(ed); //device free cudaFree(dev_arr); cudaFree(dev_result); free(arr); free(result); return 0; }
#include <iostream> #include <stdlib.h> #include <stdio.h> #include "cuda.h" #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <time.h> /* * author : pprp * theme : 平方和 */ const int N = 32; const int block = 4; const int thread = 8; __global__ void calc0(int * arr, int * result) { int tid = threadIdx.x; int bid = blockIdx.x; int sum = 0; for(int i = bid*blockDim.x+tid; i < N ; i += blockDim.x*gridDim.x) { sum += arr[i]*arr[i]; } __syncthreads(); result[bid*blockDim.x+tid] = sum; printf("++%d \n",sum); } int main() { //host int * arr; int * result; //device int * dev_arr; int * dev_result; //host malloc arr = (int*)malloc(sizeof(int)*N); result = (int*)malloc(sizeof(int)*N); //device malloc cudaMalloc((void**)&dev_arr,sizeof(int)*N); cudaMalloc((void**)&dev_result,sizeof(int)*N); //init for(int i = 0 ; i < N ; i++) { arr[i] = i+1; if(i < thread) { result[i] = 0; } } //cpu clock_t start,end; start = clock(); unsigned int res = 0; for(int i = 0 ; i < N ; i++) { res += arr[i]*arr[i]; } end = clock(); printf("cpu ans : %d\ncpu time: %.5f\n",res,float(end-start)); //time start cudaEvent_t st,ed; cudaEventCreate(&st); cudaEventCreate(&ed); cudaEventRecord(st,0); //memcpy To Host cudaMemcpy(dev_arr,arr,sizeof(int)*N,cudaMemcpyHostToDevice); //kernel functions calc0<<<block,thread>>>(dev_arr,dev_result); //memcpy To Device cudaMemcpy(result,dev_result,sizeof(int)*N,cudaMemcpyDeviceToHost); //output int res2=0; for(int i = 0 ; i < N ; i++) { res2 += result[i]; //printf("test: %d\n",result[i]); } //time end cudaEventRecord(ed,0); cudaEventSynchronize(ed); float gpu_time; cudaEventElapsedTime(&gpu_time,st,ed); printf("gpu ans :%d\ngpu time: %.5f\n",res2,gpu_time); //time destroy cudaEventDestroy(st); cudaEventDestroy(ed); //device free cudaFree(dev_arr); cudaFree(dev_result); free(arr); free(result); return 0; }