最近有很多朋友在屢次循環執行OpenCL內核程序的時候碰到一些問題。因爲對OpenCL初學者而言可能比較廣泛,所以我這裏給出一個清晰簡單的demo來掩飾如何簡單又高效地執行循環執行OpenCL內核。緩存
如下程序的大概意思與流程是:架構
內核程序含有兩個參數,第一個參數既是輸入又是輸出,第二個參數僅僅用於輸入。不過第一個參數只對其初始化一次,而第二個參數在每次循環執行新一次的內核程序前會再傳遞一次數據。這麼作有助於同窗更好地去理解、把握存儲器對象的基本使用方法。ui
存儲器對象在經過cl_context上下文建立完以後,其所在的GPU端的位置就不變了。所以,咱們在循環執行內核程序以前不須要把存儲器對象釋放掉,而後從新分配。這麼作就比較低效了。咱們徹底能夠重用同一個存儲器對象。spa
如下代碼在個人MacBook Air上能徹底經過編譯執行。沒有任何warning。code
執行環境:基於Haswell微架構的Intel Core i7 4650U,Intel HD Graphics 5000,8GB DDR3L,128GB SSD。orm
OS X 10.9.2 Mavericks,Xcode 5.1,Apple LLVM 5.1,支持GNU11標準的C編譯器。對象
#include <stdio.h> #include <string.h> #include <stdlib.h> #ifdef __APPLE__ #include <OpenCL/opencl.h> #else #include <CL/cl.h> #endif int main(void) { cl_int ret; cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_mem memObj1 = NULL; cl_mem memObj2 = NULL; char *kernelSource = NULL; cl_program program = NULL; cl_kernel kernel = NULL; int *pInputBuffer1 = NULL; int *pInputBuffer2 = NULL; int *pOutputBuffer = NULL; clGetPlatformIDs(1, &platform_id, NULL); if(platform_id == NULL) { puts("Get OpenCL platform failed!"); goto FINISH; } clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); if(device_id == NULL) { puts("No GPU available as a compute device!"); goto FINISH; } context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); if(context == NULL) { puts("Context not established!"); goto FINISH; } command_queue = clCreateCommandQueue(context, device_id, 0, &ret); if(command_queue == NULL) { puts("Command queue cannot be created!"); goto FINISH; } // Specify the path of the kernel source const char *pFileName = "/Users/zennychen/Downloads/test.cl"; FILE *fp = fopen(pFileName, "r"); if (fp == NULL) { puts("The specified kernel source file cannot be opened!"); goto FINISH; } fseek(fp, 0, SEEK_END); const long kernelLength = ftell(fp); fseek(fp, 0, SEEK_SET); kernelSource = malloc(kernelLength); fread(kernelSource, 1, kernelLength, fp); fclose(fp); program = clCreateProgramWithSource(context, 1, (const char**)&kernelSource, (const size_t*)&kernelLength, &ret); ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); if (ret != CL_SUCCESS) { size_t len; char buffer[8 * 1024]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); goto FINISH; } kernel = clCreateKernel(program, "test", &ret); if(kernel == NULL) { puts("Kernel failed to create!"); goto FINISH; } const size_t contentLength = sizeof(*pInputBuffer1) * 1024 * 1024; // 這裏預分配的緩存大小爲4MB,第一個參數是讀寫的 memObj1 = clCreateBuffer(context, CL_MEM_READ_WRITE, contentLength, NULL, &ret); if(memObj1 == NULL) { puts("Memory object1 failed to create!"); goto FINISH; } // 這裏預分配的緩存大小爲4MB,第一個參數是隻讀的 memObj2 = clCreateBuffer(context, CL_MEM_READ_ONLY, contentLength, NULL, &ret); if(memObj1 == NULL) { puts("Memory object2 failed to create!"); goto FINISH; } ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memObj1); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&memObj2); if(ret != CL_SUCCESS) { puts("Set arguments error!"); goto FINISH; } // 如下爲在主機端分配輸入緩存 pInputBuffer1 = malloc(contentLength); pInputBuffer2 = malloc(contentLength); // 而後對此工做緩存進行初始化 for(int i = 0; i < 1024 * 1024; i++) pInputBuffer1[i] = i + 1; memset(pInputBuffer2, 0, contentLength); // 而後分配輸出緩存 pOutputBuffer = malloc(contentLength); // 先將第一個參數的數據傳入GPU端,之後就不去改動了 ret = clEnqueueWriteBuffer(command_queue, memObj1, CL_TRUE, 0, contentLength, pInputBuffer1, 0, NULL, NULL); if(ret != CL_SUCCESS) { puts("Data transfer failed"); goto FINISH; } int count = 5; // 執行5次循環 do { // 先將第二個參數傳給GPU ret = clEnqueueWriteBuffer(command_queue, memObj2, CL_TRUE, 0, contentLength, pInputBuffer2, 0, NULL, NULL); if(ret != CL_SUCCESS) { puts("Data transfer failed"); goto FINISH; } // 這裏指定將總共有1024 * 1024個work-item ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, (const size_t[]){1024 * 1024}, NULL, 0, NULL, NULL); // 將結果拷貝給主機端 ret = clEnqueueReadBuffer(command_queue, memObj1, CL_TRUE, 0, contentLength, pOutputBuffer, 0, NULL, NULL); // 作次同步,這裏偷懶,不用wait event機制了~ clFinish(command_queue); // 作校驗 const int newValue = 5 - count + 1; const int addition = (5 - count) * newValue / 2; for(int i = 0; i < 1024 * 1024; i++) { if(pOutputBuffer[i] != i + 1 + addition) { puts("Result error!"); break; } } // 最後,給第二個緩存初始化新數據 for(int i = 0; i < 1024 * 1024; i++) pInputBuffer2[i] = newValue; } while(--count > 0); FINISH: /* Finalization */ if(pInputBuffer1 != NULL) free(pInputBuffer1); if(pInputBuffer2 != NULL) free(pInputBuffer2); if(pOutputBuffer != NULL) free(pOutputBuffer); if(kernelSource != NULL) free(kernelSource); if(memObj1 != NULL) clReleaseMemObject(memObj1); if(memObj2 != NULL) clReleaseMemObject(memObj2); if(kernel != NULL) clReleaseKernel(kernel); if(program != NULL) clReleaseProgram(program); if(command_queue != NULL) clReleaseCommandQueue(command_queue); if(context != NULL) clReleaseContext(context); return 0; }
上面OpenCL內核源文件的路徑被寫死了——「/Users/zennychen/Downloads/test.cl」。各位能夠根據本身環境從新指定。blog
另外,上面用了一些C99語法特性。若是是用Win7的小夥伴們,請使用Visual Studio 2013(Express/Professional)的C編譯器。ci
下面是OpenCL內核源文件:get
__kernel void test(__global int *pInOut, __global int *pIn) { int index = get_global_id(0); pInOut[index] += pIn[index]; }