Read the fucking official documents!
--By 魯迅A picture is worth a thousand words.
--By 高爾基說明:ios
OpenCL
,目標是淺顯易懂,若是沒有達到這個效果,就當我沒說這話;Middleware
的系統軟件工程師,不是一個好碼農;kernels
的語言(基於C99);opencl_runtime_16.1.2_x64_rh_6.4.0.37.tgz
以人工智能場景爲例來理解一下,假如在某個AI芯片上跑人臉識別應用,CPU擅長控制,AI processor擅長計算,軟件的flow就能夠進行拆分,用CPU來負責控制視頻流輸入輸出先後處理,AI processor來完成深度學習模型運算完成識別,這就是一個典型的異構處理場景,若是該AI芯片的SDK支持OpenCL,那麼上層的軟件就能夠基於OpenCL進行開發了。程序員
話很少說,看看OpenCL的架構吧。編程
OpenCL架構,能夠從平臺模型、內存模型、執行模型、編程模型四個角度來展開。架構
平臺模型:硬件拓撲關係的抽象描述併發
Compute Unit(CU)
;Processing Unit(PE)
,最終的計算由PE來完成;執行模型:Host如何利用OpenCL Device的計算資源完成高效的計算處理過程框架
OpenCL的Execution Model由兩個不一樣的執行單元定義:1)運行在OpenCL設備上的kernel;2)運行在Host上的Host program;
其中,OpenCL使用Context表明kernel的執行環境:函數
Context包含如下資源:學習
有兩種方式來找到work-item:測試
以一維爲例:ui
以二維爲例:
三維的方式也相似,略去。
內存模型:Host和OpenCL Device怎麼來看待數據
OpenCL的內存模型中,包含如下幾類類型的內存:
下邊來一個實際的代碼測試跑跑,Talk is cheap, show me the code!
opencl_runtime_16.1.2_x64_rh_6.4.0.37.tgz
);#include <iostream> #include <fstream> #include <sstream> #include <CL/cl.h> const int DATA_SIZE = 10; int main(void) { /* 1. get platform & device information */ cl_uint num_platforms; cl_platform_id first_platform_id; clGetPlatformIDs(1, &first_platform_id, &num_platforms); /* 2. create context */ cl_int err_num; cl_context context = nullptr; cl_context_properties context_prop[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)first_platform_id, 0 }; context = clCreateContextFromType(context_prop, CL_DEVICE_TYPE_CPU, nullptr, nullptr, &err_num); /* 3. create command queue */ cl_command_queue command_queue; cl_device_id *devices; size_t device_buffer_size = -1; clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, nullptr, &device_buffer_size); devices = new cl_device_id[device_buffer_size / sizeof(cl_device_id)]; clGetContextInfo(context, CL_CONTEXT_DEVICES, device_buffer_size, devices, nullptr); command_queue = clCreateCommandQueueWithProperties(context, devices[0], nullptr, nullptr); delete [] devices; /* 4. create program */ std::ifstream kernel_file("vector_add.cl", std::ios::in); std::ostringstream oss; oss << kernel_file.rdbuf(); std::string srcStdStr = oss.str(); const char *srcStr = srcStdStr.c_str(); cl_program program; program = clCreateProgramWithSource(context, 1, (const char **)&srcStr, nullptr, nullptr); /* 5. build program */ clBuildProgram(program, 0, nullptr, nullptr, nullptr, nullptr); /* 6. create kernel */ cl_kernel kernel; kernel = clCreateKernel(program, "vector_add", nullptr); /* 7. set input data && create memory object */ float output[DATA_SIZE]; float input_x[DATA_SIZE]; float input_y[DATA_SIZE]; for (int i = 0; i < DATA_SIZE; i++) { input_x[i] = (float)i; input_y[i] = (float)(2 * i); } cl_mem mem_object_x; cl_mem mem_object_y; cl_mem mem_object_output; mem_object_x = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * DATA_SIZE, input_x, nullptr); mem_object_y = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * DATA_SIZE, input_y, nullptr); mem_object_output = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * DATA_SIZE, nullptr, nullptr); /* 8. set kernel argument */ clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_object_x); clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem_object_y); clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_object_output); /* 9. send kernel to execute */ size_t globalWorkSize[1] = {DATA_SIZE}; size_t localWorkSize[1] = {1}; clEnqueueNDRangeKernel(command_queue, kernel, 1, nullptr, globalWorkSize, localWorkSize, 0, nullptr, nullptr); /* 10. read data from output */ clEnqueueReadBuffer(command_queue, mem_object_output, CL_TRUE, 0, DATA_SIZE * sizeof(float), output, 0, nullptr, nullptr); for (int i = 0; i < DATA_SIZE; i++) { std::cout << output[i] << " "; } std::cout << std::endl; /* 11. clean up */ clRetainMemObject(mem_object_x); clRetainMemObject(mem_object_y); clRetainMemObject(mem_object_output); clReleaseCommandQueue(command_queue); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseContext(context); return 0; }
vector_add.cl
文件中內容以下:
__kernel void vector_add(__global const float *input_x, __global const float *input_y, __global float *output) { int gid = get_global_id(0); output[gid] = input_x[gid] + input_y[gid]; }
The OpenCL Specification
歡迎關注公衆號,不按期分享技術文章