之前寫cuda:初始化環境,申請顯存,初始化顯存,launch kernel,拷貝數據,釋放顯存。一個頁面大部分都是這些繁雜但又必須的操做,有時還會忘掉釋放部分顯存。fetch
今天用C++11封裝了這些CUDA操做,而後就能夠專一於寫kernel代碼了。.cu文件就像glsl shader文件同樣簡潔明瞭。this
例如:./kernel.cu文件,裏面只有一個fill函數用於填充數組A。
extern "C" __global__ void fill(int * A, int cnt){ const int gap = blockDim.x*gridDim.x; for (int id = blockDim.x*blockIdx.x + threadIdx.x; id < cnt; id += gap) A[id] = id * 2; };
下面的main.cpp演示了Cuder類的使用。
#include "Cuder.h" const int N = 65536; std::string get_ptx_path(const char*); int main(){ int A[N]; for (int i = 0; i < N; ++i) A[i] = i; //爲禁止隨意建立CUcontext,將構造函數聲明爲private,安全起見禁用了拷貝構造函數和拷貝賦值運算符 redips::Cuder cuder = redips::Cuder::getInstance(); //添加並編譯一個.cu文件[至關於glsl shader 文件],或者直接添加一個ptx文件。 //std::string module_file = "kernel.cu"; std::string module_file = get_ptx_path("kernel.cu"); cuder.addModule(module_file); //顯存上申請一個大小爲[sizeof(int)*N]的數組,並將其命名爲["a_dev"],用於後面操做中該數組的標識; //若是第三個參數不爲null,還會執行cpu->gpu的數據拷貝 cuder.applyArray("a_dev", sizeof(int)*N, A); //運行["./kernel.cu"]文件中指定的["fill"]函數, 前兩個參數設定了gridSize和blockSize //{ "a_dev", N }是C++11中的initializer_list, 若是是字符串則對應前面申請的顯存數組名,不然是變量類型 cuder.launch(dim3(512, 1, 1), dim3(256, 1, 1), module_file, "fill", { "a_dev", N }); //將["a_dev"]對應的顯存數組拷貝回[A] cuder.fetchArray("a_dev", sizeof(int)*N, A); return 0; } std::string get_ptx_path(const char* cuFile){ std::string path = "./ptx/"; #ifdef WIN32 path += "Win32/"; #else path += "x64/"; #endif #ifdef _DEBUG path += "Debug/"; #else path += "Release/"; #endif return path + cuFile + ".ptx"; }
cuder.addModule(...)函數的參數是一個.cu文件或者.ptx文件。
1. 若是是.cu文件,該函數負責將函數編譯成ptx代碼。而後封裝到CUmodule裏。
2. 若是是.ptx文件,該函數只是將ptx封裝到CUmodule裏。
建議使用第二種方式,nvidia的optix就是這麼作的。好處是在編譯階段編譯總比運行時編譯好,若是代碼有錯誤編譯時就會提示。這時須要兩點配置:
2.a 在生成依賴項裏添加cuda 編譯器,而後相應的.cu文件設定爲用該編譯器編譯。
2.b 設定將.cu文件生成到指定路徑下的ptx文件,而後在程序中指定該ptx文件的路徑。
下面貼上Cuder.h的代碼
#pragma once #include <map> #include <string> #include <vector> #include <cuda.h> #include <nvrtc.h> #include <fstream> #include <sstream> #include <iostream> #include <cudaProfiler.h> #include <cuda_runtime.h> #include <helper_cuda_drvapi.h> namespace redips{ class Cuder{ CUcontext context; std::map <std::string, CUmodule> modules; std::map <std::string, CUdeviceptr> devptrs; Cuder(){ checkCudaErrors(cuCtxCreate(&context, 0, cuDevice)); } void release(){ //for (auto module : modules) delete module.second; for (auto dptr : devptrs) cuMemFree(dptr.second); devptrs.clear(); modules.clear(); cuCtxDestroy(context); } public: class ValueHolder{ public: void * value = nullptr; bool is_string = false; ValueHolder(const char* str){ value = (void*)str; is_string = true; } template <typename T> ValueHolder(const T& data){ value = new T(data); } }; static Cuder getInstance(){ if (!cuda_enviroment_initialized) initialize(); return Cuder(); } //forbidden copy-constructor and assignment function Cuder(const Cuder&) = delete; Cuder& operator= (const Cuder& another) = delete; Cuder(Cuder&& another){ this->context = another.context; another.context = nullptr; this->devptrs = std::map<std::string, CUdeviceptr>(std::move(another.devptrs)); this->modules = std::map<std::string, CUmodule>(std::move(another.modules)); } Cuder& operator= (Cuder&& another) { if (this->context == another.context) return *this; release(); this->context = another.context; another.context = nullptr; this->devptrs = std::map<std::string, CUdeviceptr>(std::move(another.devptrs)); this->modules = std::map<std::string, CUmodule>(std::move(another.modules)); return *this; } virtual ~Cuder(){ release(); }; public: bool launch(dim3 gridDim, dim3 blockDim, std::string module, std::string kernel_function, std::initializer_list<ValueHolder> params){ //get kernel address if (!modules.count(module)){ std::cerr << "[Cuder] : error: doesn't exists an module named " << module << std::endl; return false; } CUfunction kernel_addr; if (CUDA_SUCCESS != cuModuleGetFunction(&kernel_addr, modules[module], kernel_function.c_str())){ std::cerr << "[Cuder] : error: doesn't exists an kernel named " << kernel_function << " in module " << module << std::endl; return false; } //setup params std::vector<void*> pamary; for (auto v : params){ if (v.is_string){ if (devptrs.count((const char*)(v.value))) pamary.push_back((void*)(&(devptrs[(const char*)(v.value)]))); else{ std::cerr << "[Cuder] : error: launch failed. doesn't exists an array named " << (const char*)(v.value) << std::endl;; return false; } } else pamary.push_back(v.value); } cudaEvent_t start, stop; float elapsedTime = 0.0; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); bool result = (CUDA_SUCCESS == cuLaunchKernel(kernel_addr,/* grid dim */gridDim.x, gridDim.y, gridDim.z, /* block dim */blockDim.x, blockDim.y, blockDim.z, /* shared mem, stream */ 0, 0, &pamary[0], /* arguments */0)); cuCtxSynchronize(); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTime, start, stop); std::cout << "[Cuder] : launch finish. cost " << elapsedTime << "ms" << std::endl; return result; } bool addModule(std::string cufile){ if (modules.count(cufile)){ std::cerr << "[Cuder] : error: already has an modules named " << cufile << std::endl;; return false; } std::string ptx = get_ptx(cufile); if (ptx.length() > 0){ CUmodule module; checkCudaErrors(cuModuleLoadDataEx(&module, ptx.c_str(), 0, 0, 0)); modules[cufile] = module; return true; } else{ std::cerr << "[Cuder] : error: add module " << cufile << " failed!\n"; return false; } } void applyArray(const char* name, size_t size, void* h_ptr=nullptr){ if (devptrs.count(name)){ std::cerr << "[Cuder] : error: already has an array named " << name << std::endl;; return; } CUdeviceptr d_ptr; checkCudaErrors(cuMemAlloc(&d_ptr, size)); if (h_ptr) checkCudaErrors(cuMemcpyHtoD(d_ptr, h_ptr, size)); devptrs[name] = d_ptr; } void fetchArray(const char* name, size_t size,void * h_ptr){ if (!devptrs.count(name)){ std::cerr << "[Cuder] : error: doesn't exists an array named " << name << std::endl;; return; } checkCudaErrors(cuMemcpyDtoH(h_ptr, devptrs[name], size)); } private: static int devID; static CUdevice cuDevice; static bool cuda_enviroment_initialized; static void initialize(){ // picks the best CUDA device [with highest Gflops/s] available devID = gpuGetMaxGflopsDeviceIdDRV(); checkCudaErrors(cuDeviceGet(&cuDevice, devID)); // print device information { char name[100]; int major = 0, minor = 0; checkCudaErrors(cuDeviceGetName(name, 100, cuDevice)); checkCudaErrors(cuDeviceComputeCapability(&major, &minor, cuDevice)); printf("[Cuder] : Using CUDA Device [%d]: %s, %d.%d compute capability\n", devID, name, major, minor); } //initialize checkCudaErrors(cuInit(0)); cuda_enviroment_initialized = true; } //若是是ptx文件則直接返回文件內容,若是是cu文件則編譯後返回ptx std::string get_ptx(std::string filename){ std::ifstream inputFile(filename, std::ios::in | std::ios::binary | std::ios::ate); if (!inputFile.is_open()) { std::cerr << "[Cuder] : error: unable to open " << filename << " for reading!\n"; return ""; } std::streampos pos = inputFile.tellg(); size_t inputSize = (size_t)pos; char * memBlock = new char[inputSize + 1]; inputFile.seekg(0, std::ios::beg); inputFile.read(memBlock, inputSize); inputFile.close(); memBlock[inputSize] = '\x0'; if (filename.find(".ptx") != std::string::npos) return std::string(std::move(memBlock)); // compile nvrtcProgram prog; if (nvrtcCreateProgram(&prog, memBlock, filename.c_str(), 0, NULL, NULL) == NVRTC_SUCCESS){ delete memBlock; if (nvrtcCompileProgram(prog, 0, nullptr) == NVRTC_SUCCESS){ // dump log size_t logSize; nvrtcGetProgramLogSize(prog, &logSize); if (logSize>0){ char *log = new char[logSize + 1]; nvrtcGetProgramLog(prog, log); log[logSize] = '\x0'; std::cout << "[Cuder] : compile [" << filename << "] " << log << std::endl; delete(log); } else std::cout << "[Cuder] : compile [" << filename << "] finish" << std::endl; // fetch PTX size_t ptxSize; nvrtcGetPTXSize(prog, &ptxSize); char *ptx = new char[ptxSize+1]; nvrtcGetPTX(prog, ptx); nvrtcDestroyProgram(&prog); return std::string(std::move(ptx)); } } delete memBlock; return ""; } }; bool Cuder::cuda_enviroment_initialized = false; int Cuder::devID = 0; CUdevice Cuder::cuDevice = 0; };
下面貼一下VS裏面須要的配置
//include C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5\include C:\ProgramData\NVIDIA Corporation\CUDA Samples\v7.5\common\inc //lib C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5\lib\x64 cuda.lib cudart.lib nvrtc.lib