CUDA 顯存操做:CUDA支持的C++11 CUDER:用C++11封裝的CUDA類

CUDA9的編譯器和語言改進html

使用CUDA 9,nvcc編譯器增長了對C ++ 14的支持,其中包括新功能ios

通用的lambda表達式,其中使用auto關鍵字代替參數類型;api

auto lambda = [](auto a,auto b){return a * b;};數組

功能的返回類型扣除(使用auto關鍵字做爲返回類型,如上例所示)安全

對constexpr函數能夠包含的更少的限制,包括變量聲明,if,switch和循環。app

CUDA 9中的NVCC也更快,與CUDA 8相比,編譯時間平均減小了20%,達到了50%。函數

·擴大開發平臺和主機編譯器,包括Microsoft Visual Studio 2017, Clang 3.9, PGI17.1和GCC6.xpost

CUDER:用C++11封裝的CUDA類

之前寫cuda:初始化環境,申請顯存,初始化顯存,launch kernel,拷貝數據,釋放顯存。一個頁面大部分都是這些繁雜但又必須的操做,有時還會忘掉釋放部分顯存。fetch

今天用C++11封裝了這些CUDA操做,而後就能夠專一於寫kernel代碼了。.cu文件就像glsl shader文件同樣簡潔明瞭。this

例如:./kernel.cu文件,裏面只有一個fill函數用於填充數組A。

extern "C"  __global__ void fill(int * A, int cnt){
    const int gap = blockDim.x*gridDim.x;
    for (int id = blockDim.x*blockIdx.x + threadIdx.x; id < cnt; id += gap)
        A[id] = id * 2;
};

下面的main.cpp演示了Cuder類的使用。

複製代碼
#include "Cuder.h"
const int N = 65536;
std::string get_ptx_path(const char*);

int main(){
    int A[N];  for (int i = 0; i < N; ++i) A[i] = i;

    //爲禁止隨意建立CUcontext,將構造函數聲明爲private,安全起見禁用了拷貝構造函數和拷貝賦值運算符
    redips::Cuder cuder = redips::Cuder::getInstance();

    //添加並編譯一個.cu文件[至關於glsl shader 文件],或者直接添加一個ptx文件。
    //std::string module_file = "kernel.cu";
    std::string module_file = get_ptx_path("kernel.cu");
    cuder.addModule(module_file);
    
    //顯存上申請一個大小爲[sizeof(int)*N]的數組,並將其命名爲["a_dev"],用於後面操做中該數組的標識;
    //若是第三個參數不爲null,還會執行cpu->gpu的數據拷貝
    cuder.applyArray("a_dev", sizeof(int)*N, A);
    
    //運行["./kernel.cu"]文件中指定的["fill"]函數, 前兩個參數設定了gridSize和blockSize
    //{ "a_dev", N }是C++11中的initializer_list, 若是是字符串則對應前面申請的顯存數組名,不然是變量類型
    cuder.launch(dim3(512, 1, 1), dim3(256, 1, 1), module_file, "fill", { "a_dev", N });
    
    //將["a_dev"]對應的顯存數組拷貝回[A]
    cuder.fetchArray("a_dev", sizeof(int)*N, A);
    return 0;
}

std::string get_ptx_path(const char* cuFile){
    std::string path = "./ptx/";

#ifdef WIN32
    path += "Win32/";
#else
    path += "x64/";
#endif

#ifdef _DEBUG
    path += "Debug/";
#else 
    path += "Release/";
#endif
    return path + cuFile + ".ptx";
}
複製代碼

 cuder.addModule(...)函數的參數是一個.cu文件或者.ptx文件。

1. 若是是.cu文件,該函數負責將函數編譯成ptx代碼。而後封裝到CUmodule裏。
2. 若是是.ptx文件,該函數只是將ptx封裝到CUmodule裏。
建議使用第二種方式,nvidia的optix就是這麼作的。好處是在編譯階段編譯總比運行時編譯好,若是代碼有錯誤編譯時就會提示。這時須要兩點配置:
2.a 在生成依賴項裏添加cuda 編譯器,而後相應的.cu文件設定爲用該編譯器編譯。
2.b 設定將.cu文件生成到指定路徑下的ptx文件,而後在程序中指定該ptx文件的路徑。

 

下面貼上Cuder.h的代碼


#pragma once
#include <map>
#include <string>
#include <vector>
#include <cuda.h>
#include <nvrtc.h>
#include <fstream>
#include <sstream>
#include <iostream>
#include <cudaProfiler.h>
#include <cuda_runtime.h>
#include <helper_cuda_drvapi.h>

namespace redips{
    class Cuder{
        CUcontext context;
        std::map <std::string, CUmodule> modules;
        std::map <std::string, CUdeviceptr> devptrs;
        
        Cuder(){ 
            checkCudaErrors(cuCtxCreate(&context, 0, cuDevice)); 
        }
        void release(){
            //for (auto module : modules) delete module.second;
            for (auto dptr : devptrs)    cuMemFree(dptr.second);
            devptrs.clear();
            modules.clear();
            cuCtxDestroy(context);
        }
    public:
        class ValueHolder{
        public:
            void * value = nullptr;
            bool is_string = false;
            ValueHolder(const char* str){
                value = (void*)str;
                is_string = true;
            }
            template <typename T>
            ValueHolder(const T& data){
                value = new T(data);
            }
        };

        static Cuder getInstance(){
            if (!cuda_enviroment_initialized) initialize();
            return Cuder();
        }

        //forbidden copy-constructor and assignment function
        Cuder(const Cuder&) = delete;
        Cuder& operator= (const Cuder& another) = delete;

        Cuder(Cuder&& another){
            this->context = another.context;
            another.context = nullptr;
            this->devptrs = std::map<std::string, CUdeviceptr>(std::move(another.devptrs));
            this->modules = std::map<std::string, CUmodule>(std::move(another.modules));
        }
        Cuder& operator= (Cuder&& another) {
            if (this->context == another.context) return *this;
            release();
            this->context = another.context; 
            another.context = nullptr;
            this->devptrs = std::map<std::string, CUdeviceptr>(std::move(another.devptrs));
            this->modules = std::map<std::string, CUmodule>(std::move(another.modules));
            return *this;
        }
        
        virtual ~Cuder(){ release();    };
        
    public:
        bool launch(dim3 gridDim, dim3 blockDim, std::string module, std::string kernel_function, std::initializer_list<ValueHolder> params){
            //get kernel address
            if (!modules.count(module)){
                std::cerr << "[Cuder] : error: doesn't exists an module named " << module << std::endl; return false;
            }
            CUfunction kernel_addr;
            if (CUDA_SUCCESS != cuModuleGetFunction(&kernel_addr, modules[module], kernel_function.c_str())){
                std::cerr << "[Cuder] : error: doesn't exists an kernel named " << kernel_function << " in module " << module << std::endl; return false;
            }
            //setup params
            std::vector<void*> pamary;
            for (auto v : params){
                if (v.is_string){
                    if (devptrs.count((const char*)(v.value))) pamary.push_back((void*)(&(devptrs[(const char*)(v.value)])));
                    else{
                        std::cerr << "[Cuder] : error: launch failed. doesn't exists an array named " << (const char*)(v.value) << std::endl;;
                        return false;
                    }
                }
                else pamary.push_back(v.value);
            }

            cudaEvent_t start, stop;
            float elapsedTime = 0.0;
            cudaEventCreate(&start);
            cudaEventCreate(&stop);
            cudaEventRecord(start, 0);

            bool result = (CUDA_SUCCESS == cuLaunchKernel(kernel_addr,/* grid dim */gridDim.x, gridDim.y, gridDim.z, /* block dim */blockDim.x, blockDim.y, blockDim.z, /* shared mem, stream */ 0, 0, &pamary[0], /* arguments */0));
            cuCtxSynchronize();

            cudaEventRecord(stop, 0);
            cudaEventSynchronize(stop);
            cudaEventElapsedTime(&elapsedTime, start, stop);
            std::cout << "[Cuder] : launch finish. cost " << elapsedTime << "ms" << std::endl;
            return result;
        }
        bool addModule(std::string cufile){
            if (modules.count(cufile)){
                std::cerr << "[Cuder] : error: already has an modules named " << cufile << std::endl;;
                return false;
            }

            std::string ptx = get_ptx(cufile);
            
            if (ptx.length() > 0){
                CUmodule module;
                checkCudaErrors(cuModuleLoadDataEx(&module, ptx.c_str(), 0, 0, 0));
                modules[cufile] = module;
                return true;
            }
            else{
                std::cerr << "[Cuder] : error: add module " << cufile << " failed!\n";
                return false;
            }
        }
        void applyArray(const char* name, size_t size, void* h_ptr=nullptr){
            if (devptrs.count(name)){
                std::cerr << "[Cuder] : error: already has an array named " << name << std::endl;;
                return;
            }
            CUdeviceptr d_ptr;
            checkCudaErrors(cuMemAlloc(&d_ptr, size));
            if (h_ptr) 
                checkCudaErrors(cuMemcpyHtoD(d_ptr, h_ptr, size));
            devptrs[name] = d_ptr;
        }
        void fetchArray(const char* name, size_t size,void * h_ptr){
            if (!devptrs.count(name)){
                std::cerr << "[Cuder] : error: doesn't exists an array named " << name << std::endl;;
                return;
            }
            checkCudaErrors(cuMemcpyDtoH(h_ptr, devptrs[name], size));
        }
        
    private:
        static int devID;
        static CUdevice cuDevice;
        static bool cuda_enviroment_initialized;
        static void initialize(){
            // picks the best CUDA device [with highest Gflops/s] available
            devID = gpuGetMaxGflopsDeviceIdDRV();
            checkCudaErrors(cuDeviceGet(&cuDevice, devID));
            // print device information
            {
                char name[100]; int major = 0, minor = 0;
                checkCudaErrors(cuDeviceGetName(name, 100, cuDevice));
                checkCudaErrors(cuDeviceComputeCapability(&major, &minor, cuDevice));
                printf("[Cuder] : Using CUDA Device [%d]: %s, %d.%d compute capability\n", devID, name, major, minor);
            }
            //initialize
            checkCudaErrors(cuInit(0));

            cuda_enviroment_initialized = true;
        }
        //若是是ptx文件則直接返回文件內容,若是是cu文件則編譯後返回ptx
        std::string get_ptx(std::string filename){
            std::ifstream inputFile(filename, std::ios::in | std::ios::binary | std::ios::ate);
            if (!inputFile.is_open()) {
                std::cerr << "[Cuder] : error: unable to open " << filename << " for reading!\n";
                return "";
            }

            std::streampos pos = inputFile.tellg();
            size_t inputSize = (size_t)pos;
            char * memBlock = new char[inputSize + 1];

            inputFile.seekg(0, std::ios::beg);
            inputFile.read(memBlock, inputSize);
            inputFile.close();
            memBlock[inputSize] = '\x0';

            if (filename.find(".ptx") != std::string::npos) 
                return std::string(std::move(memBlock));
            // compile
            nvrtcProgram prog;
            if (nvrtcCreateProgram(&prog, memBlock, filename.c_str(), 0, NULL, NULL) == NVRTC_SUCCESS){
                delete memBlock;
                if (nvrtcCompileProgram(prog, 0, nullptr) == NVRTC_SUCCESS){
                    // dump log
                    size_t logSize; 
                    nvrtcGetProgramLogSize(prog, &logSize);
                    if (logSize>0){
                        char *log = new char[logSize + 1];
                        nvrtcGetProgramLog(prog, log);
                        log[logSize] = '\x0';
                        std::cout << "[Cuder] : compile [" << filename << "] " << log << std::endl;
                        delete(log);
                    }
                    else std::cout << "[Cuder] : compile [" << filename << "] finish" << std::endl;

                    // fetch PTX
                    size_t ptxSize;
                    nvrtcGetPTXSize(prog, &ptxSize);
                    char *ptx = new char[ptxSize+1];
                    nvrtcGetPTX(prog, ptx);
                    nvrtcDestroyProgram(&prog);
                    return std::string(std::move(ptx));
                }
            }
            delete memBlock;
            return "";
        }
    };
    bool Cuder::cuda_enviroment_initialized = false;
    int Cuder::devID = 0;
    CUdevice Cuder::cuDevice = 0;
};

 

 下面貼一下VS裏面須要的配置


//include 
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5\include
C:\ProgramData\NVIDIA Corporation\CUDA Samples\v7.5\common\inc
//lib
C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.5\lib\x64

cuda.lib
cudart.lib
nvrtc.lib
相關文章
相關標籤/搜索