整合Yolov3到UE4/Unity3D

時間 2019-11-06

標籤整合 yolov3 yolov ue4 unity3d unity 简体版

原文原文鏈接

　　這篇實際上是前文 CUDA版Grabcut的實現的後續，和上文同樣，先放視頻。html

　　(博客園好像不支持視頻，gif文件太大，視頻連接）算法

　　在上文用CUDA實現opencv下的grabcut後，當時問題主要是最後須要mincut須要上千次push-relabel才能獲得滿意結果，後改成種子點方式，不到100次就能夠獲得滿意結果，可是種子點須要本身來畫，不是很方便，所以，引入深度神經網絡先用單楨計算種子點，而後根據這些確認的種子點來計算GMM，如視頻這樣，以很小代價成功處理1080P下的數據（上面開了錄製視頻與Unity編輯器佔用了大量CPU）。數組

　　和前文同樣，先說其中大致的實現過程。安全

　　1 對於實現yolov3的框架darknet的改造，如今darknet只能傳入CPU數據進行計算，可是咱們框架內部在讀取攝像頭後就直接丟到GPU上了，原始的YUV轉RGBA都算好在GPU上，若是丟CPU數據，就要先download,而後再upload，這個操做太浪費時間而且是不必的，咱們須要改造這些接口。網絡

　　2 根據一楨計算一個滿意的分割圖，而後根據分割圖算GMM，而後根據這個GMM算後續圖的mask(前景與背景變化不大的狀況下), 其中grabcut的mincut裏須要迭代屢次push-relabel，所以這裏給的分辨率須要先降採樣，那麼算的結果mask並非太清晰，須要整合 CUDA加opencv復現導向濾波算法咱們才能獲得滿意的清晰邊緣。app

　　3 爲了更高效率，咱們只考慮顯存直接對接方案，相比於CUDA提供的與DX11交互的例子，直接把引擎UE4/Unity3D裏的Dx11上下文拿過來，而後把咱們處理完後的數據給對應的DX11中，有個很大問題，攝像頭30FPS就夠了，遊戲這塊咱們須要60FPS向上，在這過程當中，遊戲引擎的渲染線程被CUDA處理數據線程佔用致使遊戲會卡，就想CUDA處理線程和遊戲引擎的渲染線程都去上廁所，CUDA處理線程佔着廁所，遊戲引擎的渲染線程就只能在外面等着，而咱們要的是，渲染線程看廁所被佔了，就立刻去處理事情，等過會再來看看廁所是否空着了。框架

　　4 cuda端引用的dll過多，原來一些基本的藍綠幕/深度扣像在DX11都有實現，如今的CUDA端要作成一個能方便接入的系統，提供一些高端功能，如神經網絡檢查是否有人在鏡頭內，鏡頭內的方位等，而且沒有也不影響原來項目，相似插件模塊。編輯器

　　相對上文的一些改變，中間由於顯卡須要適配2070等20系最新顯卡的緣由，原來的openc4.0alpha(含cuda模塊)+cuda8不能使用，因而從新編譯了下opencv4.1(含cuda模塊)+cuda10.1，中間有些坑，之後若是有時間就記錄下，換了後也有個很差的地方，若是不是比較新的N卡驅動，就加載不了如今的cuda模塊。考慮性價比，用的是yolov3-tiny框架，每楨中佔比不到5ms,檢測效果還行。ide

　　首先針對第一點，咱們須要改進的是能直接傳入的GPU方法，首先達到相似以下效果。函數

cv::cuda::resize(mat, netFrame, cv::Size(netWidth, netHeight), 0, 0, cv::INTER_LINEAR);
image2netData_gpu(netFrame, netInput);
network_predict_gpudata(net, netInput);
detection *dets = get_network_boxes(net, netWidth, netHeight, cthresh, 0, 0, 1, &nboxes);

darknet接收GPU數據　　

　　咱們傳入的GpuMat,直接在顯卡完成縮放，轉化成特徵圖，而後把數據丟給神經網絡去計算，詳細實現代碼以下。

inline __global__ void image2netData(PtrStepSz<uchar4> source, float* outData, int size)
{
    //rgbargbargba->rrrgggbbb
    const int idx = blockDim.x * blockIdx.x + threadIdx.x;
    const int idy = blockDim.y * blockIdx.y + threadIdx.y;
    if (idx < source.cols && idy < source.rows)
    {
        float4 color = rgbauchar42float4(source(idy, idx));
        int nindex = idy * source.cols + idx;
        outData[nindex] = color.x;
        outData[size + nindex] = color.y;
        outData[2 * size + nindex] = color.z;
    }
}
void network_predict_gpudata(network * net, float *input)
{
    network orig = *net;        
    cudaMemcpy(net->input_gpu, input, net->w*net->h * sizeof(float) * 3, cudaMemcpyDeviceToDevice);
    net->truth = 0;
    net->train = 0;
    net->delta = 0;
    forward_network_gpudata(net);
    *net = orig;
}
void forward_network_gpudata(network *netp)
{
    network net = *netp;
    //cuda_set_device(net.gpu_index);    
    int i;
    for (i = 0; i < net.n; ++i) {
        net.index = i;
        layer l = net.layers[i];
        if (l.delta_gpu) {
            fill_gpu(l.outputs * l.batch, 0, l.delta_gpu, 1);
        }
        l.forward_gpu(l, net);
        net.input_gpu = l.output_gpu;
        net.input = l.output;
        if (l.truth) {
            net.truth_gpu = l.output_gpu;
            net.truth = l.output;
        }
    }
    pull_network_output(netp);
    calc_network_cost(netp);
}

forward_network_gpudata

　　這個改造比我想像中簡單的多，畢竟C框架，實現都在明面上。

　　而後咱們要根據咱們設想，先根據單楨計算GMM，就是視頻中點擊計算GMM的button過程，以下是實現過程。

inline __global__ void setMask(PtrStepSz<uchar> source, PtrStepSz<uchar> mask, int radius, cv::Rect rect)
{
    const int idx = blockDim.x * blockIdx.x + threadIdx.x;
    const int idy = blockDim.y * blockIdx.y + threadIdx.y;
    if (idx < source.cols && idy < source.rows)
    {
        int mmask = 0;
        //前景爲255
        if (idx > rect.x && idx < rect.x + rect.width && idy > rect.y && idy < rect.y + rect.height)
        {
            mmask = mask(idy, idx);
            int smask = source(idy, idx);
            if (smask == 3)
                mmask = 1;
        }
        mask(idy, idx) = mmask;
    }
}

inline __global__ void drawRect(PtrStepSz<uchar4> source, int xmin, int xmax, int ymin, int ymax, int radius, uchar4 drawColor)
{
    const int idx = blockDim.x * blockIdx.x + threadIdx.x;
    const int idy = blockDim.y * blockIdx.y + threadIdx.y;
    if (idx < source.cols && idy < source.rows)
    {
        int4 xx = make_int4(idx, xmax, idy, ymax);
        int4 yy = make_int4(xmin, idx, ymin, idy);

        int4 xy = abs(xx - yy);
        //只要有一個條件知足就行（分別表明左邊，右邊，上邊，下邊）
        int sum = (xy.x < radius) + (xy.y < radius) + (xy.z < radius) + (xy.w < radius);
        float2 lr = make_float2(xy.x + xy.y, xy.z + xy.w);
        float2 rl = make_float2(xmax - xmin, ymax - ymin);
        if (sum > 0 && length(lr - rl) < radius)
        {
            source(idy, idx) = drawColor;
        }
    }
}
#if MRDNN
//深度學習下自動計算一楨
vector<PersonBox> boxs;
int num = detect->checkImage(grabCutTempFrame, 0.3f, boxs);
if (num > 0)
{
    GpuMat halfCutTemp = GpuMat(grabCutHeight, grabCutWidth, CV_8UC4);
    GpuMat halfTemp = GpuMat(grabCutHeight, grabCutWidth, CV_8UC3);
    GpuMat resultMask = GpuMat(grabCutHeight, grabCutWidth, CV_8UC1);

    cv::Mat cpuResult, cpuHalfCutTemp;
    cv::cuda::resize(grabCutTempFrame, halfCutTemp, cv::Size(grabCutWidth, grabCutHeight), 0, 0, cv::INTER_LINEAR, cvCudaStream);
    //本身實現，避免引入opencv_cudafilters410/opencv_cudaarithm410，加起來快七百M
    rgba2bgr_gpu(halfCutTemp, halfTemp, cudaStream);
    halfTemp.download(cpuHalfCutTemp, cvCudaStream);
    cudaStreamSynchronize(cudaStream);
    //選擇一個邊框
    cv::Rect rectangle;
    rectangle.width = boxs[0].width*grabCutWidth;
    rectangle.height = boxs[0].height*grabCutHeight;
    rectangle.x = boxs[0].centerX*grabCutWidth - rectangle.width / 2;
    rectangle.y = boxs[0].centerY*grabCutHeight - rectangle.height / 2;
    //用opencv的方式來計算mask,由於opencv自己用八向，切割的邊緣更清晰
    cv::Mat bgModel, fgModel;
    cv::grabCut(cpuHalfCutTemp, cpuResult, rectangle, bgModel, fgModel, 3, cv::GC_INIT_WITH_RECT);
    //顯示爲3的就是GC_PR_FGD，
    resultMask.upload(cpuResult, cvCudaStream);
    //根據resultMask與rectangle構建grabCut->mask
    setMask_gpu(resultMask, grabCut->mask, 1, rectangle, cudaStream);
    //顯示邊框
    uchar4 drawColor = make_uchar4(128, 0, 0, 255);
    cv::Rect rectangle2;
    rectangle2.width = boxs[0].width*width;
    rectangle2.height = boxs[0].height*height;
    rectangle2.x = boxs[0].centerX*width - rectangle2.width / 2;
    rectangle2.y = boxs[0].centerY*height - rectangle2.height / 2;
    drawRect_gpu(grabCutRectFrame, rectangle2, 3, drawColor, cudaStream);
    //GpuMat的析構函數會調用，能夠不用手動調用 release
    halfCutTemp.release();
    halfTemp.release();
    resultMask.release();
}
#endif

計算GMM

　　裏面用的opencv自己提供的grabcut來算mask,由於單楨他用的八方向的，邊緣清晰度更好，分割效果更好，而後用們上一篇的實現根據他的mask來計算GMM的值，而後每楨用這個GMM與push-relabel算的一張低分辨率下的mask,正好用來作導向濾波的引導圖，通過CUDA加opencv復現導向濾波算法的處理，就能獲得邊緣清晰的扣圖效果。

　　其中算k-means/GMM的這個過程能夠看上文，和前文不同的是，原來k-means計算整張圖全部點，這個k-means只計算其中肯定的前景與背景，須要針對上文的updateCluster修改以下，讓計算過程能夠針對種子點模式。　　

//把source全部收集到一塊gridDim.x*gridDim.y塊數據上。
template<int blockx, int blocky, bool bSeed>
__global__ void updateCluster(PtrStepSz<uchar4> source, PtrStepSz<uchar> clusterIndex, PtrStepSz<uchar> mask, float4* kencter, int* kindexs)
{
    __shared__ float3 centers[blockx*blocky][CUDA_GRABCUT_K2];
    __shared__ int indexs[blockx*blocky][CUDA_GRABCUT_K2];
    const int idx = blockDim.x * blockIdx.x + threadIdx.x;
    const int idy = blockDim.y * blockIdx.y + threadIdx.y;
    const int threadId = threadIdx.x + threadIdx.y * blockDim.x;
#pragma unroll CUDA_GRABCUT_K2
    for (int i = 0; i < CUDA_GRABCUT_K2; i++)
    {
        centers[threadId][i] = make_float3(0.f);
        indexs[threadId][i] = 0;
    }
    __syncthreads();
    if (idx < source.cols && idy < source.rows)
    {
        //全部值都放入共享centers
        int index = clusterIndex(idy, idx);
        uchar umask = mask(idy, idx);
        if (!bSeed || (bSeed && checkObvious(umask)))
        {
            bool bFg = checkFg(umask);
            int kindex = bFg ? index : (index + CUDA_GRABCUT_K);
            float4 color = rgbauchar42float4(source(idy, idx));
            centers[threadId][kindex] = make_float3(color);
            indexs[threadId][kindex] = 1;
        }
        __syncthreads();
        //每一個線程塊進行二分聚合,每次操做都保存到前一半數組裏，直到最後保存在線程塊裏第一個線程上(這塊比較費時,0.1ms)
        for (uint stride = blockDim.x*blockDim.y / 2; stride > 0; stride >>= 1)
        {
            //int tid = (threadId&(stride - 1));
            if (threadId < stride)//stride 2^n
            {
#pragma unroll CUDA_GRABCUT_K2
                for (int i = 0; i < CUDA_GRABCUT_K2; i++)
                {
                    centers[threadId][i] += centers[threadId + stride][i];
                    indexs[threadId][i] += indexs[threadId + stride][i];
                }
            }
            //if (stride > 32)
            __syncthreads();
        }
        //每塊的第一個線程集合，把共享centers存入臨時顯存塊上kencter
        if (threadIdx.x == 0 && threadIdx.y == 0)
        {
            int blockId = blockIdx.x + blockIdx.y * gridDim.x;
#pragma unroll CUDA_GRABCUT_K2
            for (int i = 0; i < CUDA_GRABCUT_K2; i++)
            {
                int id = blockId * 2 * CUDA_GRABCUT_K + i;
                kencter[id] = make_float4(centers[0][i], 0.f);
                kindexs[id] = indexs[0][i];
            }
        }
    }
}

updateCluster

　　其中bSeed爲true代表是計算種子點模式，相似還須要修改updateGMM方法實現，種子點模式下，能夠增長手動選擇前景與背景，如graphcut相似。

　　這樣，主要實現就算完成了，接下來，咱們整合進引擎，公司主要使用UE4，我通常來講測試功能更喜歡用Unity3D,對於我測試功能來講，UI與邏輯編寫方便與編譯快是我須要的，因此主要以Unity3D講解，後面會簡單說下與UE4的對接，其實對接部分，unity3d要編寫原生插件才能拿到Unity3D的DX11上下文，以及要寫C++/C#的轉接層，對於對接部分來講，UE4明顯編寫更舒服。

　　就如前面所說，要讓CUDA處理數據的線程不佔着引擎渲染線程，須要的是一個能在CUDA與DX11不一樣線程安全訪問的顯存設計，遺憾的是我沒找到(有知道CUDA與DX11有相似DX11之間共享顯存能有鎖訪問相似設計的告訴一聲)，可是在我前文 UE4/Unity3D中同時捕獲多高清攝像頭的高效插件裏咱們知道DX11自己之間有共享顯存鎖訪問機制，根據這個，我從新整理了下思路，先聲明一個獨立的DX11上下文，在這個上下文中有共享紋理，這個DX11的共享紋理與CUDA自己綁定，而且CUDA在處理數據線程中佔着這個DX11上下文，等CUDA處理完線程就丟給這個DX11的共享紋理，而後再讓遊戲引擎UE4/Unity3D的渲染線程的DX11上下文去訪問這個DX11的共享紋理，這樣用一個獨立的DX11作中轉，也能達到咱們須要的效果。

　　以下是CUDA與DX11交互的主要代碼，其中DX11之間用鎖共享訪問的邏輯請看 UE4/Unity3D中同時捕獲多高清攝像頭的高效插件這個連接裏，相關代碼在這不貼了，這樣就避免在CUDA中處理完後還download到內存中，而後在遊戲中把內存數據upload在對應引擎的顯存上，這樣太浪費而且低效。　　

//一塊CUDA與DX11綁定的資源
struct Dx11CudaResource
{
    //cuda資源
    cudaGraphicsResource *cudaResource = nullptr;
    //cuda具體資源
    cudaArray *cuArray = nullptr;
    //對應DX11紋理
    ID3D11Texture2D *texture = nullptr;
    //對應CPU數據
    uint8_t *cpuData = nullptr;
};
//代表一個DX11上下文的共享顯存 
struct MGDLL_EXPORT GpuSharedResource
{
public:
    ID3D11Texture2D* texture = nullptr;
    HANDLE sharedHandle = nullptr;
    bool bGpuUpdate = false;
public:
    //申請一塊共享顯存,mc表示一個Dx11上下文章
    bool restart(class MainCompute* mc, int width, int height, DXGI_FORMAT format = DXGI_FORMAT_R8G8B8A8_UNORM);
    void release();
};
bool GpuSharedResource::restart(MainCompute* mc, int width, int height, DXGI_FORMAT format)
{
    SafeRelease(texture);
    bool result = mc->CreateTextureBuffer(nullptr, width, height, format, &texture, false, true);
    sharedHandle = MainCompute::GetSharedHandle(texture);
    bGpuUpdate = false;
    return result;
}

void GpuSharedResource::release()
{
    SafeRelease(texture);
}
//Dx11CudaResource 表示Cuda與Dx11聯接二方的資源
//GpuSharedResource 表示共享紋理與句柄，MainCompute爲對應DX11上下文包裝
inline bool registerCudaResource(Dx11CudaResource& cudaDx11, GpuSharedResource& sharedResource, MainCompute* mc, int width, int height)
{
    if (cudaDx11.cudaResource != nullptr && cudaDx11.texture != nullptr)
    {
        cudaGraphicsUnregisterResource(cudaDx11.cudaResource);
        cudaDx11.cudaResource = nullptr;
    }
    bool bInit = sharedResource.restart(mc, width, height);
    if (bInit)
    {
        cudaDx11.texture = sharedResource.texture;
        auto result = cudaGraphicsD3D11RegisterResource(&cudaDx11.cudaResource, cudaDx11.texture, cudaGraphicsRegisterFlagsNone);
        if (result != cudaSuccess)
        {
            LogMessage(warn, "cudaGraphicsD3D11RegisterResource fails.");
        }
    }
    return bInit;
}
//把CUDA資源複製給DX11紋理
inline void gpuMat2D3dTexture(cv::cuda::GpuMat frame, Dx11CudaResource& cudaResource, cudaStream_t stream)
{
    if (cudaResource.texture != nullptr)
    {
        //cuda map dx11,資源數組間map
        cudaGraphicsMapResources(1, &cudaResource.cudaResource, stream);
        //map單個資源
        cudaGraphicsSubResourceGetMappedArray(&cudaResource.cuArray, cudaResource.cudaResource, 0, 0);
        //從cuda顯存裏把數據複製給dx11註冊的cuda顯存裏
        cudaMemcpy2DToArray(cudaResource.cuArray, 0, 0, frame.ptr(), frame.step, frame.cols * sizeof(int32_t), frame.rows, cudaMemcpyDeviceToDevice);
        //cuda unmap dx11
        cudaGraphicsUnmapResources(1, &cudaResource.cudaResource, stream);
    }
}
//複製GPU處理後的扣像線程給共享顯存，經過鎖訪問，避免和引擎的遊戲線程一塊兒訪問這塊顯存
if (dataType & KeyingGPU)
{
    CComPtr<IDXGIKeyedMutex> pDX11Mutex = nullptr;
    auto hResult = keyingRc.texture->QueryInterface(__uuidof(IDXGIKeyedMutex), (LPVOID*)&pDX11Mutex);
    DWORD dresult = pDX11Mutex->AcquireSync(0, 0);
    if (dresult == WAIT_OBJECT_0)
    {
        gpuMat2Texture(keying, resultOpFrame);
    }
    dresult = pDX11Mutex->ReleaseSync(1);
    keyingRc.bGpuUpdate = true;
}

//dx11共享顯存把結果丟給遊戲引擎的紋理
//d3ddevice 遊戲引擎的dx11上下文
//texture 遊戲引擎的紋理
//sharedHandle CUDA那邊的遊戲引擎dx11上下文裏共享紋理的句柄
void copySharedToTexture(ID3D11Device * d3ddevice, HANDLE & sharedHandle, ID3D11Texture2D * texture)
{
    if (!d3ddevice)
        return;
    CComPtr<ID3D11DeviceContext> d3dcontext = nullptr;
    //ID3D11DeviceContext* d3dcontext = nullptr;
    d3ddevice->GetImmediateContext(&d3dcontext);
    if (!d3dcontext)
        return;
    if (sharedHandle && texture)
    {
        CComPtr<ID3D11Texture2D> pBuffer = nullptr;
        HRESULT hr = d3ddevice->OpenSharedResource(sharedHandle, __uuidof(ID3D11Texture2D), (void**)(&pBuffer));
        if (FAILED(hr) || pBuffer == nullptr)
        {
            LogMessage(error, "open shared texture error.");
            return;
        }
        CComPtr<IDXGIKeyedMutex> pDX11Mutex = nullptr;
        auto hResult = pBuffer->QueryInterface(__uuidof(IDXGIKeyedMutex), (LPVOID*)&pDX11Mutex);
        if (FAILED(hResult) || pDX11Mutex == nullptr)
        {
            LogMessage(error, "get IDXGIKeyedMutex failed.");
            return;
        }
        DWORD result = pDX11Mutex->AcquireSync(1, 0);
        if (result == WAIT_OBJECT_0 && pBuffer)
        {
            d3dcontext->CopyResource(texture, pBuffer);
        }
        result = pDX11Mutex->ReleaseSync(0);
    }
}

CUDA與引擎顯存交互

　　讓照上文這個順序，先聲明一個獨立的DX11上下文，裏面生成對應共享紋理，CUDA先綁定這邊資源與這個DX11的共享紋理，等CUDA處理完數據，而後把數據複製到這個綁定的DX11共享資源上，這一步，經過鎖知道遊戲引擎的渲染線程是否在訪問，若是在訪問，就放棄，繼續向下執行，而遊戲引擎也經過鎖安全訪問這個CUDA的計算結果，而後帶給UE4呈現，不會給遊戲引擎自己帶來延遲。注意顯存泄漏的問題，Gpumat咱們不須要多管，可是這裏咱們本身聲明的，須要注意釋放，註冊CUDA與DX11資源後面再次註冊是要先取消，不然顯存泄漏。

　　如今愈來愈多數據密集計算，大部分適合GPU算，如今顯卡的顯存也愈來愈大，徹底可把數據加載到顯存，不一樣的流計算不一樣的部分，最後統一計算呈現效果，把CPU從數據密集處理中脫離出來，專一邏輯處理。

　　整個完整過程就差很少了，最後就是CUDA模塊與相關引用的dll確實過大，可能有些狀況原來的DX11模塊就徹底能處理，而且A卡暫時還用不了CUDA，因此這塊設計成一個插件模塊，全部的DLL放入一個文件夾中，若是不要，直接去掉這個文件夾，相關功能自動使用DX11處理，若是檢測到有這個文件夾，就先檢測是否能成功加載，而後確認是否使用CUDA模塊來處理這個功能。

　　整個實現很是簡單，對於這個需求也徹底足夠了。以下，先定義插件層與工廠類。

template<typename T>
class ImageFactory {
public:
    virtual T *create(int type) = 0;
};

template<typename T>
class PluginManager
{
public:
    static PluginManager<T>& getInstance()
    {
        static PluginManager m_instance;
        return m_instance;
    };
    ~PluginManager()
    {
        //Release();
    };
private:
    //static PluginManager<T>* instance;
    PluginManager() {};
private:
    std::vector<ImageFactory<T>*> factorys;
    std::vector<T*> models;
public:
    //註冊生產類        
    void registerFactory(ImageFactory<T>* vpf, int type)
    {
        if (type >= factorys.size())
            factorys.resize(type + 1);
        factorys[type] = vpf;
    };
    //產生一個實體    
    T* createModel(int type)
    {
        if (type >= factorys.size())
            return nullptr;
        ImageFactory<T>* factory = factorys[type];
        T* model = factory->create(type);
        models.push_back(model);
        return model;
    };
    void Release()
    {
        for (T* model : models)
        {
            SafeDelete(model);
        }
        models.empty();
        for (ImageFactory<T>* factory : factorys)
        {
            SafeDelete(factory);
        }
        factorys.empty();
    }
};

void registerFactory(ImageFactory<VideoProcess>* factory, int type)
{
    PluginManager<VideoProcess>::getInstance().registerFactory(factory, type);
}

PluginManager

　　而後在CUDA層定義相應的實現接口與加載是否知足加載的API。

class CudaVideoProcessFactory :public ImageFactory<VideoProcess>
{
public:
    CudaVideoProcessFactory() {};
    ~CudaVideoProcessFactory() {};
public:
    virtual VideoProcess* create(int type) override;
};

extern "C" GUDLL_EXPORT void registerFactory();

extern "C" GUDLL_EXPORT bool bCanLoad();

VideoProcess * CudaVideoProcessFactory::create(int type)
{
    VideoProcess * vp = new VideoProcessCuda();
    return vp;
}

void registerFactory()
{
    registerFactory(new CudaVideoProcessFactory(), Cuda);
    registerFactory(new CudaComputePipeFactory(), Cuda);
}

bool bCanLoad()
{
    int count = cv::cuda::getCudaEnabledDeviceCount();
    if (count > 0)
    {
        int deviceId = 0;
        auto device = cv::cuda::DeviceInfo::DeviceInfo(deviceId);
        bool compation = device.isCompatible();
        if (compation)
        {
            //cv::cuda::setDevice(deviceId);
        }
        return true;
    }
    return false;
}

VideoProcessFactory

　　最後是在插件層根據定義的目錄結構加載CUDA層。

inline HINSTANCE LoadGPUCompuate()
{
    //HINSTANCE hdll = nullptr;
    typedef bool(*bCanLoad)();
    typedef void(*registerfactory)();
    hdll = LoadLibraryEx(L"GPUCompute.dll", nullptr, LOAD_WITH_ALTERED_SEARCH_PATH);
    if (hdll == nullptr)
    {
        //DWORD error_id = GetLastError();
        //LogMessage(error, to_string(error_id).c_str());
        wchar_t sz[512] = { 0 };
        HMODULE ihdll = GetModuleHandle(L"ImageUtility.dll");
        ::GetModuleFileName(ihdll, sz, 512);
        ::PathRemoveFileSpec(sz);
        ::PathAppend(sz, L"GPUCompute");
        SetDllDirectory(sz);//SetCurrentDirectory(sz);
        hdll = LoadLibraryEx(L"GPUCompute.dll", nullptr, LOAD_WITH_ALTERED_SEARCH_PATH);
        //error_id = GetLastError();
        //LogMessage(error, to_string(error_id).c_str());
    }
    if (hdll)
    {
        bCanLoad bcd = (bCanLoad)GetProcAddress(hdll, "bCanLoad");
        if (bcd())
        {
            registerfactory rf = (registerfactory)GetProcAddress(hdll, "registerFactory");
            if (rf)
                rf();
        }
        else
        {
            FreeLibrary(hdll);
            hdll = nullptr;
        }
    }
    return hdll;
}

LoadGPUCompuate

　　這樣就能根據需求肯定是否要加載CUDA與深度神經網絡模塊，後面改爲caffe這樣也是適用的。

　　在相應的遊戲引擎Unity3D/UE4目錄下，只須要如上定義DLL的目錄就行，若是不須要有深度神經網絡與CUDA相關功能，直接去掉這個GPUCompute目錄也不會對原來的DX11計算模塊有影響。

　　最後記，如今這個Grabcut效果只能說特定環境下還能夠，好比人和背景差異比較大的狀況下，若是相近，效果不是太好，如上視頻中，衣服的左邊和右邊邊緣扣的差不少，就是由於左邊受光照多致使與背景差異小，這個功能暫時只能作個備用項，還須要繼續改進，可是從這裏面，咱們底層框架適配了引擎與深度神經網絡，這爲後面擴展更多功能打下基礎。

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。