熟悉目標檢測的應該都清楚NMS是什麼算法,可是若是咱們要與C++和cuda結合直接寫成Pytorch的操做大家清楚怎麼寫嗎?最近在看mmdetection的源碼,發現其實原來寫C++和cuda的擴展也不難,下面給你們講一下。html
C ++的擴展是容許用戶來建立自定義PyTorch框架外的操做(operators )的,即從PyTorch後端分離。此方法與實現本地PyTorch操做的方式不一樣。C ++擴展旨在爲您節省大量與將操做與PyTorch後端集成在一塊兒相關的樣板,同時爲基於PyTorch的項目提供高度的靈活性。python
官方給出了一個LLTM的例子,你們也能夠看一下。ios
先複習一下NMS的算法:c++
這裏我給出一份純numpy的實現:git
def nms(bounding_boxes, Nt): if len(bounding_boxes) == 0: return [], [] bboxes = np.array(bounding_boxes) x1 = bboxes[:, 0] y1 = bboxes[:, 1] x2 = bboxes[:, 2] y2 = bboxes[:, 3] scores = bboxes[:, 4] areas = (x2 - x1 + 1) * (y2 - y1 + 1) order = np.argsort(scores) picked_boxes = [] while order.size > 0: index = order[-1] picked_boxes.append(bounding_boxes[index]) x11 = np.maximum(x1[index], x1[order[:-1]]) y11 = np.maximum(y1[index], y1[order[:-1]]) x22 = np.minimum(x2[index], x2[order[:-1]]) y22 = np.minimum(y2[index], y2[order[:-1]]) w = np.maximum(0.0, x22 - x11 + 1) h = np.maximum(0.0, y22 - y11 + 1) intersection = w * h ious = intersection / (areas[index] + areas[order[:-1]] - intersection) left = np.where(ious < Nt) order = order[left] return picked_boxes
須要編寫下面5個文件:github
(1) nms_kernel.cu 主要使用ATen和THC庫編寫nms_cuda_forward的函數,使用C++編寫,涉及一些lazyInitCUDA,THCudaFree,THCCeilDiv 的操做,算法跟咱們前面寫的numpy差不太多。算法
(2) nms_cuda.cpp 是調用了nms_kernel.cu文件的nms_cuda_forward封裝了一下變成nms_cuda函數。後端
(3) nms_ext.cpp 進一步封裝nms_cuda函數爲nms,而且經過PYBIND11_MODULE綁定成python可調用的函數。app
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("nms", &nms, "non-maximum suppression"); }
經過上面那樣就至關於告訴python函數名定義爲nms了。框架
(4) setup.py 就是編譯一遍nms_ext,至此你就能夠經過nms_ext.nms調用cpp extension做爲pytorch的操做了
make_cuda_ext( name='nms_ext', module='mmdet.ops.nms', sources=['src/nms_ext.cpp', 'src/cpu/nms_cpu.cpp'], sources_cuda=[ 'src/cuda/nms_cuda.cpp', 'src/cuda/nms_kernel.cu' ]),
(5) nms_wrapper.py 再次封裝 nms_ext.nms,方便使用,使用實例:
from . import nms_ext inds = nms_ext.nms(dets_th, iou_thr)
稍微完整的代碼以下,可是我也刪減了一些,只剩下nms相關的代碼,想要看完整代碼能夠點擊下面的文件名。
nms_kernel.cu (這個估計有部分是Facebook寫的)
// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. #include <ATen/ATen.h> #include <ATen/cuda/CUDAContext.h> #include <ATen/DeviceGuard.h> #include <THC/THC.h> #include <THC/THCDeviceUtils.cuh> #include <vector> #include <iostream> int const threadsPerBlock = sizeof(unsigned long long) * 8; __device__ inline float devIoU(float const * const a, float const * const b) { float left = max(a[0], b[0]), right = min(a[2], b[2]); float top = max(a[1], b[1]), bottom = min(a[3], b[3]); float width = max(right - left, 0.f), height = max(bottom - top, 0.f); float interS = width * height; float Sa = (a[2] - a[0]) * (a[3] - a[1]); float Sb = (b[2] - b[0]) * (b[3] - b[1]); return interS / (Sa + Sb - interS); } __global__ void nms_kernel(const int n_boxes, const float nms_overlap_thresh, const float *dev_boxes, unsigned long long *dev_mask) { const int row_start = blockIdx.y; const int col_start = blockIdx.x; // if (row_start > col_start) return; const int row_size = min(n_boxes - row_start * threadsPerBlock, threadsPerBlock); const int col_size = min(n_boxes - col_start * threadsPerBlock, threadsPerBlock); __shared__ float block_boxes[threadsPerBlock * 5]; if (threadIdx.x < col_size) { block_boxes[threadIdx.x * 5 + 0] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0]; block_boxes[threadIdx.x * 5 + 1] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1]; block_boxes[threadIdx.x * 5 + 2] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2]; block_boxes[threadIdx.x * 5 + 3] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3]; block_boxes[threadIdx.x * 5 + 4] = dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4]; } __syncthreads(); if (threadIdx.x < row_size) { const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x; const float *cur_box = dev_boxes + cur_box_idx * 5; int i = 0; unsigned long long t = 0; int start = 0; if (row_start == col_start) { start = threadIdx.x + 1; } for (i = start; i < col_size; i++) { if (devIoU(cur_box, block_boxes + i * 5) > nms_overlap_thresh) { t |= 1ULL << i; } } const int col_blocks = THCCeilDiv(n_boxes, threadsPerBlock); dev_mask[cur_box_idx * col_blocks + col_start] = t; } } // boxes is a N x 5 tensor at::Tensor nms_cuda_forward(const at::Tensor boxes, float nms_overlap_thresh) { // Ensure CUDA uses the input tensor device. at::DeviceGuard guard(boxes.device()); using scalar_t = float; AT_ASSERTM(boxes.device().is_cuda(), "boxes must be a CUDA tensor"); auto scores = boxes.select(1, 4); auto order_t = std::get<1>(scores.sort(0, /* descending=*/true)); auto boxes_sorted = boxes.index_select(0, order_t); int boxes_num = boxes.size(0); const int col_blocks = THCCeilDiv(boxes_num, threadsPerBlock); scalar_t* boxes_dev = boxes_sorted.data_ptr<scalar_t>(); THCState *state = at::globalContext().lazyInitCUDA(); // TODO replace with getTHCState unsigned long long* mask_dev = NULL; //THCudaCheck(THCudaMalloc(state, (void**) &mask_dev, // boxes_num * col_blocks * sizeof(unsigned long long))); mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long)); dim3 blocks(THCCeilDiv(boxes_num, threadsPerBlock), THCCeilDiv(boxes_num, threadsPerBlock)); dim3 threads(threadsPerBlock); nms_kernel<<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(boxes_num, nms_overlap_thresh, boxes_dev, mask_dev); std::vector<unsigned long long> mask_host(boxes_num * col_blocks); THCudaCheck(cudaMemcpyAsync( &mask_host[0], mask_dev, sizeof(unsigned long long) * boxes_num * col_blocks, cudaMemcpyDeviceToHost, at::cuda::getCurrentCUDAStream() )); std::vector<unsigned long long> remv(col_blocks); memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks); at::Tensor keep = at::empty({boxes_num}, boxes.options().dtype(at::kLong).device(at::kCPU)); int64_t* keep_out = keep.data_ptr<int64_t>(); int num_to_keep = 0; for (int i = 0; i < boxes_num; i++) { int nblock = i / threadsPerBlock; int inblock = i % threadsPerBlock; if (!(remv[nblock] & (1ULL << inblock))) { keep_out[num_to_keep++] = i; unsigned long long *p = &mask_host[0] + i * col_blocks; for (int j = nblock; j < col_blocks; j++) { remv[j] |= p[j]; } } } THCudaFree(state, mask_dev); // TODO improve this part return order_t.index({ keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to( order_t.device(), keep.scalar_type())}); }
#include <torch/extension.h> #define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x, " must be a CUDAtensor ") at::Tensor nms_cuda_forward(const at::Tensor boxes, float nms_overlap_thresh); at::Tensor nms_cuda(const at::Tensor& dets, const float threshold) { CHECK_CUDA(dets); if (dets.numel() == 0) return at::empty({0}, dets.options().dtype(at::kLong).device(at::kCPU)); return nms_cuda_forward(dets, threshold); }
#include <torch/extension.h> #ifdef WITH_CUDA at::Tensor nms_cuda(const at::Tensor& dets, const float threshold); #endif at::Tensor nms(const at::Tensor& dets, const float threshold){ if (dets.device().is_cuda()) { #ifdef WITH_CUDA return nms_cuda(dets, threshold); #else AT_ERROR("nms is not compiled with GPU support"); #endif } # return nms_cpu(dets, threshold); } PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { m.def("nms", &nms, "non-maximum suppression"); }
def make_cuda_ext(name, module, sources, sources_cuda=[]): define_macros = [] extra_compile_args = {'cxx': []} if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1': define_macros += [('WITH_CUDA', None)] extension = CUDAExtension extra_compile_args['nvcc'] = [ '-D__CUDA_NO_HALF_OPERATORS__', '-D__CUDA_NO_HALF_CONVERSIONS__', '-D__CUDA_NO_HALF2_OPERATORS__', ] sources += sources_cuda else: print(f'Compiling {name} without CUDA') extension = CppExtension # raise EnvironmentError('CUDA is required to compile MMDetection!') return extension( name=f'{module}.{name}', sources=[os.path.join(*module.split('.'), p) for p in sources], define_macros=define_macros, extra_compile_args=extra_compile_args) if __name__ == '__main__': write_version_py() setup( name='mmdet', version=get_version(), description='Open MMLab Detection Toolbox and Benchmark', long_description=readme(), author='OpenMMLab', author_email='chenkaidev@gmail.com', keywords='computer vision, object detection', url='https://github.com/open-mmlab/mmdetection', packages=find_packages(exclude=('configs', 'tools', 'demo')), package_data={'mmdet.ops': ['*/*.so']}, classifiers=[ 'Development Status :: 4 - Beta', 'License :: OSI Approved :: Apache Software License', 'Operating System :: OS Independent', 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', ], license='Apache License 2.0', setup_requires=parse_requirements('requirements/build.txt'), tests_require=parse_requirements('requirements/tests.txt'), install_requires=parse_requirements('requirements/runtime.txt'), extras_require={ 'all': parse_requirements('requirements.txt'), 'tests': parse_requirements('requirements/tests.txt'), 'build': parse_requirements('requirements/build.txt'), 'optional': parse_requirements('requirements/optional.txt'), }, ext_modules=[ make_cuda_ext( name='compiling_info', module='mmdet.ops.utils', sources=['src/compiling_info.cpp']), make_cuda_ext( name='nms_ext', module='mmdet.ops.nms', sources=['src/nms_ext.cpp', 'src/cpu/nms_cpu.cpp'], sources_cuda=[ 'src/cuda/nms_cuda.cpp', 'src/cuda/nms_kernel.cu' ]), ], cmdclass={'build_ext': BuildExtension}, zip_safe=False)
from . import nms_ext def nms(dets, iou_thr, device_id=None): # convert dets (tensor or numpy array) to tensor if isinstance(dets, torch.Tensor): is_numpy = False dets_th = dets elif isinstance(dets, np.ndarray): is_numpy = True device = 'cpu' if device_id is None else f'cuda:{device_id}' dets_th = torch.from_numpy(dets).to(device) else: raise TypeError('dets must be either a Tensor or numpy array, ' f'but got {type(dets)}') # execute cpu or cuda nms if dets_th.shape[0] == 0: inds = dets_th.new_zeros(0, dtype=torch.long) else: if dets_th.is_cuda: inds = nms_ext.nms(dets_th, iou_thr) else: inds = nms_ext.nms(dets_th, iou_thr) if is_numpy: inds = inds.cpu().numpy() return dets[inds, :], inds
想要編寫一個c++和cuda的擴展給Pytorch使用,其實主要就4步:
經過綁定的函數名function就能夠在Pytorch中調用了。