【視頻處理】YV12ToARGB

時間 2019-11-06

標籤視頻處理 yv12toargb toargb 简体版

原文原文鏈接

前面提到了YV12轉RGB的各類實現方法和優化方法，主要是CPU上的實現。本文主要介紹基於GPU的YV12轉RGB的實現。api

1. 基於OpenGL的實現app

利用OpenGL shader實現將YV12轉RGB，將Y、U、V份量數據做爲紋理數據，並構造YUV轉RGB的shader代碼，最終紋理數據在shader代碼做用下，實現YV12轉RGB。該方法適合於將YV12轉RGB後直接顯示，若YV12轉化成RGB後，還須要進行圖像處理操做，則利用OpenGL進行紋理數據的圖像處理操做不方便。說明：因爲本文着重於基於Cuda的實現，於是未驗證基於OpenGL的代碼實現。函數

具體資料可參考：工具

http://blog.csdn.net/xiaoguaihai/article/details/8672631佈局

http://www.fourcc.org/source/YUV420P-OpenGL-GLSLang.c性能

2. 基於Cuda的實現測試

YV12轉RGB的過程是逐一獲取像素的Y、U、V份量，而後經過轉換公式計算得RGB。基於CUDA的實現關鍵在於兩個步驟：Y、U、V份量的獲取，RGB的計算。Y、U、V份量的獲取與YUV的內存佈局有關，RGB的計算公式通常是固定不變。具體的代碼實現以下所示，主要參考NV12ToARGB.cu的代碼，在該代碼的基礎上，保持RGB的計算方法不變，修改了Y、U、V份量的獲取方法。優化

#include "cuda.h"this

#include "cuda_runtime_api.h"spa

#define COLOR_COMPONENT_BIT_SIZE 10

#define COLOR_COMPONENT_MASK 0x3FF

__constant__ float constHueColorSpaceMat[9]={1.1644f,0.0f,1.596f,1.1644f,-0.3918f,-0.813f,1.1644f,2.0172f,0.0f};

__device__ staticvoid YUV2RGB(constint* yuvi,float* red,float* green,float* blue)

{

float luma, chromaCb, chromaCr;

// Prepare for hue adjustment

luma =(float)yuvi[0];

chromaCb =(float)((int)yuvi[1]-512.0f);

chromaCr =(float)((int)yuvi[2]-512.0f);

// Convert YUV To RGB with hue adjustment

*red =(luma * constHueColorSpaceMat[0])+

(chromaCb * constHueColorSpaceMat[1])+

(chromaCr * constHueColorSpaceMat[2]);

*green =(luma * constHueColorSpaceMat[3])+

(chromaCb * constHueColorSpaceMat[4])+

(chromaCr * constHueColorSpaceMat[5]);

*blue =(luma * constHueColorSpaceMat[6])+

(chromaCb * constHueColorSpaceMat[7])+

(chromaCr * constHueColorSpaceMat[8]);

}

__device__ staticint RGBA_pack_10bit(float red,float green,float blue,int alpha)

{

int ARGBpixel =0;

// Clamp final 10 bit results

red =::fmin(::fmax(red, 0.0f),1023.f);

green =::fmin(::fmax(green,0.0f),1023.f);

blue =::fmin(::fmax(blue, 0.0f),1023.f);

// Convert to 8 bit unsigned integers per color component

ARGBpixel =(((int)blue >>2)|

(((int)green >>2)<<8) |

(((int)red >>2)<<16)|

(int)alpha);

return ARGBpixel;

}

__global__ void YV12ToARGB_FourPixel(constunsignedchar* pYV12,unsignedint* pARGB,int width,int height)

{

// Pad borders with duplicate pixels, and we multiply by 2 because we process 4 pixels per thread

constint x = blockIdx.x *(blockDim.x <<1)+(threadIdx.x <<1);

constint y = blockIdx.y *(blockDim.y <<1)+(threadIdx.y <<1);

if((x +1)>= width ||(y +1)>= height)

return;

// Read 4 Luma components at a time

int yuv101010Pel[4];

yuv101010Pel[0]=(pYV12[y * width + x ])<<2;

yuv101010Pel[1]=(pYV12[y * width + x +1])<<2;

yuv101010Pel[2]=(pYV12[(y +1)* width + x ])<<2;

yuv101010Pel[3]=(pYV12[(y +1)* width + x +1])<<2;

constunsignedint vOffset = width * height;

constunsignedint uOffset = vOffset +(vOffset >>2);

constunsignedint vPitch = width >>1;

constunsignedint uPitch = vPitch;

constint x_chroma = x >>1;

constint y_chroma = y >>1;

int chromaCb = pYV12[uOffset + y_chroma * uPitch + x_chroma]; //U

int chromaCr = pYV12[vOffset + y_chroma * vPitch + x_chroma]; //V

yuv101010Pel[0]|=(chromaCb <<( COLOR_COMPONENT_BIT_SIZE +2));

yuv101010Pel[0]|=(chromaCr <<((COLOR_COMPONENT_BIT_SIZE <<1)+2));

yuv101010Pel[1]|=(chromaCb <<( COLOR_COMPONENT_BIT_SIZE +2));

yuv101010Pel[1]|=(chromaCr <<((COLOR_COMPONENT_BIT_SIZE <<1)+2));

yuv101010Pel[2]|=(chromaCb <<( COLOR_COMPONENT_BIT_SIZE +2));

yuv101010Pel[2]|=(chromaCr <<((COLOR_COMPONENT_BIT_SIZE <<1)+2));

yuv101010Pel[3]|=(chromaCb <<( COLOR_COMPONENT_BIT_SIZE +2));

yuv101010Pel[3]|=(chromaCr <<((COLOR_COMPONENT_BIT_SIZE <<1)+2));

// this steps performs the color conversion

int yuvi[12];

float red[4], green[4], blue[4];

yuvi[0]=(yuv101010Pel[0]& COLOR_COMPONENT_MASK );

yuvi[1]=((yuv101010Pel[0]>> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);

yuvi[2]=((yuv101010Pel[0]>>(COLOR_COMPONENT_BIT_SIZE <<1))& COLOR_COMPONENT_MASK);

yuvi[3]=(yuv101010Pel[1]& COLOR_COMPONENT_MASK );

yuvi[4]=((yuv101010Pel[1]>> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);

yuvi[5]=((yuv101010Pel[1]>>(COLOR_COMPONENT_BIT_SIZE <<1))& COLOR_COMPONENT_MASK);

yuvi[6]=(yuv101010Pel[2]& COLOR_COMPONENT_MASK );

yuvi[7]=((yuv101010Pel[2]>> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);

yuvi[8]=((yuv101010Pel[2]>>(COLOR_COMPONENT_BIT_SIZE <<1))& COLOR_COMPONENT_MASK);

yuvi[9]=(yuv101010Pel[3]& COLOR_COMPONENT_MASK );

yuvi[10]=((yuv101010Pel[3]>> COLOR_COMPONENT_BIT_SIZE) & COLOR_COMPONENT_MASK);

yuvi[11]=((yuv101010Pel[3]>>(COLOR_COMPONENT_BIT_SIZE <<1))& COLOR_COMPONENT_MASK);

// YUV to RGB Transformation conversion

YUV2RGB(&yuvi[0],&red[0],&green[0],&blue[0]);

YUV2RGB(&yuvi[3],&red[1],&green[1],&blue[1]);

YUV2RGB(&yuvi[6],&red[2],&green[2],&blue[2]);

YUV2RGB(&yuvi[9],&red[3],&green[3],&blue[3]);

pARGB[y * width + x ]= RGBA_pack_10bit(red[0], green[0], blue[0],((int)0xff<<24));

pARGB[y * width + x +1]= RGBA_pack_10bit(red[1], green[1], blue[1],((int)0xff<<24));

pARGB[(y +1)* width + x ]= RGBA_pack_10bit(red[2], green[2], blue[2],((int)0xff<<24));

pARGB[(y +1)* width + x +1]= RGBA_pack_10bit(red[3], green[3], blue[3],((int)0xff<<24));

}

bool YV12ToARGB(unsignedchar* pYV12,unsignedchar* pARGB,int width,int height)

{

unsignedchar* d_src;

unsignedchar* d_dst;

unsignedint srcMemSize =sizeof(unsignedchar)* width * height *3/2;

unsignedint dstMemSize =sizeof(unsignedchar)* width * height *4;

cudaMalloc((void**)&d_src,srcMemSize);

cudaMalloc((void**)*d_dst,dstMemSize);

cudaMemcpy(d_src,pYV12,srcMemSize,cudaMemcpyHostToDevice);

dim3 block(32,8);

int gridx =(width +2*block.x -1)/(2*block.x);

int gridy =(height +2*block.y -1)/(2*block.y);

dim3 grid(gridx,gridy);

YV12ToARGB<<<grid,block>>>(d_src,(unsignedint*)d_dst,width,height);

cudaMemcpy(pARGB,d_dst,dstMemSize,cudaMemcpyDeviceToHost);

returntrue;

}

　　線程內存訪問示意圖以下所示，每一個線程訪問4個Y、1個U、1個V，最終轉換獲得4個ARGB值。因爲YV12屬於YUV4:2:0採樣，每四個Y共用一組UV份量，即Y(0,0)、Y(0,1)、Y(1,0)、Y(1,1)共用V(0,0)和U(0,0)，如紅色框標註所示。

3. 基於Cuda的實現優化

優化主要關注於兩個方面：單個線程處理像素粒度和數據傳輸。單個線程處理粒度分爲：OnePixelPerThread,TwoPixelPerThread,FourPixelPerThread。數據傳輸優化主要採用Pageable Memory，Pinned Memory，Mapped Memory(Zero Copy)。經測試，不一樣實現版本的轉換效率以下表所示，測試序列：1920*1080，時間統計包括內核函數執行時間和數據傳輸時間，單位爲ms。

	OnePixel	TwoPixel	FourPixel
Pageable	6.91691	6.64319	6.2873
Pinned	5.31999	5.01890	4.71937
Mapped	3.39043	48.5298	23.8327

由上表可知，不使用Mapped Memory(Zero Copy)時，單個線程處理像素的粒度越大，內核函數執行的時間越小，轉換效率越好。使用Mapped Memory(Zero Copy)時，單線程處理單像素時，轉換效率最好。

單個線程處理四個像素時，內核函數執行時間最少；使用Pinned Memory會減小數據傳輸時間；使用Mapped Memory消除數據傳輸過程，但會增長內核函數執行時間，最終優化效果與內核函數訪問內存的方式有關。建議使用Pinned Memory+FourPixelPerThread的優化版本。

　　利用NVIDIA提供的性能分析工具，分析Pinned Memory+FourPixelPerThread版本程序，分析結果以下圖所示，內核計算時間佔1/4左右，數據傳輸時間佔3/4左右，整體而言，內核計算任務過少，致使並行優化的效果沒法抵消數據傳輸的開銷。

相關標籤/搜索

每日一句

每一个你不满意的现在，都有一个你没有努力的曾经。