kaldi通用底層矩陣運算庫——CUDA

時間 2019-12-08

標籤 kaldi 通用底層矩陣運算 cuda 欄目應用數學简体版

原文原文鏈接

cudamatrix/cublas-wrappers.h app

該頭文件對cuBLAS的接口進行了簡單的封裝（函數名的簡化和部分kaldi函數的封裝）。 ide

好比函數

cublasSgemm_v2封裝爲cublas_gemm 優化

cublas_copy_kaldi_fd和cublas_copy_kaldi_df封裝爲cublas_copy this

cudamatrix/cu-kernels.{h,cu} spa

以cuda_add_col_sum_mat函數爲例 rest

對Kaldi cuda kernel或cublas進行了簡單的封裝（針對不一樣精度浮點型） orm

cudamatrix/cu-kernels.h
namespace kaldi { blog

inline void cuda_add_col_sum_mat(int Gr, int Bl, double* result, 繼承

const double* mat, const MatrixDim d,

const double alpha, const double beta) {

cudaD_add_col_sum_mat(Gr, Bl, result, mat, d, alpha, beta);

}

inline void cuda_add_col_sum_mat(int Gr, int Bl, float* result,

const float* mat, const MatrixDim d,

const float alpha, const float beta) {

cudaF_add_col_sum_mat(Gr, Bl, result, mat, d, alpha, beta);

}

//...

}

kernel的定義

cudamatrix/cu-kernels.cu

// Reduce a matrix 'mat' to a column vector 'result'

template<EnumTransformReduce TransReduceType, typename Real>

__global__

static void _transform_reduce_mat_cols(

Real *result, const Real *mat, const MatrixDim d,

const TransReduceOp<TransReduceType, Real> op) {

__shared__ Real sdata[CU1DBLOCK];

const int tid = threadIdx.x;

const int i = blockIdx.x;

const int row_start = i * d.stride;

Real tdata = op.InitValue();

for (int j = tid; j < d.cols; j += CU1DBLOCK) {

tdata = op.Reduce(tdata, op.Transform(mat[row_start + j]));

}

sdata[tid] = tdata;

__syncthreads();

// Tree reduce

# pragma unroll

for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) {

if (tid < shift)

sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);

__syncthreads();

}

// Reduce last warp. Threads implicitly synchronized within a warp.

if (tid < warpSize) {

for (int shift = warpSize; shift > 0; shift >>= 1)

sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]);

}

// Output to vector result.

if (tid == 0) {

result[i] = op.PostReduce(sdata[0], result[i]);

}

void cudaD_add_col_sum_mat(int Gr, int Bl, double* result, const double* mat,

const MatrixDim d, const double alpha,

const double beta) {

_transform_reduce_mat_cols<<<Gr, Bl>>>(result, mat, d,

TransReduceOp<SUMAB, double>(alpha, beta));

}

cudamatrix/cu-vector.h

與matrix/kaldi-vector.h相似的，該頭文件聲明瞭幾個向量類。與之不一樣的是，但其運算的實現基於CUDA或CBLAS。

class CuVectorBase

Cuda向量抽象類。該類對基礎運算與內存優化進行了封裝，只提供向量運算。不涉及尺寸縮放和構造函數。

尺寸縮放和構造函數由派生類CuVector和CuSubVector負責。

向量初始化

void SetZero();

向量信息

MatrixIndexT Dim() const { return dim_; }

向量的讀取與轉換

inline Real* Data() { return data_; }

inline Real operator() (MatrixIndexT i) const

CuSubVector<Real> Range(const MatrixIndexT o, const MatrixIndexT l)

向量的拷貝函數

void CopyFromVec(const CuVectorBase<Real> &v);

向量的運算

void ApplyLog();

void AddVec(const Real alpha, const CuVectorBase<OtherReal> &v, Real beta = 1.0);

//*this += alpha * M [or M^T]

//linear_params_.AddMat(alpha, other->linear_params_);

//linear_params_ += alpha * other->linear_params_

void AddMat ( const Real alpha,

const MatrixBase< Real > & M,

MatrixTransposeType transA = kNoTrans

)

//*this = alpha * diag(M * M^T) + beta * *this

diag(M M^T)+beta ** M

(1 2 3)

(4 5 6)

(7 8 9)

(1 4 7)

(2 5 8)

(3 6 9)

(1^2+2^2+3^2, *, *)

(*, 4^2+5^2+6^2, *)

(*, *, 7^2+8^2+9^2)

diag=()

void CuVectorBase<Real>::AddDiagMat2(Real alpha, const CuMatrixBase<Real> &M,

MatrixTransposeType trans, Real beta) {

//*this = alpha * diag(M * M^T) + beta * *this

this->AddDiagMatMat(alpha, M, trans, M, other_trans, beta);

}

//*this = alpha * diag(M * N^T) + beta * *this

void CuVectorBase<Real>::AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M,

MatrixTransposeType transM,

const CuMatrixBase<Real> &N,

MatrixTransposeType transN, Real beta) {

// v = alpha * diag(M * N^T) + beta * v

static void _add_diag_mat_mat_MNT(const Real alpha, const Real* M,

const MatrixDim dim_M, const Real* N,

const int stride_N, const Real beta,

Real* v)

//data_ = alpha * diag(M.Data() * N.Data()^T) + beta * data_

cuda_add_diag_mat_mat_MNT(dimGrid, dimBlock, alpha, M.Data(), M.Dim(),

N.Data(), N.Stride(), beta, data_);

class CuVector: public CuVectorBase<Real>

該類表示普通Cuda向量，並實現尺寸縮放和通常的構造函數。

多種構造函數

explicit CuVector(const CuVector<Real> &v) : CuVectorBase<Real>() {

Resize(v.Dim(), kUndefined);

this->CopyFromVec(v);

}

template<typename OtherReal>

explicit CuVector(const CuVectorBase<OtherReal> &v) : CuVectorBase<Real>() {

Resize(v.Dim(), kUndefined);

this->CopyFromVec(v);

}

template<typename OtherReal>

explicit CuVector(const VectorBase<OtherReal> &v) : CuVectorBase<Real>() {

Resize(v.Dim(), kUndefined);

this->CopyFromVec(Vector<Real>(v));

}

重載賦值運算符

CuVector<Real> &operator = (const CuVectorBase<Real> &other) {

Resize(other.Dim(), kUndefined);

this->CopyFromVec(other);

return *this;

}

CuVector<Real> &operator = (const CuVector<Real> &other) {

Resize(other.Dim(), kUndefined);

this->CopyFromVec(other);

return *this;

}

CuVector<Real> &operator = (const VectorBase<Real> &other) {

Resize(other.Dim());

this->CopyFromVec(other);

return *this;

}

Utils

void Swap(CuVector<Real> *vec);

void Swap(Vector<Real> *vec);

void Resize(MatrixIndexT length, MatrixResizeType resize_type = kSetZero);

class CuSubVector: public CuVectorBase<Real>

該類表示一個不佔有實際數據的泛化向量或向量索引，能夠表示高級向量的子向量或矩陣的行。實現多種用於索引的構造函數。

多種構造函數

CuSubVector(const CuVectorBase<Real> &t, const MatrixIndexT origin,

const MatrixIndexT length) : CuVectorBase<Real>() {

KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(origin)+

static_cast<UnsignedMatrixIndexT>(length) <=

static_cast<UnsignedMatrixIndexT>(t.Dim()));

CuVectorBase<Real>::data_ = const_cast<Real*>(t.Data()+origin);

CuVectorBase<Real>::dim_ = length;

}

/// Copy constructor

/// this constructor needed for Range() to work in base class.

CuSubVector(const CuSubVector &other) : CuVectorBase<Real> () {

CuVectorBase<Real>::data_ = other.data_;

CuVectorBase<Real>::dim_ = other.dim_;

}

CuSubVector(const Real* data, MatrixIndexT length) : CuVectorBase<Real> () {

// Yes, we're evading C's restrictions on const here, and yes, it can be used

// to do wrong stuff; unfortunately the workaround would be very difficult.

CuVectorBase<Real>::data_ = const_cast<Real*>(data);

CuVectorBase<Real>::dim_ = length;

}

cudamatrix/cu-matrix.h

與matrix/kaldi-matrixr.h相似的，該頭文件聲明瞭幾個矩陣類。與之不一樣的是，但其運算的實現基於CUDA或CBLAS。當Kaldi基於CUDA環境編譯且GPU可用（CuDevice::Instantiate().Enabled() == true）則使用CUDA卡進行計算，不然使用CPU進行計算（CBLAS）。

class CuMatrixBase

Cuda矩陣抽象類。該類對基礎運算與內存優化進行了封裝，只提供矩陣運算。不涉及尺寸縮放和構造函數。

尺寸縮放和構造函數由派生類CuMatrix和CuSubMatrix負責。

class CuMatrix

該類表示普通Cuda矩陣，並實現尺寸縮放和通常的構造函數。

class CuSubMatrix

該類表示一個不佔有實際數據的泛化矩陣或矩陣索引，能夠表示其餘矩陣的矩陣。實現多種用於索引的構造函數。

繼承於CuMatrixBase，用於對矩陣的子矩陣（塊矩陣）進行運算。