cudamatrix/cublas-wrappers.h app
該頭文件對cuBLAS的接口進行了簡單的封裝(函數名的簡化和部分kaldi函數的封裝)。 ide
好比 函數
cublasSgemm_v2封裝爲cublas_gemm 優化
cublas_copy_kaldi_fd和cublas_copy_kaldi_df封裝爲cublas_copy this
cudamatrix/cu-kernels.{h,cu} spa
以cuda_add_col_sum_mat函數爲例 rest
對Kaldi cuda kernel或cublas進行了簡單的封裝(針對不一樣精度浮點型) orm
cudamatrix/cu-kernels.h
inline void cuda_add_col_sum_mat(int Gr, int Bl, double* result, 繼承 const double* mat, const MatrixDim d, const double alpha, const double beta) { cudaD_add_col_sum_mat(Gr, Bl, result, mat, d, alpha, beta); } inline void cuda_add_col_sum_mat(int Gr, int Bl, float* result, const float* mat, const MatrixDim d, const float alpha, const float beta) { cudaF_add_col_sum_mat(Gr, Bl, result, mat, d, alpha, beta); } //... } |
kernel的定義
cudamatrix/cu-kernels.cu // Reduce a matrix 'mat' to a column vector 'result' template<EnumTransformReduce TransReduceType, typename Real> __global__ static void _transform_reduce_mat_cols( Real *result, const Real *mat, const MatrixDim d, const TransReduceOp<TransReduceType, Real> op) {
__shared__ Real sdata[CU1DBLOCK]; const int tid = threadIdx.x; const int i = blockIdx.x; const int row_start = i * d.stride;
Real tdata = op.InitValue(); for (int j = tid; j < d.cols; j += CU1DBLOCK) { tdata = op.Reduce(tdata, op.Transform(mat[row_start + j])); } sdata[tid] = tdata; __syncthreads();
// Tree reduce # pragma unroll for (int shift = CU1DBLOCK / 2; shift > warpSize; shift >>= 1) { if (tid < shift) sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]); __syncthreads(); }
// Reduce last warp. Threads implicitly synchronized within a warp. if (tid < warpSize) { for (int shift = warpSize; shift > 0; shift >>= 1) sdata[tid] = op.Reduce(sdata[tid], sdata[tid + shift]); }
// Output to vector result. if (tid == 0) { result[i] = op.PostReduce(sdata[0], result[i]); } }
void cudaD_add_col_sum_mat(int Gr, int Bl, double* result, const double* mat, const MatrixDim d, const double alpha, const double beta) { _transform_reduce_mat_cols<<<Gr, Bl>>>(result, mat, d, TransReduceOp<SUMAB, double>(alpha, beta)); } |
cudamatrix/cu-vector.h
與matrix/kaldi-vector.h相似的,該頭文件聲明瞭幾個向量類。與之不一樣的是,但其運算的實現基於CUDA或CBLAS。
class CuVectorBase
Cuda向量抽象類。該類對基礎運算與內存優化進行了封裝,只提供向量運算。不涉及尺寸縮放和構造函數。
尺寸縮放和構造函數由派生類CuVector和CuSubVector負責。
向量初始化
void SetZero();
向量信息
MatrixIndexT Dim() const { return dim_; }
向量的讀取與轉換
inline Real* Data() { return data_; }
inline Real operator() (MatrixIndexT i) const
CuSubVector<Real> Range(const MatrixIndexT o, const MatrixIndexT l)
向量的拷貝函數
void CopyFromVec(const CuVectorBase<Real> &v);
向量的運算
void ApplyLog();
void AddVec(const Real alpha, const CuVectorBase<OtherReal> &v, Real beta = 1.0);
//*this += alpha * M [or M^T]
//linear_params_.AddMat(alpha, other->linear_params_);
//linear_params_ += alpha * other->linear_params_
void AddMat ( const Real alpha,
const MatrixBase< Real > & M,
MatrixTransposeType transA = kNoTrans
)
//*this = alpha * diag(M * M^T) + beta * *this
diag(M M^T)+beta ** M (1 2 3) (4 5 6) (7 8 9)
(1 4 7) (2 5 8) (3 6 9) (1^2+2^2+3^2, *, *) (*, 4^2+5^2+6^2, *) (*, *, 7^2+8^2+9^2) diag=() |
void CuVectorBase<Real>::AddDiagMat2(Real alpha, const CuMatrixBase<Real> &M,
MatrixTransposeType trans, Real beta) {
//*this = alpha * diag(M * M^T) + beta * *this
this->AddDiagMatMat(alpha, M, trans, M, other_trans, beta);
}
//*this = alpha * diag(M * N^T) + beta * *this
void CuVectorBase<Real>::AddDiagMatMat(Real alpha, const CuMatrixBase<Real> &M,
MatrixTransposeType transM,
const CuMatrixBase<Real> &N,
MatrixTransposeType transN, Real beta) {
// v = alpha * diag(M * N^T) + beta * v
static void _add_diag_mat_mat_MNT(const Real alpha, const Real* M,
const MatrixDim dim_M, const Real* N,
const int stride_N, const Real beta,
Real* v)
//data_ = alpha * diag(M.Data() * N.Data()^T) + beta * data_
cuda_add_diag_mat_mat_MNT(dimGrid, dimBlock, alpha, M.Data(), M.Dim(),
N.Data(), N.Stride(), beta, data_);
class CuVector: public CuVectorBase<Real>
該類表示普通Cuda向量,並實現尺寸縮放和通常的構造函數。
多種構造函數
explicit CuVector(const CuVector<Real> &v) : CuVectorBase<Real>() {
Resize(v.Dim(), kUndefined);
this->CopyFromVec(v);
}
template<typename OtherReal>
explicit CuVector(const CuVectorBase<OtherReal> &v) : CuVectorBase<Real>() {
Resize(v.Dim(), kUndefined);
this->CopyFromVec(v);
}
template<typename OtherReal>
explicit CuVector(const VectorBase<OtherReal> &v) : CuVectorBase<Real>() {
Resize(v.Dim(), kUndefined);
this->CopyFromVec(Vector<Real>(v));
}
重載賦值運算符
CuVector<Real> &operator = (const CuVectorBase<Real> &other) {
Resize(other.Dim(), kUndefined);
this->CopyFromVec(other);
return *this;
}
CuVector<Real> &operator = (const CuVector<Real> &other) {
Resize(other.Dim(), kUndefined);
this->CopyFromVec(other);
return *this;
}
CuVector<Real> &operator = (const VectorBase<Real> &other) {
Resize(other.Dim());
this->CopyFromVec(other);
return *this;
}
Utils
void Swap(CuVector<Real> *vec);
void Swap(Vector<Real> *vec);
void Resize(MatrixIndexT length, MatrixResizeType resize_type = kSetZero);
class CuSubVector: public CuVectorBase<Real>
該類表示一個不佔有實際數據的泛化向量或向量索引,能夠表示高級向量的子向量或矩陣的行。實現多種用於索引的構造函數。
多種構造函數
CuSubVector(const CuVectorBase<Real> &t, const MatrixIndexT origin,
const MatrixIndexT length) : CuVectorBase<Real>() {
KALDI_ASSERT(static_cast<UnsignedMatrixIndexT>(origin)+
static_cast<UnsignedMatrixIndexT>(length) <=
static_cast<UnsignedMatrixIndexT>(t.Dim()));
CuVectorBase<Real>::data_ = const_cast<Real*>(t.Data()+origin);
CuVectorBase<Real>::dim_ = length;
}
/// Copy constructor
/// this constructor needed for Range() to work in base class.
CuSubVector(const CuSubVector &other) : CuVectorBase<Real> () {
CuVectorBase<Real>::data_ = other.data_;
CuVectorBase<Real>::dim_ = other.dim_;
}
CuSubVector(const Real* data, MatrixIndexT length) : CuVectorBase<Real> () {
// Yes, we're evading C's restrictions on const here, and yes, it can be used
// to do wrong stuff; unfortunately the workaround would be very difficult.
CuVectorBase<Real>::data_ = const_cast<Real*>(data);
CuVectorBase<Real>::dim_ = length;
}
cudamatrix/cu-matrix.h
與matrix/kaldi-matrixr.h相似的,該頭文件聲明瞭幾個矩陣類。與之不一樣的是,但其運算的實現基於CUDA或CBLAS。當Kaldi基於CUDA環境編譯且GPU可用(CuDevice::Instantiate().Enabled() == true)則使用CUDA卡進行計算,不然使用CPU進行計算(CBLAS)。
class CuMatrixBase
Cuda矩陣抽象類。該類對基礎運算與內存優化進行了封裝,只提供矩陣運算。不涉及尺寸縮放和構造函數。
尺寸縮放和構造函數由派生類CuMatrix和CuSubMatrix負責。
class CuMatrix
該類表示普通Cuda矩陣,並實現尺寸縮放和通常的構造函數。
class CuSubMatrix
該類表示一個不佔有實際數據的泛化矩陣或矩陣索引,能夠表示其餘矩陣的矩陣。實現多種用於索引的構造函數。
繼承於CuMatrixBase,用於對矩陣的子矩陣(塊矩陣)進行運算。