2015-10-13 167 views
1

我想在mex文件中使用cublasSgemmBatched從matlab中乘以多個矩陣。錯誤在mex中使用cublasSgemmBatched

我MATLAB代碼非常簡單:

gpuDevice(1); 
a = single(rand(400,10,1500,'gpuArray')); 
b = single(rand(10,12,1500,'gpuArray')); 
c = MatCuda(a,b) 

我得到以下錯誤:使用gpuArray /的subsref 意外錯誤

錯誤CUDA執行過程中發生。 CUDA的錯誤是: 未知錯誤

和這裏的mexFunction代碼:

void mexFunction(int nlhs, mxArray *plhs[], 
       int nrhs, const mxArray *prhs[]){ 

char const * const errId = "parallel:gpu:mexGPUExample:InvalidInput"; 
char const * const errMsg = "Invalid input to MEX file."; 

/* Declare all variables.*/ 
mxGPUArray const *A; 
mxGPUArray const *B; 
mxGPUArray *C; 

/* Initialize the MathWorks GPU API. */ 
mxInitGPU(); 

/* Throw an error if the input is not a GPU array. */ 
if ((nrhs != 2) || !(mxIsGPUArray(prhs[0])) || !(mxIsGPUArray(prhs[1]))) { 
    mexErrMsgIdAndTxt(errId, errMsg); 
} 

A = mxGPUCreateFromMxArray(prhs[0]); 
B = mxGPUCreateFromMxArray(prhs[1]); 

if ((mxGPUGetClassID(A) != mxSINGLE_CLASS) || (mxGPUGetClassID(B) != mxSINGLE_CLASS)) { 
    mexErrMsgIdAndTxt(errId, errMsg); 
} 

float const *d_A; 
float const *d_B; 
d_A = (float const *)(mxGPUGetDataReadOnly(A)); 
d_B = (float const *)(mxGPUGetDataReadOnly(B)); 

const mwSize *dimsA = mxGPUGetDimensions(A); 
size_t nrowsA = dimsA[0]; 
size_t ncolsA = dimsA[1]; 
size_t nMatricesA = dimsA[2]; 
mxFree((void*) dimsA); 

const mwSize *dimsB = mxGPUGetDimensions(B); 
size_t nrowsB = dimsB[0]; 
size_t ncolsB = dimsB[1]; 
size_t nMatricesB = dimsB[2]; 
mxFree((void*)dimsB); 

size_t nrowsC = nrowsA; 
size_t ncolsC = ncolsB; 

mwSize dimsC[3] = { nrowsA, ncolsB, nMatricesB }; 
C = mxGPUCreateGPUArray(mxGPUGetNumberOfDimensions(A), 
    dimsC, 
    mxGPUGetClassID(A), 
    mxGPUGetComplexity(A), 
    MX_GPU_DO_NOT_INITIALIZE); 

float *d_C; 
d_C = (float *)(mxGPUGetData(C)); 

cublasHandle_t handle; 
cublasStatus_t ret; 
ret = cublasCreate(&handle); 
if (ret != CUBLAS_STATUS_SUCCESS) 
{ 
    printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__); 
    exit(EXIT_FAILURE); 
} 
const float alpha = 1.0f; 
const float beta = 0.0f; 
ret = cublasSgemmBatched(handle, CUBLAS_OP_N, CUBLAS_OP_N, nrowsA, ncolsB, ncolsA, &alpha, &d_A, nrowsA, &d_B, nrowsB, &beta, &d_C, nrowsC, nMatricesA); 

if (ret != CUBLAS_STATUS_SUCCESS) 
{ 
    printf("cublasSgemm returned error code %d, line(%d)\n", ret, __LINE__); 
    exit(EXIT_FAILURE); 
} 

ret = cublasDestroy(handle); 
if (ret != CUBLAS_STATUS_SUCCESS) 
{ 
    printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__); 
    exit(EXIT_FAILURE); 
} 

plhs[0] = mxGPUCreateMxArrayOnGPU(C); 
mxGPUDestroyGPUArray(A); 
mxGPUDestroyGPUArray(B); 
mxGPUDestroyGPUArray(C); 
} 

我懷疑這是有關功能cublasSgemmBatched,因爲當我從代碼中刪除它,然後我沒有得到這個錯誤。

幫助將非常感謝! 謝謝!

+1

gemmBatched比大多數cublas函數要複雜得多。您不僅需要複製矩陣數組以進行乘法運算,還必須將指針數組複製到這些矩陣。你可以通過編寫一個正確使用該函數的普通C/C++代碼來測試你的理解,或者看看[this](http://stackoverflow.com/questions/23743384/how-performing-multiple-matrix-multiplications -in-CUDA/23743838#23743838)。 –

+1

您尚未掌握gemmBatched函數的要求,因此您的調用肯定是不正確的。我們不會將具有'd_A','d_B'和'd_C'的參數的(主機)地址傳遞給參數。這些是指針指針參數,必須將它們正確設置爲一組設備指針,然後將其複製到設備。我認爲你的方法完全忽略了這一點。 –

+0

謝謝羅伯特! – nonobrez

回答

1

這裏你不需要一個MEX文件,你可以使用pagefun來做到這一點。另外,我建議直接建造ab,single精度而不是鑄造。換句話說,

a = rand(400,10,1500,'single','gpuArray'); 
b = rand(10,12,1500,'single','gpuArray'); 
c = pagefun(@mtimes, a, b); 
0

cublasDgemm適合我。我只是將常規數組傳遞給mexfunction。這是我的示例代碼。

#include <cuda_runtime.h> 
#include <cublas_v2.h> 
#include <time.h> 
#include "mex.h" 
#include "mxGPUArray.h" 

typedef struct _matrixSize 
{ 
    unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC; 
} sMatrixSize; 

void matrixMultiply(double const* d_A, double const* d_B, double* d_C, sMatrixSize &matrix_size); 

void mexFunction(int nlhs, mxArray *plhs[], int nrhs, mxArray const *prhs[]) 
{ 
    mxGPUArray const *A; 
    mxGPUArray const *B; 
    mxGPUArray *C; 
    _matrixSize matrix_size; 
    mwSize const *A_sz; 
    mwSize const *B_sz; 

    double const *d_A; 
    double const *d_B; 
    double *d_C; 

    char const * const errId = "parallel:gpu:mexGPUExample:InvalidInput"; 
    char const * const errMsg = "Invalid input to MEX file."; 

    if (nrhs != 2) { 
     mexErrMsgTxt("Need two inputs"); 
    } 

    A = mxGPUCreateFromMxArray(prhs[0]); 
    B = mxGPUCreateFromMxArray(prhs[1]); 

    A_sz=mxGPUGetDimensions(A); 
    B_sz = mxGPUGetDimensions(B); 

    matrix_size.uiWA = (unsigned int)A_sz[0]; matrix_size.uiHA = (unsigned int)A_sz[1]; 
    matrix_size.uiWB = (unsigned int)B_sz[0]; matrix_size.uiHB = (unsigned int)B_sz[1]; 
    mwSize C_sz[3] = { matrix_size.uiWA, matrix_size.uiHB, 1 }; 

    d_A = (double const *)(mxGPUGetDataReadOnly(A)); 
    d_B = (double const *)(mxGPUGetDataReadOnly(B)); 

    C = mxGPUCreateGPUArray(mxGPUGetNumberOfDimensions(A), 
     C_sz, 
     mxGPUGetClassID(A), 
     mxGPUGetComplexity(A), 
     MX_GPU_DO_NOT_INITIALIZE); 

    d_C = (double *)(mxGPUGetData(C)); 

    matrixMultiply(d_A, d_B, d_C, matrix_size); 
    plhs[0] = mxGPUCreateMxArrayOnGPU(C); 

    mxFree((void*)A_sz); 
    mxFree((void*)B_sz); 
    mxGPUDestroyGPUArray(A); 
    mxGPUDestroyGPUArray(B); 
    mxGPUDestroyGPUArray(C); 
} 

void matrixMultiply(double const* d_A, double const* d_B, double* d_C, sMatrixSize &matrix_size) 
{ 
    cublasStatus_t status; 
    cublasHandle_t handle; 
    status=cublasCreate(&handle); 
    if (status != CUBLAS_STATUS_SUCCESS) 
    { 
     if (status == CUBLAS_STATUS_NOT_INITIALIZED) { 
      mexPrintf("CUBLAS initializing error"); 
     } 
     getchar(); 
     return; 
    } 
    const double alpha = 1.0f; 
    const double beta = 0.0f; 
    cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB); 
    cudaThreadSynchronize(); 
    cublasDestroy(handle); 
} 

還有一種更原生的cuda風格。兩者都適合我。

#include <cuda_runtime.h> 
#include <cublas_v2.h> 
#include <time.h> 
#include "mex.h" 
#include "mxGPUArray.h" 

typedef struct _matrixSize 
{ 
    unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC; 
} sMatrixSize; 

void matrixMultiply(double const* d_A, double const* d_B, double* d_C, sMatrixSize &matrix_size); 

void mexFunction(int nlhs, mxArray *plhs[], int nrhs, mxArray const *prhs[]) 
{ 
    mxArray const *mA; 
    mxArray const *mB; 

    _matrixSize matrix_size; 
    size_t A_w, A_h, B_w, B_h; 

    double *d_A; 
    double *d_B; 
    double *d_C; 

    double *h_A; 
    double *h_B; 
    double *h_C; 

    char const * const errId = "parallel:gpu:mexGPUExample:InvalidInput"; 
    char const * const errMsg = "Invalid input to MEX file."; 

    if (nrhs != 2) { 
     mexErrMsgTxt("Need two inputs"); 
    } 

    mA = prhs[0]; mB = prhs[1]; 
    A_w = mxGetM(mA);A_h = mxGetN(mA);B_w = mxGetM(mB);B_h = mxGetN(mB); 

    matrix_size.uiWA = (unsigned int)A_w; matrix_size.uiHA = (unsigned int)A_h; 
    matrix_size.uiWB = (unsigned int)B_w; matrix_size.uiHB = (unsigned int)B_h; 
    matrix_size.uiWC = (unsigned int)A_w; matrix_size.uiHC = (unsigned int)B_h; 

    mwSize const C_sz[3] = { matrix_size.uiWA, matrix_size.uiHB, 1 }; 

    unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA; 
    unsigned int mem_size_A = sizeof(double) * size_A; 
    h_A = (double*)mxGetData(mA); 

    unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB; 
    unsigned int mem_size_B = sizeof(double) * size_B; 
    h_B = (double*)mxGetData(mB); 

    unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC; 
    unsigned int mem_size_C = sizeof(double) * size_C; 

    plhs[0] = mxCreateNumericArray(3, C_sz, mxDOUBLE_CLASS, mxREAL); 
    h_C = (double*)mxGetData(plhs[0]); 

    cudaMalloc((void **)&d_A, mem_size_A); 
    cudaMalloc((void **)&d_B, mem_size_B); 
    cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice); 
    cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice); 
    cudaMalloc((void **)&d_C, mem_size_C); 

    matrixMultiply(d_A, d_B, d_C, matrix_size); 
    cudaMemcpy(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost); 
    cudaThreadSynchronize(); 

    cudaFree(d_A); 
    cudaFree(d_B); 
    cudaFree(d_C);  
} 


void matrixMultiply(double const* d_A, double const* d_B, double* d_C, sMatrixSize &matrix_size) 
{ 
    cublasHandle_t handle; 
    cublasCreate(&handle); 
    const double alpha = 1.0f; 
    const double beta = 0.0f; 
    cublasDgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWB); 
    cublasDestroy(handle); 
}