2015-10-15 133 views
1

CUBLAS庫是否存在cublasDgetrfBatched()的最大批量限制?我正在做一個比較CPU和GPU之間時序的基準問題。對於1000的批量處理,我正在讓GPU時序大於CPU時序。但是,對於100的批量處理,我正在通過CPU獲得一些加速。CUBLAS Library允許的cublasDgetrfBatched()允許的最大批量大小

我在下面發佈了用於基準測試的代碼。

1. main.cpp

/*main.cpp goes below*/ 
#include<stdio.h> 
#include <time.h> 
#include <cuda.h> 
#include <cuda_runtime.h> 
#include <cublas_v2.h> 
#include "mathlib_blas.h" 

int main(){ 

double**mat;       
double**mat_scratch1; 
int *ipvt; 
double *fVec; 
double *fVecSave; 
double *fVec_scratch; 
double *A;      
double *B; 
double **devPtrA; 
double **devPtrB; 
double **devPtrA_dev; 
double **devPtrB_dev; 
double *d_x; 
double *x; 
int *d_pivot_array ; 


int *d_info_array; 
int *h_info_array; 
int batchsize; 
int neqn; 

cublasHandle_t handle; 
cublasStatus_t status; 
cudaError_t error; 

clock_t start, end, start1, end1; 

double rcond; 
batchsize = 32; 
neqn = 172; 

mat = (double**) ArrayAlloc2d((size_t) neqn, (size_t) neqn, sizeof(double)); 
mat_scratch1 = (double**) ArrayAlloc2d((size_t) neqn, (size_t) neqn, sizeof(double)); 


ipvt = (int*) calloc((size_t) neqn, sizeof(int)); 
fVec = (double*) calloc((size_t) neqn, sizeof(double)); 
fVecSave = (double*) calloc((size_t) neqn, sizeof(double)); 
fVec_scratch = (double*) calloc((size_t) neqn, sizeof(double)); 

A = (double*)malloc(neqn*neqn*sizeof(A[0])); 
B = (double*)malloc(neqn*neqn*sizeof(B[0])); 

devPtrA = (double**)malloc(batchsize*sizeof(*devPtrA)); 
devPtrB = (double**)malloc(batchsize*sizeof(*devPtrB)); 

for(int b_count =0; b_count<batchsize; b_count++){ 
cudaMalloc((void **)&devPtrA[b_count], neqn*neqn * sizeof(devPtrA[0][0])); 
cudaMalloc((void **)&devPtrB[b_count], batchsize*neqn * sizeof(devPtrB[0][0])); 
} 

cudaMalloc((void **)&devPtrA_dev, batchsize*sizeof(*devPtrA)); 
cudaMalloc((void **)&devPtrB_dev, batchsize*sizeof(*devPtrB)); 


cudaMemcpy(devPtrA_dev, devPtrA, batchsize*sizeof(*devPtrA), cudaMemcpyHostToDevice); 
cudaMemcpy(devPtrB_dev, devPtrB, batchsize*sizeof(*devPtrB), cudaMemcpyHostToDevice); 


cudaMalloc((void **)&d_x, neqn*sizeof(double)); 
x =(double *)malloc(neqn*sizeof(double)); 

cudaMalloc((void **)&d_pivot_array, batchsize*neqn*sizeof(int)); 

cudaMalloc((void **)&d_info_array, batchsize*sizeof(int)); 

h_info_array =(int*)malloc(batchsize*sizeof(int)); 

cublasCreate(&handle); 

srand(time(NULL)); 



/* Fill in the CPU and GPU Matrix */ 
for (int iRow = 0; iRow < neqn; iRow++) { 
    double sumCol = 0; 
    for (int iColumn = 0; iColumn < neqn; iColumn++) { 
     for(int b_count =0; b_count<batchsize; b_count++){ 
      A[neqn*iColumn + iRow] = rand()%10 ; 
      mat[iRow][iColumn] = A[neqn*iColumn + iRow]; 
     } 
     sumCol +=A[neqn*iColumn + iRow]; 
    } 
    fVec[iRow] = sumCol; 
    fVecSave[iRow] = sumCol; 
} 

/*CPU_CODE GOES HERE */ 

start = clock(); 
for(int b_count =0; b_count<batchsize; b_count++){ 

    for (int iRow = 0; iRow < neqn; iRow++) { 
     for (int iColumn = 0; iColumn < neqn; iColumn++) { 
      mat_scratch1[iColumn][iRow]= mat[iColumn][iRow]; 
     } 
    } 

    dgeco_blas(mat_scratch1, neqn, ipvt, &rcond, fVecSave); 
    } 

     for (int iRow = 0; iRow < neqn; iRow++) { 
     for (int iColumn = 0; iColumn < neqn; iColumn++) { 
      mat[iColumn][iRow]= mat_scratch1[iColumn][iRow]; 
     } 
    } 

    for(int b_count =0; b_count<batchsize; b_count++){ 
    for(int i = 0; i < neqn; i++) fVec_scratch[i] = fVec[i];  
    dgesl_blas(mat, neqn, ipvt , fVec_scratch, 0); 
    } 

end = clock(); 
float seconds = (float)(end - start)/CLOCKS_PER_SEC; 

printf("Time in seconds(CPU) : %lf \n", seconds); 
/*CPU_CODE ENDS HERE */ 


start1 = clock(); 

for(int b_count =0; b_count<batchsize; b_count++){ 
    status = cublasSetMatrix(neqn, neqn, sizeof(A[0]), A, neqn, devPtrA[b_count], neqn); 
} 

status = cublasDgetrfBatched(handle, neqn, (double**)devPtrA_dev,neqn,d_pivot_array,d_info_array,batchsize); 
if (status != CUBLAS_STATUS_SUCCESS) fprintf(stderr,"error in dgetrf %i\n",status); 

cudaMemcpy(h_info_array, d_info_array, batchsize*sizeof(int), cudaMemcpyDeviceToHost); 


for(int b_count =0; b_count<batchsize; b_count++){ 
    cudaMemcpy(devPtrB[b_count], fVec, neqn*sizeof(double),cudaMemcpyHostToDevice);  /* for testing purpose only */ 
    } 

status = cublasDgetrsBatched(handle, CUBLAS_OP_N, neqn, batchsize, (const double**)devPtrA_dev, 
        neqn, d_pivot_array,devPtrB_dev, neqn, h_info_array, batchsize);                 

for(int b_count =0; b_count<batchsize; b_count++){ 
    cudaMemcpy(fVec,devPtrB[b_count], neqn*sizeof(double),cudaMemcpyDeviceToHost);  /* for testing purpose only */ 
} 


end1 = clock(); 
float seconds1 = (float)(end1 - start1)/CLOCKS_PER_SEC; 

printf("Time in seconds(GPU) : %lf \n", seconds1); 

printf("Speedup(CPU/GPU) : %lf \n", seconds/seconds1); 

system("pause"); 
/* End of the main portion of the code */ 

free(mat); 
free(mat_scratch1); 
free(ipvt); 
free(fVec); 
free(fVecSave); 
free(fVec_scratch); 
free(A); 
free(B); 
cudaFree(devPtrA[0]); 
cudaFree(devPtrB[0]); 
cudaFree(devPtrA_dev); 
cudaFree(devPtrB_dev); 
free(devPtrA); 
free(devPtrB); 
cudaFree(d_x); 
free(x); 
cudaFree(d_pivot_array); 
cudaFree(d_info_array); 
free(h_info_array); 
cublasDestroy_v2(handle); 


} 

2. mathlib_blas.h

#include <stdio.h> 
#include <math.h> 

#define maxm(a,b) (((a) > (b)) ? (a) : (b)) 
#define minm(a,b) (((a) < (b)) ? (a) : (b)) 
#define signum(a,b) (((b) < (0)) ? (-a) : (a)) 

void **ArrayAlloc2d(const int size1, const int size2, const size_t sizeType); 
void dgefa_blas(double **a,int n, int ipvt[],int *info); 
void dgesl_blas(double **a,int n,int ipvt[],double b[],int job); 
void dgeco_blas(double **a,int n, int *ipvt, double *rcond,double *z); 



void **ArrayAlloc2d(const int size1, const int size2, const size_t sizeType) 
{ 
    void** array = nullptr; 

    array = (void**)calloc(size1, sizeof(void*)); 

    if (array != nullptr) { 
    if (size2 > 0) { 
     void* data = calloc(size1*size2, sizeType); 

     if (data != nullptr) { 
      char* addr = (char*)data; 

      for (int index1 = 0; index1 < size1; index1++) { 
       array[index1] = (void*)addr; 
       addr += sizeType*size2; /* char is always 1 byte */ 
      } 
     } else { 

      free(array); 
      free(data); 
      array = nullptr; 


     } 
    } 
    } else { 

    } 


    return array; 
} 

void dgeco_blas(double **a,int n, int *ipvt, double *rcond,double *z) 
{ 
double anorm,ek,s,sm,t,vecdot,vecsum,wk,wkm,ynorm; 
int i,info,j,k,kb,kp1,l; 

/* Compute 1-norm of a */ 
anorm = 0.0; 
for (j = 0; j < n; j++) { 
    vecsum = 0.0; 
    for (i = 0;i < n; i++) 
    vecsum += fabs(a[i][j]); 
    anorm = maxm(anorm,vecsum); 
} 

/* Factor. */ 
dgefa_blas(a,n,ipvt,&info); 

/* rcond = 1/(norm(a) * (estimate of norm(inverse(a)))). 
* estimate = norm(z)/norm(y), where a*z=y and trans(a)*y=e. 
* trans(a) is the transpose of a. The components of e are 
* chosen to cause maximum local growth in the elements of 
* w, where trans(u)*w=e. The vectors are frequently rescaled 
* to avoid overflow. 
*/ 
ek = 1.0; 
for (j = 0; j < n; j++) 
    z[j] = 0.0; 
    for (k = 0; k < n; k++) { 
    if (z[k] != 0.0) 
    ek = signum(ek,-z[k]); 
    if (fabs(ek-z[k]) > fabs(a[k][k])) { 
    s = fabs(a[k][k])/fabs(ek-z[k]); 
    /* dscal(n,s,z,1) */ 
    for (i = 0; i < n; i++) 
     z[i] *= s; 
    ek *= s; 
    } 
    wk = ek - z[k]; 
    wkm = -ek - z[k]; 
    s = fabs(wk); 
    sm = fabs(wkm); 
    if (a[k][k] != 0.0) { 
    wk /= a[k][k]; 
    wkm /= a[k][k]; 
    } 
    else { 
    wk = 1.0; 
    wkm = 1.0; 
    } 
    kp1 = k + 1; 
    if (kp1 < n) { 
    for (j = kp1; j < n; j++) { 
     sm += (fabs(z[j] + wkm * a[k][j])); 
     z[j] += (wk * a[k][j]); 
     s += fabs(z[j]); 
    } 
    if (s < sm) { 
     t = wkm -wk; 
     wk = wkm; 
     for (j = kp1; j < n; j++) 
      z[j] += (t * a[k][j]); 
    } 
    } 
    z[k] = wk; 
    } 

    /* dasum(n,s,z,1) */ 
vecsum = 0.0; 

for (i = 0;i < n; i++) 
    vecsum += fabs(z[i]); 
s = 1.0/vecsum; 

/* dscal(n,s,z) */ 
for (i = 0; i < n; i++) 
    z[i] *= s; 

/* Solve trans(l)*y= w 
*/ 
for (kb = 0; kb < n; kb++) { 
    k = n - kb - 1; 
    if (k < (n-1)) { 
    /* sdot(n-k,a(k+1,k),1,z(k+1),1) */ 
    vecdot = 0.0; 
    for (i = k+1;i < n; i++) 
     vecdot += (a[i][k] * z[i]); 
    z[k] += vecdot; 
    } 
    if (fabs(z[k]) > 1.0) { 
    s = 1.0/fabs(z[k]); 
    /* dscal(n,s,z) */ 
    for (i = 0; i < n; i++) 
     z[i] *= s; 
    } 
    l = ipvt[k]; 
    t = z[l]; 
    z[l] = z[k]; 
    z[k] = t; 
    } /* endfor kb */ 
    /* dasum(n,z,1) */ 
    vecsum = 0.0; 
    for (i = 0; i < n; i++) 
    vecsum += fabs(z[i]); 
    s = 1.0/vecsum; 
    /* dscal(n,s,z) */ 
    for (i = 0; i < n; i++) 
    z[i] *= s; 

    ynorm = 1.0; 
    /* 
    * Solve l * v = y 
    */ 
    for (k = 0; k < n; k++) { 
    l = ipvt[k]; 
    t = z[l]; 
    z[l] = z[k]; 
    z[k] = t; 
    if (k < (n-1)) { 
    /* daxpy(n-k,t,a[k+1][k],1,z[k+1],1) */ 
    for (i = k+1;i < n; i++) 
     z[i] += (t * a[i][k]); 
    } 
    if (fabs(z[k]) > 1.0) { 
    s = 1.0/fabs(z[k]); 
    /* dscal(n,s,z,1) */ 
    for (i = 0; i < n; i++) 
     z[i] *= s; 
    ynorm *= s; 
    } 
    } 
    /* dasum(n,z,1) */ 
    vecsum = 0.0; 
    for (i = 0; i < n; i++) 
    vecsum += fabs(z[i]); 
    s = 1.0/vecsum; 
    /* dscal(n,s,z,1) */ 
    for (i = 0; i < n; i++) 
    z[i] *= s; 
    ynorm *= s; 

    /* Solve u * z = v */ 
    for (kb = 0; kb < n; kb++) { 
    k = n - kb - 1; 
    if (fabs(z[k]) > fabs(a[k][k])) { 
    s = fabs(a[k][k])/fabs(z[k]); 
    /* dscal(n,s,z,1) */ 
    for (i = 0; i < n; i++) 
     z[i] *= s; 
    ynorm *= s; 
    } 
    if (a[k][k] != 0.0) 
    z[k] /= a[k][k]; 
    if (a[k][k] == 0.0) 
    z[k] = 1.0; 
    t = -z[k]; 
    /* daxpy(k-1,t,a[1][k],1,z[1],1) */ 
    for (i = 0; i < k; i++) 
    z[i] += (t * a[i][k]); 
    } 
    /* Make znorm = 1.0 */ 
    /* dasum(n,z,1) */ 
    vecsum = 0.0; 
    for (i = 0; i < n; i++) 
    vecsum += fabs(z[i]); 
    s = 1.0/vecsum; 

    /* dscal(n,s,z,1) */ 
    for (i = 0; i < n; i++) 
    z[i] *= s; 
    ynorm *= s; 

    if (anorm != 0.0) *rcond = ynorm/anorm; 
    if (anorm == 0.0) *rcond = 0.0; 
    } 



    void dgefa_blas(double **a,int n, int ipvt[],int *info) 
    { 
    double dmax,t; 
    int i,j,k,kp1,l,nm1; 

*info = 0; 
nm1 = n - 1; 
if (n > 0) { 
    for (k = 0; k < nm1; k++) { 
    kp1 = k + 1; 
    /* Find l = pivot index. */ 
    dmax = fabs(a[k][k]); 
    l = k; 
    for (i = k+1; i < n; i++) { 
     if (fabs(a[i][k]) <= dmax) continue; 
     l = i; 
    } 
    ipvt[k] = l; 

    /* Zero pivot implies this column already triangularized. */ 
    if (a[l][k] == 0.0) { 
     *info = k; 
     continue; 
    } 

    /* Interchange if necessary. */ 
    if (l != k) { 
     t = a[l][k]; 
     a[l][k] = a[k][k]; 
     a[k][k] = t; 
    } 

    /* Compute multipliers. */ 
    if (a[k][k] == 0.0) printf("\n!ERROR. Singular matrix.\n"); 
    t = -1.0/a[k][k]; 
    for (i = k+1; i < n; i++) 
     a[i][k] *= t; 

    /* Row elimination with column indexing. */ 
    for (j = kp1; j < n; j++) { 
     t = a[l][j]; 
     if (l != k) { 
      a[l][j] = a[k][j]; 
      a[k][j] = t; 
     } 
     for (i = k+1; i < n; i++) 
      a[i][j] += (t * a[i][k]); 
    } 
    } 
    } 
    ipvt[n-1] = n-1; 
    if (a[n-1][n-1] == 0.0) *info = n-1; 
} 



void dgesl_blas(double **a,int n,int ipvt[],double b[],int job) 
{ 
double t; 
int i,k,kb,l,nm1; 

nm1 = n - 1; 
if (job == 0) { 

    /* job = 0, solve a * x = b. 
    * First solve l * y = b. 
    */ 
    if (n > 0) { 
    for (k = 0; k < nm1; k++) { 
     l = ipvt[k]; 
     t = b[l]; 
     if (l != k) { 
      b[l] = b[k]; 
      b[k] = t; 
     } 
     /* saxpy(n-k,t,a(k+1,k),1,b(k+1),1); */ 
     for (i=k+1;i < n;i++) 
      b[i] += (t * a[i][k]); 
    } 
    } 

    /* Now solve u * x = y. */ 
    for (kb = 0; kb < n; kb++) { 
    k = n - kb-1; 
    b[k] /= a[k][k]; 
    t = -b[k]; 
    /*  saxpy(k-1,t,a(1,k),1,b(1),1); */ 
    for (i = 0; i < k ; i++) 
     b[i] += (t * a[i][k]); 
    } 
    return; 
    } 
    /* job != 0, solve trans(a) * x = b. 
* First solve trans(u) * x = y. 
*/ 
for (k = 0; k < n; k++) { 
    /* t = ddot(k-1,a(1,k),1,b(1),1); */ 
    t = 0; 
    for (i = 0; i < k; i++) 
    t += (a[i][k] * b[i]); 
    b[k] = (b[k] - t)/a[k][k]; 
} 

/* Now solve trans(l) * x = y. */ 

if (n > 0) { 
    for (kb = 0; kb < nm1; kb++) { 
    k = n - 2 - kb; 
    /* b[k] = b[k] + ddot(n-k,a(k+1,k),1,b(k+1),1); */ 
    t = 0; 

    for (i = k+1;i < n; i++) 
     t += (a[i][k] * b[i]); 
    b[k] += t; 
    l = ipvt[k]; 

    if (l != k) { 
     t = b[l]; 
     b[l] = b[k]; 
     b[k] = t; 
    } 
    } 
    } 
} 
+0

「GPU時序比CPU時序較大」是指GPU是慢?無論如何,您應該將您使用的bechmark代碼發佈爲[mcve]。 –

+0

我已經上傳了代碼的GPU版本[這裏](https://www.dropbox.com/s/4yndy95o6u5wqor/main.cpp?dl=0) – sa112

回答

3

不應該有100批次尺寸和1000的批量大小之間的任何行爲上的差異(當然會有性能差別 - 1000的批量大小可能需要更長的時間。)

沒有published limits to the batch size,除了impl icit內存限制。事實上,除非GPU返回不正確的結果,否則沒有理由認爲你已經遇到任何硬性限制。

(如果你想探索一些行爲或性能的問題,這個問題沒有正確地寫入到解決這個問題。)

+0

我已經上傳了GPU版本的代碼[here]( https://www.dropbox.com/s/4yndy95o6u5wqor/main.cpp?dl=0) – sa112

+1

(1)從發佈的代碼,它不清楚什麼是計時,以及如何。 (2)求解器的速度在一定程度上取決於數據,因爲可能使用了部分旋轉。通過對每次運行使用不同的隨機數據,測量可能是不可重複的。 – njuffa

+0

並且沒有與CPU版本 –