0
我想運行一個調用cublassgemm函數的非常簡單的內核。我的代碼是:在內核中調用cublas函數時編譯CUDA代碼
__global__ void cor (float * dev_mat,float * dev_cor,int n,cublasHandle_t handle)
{
const float alpha = 1.0;
const float beta = 0;
cublasStatus_t stat;
stat = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, n, n, &alpha, dev_mat, n, dev_mat,n,&beta, dev_cor, n);
if(stat != CUBLAS_STATUS_SUCCESS)
{
cout<<"error in cublas sgemm \n";
}
}
int main()
{
int m =1000,n = 1000;
float * h_mat = new float[m*n];
float * h_cor = new float[m*n];
float * dev_mat,*dev_cor;
cudaMalloc(&dev_mat,m*n*sizeof(float));
cudaMalloc(&dev_cor,m*n*sizeof(float));
for (int i = 0; i< m; i++)
for(int j = 0; j <n;j++)
{
h_mat[i*n+j]=rand()%10;
}
cudaError_t cudaStat;
cublasStatus_t stat;
cublasHandle_t handle;
stat = cublasSetMatrix(m, n, sizeof(float), h_mat, m, dev_mat, m);
if(stat !=CUBLAS_STATUS_SUCCESS)
{
cout<<"error in cublassetmatrix \n";
return stat;
}
stat = cublasCreate (&handle);
if(stat != CUBLAS_STATUS_SUCCESS)
{
cout<<"error in cublas create handle \n";
return stat;
}
cor<<<1,1>>>(dev_mat,dev_cor,n,handle);
cudaFree(dev_mat);
delete []h_mat;
delete []h_cor;
return 0;
}
我試着用下面的命令來編譯這段代碼:
nvcc -lcublas cublassegmm_inside_kernel.cu -o cublassegmm_inside_kernel
但我得到了以下錯誤:
calling a host function("std::operator << > ") from a global function("cor") is not allowed
我讀This link但我沒有」不明白我該如何編譯!任何人都可以請向我解釋或建議來源。謝謝你這麼多