我在安裝在PCIe 3x16上的NVIDIA Quadro M4000上做OpenCL。在卡文檔中,據說CPU-> GPU的傳輸速率可以達到15.7Gb/s,而在我的基準測試中,它只能達到〜2.4Gb/s。我知道有效的傳輸速率可能與理論傳輸速率有很大的不同,但我並沒有預料到這種差異會非常大。使用NVIDIA Quadro M4000降低主機 - 設備傳輸速率
任何人都有關於quadro CPU-> GPU數據傳輸的經驗。
感謝
#include<iostream>
#include<cstdlib>
#include<cstdio>
#include<string>
#include<cmath>
#include<CL/cl.h>
#include <Windows.h>
using namespace std;
SYSTEMTIME last_call;
cl_platform_id platform_id = NULL;
cl_uint ret_num_platform;
cl_device_id device_id = NULL;
cl_uint ret_num_device;
cl_context context = NULL;
cl_command_queue command_queue = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_int err;
void _profile(char* msg){
SYSTEMTIME tmp;
clFinish(command_queue);
GetSystemTime(&tmp);
printf("__Profile --- %s --- : %d : %d : %d\n", msg, (tmp.wMinute - last_call.wMinute),
(tmp.wSecond - last_call.wSecond),
(tmp.wMilliseconds - last_call.wMilliseconds));
last_call = tmp;
}
int main()
{
// Reading Kernel Program
char *kernel_src_std = "__kernel void copy(__global const uchar *x, __global uchar *z){\
const int id = get_global_id(0);\
z[id] = x[id]; \
}";
size_t kernel_src_size = strlen(kernel_src_std);
// Create Input data
int w = 1920;
int h = 1080;
int c = 3;
float* input = (float*)malloc(w * h * c * sizeof(float));
for(int i=0;i<w*h*c;i++)
input[i] = (float)rand()/RAND_MAX;
// getting platform ID
err = clGetPlatformIDs(1, &platform_id, &ret_num_platform);
// Get Device ID
err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_device);
// Create Context
context = clCreateContext(NULL,1,&device_id,NULL,NULL,&err);
// Create Command Queue
command_queue = clCreateCommandQueue(context, device_id, 0, &err);
// Create buffer Object
cl_mem buf_in = clCreateBuffer(context,CL_MEM_READ_ONLY, sizeof(float) * w*h*c,
0, &err);
cl_mem buf_out = clCreateBuffer(context,CL_MEM_WRITE_ONLY, sizeof(float) * w*h*c,
0, &err);
_profile("Start transfer input...");
// Copy Data from Host to Device
cl_event event[5];
err = clEnqueueWriteBuffer(command_queue,buf_in,CL_TRUE, 0, sizeof(float)*w*h*c,input,0,NULL, NULL);
_profile("End transfer input...");
// Create and Build Program
program = clCreateProgramWithSource(context, 1, (const char **)&kernel_src_std, 0, &err);
// Create Kernel
kernel = clCreateKernel(program,"copy",&err);
// Set Kernel Arguments
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&buf_in);
err = clSetKernelArg(kernel, 1,sizeof(cl_mem), (void *)&buf_out);
// Execute Kernel
size_t ws[]={h*w*c};
size_t lws[]={100};
err = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, ws, lws, 0, NULL, NULL);
// Create output buf
float* output = (float*)malloc(sizeof(float)*w*h*c);
// Read output Data, from Device to Host
err = clEnqueueReadBuffer(command_queue, buf_out, CL_TRUE, 0, sizeof(float)*w*h*c, output,NULL,NULL,NULL);
//Release Objects
clReleaseMemObject(buf_in);
clReleaseMemObject(buf_out);
clReleaseKernel(kernel);
clReleaseProgram(program);
clReleaseCommandQueue(command_queue);
clReleaseContext(context);
free(input);
free(output);
while(1);
return(0);
}
它可能取決於您如何傳輸數據。你怎麼做呢?它只是一個巨大的陣列,你轉移到GPU?另外:您是否將您的結果與CUDA示例中給出的基準進行了比較? – CygnusX1
是的,我正在傳輸一個6220800浮點數組。這可能是原因嗎?我無法在我的安裝中找到OpenCL示例。人們似乎認爲NVIDIA不再維護這些產品。我使用clEnqueueWriteBuffer進行傳輸。 – lity
@lity這個問題似乎是關於OpenCL,但也被標記爲[cuda]。我建議刪除該標籤以避免混淆。確保該卡插入PCIe x16插槽,而不是PCIe x4插槽。對於傳輸大小> = 16 MB,PCIe gen3 x16的最大實際傳輸速率約爲11-12 GB/sec。您需要在主機上使用「固定內存」以獲得最高傳輸速度,但不確定OpenCL是否支持該功能。 – njuffa