4
因此,我試圖讓正面和反面得到一段代碼在OpenCL中工作。OpenCL:奇怪的內核行爲
由於我沒有得到最初的預期結果,我一直在嘗試各種方法來弄清楚哪裏出了問題。所以我想出了下面的代碼,並且在成功執行後,它並沒有產生預期的結果。
此代碼的最初設想是執行指定數量的線程並將線程號複製到數組中。
Threads: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
然而,我得到的結果。
Threads: 0 0 0 3 0 0 0 7 0 0 0 11 0 0 0 15
隨着我得到他們的結果是一種模式。所以每個
(n % 4)
它似乎把我的數組中的數字。我開始考慮是否由於某種原因代碼被視爲int
並轉換爲char
。
gcc main.c -o threadsopencl -std=c99 -framework OpenCL
#ifdef __APPLE__
#include <OpenCL/opencl.h>
#else
#include <CL/cl.h>
#endif
#include <stdlib.h> // warning: implicit declaration of function ‘malloc’
#include <stdio.h> // error: ‘stderr’ undeclared (first use in this function)
int main(int argc, char **argv)
{
/* Retrieve Platforms */
cl_uint Platforms = 0;
printf("Checking for OpenCL platforms.\n");
if (CL_SUCCESS == clGetPlatformIDs (0, NULL, &Platforms))
{
printf("Found %d platform.\n", Platforms);
if (Platforms > 0)
{
/* Retrieve Platform ID */
printf("Retrieving OpenCL platform details.\n");
cl_platform_id *Platform = malloc((sizeof(cl_platform_id) * Platforms));
clGetPlatformIDs(Platforms, Platform, &Platforms);
/* Retrieve Devices on Platform */
cl_uint GPUs = 0;
printf("Retrieving GPU devices associated with the detected platform.\n");
clGetDeviceIDs(Platform[0], CL_DEVICE_TYPE_GPU, 0, NULL, &GPUs);
if (GPUs > 0)
{
printf("Found %d GPU device(s).\n", GPUs);
cl_device_id *GPU = malloc((sizeof(cl_device_id) * GPUs));
clGetDeviceIDs(Platform[0], CL_DEVICE_TYPE_GPU, GPUs, GPU, &GPUs);
cl_uint Error;
printf("Creating OpenCL context and associating it with the detected GPU device.\n");
//clCreateContext(NULL, 1, &devices[device_no], &pfn_notify, NULL, &_err)
cl_context GPUcontext = clCreateContext(NULL, 1, &GPU[0], 0, NULL, &Error);
//clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU, NULL, NULL, &Error);
if (Error != CL_SUCCESS)
{
printf("Failed to create an OpenCL context!\n");
return 1;
}
const char *program_source[] = {
"__kernel void NumberOfThreads(__global uchar *thread)\n",
"{\n",
"uchar id = convert_uchar(get_global_id(0));\n",
"thread[id] = id;\n",
"}\n"
};
printf("Creating a program for execution on the device.\n");
cl_program AES = clCreateProgramWithSource(GPUcontext, sizeof(program_source)/sizeof(*program_source), program_source, NULL, &Error);
if (Error != CL_SUCCESS)
{
printf("Failed to create a program from source!\n");
return 1;
}
printf("Attempting compilation!\n");
if (clBuildProgram(AES, GPUs, &GPU[0], "", NULL, NULL) != CL_SUCCESS) {
printf("Program compilation failed!\n");
char buffer[10240];
clGetProgramBuildInfo(AES, GPU[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL);
fprintf(stderr, "CL Compilation failed:\n%s", buffer);
exit(2); // abort();
}
/* Since OpenCL compilation failed is due to incomplete code work */
printf("Allocating space for the data to be executed within the context.\n");
cl_mem Threads = clCreateBuffer(GPUcontext, CL_MEM_WRITE_ONLY, 16*sizeof(char), NULL, &Error);
if (Error != CL_SUCCESS)
{
printf("Failed to allocate buffer for State Matrix!\n");
return 1;
}
printf("Creating an OpenCL kernel!\n");
cl_kernel ThreadsKernel = clCreateKernel(AES, "NumberOfThreads", &Error);
clSetKernelArg(ThreadsKernel, 0, sizeof(cl_mem), &Threads);
if (Error != CL_SUCCESS)
{
printf("Failed to create kernel object!\n");
return 1;
}
printf("Setting up an execution queue.\n");
cl_command_queue ExecutionQueue = clCreateCommandQueue(GPUcontext, GPU[0], 0, &Error);
if (Error != CL_SUCCESS)
{
printf("Failed to create command queue!\n");
return 1;
}
printf("Commencing with kernel execution!\n");
cl_event ExecutionComplete;
size_t global_work_size[1] = { 16 };
if (clEnqueueNDRangeKernel(ExecutionQueue, ThreadsKernel, 1, NULL, global_work_size, NULL, 0, NULL, &ExecutionComplete) != CL_SUCCESS)
{
//printf("Failed to execute kernel! Error %d\n", (unsigned int)Error);
switch(Error)
{
case CL_INVALID_PROGRAM_EXECUTABLE:
printf("CL_INVALID_PROGRAM_EXECUTABLE\n");
break;
case CL_INVALID_COMMAND_QUEUE:
printf("CL_INVALID_COMMAND_QUEUE\n");
break;
case CL_INVALID_KERNEL:
printf("CL_INVALID_KERNEL\n");
break;
case CL_INVALID_CONTEXT:
printf("CL_INVALID_CONTEXT\n");
break;
case CL_INVALID_KERNEL_ARGS:
printf("CL_INVALID_KERNEL_ARGS\n");
break;
case CL_INVALID_WORK_DIMENSION:
printf("CL_INVALID_WORK_DIMENSION\n");
break;
case CL_INVALID_GLOBAL_WORK_SIZE:
printf("CL_INVALID_GLOBAL_WORK_SIZE\n");
break;
case CL_INVALID_WORK_GROUP_SIZE:
printf("CL_INVALID_WORK_GROUP_SIZE\n");
break;
case CL_INVALID_WORK_ITEM_SIZE:
printf("CL_INVALID_WORK_ITEM_SIZE\n");
break;
case CL_INVALID_GLOBAL_OFFSET:
printf("CL_INVALID_GLOBAL_OFFSET\n");
break;
case CL_OUT_OF_RESOURCES:
printf("CL_OUT_OF_RESOURCES\n");
break;
case CL_MEM_OBJECT_ALLOCATION_FAILURE:
printf("CL_MEM_OBJECT_ALLOCATION_FAILURE\n");
break;
case CL_INVALID_EVENT_WAIT_LIST:
printf("CL_INVALID_EVENT_WAIT_LIST\n");
break;
case CL_OUT_OF_HOST_MEMORY:
printf("CL_OUT_OF_HOST_MEMORY\n");
break;
default:
printf("Failed to execute kernel! %u\n", (unsigned int)Error);
}
return 1;
}
clWaitForEvents(1, &ExecutionComplete);
clReleaseEvent(ExecutionComplete);
printf("ThreadValue:");
for (char Loop = 0; Loop < 16; Loop++)
{
unsigned char ThreadValue = 0;
if (clEnqueueReadBuffer(ExecutionQueue, Threads, CL_TRUE, Loop, 1, &ThreadValue, 0, NULL, NULL) != CL_SUCCESS)
{
printf("Failed to copy data back from Device to Host!\n");
return 1;
}
printf(" %d", ThreadValue);
}
printf("\n");
printf("Freeing memory and exiting!\n");
clReleaseMemObject(Threads);
clReleaseKernel(ThreadsKernel);
clReleaseProgram(AES);
clReleaseContext(GPUcontext);
}
}
}
return 0;
}
看來你在做內核的tid * tid,所以預期的結果應該是0,1,3,9 ......在我的AMD機器上,我得到了這個結果。 – Meluha
get_global_id(0) - >檢索線程號,然後將其轉換爲unsigned char,然後在線程數組中選擇它(最初在內核調用期間通過參數傳遞),然後存儲線程號*乘以自身。 – Nocturnal
修復了(id * id)。但即使我運行代碼,它也會打印0 0 0 9 0 0 0 49 0 0 0 121 0 0 0 225.所以3個字節的NULL值,然後是內核函數的乘法運算。代碼應該展示出以下結果:0 1 4 9 16 25 36 49 64 91 100 121 144 169 196 225 – Nocturnal