OpenCL，C++：簡單和浮點向量程序的意外結果

這是一個簡單的程序，它從文件讀取兩個float4向量，然後計算相反數字的和。它的結果沒有預料到！OpenCL，C++：簡單和浮點向量程序的意外結果

主文件：

#include <limits.h> 
#include <stdio.h> 
#include <stdlib.h> 
#include <iostream> 
#include <iomanip> 
#include <array> 
#include <fstream> 
#include <sstream> 
#include <string> 
#include <algorithm> 
#include <iterator> 


#ifdef __APPLE__ 
#include <OpenCL/opencl.h> 
#else 
#include <CL/cl.h> 
#include <time.h> 
#endif 



const int number_of_points = 16; // number of points in Both A and B files (number of rows) 
const int number_of_axis = 4;  // number of points axis in Both A and B files (number of Columns) 


using namespace std; 

void checkError(cl_int err, const char *operation) 
{ 
    if (err != CL_SUCCESS) 
    { 
    fprintf(stderr, "Error during operation '%s': %d\n", operation, err); 
    exit(1); 
    } 
} 

int main(int argc, char *argv[]) { 
    clock_t tStart = clock(); 
    // Create the two input vectors 
    // working variables 
    int i; 
    ifstream input_fileA, input_fileB; // input files 
    string line; // transfer row from file to array 
    float x;  // transfer word from file to array 
    int row = 0; // number of rows of file A,B (= array) 
    int col = 0; // number of rows of file A,B (= array) 

    // working arrays 

    // working arrays 
// int mem_size_TempA = number_of_points * number_of_axis * sizeof(cl_float); 
// int mem_size_TempB = number_of_points * number_of_axis * sizeof(cl_float); 

    float tempAArray[number_of_points][number_of_axis]={{0}}; // array contains file A data 
    float tempBArray[number_of_points][number_of_axis]={{0}}; // array contains file B data 



    int mem_size_InputA = number_of_points ; 
    int mem_size_InputB = number_of_points ; 
    int mem_size_Output = number_of_points ; 

    float *inputAArray = (float*) malloc(number_of_points*sizeof(cl_float4)); // array contains file A data 
    float *inputBArray = (float*) malloc(number_of_points*sizeof(cl_float4)); // array contains file B data 
    float *outputArray = (float*) malloc(number_of_points*sizeof(cl_float4)); // array contains file B data 


    // import input files 
    input_fileA.open(argv[1]); 
    input_fileB.open(argv[2]); 


    // transfer input files data to array 
    // input file A to arrayA 
    row = 0; 
    while (getline(input_fileA, line)) 
    { 

     istringstream streamA(line); 
     col = 0; 
     while(streamA >> x){ 
      tempAArray[row][col] = x; 
      col++; 
     } 
     row++; 
    } 

    // input file B to arrayB 
    row = 0; 
    while (getline(input_fileB, line)) 
    { 

     istringstream streamB(line); 
     col = 0; 
     while(streamB >> x){ 
      tempBArray[row][col] = x; 
      col++; 
     } 
     row++; 
    } 

    // switch columns of B array 
    for(int row_of_arrayB = 0; row_of_arrayB < number_of_points; row_of_arrayB++) 
    { 
     float temporary = tempBArray[row_of_arrayB][2]; 
     tempBArray[row_of_arrayB][2] = tempBArray[row_of_arrayB][1]; 
     tempBArray[row_of_arrayB][1] = temporary; 
    } 

    // from Array to 3d vectors 
// for (int row_of_array = 0; row_of_array<number_of_points; row_of_array++) 
// { 
//  inputAArray[row_of_array] = (tempAArray[row_of_array][0], tempAArray[row_of_array][1], tempAArray[row_of_array][2],0); 
//  inputBArray[row_of_array] = (tempBArray[row_of_array][0], tempBArray[row_of_array][1], tempBArray[row_of_array][2],0); 
// } 

    for (int row_of_array=0; row_of_array < number_of_points; row_of_array++) 
    { 

     inputAArray[row_of_array*4+0] = tempAArray[row_of_array][0]; 
     inputAArray[row_of_array*4+1] = tempAArray[row_of_array][1]; 
     inputAArray[row_of_array*4+2] = tempAArray[row_of_array][2]; 
     inputAArray[row_of_array*4+3] = 0.0f; 

//  inputAArray[row_of_array]= float(4) (tempAArray[row_of_array][0], tempAArray[row_of_array][1], tempAArray[row_of_array][2], 0.0f); 

     inputBArray[row_of_array*4+0] = tempBArray[row_of_array][0]; 
     inputBArray[row_of_array*4+1] = tempBArray[row_of_array][1]; 
     inputBArray[row_of_array*4+2] = tempBArray[row_of_array][2]; 
     inputBArray[row_of_array*4+3] = 0.0f; 

     outputArray[row_of_array*4+0] = 0.0f; 
     outputArray[row_of_array*4+1] = 0.0f; 
     outputArray[row_of_array*4+2] = 0.0f; 
     outputArray[row_of_array*4+3] = 0.0f; 
//  inputBArray[row_of_array] = (tempBArray[row_of_array][0], tempBArray[row_of_array][1], tempBArray[row_of_array][2],0); 

    } 
// for (int row_of_array=0; row_of_array < number_of_points; row_of_array++) 
// { 
//  printf("0: %f, 1: %f, 2: %f, 3: %f \n", inputAArray[row_of_array*number_of_points+0], inputAArray[row_of_array*number_of_points+1], 
//    inputAArray[row_of_array*number_of_points+2], inputAArray[row_of_array*number_of_points+3]); 
// } 
    // close input files 
    input_fileA.close(); 
    input_fileB.close(); 




    // Load the kernel source code into the array source_str 
    FILE *fp; 
    char *source_str; 
    size_t source_size; 

    fp = fopen("calculate_bottom_SNM_kernel.cl", "r"); 
    if (!fp) { 
     fprintf(stderr, "Failed to load kernel.\n"); 
     exit(1); 
    } 

    fseek(fp, 0, SEEK_END); 
    size_t programLength = ftell(fp); 
    rewind(fp); 

    source_str = (char*)malloc(programLength+1); 
    source_size = fread(source_str, 1, programLength, fp); 
    source_str[programLength] = '\0'; 
    fclose(fp); 

    // Get platform and device information 
    cl_platform_id platform_id = NULL; 
    cl_device_id device_id = NULL; 
    cl_uint ret_num_devices; 
    cl_uint ret_num_platforms; 
    cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); 
    ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, 1, 
      &device_id, &ret_num_devices); 

    // Create an OpenCL context 
    cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); 

    // Create a command queue 
    cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret); 

    // Create memory buffers on the device for each vector 
    cl_mem inputa_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, 
      mem_size_InputA*sizeof(cl_float4) , NULL, &ret); 
    cl_mem inputb_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, 
      mem_size_InputB*sizeof(cl_float4), NULL, &ret); 

    cl_mem output_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 
      mem_size_Output*sizeof(cl_float4), NULL, &ret); 


    // Copy the lists A and B to their respective memory buffers 
    ret = clEnqueueWriteBuffer(command_queue, inputa_mem_obj, CL_TRUE, 0, 
      mem_size_InputA*sizeof(cl_float4), inputAArray, 0, NULL, NULL); 
    ret = clEnqueueWriteBuffer(command_queue, inputb_mem_obj, CL_TRUE, 0, 
      mem_size_InputB*sizeof(cl_float4), inputBArray, 0, NULL, NULL); 


    // Create a program from the kernel source 
    cl_program program = clCreateProgramWithSource(context, 1, 
      (const char **)&source_str, (const size_t *)&source_size, &ret); 

    // Build the program 

    ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); 
    if (ret == CL_BUILD_PROGRAM_FAILURE) 
     { 
     // Get size of build log 
     size_t logSize; 
     ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 
            0, NULL, &logSize); 
     checkError(ret, "getting build log size"); 

     // Get build log 
     char log[logSize]; 
     ret = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 
            logSize, log, NULL); 
     checkError(ret, "getting build log"); 

     printf("OpenCL program build log:\n%s\n", log); 
     exit(1); 
     } 


    // Create the OpenCL kernel 
    cl_kernel kernel = clCreateKernel(program, "calculate_bottom_SNM", &ret); 

    // Set the arguments of the kernel 
    ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputa_mem_obj); 
    ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&inputb_mem_obj); 
    ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&output_mem_obj); 

    // Execute the OpenCL kernel on the list 
    size_t global_item_size = number_of_points; // Process the entire lists 
    size_t local_item_size = 4; // Process in groups of 64 

    ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, 
      &global_item_size, &local_item_size, 0, NULL, NULL); 

    // Read the memory buffer C on the device to the local variable C 
// int *C = (int*)malloc(sizeof(int)*number_of_points); 


// float *C = (float*)malloc(sizeof(float)*number_of_points); 
    ret = clEnqueueReadBuffer(command_queue, output_mem_obj, CL_TRUE, 0, 
      mem_size_Output, outputArray, 0, NULL, NULL); 


    // Display the result to the screen 
// float buttomSNM = 0; 
    for(i = 0; i < number_of_points; i++) 
    { 
      printf("%f + %f = %f, \n",inputAArray[i*4+0],inputBArray[i*4+0], outputArray[i*4+0]); 
    } 

    // Clean up 
    ret = clFlush(command_queue); 
    ret = clFinish(command_queue); 
    ret = clReleaseKernel(kernel); 
    ret = clReleaseProgram(program); 
    ret = clReleaseMemObject(inputa_mem_obj); 
    ret = clReleaseMemObject(inputb_mem_obj); 
    ret = clReleaseMemObject(output_mem_obj); 
    ret = clReleaseCommandQueue(command_queue); 
    ret = clReleaseContext(context); 
    free (inputAArray); 
    free (inputBArray); 
    free (outputArray); 

printf("ALL Time taken: %.2fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC); 
    return 0; 
}

內核：

__kernel void calculate_bottom_SNM(__global float4 *inputAArray, __global float4 *inputBArray, 
         __global float4 *outputArray) { 

    // Get the index of the current element 
    int i = get_global_id(0); 

    outputArray[i].x = inputAArray[i].x + inputBArray[i].x; // Do something with first component 
    outputArray[i].y = inputAArray[i].y + inputBArray[i].y; // Do something with second component 
    outputArray[i].z = inputAArray[i].z + inputBArray[i].z; // Do something with third component 
    outputArray[i].w = inputAArray[i].w + inputBArray[i].w; // Do something with third component 

}

第一輸入文件一個：

0 0.000000e+00 9.998994e-01  
1 1.000000e-03 9.998981e-01  
2 2.000000e-03 9.998967e-01  
3 3.000000e-03 9.998953e-01  
4 4.000000e-03 9.998939e-01  
5 5.000000e-03 9.998925e-01  
6 6.000000e-03 9.998911e-01  
7 7.000000e-03 9.998896e-01  
8 8.000000e-03 9.998881e-01  
9 9.000000e-03 9.998865e-01  
10 1.000000e-02 9.998850e-01  
11 1.100000e-02 9.998834e-01  
12 1.200000e-02 9.998817e-01  
13 1.300000e-02 9.998800e-01  
14 1.400000e-02 9.998783e-01  
15 1.500000e-02 9.998766e-01

第二輸入文件B：

0 0.000000e+00 9.998966e-01  
1 1.000000e-03 9.998953e-01  
2 2.000000e-03 9.998939e-01  
3 3.000000e-03 9.998925e-01  
4 4.000000e-03 9.998911e-01  
5 5.000000e-03 9.998896e-01  
6 6.000000e-03 9.998881e-01  
7 7.000000e-03 9.998866e-01  
8 8.000000e-03 9.998850e-01  
9 9.000000e-03 9.998834e-01  
10 1.000000e-02 9.998818e-01  
11 1.100000e-02 9.998801e-01  
12 1.200000e-02 9.998785e-01  
13 1.300000e-02 9.998767e-01  
14 1.400000e-02 9.998750e-01  
15 1.500000e-02 9.998732e-01

輸出應和的最後兩個文件的結果，我只會打印第一列，但它是爲別人同樣的行爲：

輸出：

0.000000 + 0.000000 = 0.000000, 
1.000000 + 1.000000 = 0.000000, 
2.000000 + 2.000000 = 0.000000, 
3.000000 + 3.000000 = 0.000000, 
4.000000 + 4.000000 = 0.000000, 
5.000000 + 5.000000 = 0.000000, 
6.000000 + 6.000000 = 0.000000, 
7.000000 + 7.000000 = 0.000000, 
8.000000 + 8.000000 = 0.000000, 
9.000000 + 9.000000 = 0.000000, 
10.000000 + 10.000000 = 0.000000, 
11.000000 + 11.000000 = 0.000000, 
12.000000 + 12.000000 = 0.000000, 
13.000000 + 13.000000 = 0.000000, 
14.000000 + 14.000000 = 0.000000, 
15.000000 + 15.000000 = 0.000000, 
ALL Time taken: 0.07s

由於提前，

來源

2015-04-01 Rami Aqqad

對不起，但對我來說這不看起來「簡單」。我看到很多我不能立即理解的東西。例如，當你想總結浮點數時，爲什麼你需要「將內核源代碼加載到數組source_str中？爲什麼你使用malloc之類而不是std :: vectors？ – user463035818 2015-04-01 11:19:16

我是OpenCl中的新成員，我帶了示例代碼並對其進行了修改，實際上我不知道，也許你是對的。順便說一句，我已經更新了代碼，並從內核中刪除了一些行，但仍然是意外的相同結果！ – 2015-04-01 11:33:53

檢查OpenCL API的返回值。他們執行完美嗎？ – Meluha 2015-04-01 11:39:19

你是不復制字節的正確數量從設備傳回主機：

int mem_size_Output = number_of_points ; 

... 

ret = clEnqueueReadBuffer(command_queue, output_mem_obj, CL_TRUE, 0, 
     mem_size_Output, outputArray, 0, NULL, NULL);

數據在緩衝區量number_of_points * sizeof(cl_float4)。

來源

2015-04-01 12:02:24 jprice

至少它應該正確讀取第一個1/16，但它不會。所以一定還有其他的東西...... – DarkZeros 2015-04-01 14:50:42

@DarkZeros它讀取16個字節，這是一個單獨的'cl_float4'。第一個值確實是正確的（0 + 0 = 0）。 – jprice 2015-04-01 15:02:58

噢，的確，第一個結果是零，而且是唯一正確的。 :) – DarkZeros 2015-04-01 17:35:58

OpenCL，C++：簡單和浮點向量程序的意外結果

回答

相關問題