CUDA線索索引爲非常簡單的內核VS 2015更新返回意外的結果3

-1

我正在編寫我的第一個內核並設置一個簡單的程序來將線索索引分配給數組中的該位置，它適用於第一個1/4數組，即它的大小爲100時停止在25和大小爲50的時候停止。然後剩下的都是零。我進行了測試，以確保blockdim正確地出來，而且，這個程序非常簡單，並且緊跟着例子，我不知道它會有什麼問題。CUDA線索索引爲非常簡單的內核VS 2015更新返回意外的結果3

計算機是將我的線程分成四個塊還是什麼？

__global__ void index_initialize(int* data) 
{ 
    // set data at index to index 
    data[threadIdx.x] = threadIdx.x; 
} 

void zero_initialize(int* data, int size) 
{ 
    for (int i = 0; i < size; i++) 
    { 
     data[i] = 0; 
    } 
} 

void print_array(int* data, int size) 
{ 
    for (int i = 0; i < size; i++) 
    { 
     std::cout << data[i] << " "; 

     if (i % 20 == 0 && i > 0) std::cout << '\n'; 
    } 
} 


int main() 
{ 
    GpuTimer timer; 

    // size 
    int size = 100; 

    // host array 
    int* host = new int[size]; 

    // device array 
    int* device = new int[size]; 

    // zero out device and host 
    zero_initialize(host, size); 
    zero_initialize(device, size); 

    // allocate size ints on device 
    cudaMalloc(&device, size * sizeof(int)); 

    // call kernel on one thread block of size 
    index_initialize<<<1, size>>> (device); 

    // synchronize 
    cudaDeviceSynchronize(); 

    // copy device to host 
    cudaMemcpy(host, device, size, cudaMemcpyDeviceToHost); 

    // reset device 
    cudaDeviceReset(); 

    // print out host 
    print_array(host, size); 

    // free memory 
    cudaFree(device); 

}

該程序做同樣的事情，它停止在25

#include <iostream> 
#include <typeinfo> 

#include "cs344\Lesson Code Snippets\Lesson 2 Code Snippets\gputimer.h" 

#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 


__global__ void add_arrays(int* A, int* B, int* C) 
{ 
    C[threadIdx.x] = A[threadIdx.x] + B[threadIdx.x]; 
} 

__global__ void index_initialize(int* data) 
{ 
    // set data at index to index 
    data[threadIdx.x] = threadIdx.x; 
} 

void zero_initialize(int* data, int size) 
{ 
    for (int i = 0; i < size; i++) 
    { 
     data[i] = 0; 
    } 
} 

void print_array(int* data, int size) 
{ 
    for (int i = 0; i < size; i++) 
    { 
     std::cout << data[i] << " "; 

     if (i % 20 == 0 && i > 0) std::cout << '\n'; 
    } 
} 


int main() 
{ 
    GpuTimer timer; 

    // size 
    int size = 100; 

    // host arrays 
    int* hostA = new int[size]; 
    int* hostB = new int[size]; 
    int* hostC = new int[size]; 

    // device arrays 
    int* deviceA = new int[size]; 
    int* deviceB = new int[size]; 
    int* deviceC = new int[size]; 

    // zero out host 
    zero_initialize(hostA, size); 
    zero_initialize(hostB, size); 
    zero_initialize(hostC, size); 

    // set to index 
    for (int i = 0; i < size; i++) 
    { 
     hostB[i] = i; 
    } 

    // allocate size ints on device 
    cudaMalloc(&deviceA, size * sizeof(int)); 
    cudaMalloc(&deviceB, size * sizeof(int)); 
    cudaMalloc(&deviceC, size * sizeof(int)); 

    cudaMemcpy(deviceA, hostA, size, cudaMemcpyHostToDevice); 
    cudaMemcpy(deviceB, hostB, size, cudaMemcpyHostToDevice); 

    // call kernel on one thread block of size 
    //index_initialize<<<1, size>>> (device); 

    // call add kernel 
    add_arrays<<< 1, size >>> (deviceA, deviceB, deviceC); 

    // synchronize 
    cudaDeviceSynchronize(); 

    // copy device to host 
    cudaMemcpy(hostC, deviceC, size, cudaMemcpyDeviceToHost); 

    // reset device 
    cudaDeviceReset(); 

    // print out host 
    print_array(hostC, size); 

    // free memory 
    cudaFree(deviceA); 
    cudaFree(deviceB); 
    cudaFree(deviceC); 

}

來源

2017-08-08 Grant Swalwell

'cudaMemcpy（主機，設備，大小，cudaMemcpyDeviceToHost）;' - 你只能複製四分之一的數組。 [SO]不是一個免費的小錯誤發現服務，請不要把它當作一個。 – talonmies

謝謝，我意識到我在做什麼 –

cudaMemcpy()取入可變的字節大小而不是元素的數量

cudaMemcpy(host, device, size, cudaMemcpyDevicetoHost)

僅複製了分配給100個整數的400個字節的前100個。正確的複製語句將是

cudaMemcpy(host, device, size * sizeof(int), cudaMemcpyDevicetoHost)

來源

2017-08-08 10:15:33

CUDA線索索引爲非常簡單的內核VS 2015更新返回意外的結果3

回答

相關問題