我使用以下CUDA內核:這是CUDA中的錯誤嗎? (非法的內存訪問時遇到)
__global__
void sum_worker(int *data, int *sum_ptr)
{
__shared__ int block_sum;
int idx = threadIdx.x;
int thread_sum = 0;
if (threadIdx.x == 0)
block_sum = 2;
for (int i = idx; i < MAX_INDEX; i += blockDim.x)
thread_sum += data[i];
__syncthreads();
atomicAdd(&block_sum, thread_sum);
__syncthreads();
if (threadIdx.x == 0)
*sum_ptr = block_sum;
}
它使用此代碼展開:
sum_worker<<<1, 32>>>(primes_or_zeros, sum_buffer);
而且它工作正常(無運行時錯誤併產生正確的結果)。但是,如果我改變i += blockDim.x
到i += 32
我得到一個錯誤,下一次我打電話cudaDeviceSynchronize()
:
Cuda error 'an illegal memory access was encountered' in primes_gpu.cu at line 97
與cuda-memcheck
運行的內核:
========= Invalid __global__ read of size 4
========= at 0x00000108 in /home/clifford/Work/handicraft/2016/perfmeas/primes_gpu.cu:35:sum_worker(int*, int*)
========= by thread (31,0,0) in block (0,0,0)
========= Address 0x703b70d7c is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x472225]
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcudart.so.7.5 [0x146ad]
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcudart.so.7.5 (cudaLaunch + 0x143) [0x2ece3]
========= Host Frame:./perfmeas [0x17c7]
========= Host Frame:./perfmeas [0x16b7]
========= Host Frame:./perfmeas [0x16e2]
========= Host Frame:./perfmeas [0x153f]
========= Host Frame:./perfmeas [0xdcd]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf0) [0x20830]
========= Host Frame:./perfmeas [0xf39]
....
地址0x703b70d7c的確是出界了data
組成:數組從0x703b40000開始,並具有MAX_INDEX元素。此測試中MAX_INDEX爲50000。 (0x703b70d7c - 0x703b40000)/ 4 = 50015.
添加額外的檢查,對i >= 50000
使得以神奇的問題消失:
for (int i = idx; i < MAX_INDEX; i += 32) {
if (i >= MAX_INDEX)
printf("WTF!\n");
thread_sum += data[i];
}
這是CUDA的錯誤還是我在這裏做一些愚蠢的事?
我在Ubuntu 2016.04上使用CUDA 7.5。的nvcc --version
輸出:
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2015 NVIDIA Corporation
Built on Tue_Aug_11_14:27:32_CDT_2015
Cuda compilation tools, release 7.5, V7.5.17
這個測試案例的完整源代碼可以在這裏找到:。
http://svn.clifford.at/handicraft/2016/perfmeas
(有選項-gx
此版本使用i += blockDim.x
運行改變,要i += 32
重現問題。)
編輯:@njuffa在評論中表示,他不想跟隨鏈接關閉堆棧因爲他「太害怕[他的]計算機可能會捕獲某些東西」,並且更喜歡測試用例,他可以直接從堆棧溢出複製&粘貼。所以這裏有雲:
#include <string.h>
#include <stdio.h>
#include <stdbool.h>
#include <math.h>
#define MAX_PRIMES 100000
#define MAX_INDEX (MAX_PRIMES/2)
__global__
void primes_worker(int *data)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if (idx >= MAX_INDEX)
return;
int p = 2*idx+1;
for (int i = 3; i*i <= p; i += 2) {
if (p % i == 0) {
data[idx] = 0;
return;
}
}
data[idx] = idx ? p : 0;
}
__global__
void sum_worker(int *data, int *sum_ptr)
{
__shared__ int block_sum;
int idx = threadIdx.x;
int thread_sum = 0;
if (threadIdx.x == 0)
block_sum = 2;
#ifdef ENABLE_BUG
for (int i = idx; i < MAX_INDEX; i += 32)
thread_sum += data[i];
#else
for (int i = idx; i < MAX_INDEX; i += blockDim.x)
thread_sum += data[i];
#endif
__syncthreads();
atomicAdd(&block_sum, thread_sum);
__syncthreads();
if (threadIdx.x == 0)
*sum_ptr = block_sum;
}
int *primes_or_zeros;
int *sum_buffer;
void primes_gpu_init()
{
cudaError_t err;
err = cudaMalloc((void**)&primes_or_zeros, sizeof(int)*MAX_INDEX);
if (err != cudaSuccess)
printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
err = cudaMallocHost((void**)&sum_buffer, sizeof(int));
if (err != cudaSuccess)
printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
}
void primes_gpu_done()
{
cudaError_t err;
err = cudaFree(primes_or_zeros);
if (err != cudaSuccess)
printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
err = cudaFreeHost(sum_buffer);
if (err != cudaSuccess)
printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
}
int primes_gpu()
{
int num_blocks = (MAX_INDEX + 31)/32;
int num_treads = 32;
primes_worker<<<num_blocks, num_treads>>>(primes_or_zeros);
sum_worker<<<1, 32>>>(primes_or_zeros, sum_buffer);
cudaError_t err = cudaDeviceSynchronize();
if (err != cudaSuccess)
printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__);
return *sum_buffer;
}
int main()
{
primes_gpu_init();
int result = primes_gpu();
printf("Result: %d\n", result);
if (result != 454396537) {
printf("Incorrect result!\n");
return 1;
}
primes_gpu_done();
return 0;
}
用法:
$ nvcc -o demo demo.cu
$ ./demo
Result: 454396537
$ nvcc -D ENABLE_BUG -o demo demo.cu
$ ./demo
Cuda error 'an illegal memory access was encountered' in demo.cu at line 99
Result: 0
Incorrect result!
而不包含最小的,完整的和可驗證的示例](http://stackoverflow.com/help/mcve)你是不可能找到任何人在這裏誰,甚至將嘗試幫助調試你的代碼。 – njuffa
@njuffa你甚至可以看看我發佈的鏈接上的代碼嗎?這當然是完整的和可驗證的。它也接近最低限度。 (一切都小得多,要麼缺乏適當的錯誤處理,要麼不會檢查結果的正確性。) – CliffordVienna
MCVE基本上意味着:用戶可以從源代碼中刪除SO,粘貼到他們最喜歡的編輯器中,保存文件,編譯並運行。最好我可以告訴,這是不可能的與上面張貼的代碼。我通常不會跟蹤非現場鏈接(我的電腦可能會捕捉到某些東西,所以我很害怕),但我並不這麼認爲。 – njuffa