2016-09-21 249 views
1

我使用以下CUDA內核:這是CUDA中的錯誤嗎? (非法的內存訪問時遇到)

__global__ 
void sum_worker(int *data, int *sum_ptr) 
{ 
     __shared__ int block_sum; 
     int idx = threadIdx.x; 
     int thread_sum = 0; 

     if (threadIdx.x == 0) 
       block_sum = 2; 

     for (int i = idx; i < MAX_INDEX; i += blockDim.x) 
       thread_sum += data[i]; 

     __syncthreads(); 

     atomicAdd(&block_sum, thread_sum); 

     __syncthreads(); 

     if (threadIdx.x == 0) 
       *sum_ptr = block_sum; 
} 

它使用此代碼展開:

sum_worker<<<1, 32>>>(primes_or_zeros, sum_buffer); 

而且它工作正常(無運行時錯誤併產生正確的結果)。但是,如果我改變i += blockDim.xi += 32我得到一個錯誤,下一次我打電話cudaDeviceSynchronize()

Cuda error 'an illegal memory access was encountered' in primes_gpu.cu at line 97 

cuda-memcheck運行的內核:

========= Invalid __global__ read of size 4 
=========  at 0x00000108 in /home/clifford/Work/handicraft/2016/perfmeas/primes_gpu.cu:35:sum_worker(int*, int*) 
=========  by thread (31,0,0) in block (0,0,0) 
=========  Address 0x703b70d7c is out of bounds 
=========  Saved host backtrace up to driver entry point at kernel launch time 
=========  Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x472225] 
=========  Host Frame:/usr/lib/x86_64-linux-gnu/libcudart.so.7.5 [0x146ad] 
=========  Host Frame:/usr/lib/x86_64-linux-gnu/libcudart.so.7.5 (cudaLaunch + 0x143) [0x2ece3] 
=========  Host Frame:./perfmeas [0x17c7] 
=========  Host Frame:./perfmeas [0x16b7] 
=========  Host Frame:./perfmeas [0x16e2] 
=========  Host Frame:./perfmeas [0x153f] 
=========  Host Frame:./perfmeas [0xdcd] 
=========  Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf0) [0x20830] 
=========  Host Frame:./perfmeas [0xf39] 
.... 

地址0x703b70d7c的確是出界了data組成:數組從0x703b40000開始,並具有MAX_INDEX元素。此測試中MAX_INDEX爲50000。 (0x703b70d7c - 0x703b40000)/ 4 = 50015.

添加額外的檢查,對i >= 50000使得以神奇的問題消失:

for (int i = idx; i < MAX_INDEX; i += 32) { 
      if (i >= MAX_INDEX) 
        printf("WTF!\n"); 
      thread_sum += data[i]; 
    } 

這是CUDA的錯誤還是我在這裏做一些愚蠢的事?

我在Ubuntu 2016.04上使用CUDA 7.5。的nvcc --version輸出:

nvcc: NVIDIA (R) Cuda compiler driver 
Copyright (c) 2005-2015 NVIDIA Corporation 
Built on Tue_Aug_11_14:27:32_CDT_2015 
Cuda compilation tools, release 7.5, V7.5.17 

這個測試案例的完整源代碼可以在這裏找到:。
http://svn.clifford.at/handicraft/2016/perfmeas

(有選項-gx此版本使用i += blockDim.x運行改變,要i += 32重現問題。)


編輯:@njuffa在評論中表示,他不想跟隨鏈接關閉堆棧因爲他「太害怕[他的]計算機可能會捕獲某些東西」,並且更喜歡測試用例,他可以直接從堆棧溢出複製&粘貼。所以這裏有雲:

#include <string.h> 
#include <stdio.h> 
#include <stdbool.h> 
#include <math.h> 

#define MAX_PRIMES 100000 
#define MAX_INDEX (MAX_PRIMES/2) 

__global__ 
void primes_worker(int *data) 
{ 
    int idx = threadIdx.x + blockIdx.x * blockDim.x; 

    if (idx >= MAX_INDEX) 
     return; 

    int p = 2*idx+1; 
    for (int i = 3; i*i <= p; i += 2) { 
     if (p % i == 0) { 
      data[idx] = 0; 
      return; 
     } 
    } 

    data[idx] = idx ? p : 0; 
} 

__global__ 
void sum_worker(int *data, int *sum_ptr) 
{ 
    __shared__ int block_sum; 
    int idx = threadIdx.x; 
    int thread_sum = 0; 

    if (threadIdx.x == 0) 
     block_sum = 2; 

#ifdef ENABLE_BUG 
    for (int i = idx; i < MAX_INDEX; i += 32) 
     thread_sum += data[i]; 
#else 
    for (int i = idx; i < MAX_INDEX; i += blockDim.x) 
     thread_sum += data[i]; 
#endif 

    __syncthreads(); 

    atomicAdd(&block_sum, thread_sum); 

    __syncthreads(); 

    if (threadIdx.x == 0) 
     *sum_ptr = block_sum; 
} 

int *primes_or_zeros; 
int *sum_buffer; 

void primes_gpu_init() 
{ 
    cudaError_t err; 

    err = cudaMalloc((void**)&primes_or_zeros, sizeof(int)*MAX_INDEX); 

    if (err != cudaSuccess) 
     printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); 

    err = cudaMallocHost((void**)&sum_buffer, sizeof(int)); 

    if (err != cudaSuccess) 
     printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); 
} 

void primes_gpu_done() 
{ 
    cudaError_t err; 

    err = cudaFree(primes_or_zeros); 

    if (err != cudaSuccess) 
     printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); 

    err = cudaFreeHost(sum_buffer); 

    if (err != cudaSuccess) 
     printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); 
} 

int primes_gpu() 
{ 
    int num_blocks = (MAX_INDEX + 31)/32; 
    int num_treads = 32; 

    primes_worker<<<num_blocks, num_treads>>>(primes_or_zeros); 
    sum_worker<<<1, 32>>>(primes_or_zeros, sum_buffer); 
    cudaError_t err = cudaDeviceSynchronize(); 

    if (err != cudaSuccess) 
     printf("Cuda error '%s' in %s at line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); 

    return *sum_buffer; 
} 

int main() 
{ 
    primes_gpu_init(); 

    int result = primes_gpu(); 
    printf("Result: %d\n", result); 

    if (result != 454396537) { 
     printf("Incorrect result!\n"); 
     return 1; 
    } 

    primes_gpu_done(); 
    return 0; 
} 

用法:

$ nvcc -o demo demo.cu 
$ ./demo 
Result: 454396537 

$ nvcc -D ENABLE_BUG -o demo demo.cu 
$ ./demo 
Cuda error 'an illegal memory access was encountered' in demo.cu at line 99 
Result: 0 
Incorrect result! 
+0

而不包含最小的,完整的和可驗證的示例](http://stackoverflow.com/help/mcve)你是不可能找到任何人在這裏誰,甚至將嘗試幫助調試你的代碼。 – njuffa

+0

@njuffa你甚至可以看看我發佈的鏈接上的代碼嗎?這當然是完整的和可驗證的。它也接近最低限度。 (一切都小得多,要麼缺乏適當的錯誤處理,要麼不會檢查結果的正確性。) – CliffordVienna

+0

MCVE基本上意味着:用戶可以從源代碼中刪除SO,粘貼到他們最喜歡的編輯器中,保存文件,編譯並運行。最好我可以告訴,這是不可能的與上面張貼的代碼。我通常不會跟蹤非現場鏈接(我的電腦可能會捕捉到某些東西,所以我很害怕),但我並不這麼認爲。 – njuffa

回答

4

TL; DR:所觀察到的行爲很可能是由錯誤的CUDA 7.5工具鏈的ptxas成分引起的,特別是環unroller。 CUDA 8.0 RC中已經修復了該bug,這是公開可用的。

我能夠通過Quadro K2200 GPU(它是一個sm_50設備)在64位Windows 7平臺上重現問題中報告的行爲。生成的機器碼(SASS)與ENABLE_BUG定義的主要區別在於循環展開了四倍。這是將循環增量從變量threadIdx.x更改爲編譯時間常數32的直接結果,該編譯時間常數允許編譯器在編譯時計算跳閘計數。

這是有趣的是,在中間PTX水平,循環,即使有32增量熱軋:

BB7_4: 
ld.global.u32 %r12, [%rd10]; 
add.s32 %r16, %r12, %r16; 
add.s64 %rd10, %rd10, 128; 
add.s32 %r15, %r15, 32; 
setp.lt.s32  %p3, %r15, 50000; 
@%p3 bra BB7_4; 

隨着循環機器代碼展開,它必須是ptxas unroller應用該轉型。

如果我降低ptxas優化級別-O1,由nvcc命令行上指定-Xptxas -O1,代碼工作正常。如果我構建sm_30的代碼(在運行於sm_50設備上時導致JIT編譯),則代碼將在運行最新驅動程序Windows 369.26時按預期工作。這強烈地表明,CUDA 7.5的ptxas組件的展開器中存在一個錯誤,但已經修復,因爲CUDA驅動程序內的ptxas組件比CUDA 7.5工具鏈的ptxas組件近得多。

直接在環也解決了這個問題,因爲在這種情況下,展開由編譯器的nvvm組件執行的前面放置一個#pragma unroll 4,意味着展開的循環是已經存在於PTX級別:

#if ENABLE_BUG 
#pragma unroll 4 
    for (int i = idx; i < MAX_INDEX; i += 32) 
     thread_sum += data[i]; 
#else 

得到的PTX:

BB7_5: 
.pragma "nounroll"; 
ld.global.u32 %r34, [%rd14]; 
add.s32 %r35, %r34, %r45; 
ld.global.u32 %r36, [%rd14+128]; 
add.s32 %r37, %r36, %r35; 
ld.global.u32 %r38, [%rd14+256]; 
add.s32 %r39, %r38, %r37; 
ld.global.u32 %r40, [%rd14+384]; 
add.s32 %r45, %r40, %r39; 
add.s64 %rd14, %rd14, 512; 
add.s32 %r44, %r44, 128; 
setp.lt.s32  %p5, %r44, %r3; 
@%p5 bra BB7_5;