__device__ int tag = 0; 
__global__ void kernel() { 
    if (threadIdx.x == 0) { 
     volatile int v; 
     do { 
      v = tag; 
     while (v == 0); 
    return ; 








使用原子讀/寫沒有幫助? –


不是。我暗指使用原子的哈克方法,但它們都依賴於在執行條件分支時知道執行順序。而且它們不是通用的,因爲它們依賴於網格中的每個塊被調度和激活,這與硬件有關。 – talonmies


那麼你可以深入瞭解爲什麼我提交的代碼在5795+開始失敗?由於你提到的任何原因,不知道它是失敗的。 –



#include <stdio.h> 
#include <cuda.h> 
#include <cuda_runtime.h> 
#include <cuda_runtime_api.h> 

__global__ static 
void kernel(int *count, float *data) 
    count += threadIdx.x; 
    data += gridDim.x * threadIdx.x; 
    int i = blockIdx.x; 
    if (i < gridDim.x - 1) { 
     data[i] = i + 1; 
     atomicAdd(count, 1); 

    while (atomicMin(count, i) != i); 

    float tmp = i + 1; 
    for (int j = 0; j < i; j++) tmp += data[j]; 

    data[i] = tmp; 

int main(int argc, char **args) 
     int num = 100; 
    if (argc >= 2) num = atoi(args[1]); 

    int bytes = num * sizeof(float) * 32; 
    float *d_data; cudaMalloc((void **)&d_data, bytes); 
    float *h_data = (float *)malloc(bytes); 
    for (int i = 0; i < 32 * num; i++) h_data[i] = -1; // Being safe                               

    int h_count[32] = {1}; 
    int *d_count; cudaMalloc((void **)&d_count, 32 * sizeof(int)); 
    cudaMemcpy(d_count, &h_count, 32 * sizeof(int), cudaMemcpyHostToDevice); 
    cudaMemcpy(d_data, h_data, bytes, cudaMemcpyHostToDevice); 
    kernel<<<num, 32>>>(d_count, d_data); 
    cudaMemcpy(&h_count, d_count, 32 * sizeof(int), cudaMemcpyDeviceToHost); 
    cudaMemcpy(h_data, d_data, bytes, cudaMemcpyDeviceToHost); 

    for (int i = 0; i < 32; i++) { 
     printf("sum of first %d from thread %d is %d \n", num, i, (int)h_data[num -1]); 
     h_data += num; 

    free(h_data - num * 32); 

我不能保證這將始終有效。但是,我的卡(320M)的突破點似乎是爲數= 5796.也許每種卡的某種硬件限制不同?


這個問題的答案是是n *(N + 1)/ 2> 2^24對於n> 5795(其是單精度極限)。超出此點的整數值的準確性未定義。感謝talonmies指出它。

./a.out 5795 
sum of first 5795 from thread 0 is 16793910 
sum of first 5795 from thread 1 is 16793910 
sum of first 5795 from thread 2 is 16793910 
sum of first 5795 from thread 3 is 16793910 
sum of first 5795 from thread 4 is 16793910 
sum of first 5795 from thread 5 is 16793910 
sum of first 5795 from thread 6 is 16793910 
sum of first 5795 from thread 7 is 16793910 
sum of first 5795 from thread 8 is 16793910 
sum of first 5795 from thread 9 is 16793910 
sum of first 5795 from thread 10 is 16793910 
sum of first 5795 from thread 11 is 16793910 
sum of first 5795 from thread 12 is 16793910 
sum of first 5795 from thread 13 is 16793910 
sum of first 5795 from thread 14 is 16793910 
sum of first 5795 from thread 15 is 16793910 
sum of first 5795 from thread 16 is 16793910 
sum of first 5795 from thread 17 is 16793910 
sum of first 5795 from thread 18 is 16793910 
sum of first 5795 from thread 19 is 16793910 
sum of first 5795 from thread 20 is 16793910 
sum of first 5795 from thread 21 is 16793910 
sum of first 5795 from thread 22 is 16793910 
sum of first 5795 from thread 23 is 16793910 
sum of first 5795 from thread 24 is 16793910 
sum of first 5795 from thread 25 is 16793910 
sum of first 5795 from thread 26 is 16793910 
sum of first 5795 from thread 27 is 16793910 
sum of first 5795 from thread 28 is 16793910 
sum of first 5795 from thread 29 is 16793910 
sum of first 5795 from thread 30 is 16793910 
sum of first 5795 from thread 31 is 16793910 






@Kun,如果你確實嘗試了這個,記得它會像地獄一樣緩慢。我同意talonmies,你需要使用不同的算法或使用兩個內核。只是爲了知識而做了這個實驗。 –


您看到的5796限制是一個單精度浮點僞像,而不是任何與硬件有關的參數 - 16777216 = 2^24 = IEEE 32位浮點的24位尾數限制。我沒有看到你的例子是當你每個塊只運行1個線程時是真實世界同步的有效演示...... – talonmies


啊忘了!困腦袋想着。 –
