這是一種駭人聽聞的方式,我試圖看看它是否會起作用。
#include <stdio.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
__global__ static
void kernel(int *count, float *data)
{
count += threadIdx.x;
data += gridDim.x * threadIdx.x;
int i = blockIdx.x;
if (i < gridDim.x - 1) {
data[i] = i + 1;
atomicAdd(count, 1);
return;
}
while (atomicMin(count, i) != i);
float tmp = i + 1;
for (int j = 0; j < i; j++) tmp += data[j];
data[i] = tmp;
}
int main(int argc, char **args)
{
int num = 100;
if (argc >= 2) num = atoi(args[1]);
int bytes = num * sizeof(float) * 32;
float *d_data; cudaMalloc((void **)&d_data, bytes);
float *h_data = (float *)malloc(bytes);
for (int i = 0; i < 32 * num; i++) h_data[i] = -1; // Being safe
int h_count[32] = {1};
int *d_count; cudaMalloc((void **)&d_count, 32 * sizeof(int));
cudaMemcpy(d_count, &h_count, 32 * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_data, h_data, bytes, cudaMemcpyHostToDevice);
kernel<<<num, 32>>>(d_count, d_data);
cudaMemcpy(&h_count, d_count, 32 * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(h_data, d_data, bytes, cudaMemcpyDeviceToHost);
for (int i = 0; i < 32; i++) {
printf("sum of first %d from thread %d is %d \n", num, i, (int)h_data[num -1]);
h_data += num;
}
cudaFree(d_count);
cudaFree(d_data);
free(h_data - num * 32);
}
我不能保證這將始終有效。但是,我的卡(320M)的突破點似乎是爲數= 5796.也許每種卡的某種硬件限制不同?
EDIT
這個問題的答案是是n *(N + 1)/ 2> 2^24對於n> 5795(其是單精度極限)。超出此點的整數值的準確性未定義。感謝talonmies指出它。
./a.out 5795
sum of first 5795 from thread 0 is 16793910
sum of first 5795 from thread 1 is 16793910
sum of first 5795 from thread 2 is 16793910
sum of first 5795 from thread 3 is 16793910
sum of first 5795 from thread 4 is 16793910
sum of first 5795 from thread 5 is 16793910
sum of first 5795 from thread 6 is 16793910
sum of first 5795 from thread 7 is 16793910
sum of first 5795 from thread 8 is 16793910
sum of first 5795 from thread 9 is 16793910
sum of first 5795 from thread 10 is 16793910
sum of first 5795 from thread 11 is 16793910
sum of first 5795 from thread 12 is 16793910
sum of first 5795 from thread 13 is 16793910
sum of first 5795 from thread 14 is 16793910
sum of first 5795 from thread 15 is 16793910
sum of first 5795 from thread 16 is 16793910
sum of first 5795 from thread 17 is 16793910
sum of first 5795 from thread 18 is 16793910
sum of first 5795 from thread 19 is 16793910
sum of first 5795 from thread 20 is 16793910
sum of first 5795 from thread 21 is 16793910
sum of first 5795 from thread 22 is 16793910
sum of first 5795 from thread 23 is 16793910
sum of first 5795 from thread 24 is 16793910
sum of first 5795 from thread 25 is 16793910
sum of first 5795 from thread 26 is 16793910
sum of first 5795 from thread 27 is 16793910
sum of first 5795 from thread 28 is 16793910
sum of first 5795 from thread 29 is 16793910
sum of first 5795 from thread 30 is 16793910
sum of first 5795 from thread 31 is 16793910
-
我編輯這是隻使用一個塊我以前的代碼。這是更真實世界的線程/塊的代表(內存訪問是奇怪的,並且會像地獄一樣慢,但他們完成了快速移植我的舊測試代碼以使用多線程)。
看起來有些情況下你可以在塊之間進行同步,但主要取決於你事先知道某些事情(對於這種特殊情況,我只在同步n-1個塊之前對最後一個進行了瘋狂無用的計數塊)。
這是一個概念證明而已,不採取認真代碼
使用原子讀/寫沒有幫助? –
不是。我暗指使用原子的哈克方法,但它們都依賴於在執行條件分支時知道執行順序。而且它們不是通用的,因爲它們依賴於網格中的每個塊被調度和激活,這與硬件有關。 – talonmies
那麼你可以深入瞭解爲什麼我提交的代碼在5795+開始失敗?由於你提到的任何原因,不知道它是失敗的。 –