cudaMemcpyToSymbol vs cudaMemcpy

我想知道爲什麼cudaMemcpyToSymbol不適用於我。（但cudaMemcpy一樣。）cudaMemcpyToSymbol vs cudaMemcpy

// symbols: 
__constant__ float flt[480]; // 1920 bytes 
__constant__ int ints[160]; // 640 bytes 

// func code follows: 
float* pFlts; 
cudaMalloc((void**)&pFlts, 1920+640); // chunk of gpu mem (floats & ints) 

// This does NOT work properly: 
cudaMemcpyToSymbol(flt,pFlts,1920,0,cudaMemcpyDeviceToDevice); // first copy 
cudaMemcpyToSymbol(ints,pFlts,640,1920,cudaMemcpyDeviceToDevice); // second copy

的第二個副本搗毀的第一個副本（FLT）的內容，而第二個副本不會發生。（如果我刪除第二個副本，第一份工作正常。）

結果：

GpuDumpFloatMemory<<<1,1>>>(0x500500000, 13, 320) TotThrds=1 ** Source of 1st copy 
    0x500500500: float[320]= 1.000 
    0x500500504: float[321]= 0.866 
    0x500500508: float[322]= 0.500 
    0x50050050c: float[323]= -0.000 
    0x500500510: float[324]= -0.500 
    0x500500514: float[325]= -0.866 
    0x500500518: float[326]= -1.000 
    0x50050051c: float[327]= -0.866 
    0x500500520: float[328]= -0.500 
    0x500500524: float[329]= 0.000 
    0x500500528: float[330]= 0.500 
    0x50050052c: float[331]= 0.866 
    0x500500530: float[332]= 1.000 
    GpuDumpFloatMemory<<<1,1>>>(0x500100a98, 13, 320) TotThrds=1  ** Dest of 1st copy 
    0x500100f98: float[320]= 0.000 
    0x500100f9c: float[321]= 0.500 
    0x500100fa0: float[322]= 0.866 
    0x500100fa4: float[323]= 1.000 
    0x500100fa8: float[324]= 0.866 
    0x500100fac: float[325]= 0.500 
    0x500100fb0: float[326]= -0.000 
    0x500100fb4: float[327]= -0.500 
    0x500100fb8: float[328]= -0.866 
    0x500100fbc: float[329]= -1.000 
    0x500100fc0: float[330]= -0.866 
    0x500100fc4: float[331]= -0.500 
    0x500100fc8: float[332]= 0.000 
    GpuDumpIntMemory<<<1,1>>>(0x500500780, 13, 0) TotThrds=1  ** Source of 2nd copy 
    0x500500780: int[0]= 1 
    0x500500784: int[1]= 1 
    0x500500788: int[2]= 1 
    0x50050078c: int[3]= 1 
    0x500500790: int[4]= 1 
    0x500500794: int[5]= 1 
    0x500500798: int[6]= 1 
    0x50050079c: int[7]= 1 
    0x5005007a0: int[8]= 1 
    0x5005007a4: int[9]= 1 
    0x5005007a8: int[10]= 1 
    0x5005007ac: int[11]= 1 
    0x5005007b0: int[12]= 0 
    GpuDumpIntMemory<<<1,1>>>(0x500100818, 13, 0) TotThrds=1  ** Dest of 2nd copy 
    0x500100818: int[0]= 0 
    0x50010081c: int[1]= 0 
    0x500100820: int[2]= 0 
    0x500100824: int[3]= 0 
    0x500100828: int[4]= 0 
    0x50010082c: int[5]= 0 
    0x500100830: int[6]= 0 
    0x500100834: int[7]= 0 
    0x500100838: int[8]= 0 
    0x50010083c: int[9]= 0 
    0x500100840: int[10]= 0 
    0x500100844: int[11]= 0 
    0x500100848: int[12]= 0

以下工作正常：

cudaMemcpyToSymbol(flt,pFlts,1920,0,cudaMemcpyDeviceToDevice); // first copy 
int* pTemp; 
cudaGetSymbolAddress((void**) &pTemp, ints); 
cudaMemcpy(ints,pFlts+480,640,cudaMemcpyDeviceToDevice); // second copy

結果：

GpuDumpFloatMemory<<<1,1>>>(0x500500000, 13, 320) TotThrds=1 ** Source of first copy 
    0x500500500: float[320]= 1.000 
    0x500500504: float[321]= 0.866 
    0x500500508: float[322]= 0.500 
    0x50050050c: float[323]= -0.000 
    0x500500510: float[324]= -0.500 
    0x500500514: float[325]= -0.866 
    0x500500518: float[326]= -1.000 
    0x50050051c: float[327]= -0.866 
    0x500500520: float[328]= -0.500 
    0x500500524: float[329]= 0.000 
    0x500500528: float[330]= 0.500 
    0x50050052c: float[331]= 0.866 
    0x500500530: float[332]= 1.000 
    GpuDumpFloatMemory<<<1,1>>>(0x500100a98, 13, 320) TotThrds=1 ** Dest of first copy 
    0x500100f98: float[320]= 1.000 
    0x500100f9c: float[321]= 0.866 
    0x500100fa0: float[322]= 0.500 
    0x500100fa4: float[323]= -0.000 
    0x500100fa8: float[324]= -0.500 
    0x500100fac: float[325]= -0.866 
    0x500100fb0: float[326]= -1.000 
    0x500100fb4: float[327]= -0.866 
    0x500100fb8: float[328]= -0.500 
    0x500100fbc: float[329]= 0.000 
    0x500100fc0: float[330]= 0.500 
    0x500100fc4: float[331]= 0.866 
    0x500100fc8: float[332]= 1.000 
    GpuDumpIntMemory<<<1,1>>>(0x500500780, 13, 0) TotThrds=1 ** Source of 2nd copy 
    0x500500780: int[0]= 1 
    0x500500784: int[1]= 1 
    0x500500788: int[2]= 1 
    0x50050078c: int[3]= 1 
    0x500500790: int[4]= 1 
    0x500500794: int[5]= 1 
    0x500500798: int[6]= 1 
    0x50050079c: int[7]= 1 
    0x5005007a0: int[8]= 1 
    0x5005007a4: int[9]= 1 
    0x5005007a8: int[10]= 1 
    0x5005007ac: int[11]= 1 
    0x5005007b0: int[12]= 0 
    GpuDumpIntMemory<<<1,1>>>(0x500100818, 13, 0) TotThrds=1 ** Destination of 2nd copy 
    0x500100818: int[0]= 1 
    0x50010081c: int[1]= 1 
    0x500100820: int[2]= 1 
    0x500100824: int[3]= 1 
    0x500100828: int[4]= 1 
    0x50010082c: int[5]= 1 
    0x500100830: int[6]= 1 
    0x500100834: int[7]= 1 
    0x500100838: int[8]= 1 
    0x50010083c: int[9]= 1 
    0x500100840: int[10]= 1 
    0x500100844: int[11]= 1 
    0x500100848: int[12]= 0

當我看看壞的情況，看起來好像符號表發生了一些事情。如在中，第一個複製目標的數據非常熟悉。不像它被覆蓋，只是移動。像指針是錯誤的。

來源

2013-03-13 Doug

你在對你的cuda呼叫進行錯誤檢查嗎？你被給了一個例子[這裏]（http://stackoverflow.com/questions/14968293/copy-symbol-address-to-symbol）。 – 2013-03-13 17:03:32

是的，我沒有包含檢查的宏。沒有錯誤報告。（cudaSuccess） – Doug 2013-03-13 17:18:59

偏移量適用於符號，而不是源。那是你的問題。 – talonmies 2013-03-13 17:26:44

第二個副本看起來打破了我。你已經定義了這個數組：

__constant__ int ints[160]; // 640 bytes

正確指出的是640字節長。

你的第二個副本是這樣的：

cudaMemcpyToSymbol(ints,pFlts,640,1920,cudaMemcpyDeviceToDevice); // second copy

它說，「複製共有640個字節，從pFlts陣列ints數組，存儲位置ints陣列在從1920個字節開始數組的開始。「

這不起作用。 ints數組只有640個字節長。你不能選擇一個1920字節的位置作爲你的目的地。

從文檔cudaMemcpyToSymbol：

經偏置從碼元的開頭以字節爲單位

抵消這種情況下，符號是ints

也許你想要的是：

cudaMemcpyToSymbol(ints,pFlts+480,640,0,cudaMemcpyDeviceToDevice); // second copy

編輯： 爲了迴應有關錯誤檢查意見的問題，我製作的這個簡單的測試程序：

#include <stdio.h> 

#define cudaCheckErrors(msg) \ 
    do { \ 
     cudaError_t __err = cudaGetLastError(); \ 
     if (__err != cudaSuccess) { \ 
      fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \ 
       msg, cudaGetErrorString(__err), \ 
       __FILE__, __LINE__); \ 
      fprintf(stderr, "*** FAILED - ABORTING\n"); \ 
      exit(1); \ 
     } \ 
    } while (0) 

__constant__ int ints[160]; 

int main(){ 

    int *d_ints; 
    cudaError_t mystatus; 

    cudaMalloc((void **)&d_ints, sizeof(int)*160); 
    cudaCheckErrors("cudamalloc fail"); 
    mystatus = cudaMemcpyToSymbol(ints, d_ints, 160*sizeof(int), 1920, cudaMemcpyDeviceToDevice); 
    if (mystatus != cudaSuccess) printf("returned value was not cudaSuccess\n"); 
    cudaCheckErrors("cudamemcpytosymbol fail"); 

    printf("OK!\n"); 
    return 0; 
}

當我編譯並運行它，我得到以下的輸出：

returned value was not cudaSuccess 
Fatal error: cudamemcpytosymbol fail (invalid argument at t94.cu:26) 
*** FAILED - ABORTING

這表明既從cudaMemcpyToSymbol函數調用和的cudaGetLastError()方法的錯誤返回值在這種情況下返回一個錯誤。如果我在這個測試用例中將1920參數更改爲零，則錯誤消失。

來源

2013-03-13 17:24:38

是的，我使用OFFSET作爲偏移量來源，而不是目標。（Bassackwards）謝謝 – Doug 2013-03-13 17:30:51

cudaMemcpyToSymbol vs cudaMemcpy

回答

相關問題