cudaMemcpy結構設備主機不工作

當我嘗試從設備複製節點數組返回到主機我在Node.m [...]而不是值中得到零，即使當我在內核中打印節點時，它顯示值正確設置。不幸的是，我無法自己發現任何錯誤，所以我懇請您尋求幫助。我用visual studio編譯器和計算能力編譯代碼3.來自this的代碼答案正在爲我工作。cudaMemcpy結構設備主機不工作

我粘貼整個代碼，但唯一有意義的部分是

__global__ void divideLeft(Node * nodes,float * leftSide){...}

和

divideLeft<<<1,1>>>(dNodes,dLeftSide); 
ERRCHECK(cudaDeviceSynchronize()); 
ERRCHECK(cudaGetLastError()); 
ERRCHECK(cudaMemcpy(nodes,dNodes,sizeof(Node) * heapSize,cudaMemcpyDeviceToHost)); 
printNode(nodes[3]);

#include "cuda_runtime.h" 
#include "device_launch_parameters.h" 

#include <stdio.h> 
#include <conio.h> 
#include <new> 
#include <cmath> 

#define ERRCHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); } 
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true,bool wait=true) 
{ 
    if (code != cudaSuccess) 
    { 
     fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line); 
     if (wait) getch(); 
     if (abort) exit(code); 
    } 
} 

#define MSIZE 36 
#define INPUT_SIZE(N) N*5 - 3*2 
#define PARENT(i) (i-1)/2 
#define LEFT(i) 2*i + 1 
#define RIGHT(i) 2*i + 2 
#define BOTTOM_HEAP_NODES_COUNT(N) (N-2)/3 //size of input must be 2+3n,n>1 
#define HEAP_SIZE(N) 2*BOTTOM_HEAP_NODES_COUNT(N)-1 
#define FIRST_LEVEL_SIZE 19 
#define ROW_LENGTH 5 
#define FIRST_LVL_MAT_SIZE 5 
#define XY(x,y) x*6+y 

__constant__ int dHigherTreeLevelThreshold; 
__constant__ int dNodesCount; 
__constant__ int dLeftSize; 
__constant__ int dHeapSize; 
__constant__ int dBottomNodes; 
__constant__ int dRemainingNodes; 
__constant__ int dRightCols; 
__constant__ int dInputCount; 

struct Node 
{ 
    float m[MSIZE]; 
    float *x; 
}; 

__device__ __host__ void printNode(Node node); 
__global__ void divideLeft(Node * nodes,float * leftSide) 
{ 
    int idx = blockIdx.x*blockDim.x + threadIdx.x; 
    if(idx>=dBottomNodes) 
     return; 
    int nodeIdx = idx + dRemainingNodes - (idx >= dHigherTreeLevelThreshold)*dBottomNodes; 
// printf("%d %d\n",idx,nodeIdx); 
    Node node = nodes[nodeIdx]; 
    idx*=5*3; 
    node.m[XY(3,3)] = leftSide[idx+2]/3; 
    node.m[XY(3,2)] = leftSide[idx+3]/2; 
    node.m[XY(3,1)] = leftSide[idx+4]; 

    node.m[XY(2,3)] = leftSide[idx+6]/2; 
    node.m[XY(2,2)] = leftSide[idx+7]*2/3; 
    node.m[XY(2,1)] = leftSide[idx+8]; 
    node.m[XY(2,4)] = leftSide[idx+9]; 

    node.m[XY(1,3)] = leftSide[idx+10]; 
    node.m[XY(1,2)] = leftSide[idx+11]; 
    node.m[XY(1,1)] = leftSide[idx+12]; 
    node.m[XY(1,4)] = leftSide[idx+13]; 
    node.m[XY(1,5)] = leftSide[idx+14]; 

    node.m[XY(4,2)] = leftSide[idx+15]; 
    node.m[XY(4,1)] = leftSide[idx+16]; 
    node.m[XY(4,4)] = leftSide[idx+17]*2/3; 
    node.m[XY(4,5)] = leftSide[idx+18]/2; 

    node.m[XY(5,1)] = leftSide[idx+20]; 
    node.m[XY(5,4)] = leftSide[idx+21]/2; 
    node.m[XY(5,5)] = leftSide[idx+22]/3; 
    printNode(node); 
} 

void leftSideInit(float * leftSide,int size) 
{ 
    for(int i = 0;i<size;i++) 
    { 
     leftSide[i] = 1;//(i+1)%26; 
    } 
} 

int main(){ 
    ERRCHECK(cudaSetDevice(0)); 

    int leftCount = 11; 
    int leftSize = leftCount*5; 
    int rightSize = 10; 
    int heapSize = HEAP_SIZE(leftCount); 
    int bottomNodes = BOTTOM_HEAP_NODES_COUNT(leftCount); 
    int greatestPowerOfTwo = pow(2,(int)log2(bottomNodes)); 
    int remainingNodes = heapSize - greatestPowerOfTwo; 

    ERRCHECK(cudaMemcpyToSymbol(dBottomNodes,&bottomNodes,sizeof(int))); 
    ERRCHECK(cudaMemcpyToSymbol(dHigherTreeLevelThreshold,&greatestPowerOfTwo,sizeof(int))); 
    ERRCHECK(cudaMemcpyToSymbol(dRemainingNodes,&remainingNodes,sizeof(int))); 
    ERRCHECK(cudaMemcpyToSymbol(dRightCols,&rightSize,sizeof(int))); 
    ERRCHECK(cudaMemcpyToSymbol(dHeapSize,&heapSize,sizeof(int))); 

    float * leftSide = new float[leftSize]; 
    float * rightSide = new float[rightSize]; 
    Node * nodes = new Node[heapSize]; 
    Node * dNodes = nullptr; 
    float * dLeftSide =nullptr; 
    leftSideInit(leftSide,leftSize); 

    ERRCHECK(cudaMalloc(&dNodes,sizeof(Node)* heapSize)); 
    ERRCHECK(cudaMemset(dNodes,0,sizeof(Node)*heapSize)); 
    ERRCHECK(cudaMalloc(&dLeftSide,leftSize*sizeof(float))); 
    ERRCHECK(cudaMemcpy(dLeftSide,leftSide,leftSize*sizeof(float),cudaMemcpyHostToDevice)); 
    divideLeft<<<1,1>>>(dNodes,dLeftSide); 
    ERRCHECK(cudaDeviceSynchronize()); 
    ERRCHECK(cudaGetLastError()); 
    ERRCHECK(cudaMemcpy(nodes,dNodes,sizeof(Node) * heapSize,cudaMemcpyDeviceToHost)); 
    printNode(nodes[3]); 
    delete [] nodes; 
    cudaFree(dNodes); 

    ERRCHECK(cudaDeviceReset()); 

    getch(); 
    return 0; 
} 

__device__ __host__ void printNode(Node node) 
{ 
    for(int i= 0;i<6;i++) 
     printf("%.3f %.3f %.3f %.3f %.3f %.3f\n",node.m[XY(i,0)],node.m[XY(i,1)],node.m[XY(i,2)],node.m[XY(i,3)],node.m[XY(i,4)],node.m[XY(i,5)]); 

}

來源

2017-02-11 quirell

根據您的代碼，我無法確定可能出錯的範圍，但您可能想要發佈如何定義Node類/結構。 –

@ Tae-SungShin'struct Node'在發佈代碼中定義。 –

在你的內核，你所做的Node的本地副本，你是工作上：

Node node = nodes[nodeIdx];

內核的其餘部分繼續修改您的本地副本node的元素。

但是，在完成所有修改後，您絕不會將本地副本複製回全局副本，因此全局副本保持不變。

爲了解決這個問題，一種可能是在你的內核的末尾添加一行：

nodes[nodeIdx] = node;

順便說一句，我注意到，你的struct Node包含一個指針變量：

struct Node 
{ 
    float m[MSIZE]; 
    float *x; 
};

您應該意識到使用帶嵌入式指針的結構數組可能會有一些特殊的複雜性。你實際上並沒有使用這個變量（x），所以我只是把它作爲評論。您可能想要參考cuda tag info page獲取關於此概念的規範問題（「在CUDA中使用指針數組」）。

來源

2017-02-11 19:30:37

非常感謝，我沒有使用C++的年齡，所以我沒有注意到這個簡單的問題。 – quirell

cudaMemcpy結構設備主機不工作

回答

相關問題