當我嘗試從設備複製節點數組返回到主機我在Node.m [...]而不是值中得到零,即使當我在內核中打印節點時,它顯示值正確設置。不幸的是,我無法自己發現任何錯誤,所以我懇請您尋求幫助。我用visual studio編譯器和計算能力編譯代碼3.來自this的代碼答案正在爲我工作。cudaMemcpy結構設備主機不工作
我粘貼整個代碼,但唯一有意義的部分是
__global__ void divideLeft(Node * nodes,float * leftSide){...}
和
divideLeft<<<1,1>>>(dNodes,dLeftSide);
ERRCHECK(cudaDeviceSynchronize());
ERRCHECK(cudaGetLastError());
ERRCHECK(cudaMemcpy(nodes,dNodes,sizeof(Node) * heapSize,cudaMemcpyDeviceToHost));
printNode(nodes[3]);
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <conio.h>
#include <new>
#include <cmath>
#define ERRCHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true,bool wait=true)
{
if (code != cudaSuccess)
{
fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
if (wait) getch();
if (abort) exit(code);
}
}
#define MSIZE 36
#define INPUT_SIZE(N) N*5 - 3*2
#define PARENT(i) (i-1)/2
#define LEFT(i) 2*i + 1
#define RIGHT(i) 2*i + 2
#define BOTTOM_HEAP_NODES_COUNT(N) (N-2)/3 //size of input must be 2+3n,n>1
#define HEAP_SIZE(N) 2*BOTTOM_HEAP_NODES_COUNT(N)-1
#define FIRST_LEVEL_SIZE 19
#define ROW_LENGTH 5
#define FIRST_LVL_MAT_SIZE 5
#define XY(x,y) x*6+y
__constant__ int dHigherTreeLevelThreshold;
__constant__ int dNodesCount;
__constant__ int dLeftSize;
__constant__ int dHeapSize;
__constant__ int dBottomNodes;
__constant__ int dRemainingNodes;
__constant__ int dRightCols;
__constant__ int dInputCount;
struct Node
{
float m[MSIZE];
float *x;
};
__device__ __host__ void printNode(Node node);
__global__ void divideLeft(Node * nodes,float * leftSide)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if(idx>=dBottomNodes)
return;
int nodeIdx = idx + dRemainingNodes - (idx >= dHigherTreeLevelThreshold)*dBottomNodes;
// printf("%d %d\n",idx,nodeIdx);
Node node = nodes[nodeIdx];
idx*=5*3;
node.m[XY(3,3)] = leftSide[idx+2]/3;
node.m[XY(3,2)] = leftSide[idx+3]/2;
node.m[XY(3,1)] = leftSide[idx+4];
node.m[XY(2,3)] = leftSide[idx+6]/2;
node.m[XY(2,2)] = leftSide[idx+7]*2/3;
node.m[XY(2,1)] = leftSide[idx+8];
node.m[XY(2,4)] = leftSide[idx+9];
node.m[XY(1,3)] = leftSide[idx+10];
node.m[XY(1,2)] = leftSide[idx+11];
node.m[XY(1,1)] = leftSide[idx+12];
node.m[XY(1,4)] = leftSide[idx+13];
node.m[XY(1,5)] = leftSide[idx+14];
node.m[XY(4,2)] = leftSide[idx+15];
node.m[XY(4,1)] = leftSide[idx+16];
node.m[XY(4,4)] = leftSide[idx+17]*2/3;
node.m[XY(4,5)] = leftSide[idx+18]/2;
node.m[XY(5,1)] = leftSide[idx+20];
node.m[XY(5,4)] = leftSide[idx+21]/2;
node.m[XY(5,5)] = leftSide[idx+22]/3;
printNode(node);
}
void leftSideInit(float * leftSide,int size)
{
for(int i = 0;i<size;i++)
{
leftSide[i] = 1;//(i+1)%26;
}
}
int main(){
ERRCHECK(cudaSetDevice(0));
int leftCount = 11;
int leftSize = leftCount*5;
int rightSize = 10;
int heapSize = HEAP_SIZE(leftCount);
int bottomNodes = BOTTOM_HEAP_NODES_COUNT(leftCount);
int greatestPowerOfTwo = pow(2,(int)log2(bottomNodes));
int remainingNodes = heapSize - greatestPowerOfTwo;
ERRCHECK(cudaMemcpyToSymbol(dBottomNodes,&bottomNodes,sizeof(int)));
ERRCHECK(cudaMemcpyToSymbol(dHigherTreeLevelThreshold,&greatestPowerOfTwo,sizeof(int)));
ERRCHECK(cudaMemcpyToSymbol(dRemainingNodes,&remainingNodes,sizeof(int)));
ERRCHECK(cudaMemcpyToSymbol(dRightCols,&rightSize,sizeof(int)));
ERRCHECK(cudaMemcpyToSymbol(dHeapSize,&heapSize,sizeof(int)));
float * leftSide = new float[leftSize];
float * rightSide = new float[rightSize];
Node * nodes = new Node[heapSize];
Node * dNodes = nullptr;
float * dLeftSide =nullptr;
leftSideInit(leftSide,leftSize);
ERRCHECK(cudaMalloc(&dNodes,sizeof(Node)* heapSize));
ERRCHECK(cudaMemset(dNodes,0,sizeof(Node)*heapSize));
ERRCHECK(cudaMalloc(&dLeftSide,leftSize*sizeof(float)));
ERRCHECK(cudaMemcpy(dLeftSide,leftSide,leftSize*sizeof(float),cudaMemcpyHostToDevice));
divideLeft<<<1,1>>>(dNodes,dLeftSide);
ERRCHECK(cudaDeviceSynchronize());
ERRCHECK(cudaGetLastError());
ERRCHECK(cudaMemcpy(nodes,dNodes,sizeof(Node) * heapSize,cudaMemcpyDeviceToHost));
printNode(nodes[3]);
delete [] nodes;
cudaFree(dNodes);
ERRCHECK(cudaDeviceReset());
getch();
return 0;
}
__device__ __host__ void printNode(Node node)
{
for(int i= 0;i<6;i++)
printf("%.3f %.3f %.3f %.3f %.3f %.3f\n",node.m[XY(i,0)],node.m[XY(i,1)],node.m[XY(i,2)],node.m[XY(i,3)],node.m[XY(i,4)],node.m[XY(i,5)]);
}
根據您的代碼,我無法確定可能出錯的範圍,但您可能想要發佈如何定義Node類/結構。 –
@ Tae-SungShin'struct Node'在發佈代碼中定義。 –