不同數量的線程，不同的答案

所以我有一些神經網絡模擬器代碼在CPU上正常工作，並行版本與串行版本至少有6個小數位，與32個線程單個塊在兩個我的CUDA在Win7個人電腦下，但有1個塊和64個線程的Wt產生稍微不同的值。 Wt值通常不會超過3位小數，並且當我嘗試通過在循環中嵌入__syncthreads（）來消除競爭條件時，Wt值在複製回CPU時顯示爲非A號。不同數量的線程，不同的答案

有人可以給我一個提示，我可能做錯了什麼？我已經包括下面的並行代碼，knlBackProp被稱爲與lSampleQtyReq = 10000，O = 1，和Option =「R」：

// device-global variables to facilitate data transfer 
__device__ __constant__ __align__(8) struct rohanContext devSes; 
__device__ __constant__ struct rohanLearningSet devLearn; 
__device__ __align__(16) struct rohanNetwork devNet; 

__device__ double devdReturn[1024*1024]; 
__device__ double devdRMSE=0; 
__device__ int devlReturn[1024*1024]; 
__device__ int devlTrainable=0; 

extern"C" 
int knlBackProp(struct rohanContext& rSes, long lSampleQtyReq, long o, char Option) 
{mIDfunc /*! divides error in yielded values and back-propagates corrections among weights */ 
// Option S - single sample correction only 
// Option E - keep existing weights, count trainable samples only 
// Option R - perform corrections for all trainable samples 
    int lTotal=0; 

    cudaMemcpyToSymbol("devlTrainable", &lTotal, sizeof(int)); // init return value on both sides 
     mCheckCudaWorked 
    cudaEvent_t start, stop; 
    cudaEventCreate(&start); 
    cudaEventCreate(&stop); 

      cudaEventRecord(start, 0); 
     mtkBackPropMT<<< rSes.iBpropBlocks , rSes.iBpropThreads >>>(lSampleQtyReq, o, Option); 
      cudaEventRecord(stop, 0); 
      mCheckCudaWorked 

    cudaMemcpyFromSymbol(&lTotal, "devlTrainable", sizeof(long)); // retrieve return value 
     mCheckCudaWorked 
    cudaEventSynchronize(stop); 
     float elapsedTime; 
     cudaEventElapsedTime(&elapsedTime, start, stop); 
    conPrintf("DEVICE: Time to complete BackProp kernel: %3.1f ms\n", elapsedTime); 
     cudaEventDestroy(start); 
     cudaEventDestroy(stop); 

    return lTotal; 
} 


__global__ __device__ void mtkBackPropMT(long lSampleQtyReq, long o, char Option) 
{/*! divides error in yielded values and back-propagates corrections among weights */ 
// Option S - single sample correction only 
// Option E - keep existing weights, count trainable samples only 
// Option R - perform corrections for all trainable samples 

    if(Option=='E' || Option=='e'){ // 
     devlTrainable=0; // reset global mem trainable counter 
     subkBackPropEoptMT(lSampleQtyReq, o); 
    } 

    if(Option=='S' || Option=='s'){ 
     devlTrainable=0; // reset global mem trainable counter 
     subkBackPropSoptMT(lSampleQtyReq, false, devNet, devNet.Signals, devNet.Zs, devNet.Wt, devNet.Deltas, devLearn.gpuXInputs, devLearn.gpuYEval, devLearn.gpudYEval); 
    } 

    if(Option=='R' || Option=='r'){ // 
     devlTrainable=0; // reset global mem trainable counter 
     subkBackPropRoptMT(lSampleQtyReq, o); 
    } 
} 


__device__ void subkBackPropRoptMT(long lSampleQtyReq, long o) 
{/*! flags and counts samples meeting */ 
    long OUTROWLEN=devLearn.iOutputQty+1; // prepare array index and width 
    //long tIx = threadIdx.x + devSes.iEvalThreads * blockIdx.x; // tIx is thread index over the kernel 
    long tIx = threadIdx.x + blockDim.x * blockIdx.x; // tIx is thread index over the kernel 
    //long lTotalThreads = devSes.iBpropThreads * devSes.iBpropBlocks; // total number of threads 
    double maxSquared = devSes.dMAX * devSes.dMAX ; //needed to compart to stored delta squared values 

    devlTrainable=0; // clear global mem accumulator; out of bound samples will remain at this value 
    for (long s=0; s<lSampleQtyReq; ++s){ // iterate over samples 
     if(devLearn.gpudSE1024[IDX2C(o, s, OUTROWLEN)] > maxSquared){ // if the MAX criterion is exceeded 
      if(tIx==0)++devlTrainable; // increment the counter 
      subkBackPropSoptMT(s, true, devNet, devNet.Signals, devNet.Zs, devNet.Wt, devNet.Deltas, devLearn.gpuXInputs, devLearn.gpuYEval, devLearn.gpudYEval); 
     } 
    } 
} 


__device__ void subkBackPropSoptMT(long s, int o, rohanNetwork& Net, cuDoubleComplex * Signals, cuDoubleComplex * Zs, cuDoubleComplex * Wt, cuDoubleComplex * Deltas, cuDoubleComplex * XInputs, cuDoubleComplex * YEval, double * dYEval) 
{/*! propagates adjustment of weights backwards preceeding layers from the chosen network output. */ 
    // s is sample's index 
    // o is an optional method selection parameter; print/don't print as of 2/29/12 
    long index, kindex; // for warpwise loops 
    long tIx = threadIdx.x + blockDim.x * blockIdx.x; // tIx is thread index over the kernel 
    long lTotalThreads = gridDim.x * blockDim.x; // total number of threads 
    const cuDoubleComplex cdcZero = { 0, 0 }; 

    /* clear all temp values BP0 */ 
    for (long offset=0; (index =offset+tIx)< MAXNEURONS ; offset+=lTotalThreads){ // index stands for i 
     Deltas[index]=cdcZero; 
     Signals[index]=cdcZero; 
     Zs[index]=cdcZero; 
    } 
    /* re-evaluate sample to load temp values. BPI */ 
    subkEvalSampleBetaMT(devSes, s, Net, (s==0), Signals, Zs, Wt, XInputs, YEval, dYEval); 
    /* begin error calculation. BPII */ 
    cuDoubleComplex Deltastar /* measured error at the chosen network output. */ ; 
    /* calc top layer deltas. */ 
    long TOP=Net.iLayerQty-1; 
    int ROWLEN=Net.iNeuronQTY[TOP]; 
    //for(int i=0; i<Net.iNeuronQTY[TOP]; ++i){ 
    for (long offset=0; (index =offset+tIx)< Net.iNeuronQTY[TOP] ; offset+=lTotalThreads){ // index stands for i 
     // delta-star = D - Y = Desired output minus actual output from evaluation 
     // D is the cplx coords of the sector of the desired answer  Y is the complex result of evaluation of the given sample, unactivated. */ 
     Deltastar = CxSubtractCxUT( 
         devLearn.gpuDOutputs[ IDX2C(index, s, ROWLEN) ], 
         Signals[Net.iNeuronOfst[TOP]+index]); 
     /* divide the correction; delta = alpha * delta-star/n+1 (but alpha is always 1 for now). */ 
     //Deltas[Net.iNeuronOfst[TOP]+index] = CxDivideRlUT(Deltastar, Net.iDendrtQTY[TOP]); 
     Deltas[Net.iNeuronOfst[TOP]+index] = CxMultiplyRlUT(Deltastar, Net.dINV_S[TOP]); 
    } 
    __syncthreads(); 
    /* Now distribute the correction to lower layers if any. BPII.1 */ 
    if (Net.iLayerQty>2){ /* remember layer 0 = inputs, layer 1 = bottom row, layer {2..iLayerQty-2} = middle row, layer iLayerQty-1 = top row. */ 
     for (int L=Net.iLayerQty-1; L>1; --L){ 
      long LAY = L; /* setup access to layers. */ 
      long TRIB = L-1; /* trib for tributary.*/ 
      int iTributQTY=Net.iNeuronQTY[TRIB]; 
      //int Sj=Net.iDendrtQTY[TRIB]; if (TRIB==1) Sj=1; // Sj=1 for firest hidden layer 
      for (int i=1; i<Net.iNeuronQTY[LAY]; ++i) { // skip 0th neuron as its weights are either 1 (div identity) or 0 (div forbidden) and don't change anyway 
       // k index must begin at 1, neuron zero not valid for correction 
       //for (int k=1; k<iTributQTY; ++k) { /* the contribution to ith neuron's kth tributary's delta = i's delta/i's weight k. */ 
       for (long offset=1; (kindex =offset+tIx)< iTributQTY ; offset+=lTotalThreads){ // kindex stands for k 
            Deltas[Net.iNeuronOfst[TRIB]+kindex] 
        = CxAddCxUT (Deltas[Net.iNeuronOfst[TRIB]+kindex] , 
         CxDivideCxUT( 
          Deltas[Net.iNeuronOfst[LAY]+i] , 
          Wt[IDX2C(Net.iWeightOfst[LAY]+kindex, i, iTributQTY)])); 
       } 
      } 
      for (long offset=1; (kindex =offset+tIx)< iTributQTY ; offset+=lTotalThreads){ // kindex stands for k 
       //cuDoubleComplex preDiv=Deltas[Net.iNeuronOfst[TRIB]+kindex]; // diagnostic purpose only, remove if removing other diags 
       //Deltas[Net.iNeuronOfst[TRIB]+kindex] 
       // = CxDivideRlUT( 
       //  Deltas[Net.iNeuronOfst[TRIB]+kindex] , 
       //  Sj); 
       Deltas[Net.iNeuronOfst[TRIB]+kindex] 
        = CxMultiplyRlUT( 
         Deltas[Net.iNeuronOfst[TRIB]+kindex] , 
         Net.dINV_S[TRIB]); 
      } 
     } 
    } 
    __syncthreads(); 
    /* error distribution completed */ 
    /* and now update the weights BP III */ 
    /* adj weights on first hidden layer. */ 
     int FHID = 1; 
     int SIG = 0; 
     int iSignalQTY=Net.iNeuronQTY[SIG]; //rSes.rLearn->iInputQty+1; 
     int iHidWidth=Net.iNeuronQTY[FHID]; 
    for (int k=1; k<iHidWidth; ++k){ 
     //for (int i=0; i<iSignalQTY; ++i){ 
     for (long offset=0; (index =offset+tIx)< iSignalQTY ; offset+=lTotalThreads){ // index stands for i 
      /* dW=d*xbar/s1/|z|= neuron's delta * input's conjugate/(dendrites+1 * abs of input i). */ 
         Wt[IDX2C(Net.iWeightOfst[FHID]+index, k, iSignalQTY)] 
      =CxAddCxUT(Wt[IDX2C(Net.iWeightOfst[FHID]+index, k, iSignalQTY)] , 
       CxDivideRlUT( 
        CxMultiplyCxUT( 
         Deltas[Net.iNeuronOfst[FHID]+k] , 
         CxConjugateUT(Signals[Net.iNeuronOfst[SIG]+index]) 
        ) , 
        CxAbsUT(Zs[Net.iNeuronOfst[FHID]+k]) // N+1 denominator factor is considered redundant - JAW & IA 2/27/12 
       ) 
      ); 
     } 
    } 
    __syncthreads(); 
    /* re-evaluate sample to update temp values. */ 
    subkEvalSampleBetaMT(devSes, s, Net, false, Signals, Zs, Wt, XInputs, YEval, dYEval); 
    if (Net.iLayerQty>2){ 
     /* now use those outputs' conjugates and the deltas to adjust middle layers. BP III.1 */ 
     for (int L=2; L<Net.iLayerQty-1; ++L){ 
      /* setup access to layers. */ 
      long LAY = L; 
      long TRIB = L-1; 
      //int iLayWidth=Net.iNeuronQTY[LAY]; 
      int iTribWidth=Net.iNeuronQTY[TRIB]; 
      for (int k=1; k<Net.iNeuronQTY[LAY]; ++k){ 
       //for (int i=0; i<Net.iNeuronQTY[TRIB]; ++i){ 
       for (long offset=0; (index =offset+tIx)< Net.iNeuronQTY[TRIB] ; offset+=lTotalThreads){ // index stands for i 
        /* the adjustment added to kth neuron's ith trib's weight = k's delta * complex conjugate of i's signal/(abs of k's previous-wt product-sum * dendrites+1) . */ 
           Wt[IDX2C(Net.iWeightOfst[LAY]+index, k, iTribWidth)] 
        =CxAddCxUT(Wt[IDX2C(Net.iWeightOfst[LAY]+index, k, iTribWidth)] , 
         CxDivideRlUT( 
          CxMultiplyCxUT( 
           Deltas[Net.iNeuronOfst[LAY]+k] , 
           CxConjugateUT(Signals[Net.iNeuronOfst[TRIB]+index]) 
          ) , 
          ( 
           CxAbsUT(Zs[Net.iNeuronOfst[LAY]+k]) // N+1 denominator factor is considered redundant - JAW & IA 2/27/12 
          ) 
         ) 
        ); 
       } 
      } 
      /* layer is complete. */ 
      subkEvalSampleBetaMT(devSes, s, Net, true, Signals, Zs, Wt, XInputs, YEval, dYEval); 
     } 
    } 
    __syncthreads(); 

    /* correct output layer BP III.3 */ 
    long SUB = TOP-1; 
    //int iTopWidth=Net.iNeuronQTY[TOP]; 
    int iSubWidth=Net.iNeuronQTY[SUB]; 

    for (int k=1; k<Net.iNeuronQTY[TOP]; ++k){ 
     //for (int i=0; i<Net.iNeuronQTY[SUB]; ++i){ 
     for (long offset=0; (index =offset+tIx)< Net.iNeuronQTY[SUB] ; offset+=lTotalThreads){ // index stands for i 
      /* For last layer only, adjustment to kth neuron's ith weight = k's delta * complex conjugate of i's signal/(dendrites+1) . */ 
         Wt[IDX2C(Net.iWeightOfst[TOP]+index, k, iSubWidth)] 
      =CxAddCxUT(Wt[IDX2C(Net.iWeightOfst[TOP]+index, k, iSubWidth)] , 
       CxMultiplyCxUT( 
        Deltas[Net.iNeuronOfst[TOP]+k] , 
        CxConjugateUT(Signals[Net.iNeuronOfst[SUB]+index]) 
       ) 
      ); // N+1 denominator factor is considered redundant - JAW & IA 2/27/12 
     } 
    } 
    /* backprop is complete. */ 
} 


__device__ void subkEvalSampleBetaMT(rohanContext& Ses, long s, rohanNetwork& Net, int o, cuDoubleComplex * Signals, cuDoubleComplex * Zs, cuDoubleComplex * Wt, cuDoubleComplex * XInputs, cuDoubleComplex * YEval, double * dYEval) 
{// Beta uses fixed length fields instead of nested pointer layers 
    // delta squared is not updated, since they'll be updated when RMSE is checked at the end of a pass through the learning set 
    long index, kindex; // for warpwise loops 
    long tIx = threadIdx.x + blockDim.x * blockIdx.x; // tIx is thread index over the kernel 
    long lTotalThreads = gridDim.x * blockDim.x; // total number of threads 
    const cuDoubleComplex cdcZero = { 0, 0 }; 
    /*! layer zero (inputs) is special. */ 
    long INROWLEN=Net.iNeuronQTY[0];//rSes.rLearn->iInputQty+1; 
    //for (int i=0; i<INROWLEN; ++i){ 
    for (long offset=0; (index =offset+tIx)< INROWLEN ; offset+=lTotalThreads){ // index stands for i 
     Signals[Net.iNeuronOfst[0]+index]= XInputs[IDX2C(index, s, INROWLEN)]; 
    } 
    /*! middle and top layers. */ 
    for (int L=1; L<Net.iLayerQty; ++L){ 
     //struct rohanLayer& lay = Net.rLayer[L]; 
     long LAY=L; 
     int TRIB=L-1; // index of previous layer 
     int iNeuronQTY=Net.iNeuronQTY[LAY]; 
     int iSignalQTY=Net.iDendrtQTY[LAY]; // signal qty depends on size of previous layer 
     //for (int k=0; k<iNeuronQTY; ++k){ //Neuron zero is not skipped, its output should be 1+0i as a check 
     for (long offset=0; (kindex =offset+tIx)< iNeuronQTY ; offset+=lTotalThreads){ // kindex stands for k 
      Zs[Net.iNeuronOfst[LAY]+kindex]=cdcZero; 
      for (int i=0; i<iSignalQTY; ++i){ //walk weights on inputs from previous layer 
          Zs[Net.iNeuronOfst[LAY]+kindex] = 
       CxAddCxUT(Zs[Net.iNeuronOfst[LAY]+kindex] , 
        CxMultiplyCxUT(
         Wt[IDX2C(Net.iWeightOfst[LAY] + i, kindex, iSignalQTY)], 
         Signals[Net.iNeuronOfst[TRIB]+i])) ; 
      } 
      // ACTIVATE // 
      Signals[Net.iNeuronOfst[LAY]+kindex] = CxActivateUT(Zs[Net.iNeuronOfst[LAY]+kindex]); 
     } 
    } 
    /*! last layer values are converted and stored here */ 
    long TOP = Net.iLayerQty-1; 
    long OUTROWLEN=Net.iNeuronQTY[TOP]; 
    //for (int i=0; i<Net.iNeuronQTY[TOP]; ++i){ // continuous conversion begins here 
    for (long offset=0; (index =offset+tIx)< OUTROWLEN ; offset+=lTotalThreads){ // index stands for i 
     YEval[IDX2C(index, s, OUTROWLEN)]= Signals[Net.iNeuronOfst[TOP]+index] ; // store final complex output(s) 
     dYEval[IDX2C(index, s, OUTROWLEN)]=FUnitCxUT(YEval[IDX2C(index, s, OUTROWLEN)]) * Net.iSectorQty; // convert final complex outputs to sectors and store that 
     if(devLearn.iContOutputs==false) // round off decimal if disc activation is set 
      dYEval[IDX2C(index, s, OUTROWLEN)]=int(dYEval[IDX2C(index, s, OUTROWLEN)]); 
    } 
    /*! end of sample evaluation. */ 
} 

__device__ cuDoubleComplex CxActivateUT(const cuDoubleComplex Z) 
{/// applies ContActivation or discrete activation function to cx neuron output and returns Phi(Z) 
    /// This fn should be phased out in favor of a GPU device vector based fn 
    cuDoubleComplex phi; 
    if (devNet.bContActivation) { // apply ContActivation activation function to weighted sum : phi(z)=z/|z| 
     phi = CxDivideRlUT(Z, CxAbsUT(Z)); 
    } 
    else { // apply Discrete activation function to weighted sum : s=int(arctan(z)*k/2pi), phi(z)=(X(s),Y(s)) 
     double theta = atan2(Z.y, Z.x); // theta = arctan y/x 
     int iSector = (int)((theta * devNet.dK_DIV_TWO_PI) + devNet.iSectorQty) % devNet.iSectorQty; 
     phi = devNet.gpuSectorBdry[iSector]; 
     //printf(" %f+%fi %d Activate\n", phi.x, phi.y, threadIdx.x); 
    } 
    return phi; 
}

來源

2012-04-27 jwilson75503

我對cuda並不熟悉，但您發佈的代碼超過300行。曾聽說[sscce.org]（http://sscce.org/）？ – 2012-04-27 05:15:15

你不可能真正期望有人坐下來篩選300多行難以理解的遞歸代碼，這些代碼實際上無法運行，以查找可能是微妙的記憶競賽的內容。如果你不能努力將問題的範圍縮小到其他人可以編譯和運行的緊湊的repro案例，爲什麼你應該指望別人會努力回答你的問題？ – talonmies 2012-04-27 06:29:25

所以，我不會讀所有的代碼，但我可以給你一個強烈的暗示。 warp的大小是32個線程，所以64個線程的case會運行兩個warps/block - 在前一種情況下，你不能有任何基於指令指針的競爭條件，但是，在第二種情況下，你將有兩個組在不同的時間安排不同IP的線程。你可能已經知道很多這個（因此syncthreads），但是上述事實上幾乎可以確定你只是還有一個你尚未解決的競爭條件。

放入同步線程是嘗試隔離它的良好開端。你確定在你的循環中，一個warp的源數據不會被另一個warp覆蓋嗎？如果不嘗試將syncthreads放入內部循環中，僅用於調試目的，以查看可能導致競爭條件的原因。

來源

2012-04-27 06:01:17 Crowley9

雖然這是一個很好的建議，但在內部循環中使用'__syncthreads（）'需要小心。執行模型要求給定warp中的每個線程在warp繼續前執行'bar.sync'指令。如果warp中有分支分歧，以致一個或多個線程圍繞'bar.sync'指令分支，則會導致死鎖。 – talonmies 2012-04-27 06:33:41

CUDA有許多可用的源代碼級調試器。我建議你單步執行代碼並幫助縮小問題的範圍。作爲一種風格，避免長時間使用。它不是便攜式的。 – 2012-04-27 06:48:38

不同數量的線程，不同的答案

回答

相關問題