我使用OpenCL實現了以下Cholesky分解算法。代碼呈現隨機行爲。它只匹配CPU的輸出次數。有人可以幫我弄清楚我的實施有什麼問題。OpenCL Cholesky分解
這裏是算法:
procedure CHOLESKY(A)
int i, j, k;
for k := 0 to n − 1 do /* 1st loop */
/* Obtain the square root of the diagonal element. */
A[k, k] := A[k, k];
for j := k + 1 to n − 1 do /* 2nd loop */
/* The division step. */
A[k, j] := A[k, j]/A[k, k];
end for
for i := k + 1 to n − 1 do /* 3rd loop */
for j := i to n − 1 do /* 4th loop */
/* The elimination step. */
A[i, j] := A[i, j] - A[k, i] × A[k, j];
end for
end for
end for
方法論並行上述算法:
從算法,消除步驟是最昂貴的。所以我在主機代碼中有最外層的循環 ,我在循環中調用內核。內核的一次運行基本上是 對應於第三循環的單次迭代。因此,我推出(n-1) - (k + 1)+ 1個工作組。工作組中的工作項數量設置爲n/2。第二個for循環也在內核中進行計算,但是我只允許第一個工作組來執行它。
有關東道國CODE
// for a 10 X 10 matrix, MATRIX_SIZE = 10
localWorkSize[0] = MATRIX_SIZE/2;
stride = MATRIX_SIZE/2;
cl_event event;
for(k = 0; k < MATRIX_SIZE; k++)
{
int isize = (MATRIX_SIZE-1) - (k+1) + 1;
int num_blocks = isize;
if(num_blocks <= 0)
num_blocks = 1;
globalWorkSize[0] = num_blocks * WA/2;
errcode = clSetKernelArg(clKernel, 0, sizeof(int), (void *)&k);
errcode |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A);
errcode |= clSetKernelArg(clKernel, 2, sizeof(int), (void *)&stride);
errcode = clEnqueueNDRangeKernel(clCommandQueue,
clKernel, 1, NULL, globalWorkSize,
localWorkSize, 0, NULL, &event);
OpenCL_CheckError(errcode, "clEnqueueNDRangeKernel");
clFinish(clCommandQueue);
}
內核代碼
__kernel void
batchedCholesky(__global float *U, int k, int stride)
{
int tx = get_global_id(0);
unsigned int j;
unsigned int num_rows = MATRIX_SIZE;
if(tx==0)
{
// Take the square root of the diagonal element
U[k * num_rows + k] = sqrt(U[k * num_rows + k]);
}
barrier(CLK_GLOBAL_MEM_FENCE);
int offset = (k+1); //From original loop
int jstart = get_local_id(0) + offset;
int jstep = stride;
int jtop = num_rows - 1;
int jbottom = (k + 1);
//Do work for this i iteration
//Division step
if(get_group_id(0) == 0)
{
for(j = jstart; (j >= jbottom) && (j <= jtop); j+=jstep)
{
U[k * num_rows + j] /= U[k * num_rows + k]; // Division step
}
}
barrier(CLK_GLOBAL_MEM_FENCE);
j = 0;
int i = get_group_id(0) + (k+1);
offset = i;
jstart = get_local_id(0) + offset;
jbottom = i;
for(j = jstart; j >= jbottom && j <= jtop; j += jstep)
U[i * num_rows + j] -= U[k * num_rows + i] * U[k * num_rows + j];
barrier(CLK_GLOBAL_MEM_FENCE);
}
我看到三個'CLK_LOCAL_MEM_FENCE'障礙但沒有使用本地內存。你的意思是'CLK_GLOBAL_MEM_FENCE'?或者這不是完整的相關代碼? – 2014-10-07 17:33:30
是的,這是正確的。它應該是CLK_GLOBAL_MEM_FENCE。 – user1274878 2014-10-07 17:45:47