我目前使用AMD GPU。AMD clEnqueueMapBuffer和固定內存
/*device memory*/
pattern_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, MAX_PATTERN_SIZE * MAX_PATTERN_NUM * sizeof(cl_char), NULL, &ret);
text_objA = clCreateBuffer(context, CL_MEM_READ_ONLY, fileLen * sizeof(char)/2, NULL, &ret);
text_objB = clCreateBuffer(context, CL_MEM_READ_ONLY, fileLen * sizeof(char)/2, NULL, &ret);
failure_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, MAX_PATTERN_SIZE * MAX_PATTERN_NUM * sizeof(cl_int), NULL, &ret);
ret_objA = clCreateBuffer(context, CL_MEM_WRITE_ONLY, MAX_PATTERN_NUM * sizeof(cl_int), NULL, &ret);
ret_objB = clCreateBuffer(context, CL_MEM_WRITE_ONLY, MAX_PATTERN_NUM * sizeof(cl_int), NULL, &ret);
/*pinned memory*/
mPattern_obj = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, MAX_PATTERN_SIZE * MAX_PATTERN_NUM * sizeof(cl_char), NULL, &ret);
mText_obj = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, fileLen * sizeof(char), NULL, &ret);
mFailure_obj = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, MAX_PATTERN_SIZE * MAX_PATTERN_NUM * sizeof(cl_int), NULL, &ret);
mRet_obj = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, MAX_PATTERN_NUM * sizeof(cl_int) * 2, NULL, &ret);
/*mapped pointer for pinned memory*/
pattern = (cl_char *)clEnqueueMapBuffer(command_queue[0], mPattern_obj, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, MAX_PATTERN_SIZE * MAX_PATTERN_NUM * sizeof(cl_char), 0, NULL, NULL, &ret);
strings = (char *)clEnqueueMapBuffer(command_queue[0], mText_obj, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, fileLen * sizeof(char), 0, NULL, NULL, &ret);
failure = (cl_int *)clEnqueueMapBuffer(command_queue[0], mFailure_obj, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, MAX_PATTERN_SIZE * MAX_PATTERN_NUM * sizeof(cl_int), 0, NULL, NULL, &ret);
matched = (cl_int *)clEnqueueMapBuffer(command_queue[0], mRet_obj, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, MAX_PATTERN_NUM * sizeof(cl_int) * 2, 0, NULL, NULL, &ret);
/*Initialize the mapped pointer*/
ret = clEnqueueWriteBuffer(command_queue[0], pattern_obj, CL_FALSE, 0, MAX_PATTERN_SIZE * MAX_PATTERN_NUM * sizeof(cl_char), (void *)&pattern[0], 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue[0], text_objA, CL_FALSE, 0, halfSize, (void *)&strings[0], 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue[0], failure_obj, CL_FALSE, 0, MAX_PATTERN_SIZE * MAX_PATTERN_NUM * sizeof(cl_int), (void *)&failure[0], 0, NULL, NULL);
ret = clEnqueueNDRangeKernel(command_queue[0], kernel[0], 1, NULL, &global_item_size, &local_item_size, 0, NULL, &kmp_event);
clWaitForEvents(1, &kmp_event);
ret = clEnqueueWriteBuffer(command_queue[1], pattern_obj, CL_FALSE, 0, MAX_PATTERN_SIZE * MAX_PATTERN_NUM * sizeof(cl_char), (void *)&pattern[0], 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue[1], text_objB, CL_FALSE, 0, halfSize, (void *)&strings[halfOffset], 0, NULL, NULL);
ret = clEnqueueWriteBuffer(command_queue[1], failure_obj, CL_FALSE, 0, MAX_PATTERN_SIZE * MAX_PATTERN_NUM * sizeof(cl_int), (void *)&failure[0], 0, NULL, NULL);
ret = clEnqueueNDRangeKernel(command_queue[1], kernel[1], 1, NULL, &global_item_size, &local_item_size, 0, NULL, &kmp_event);
clWaitForEvents(1, &kmp_event);
ret = clEnqueueReadBuffer(command_queue[0], ret_objA, CL_FALSE, 0, MAX_PATTERN_NUM * sizeof(cl_int), (void *)&matched[0], 0, NULL, NULL);
ret = clEnqueueReadBuffer(command_queue[1], ret_objB, CL_FALSE, 0, MAX_PATTERN_NUM * sizeof(cl_int), (void *)&matched[MAX_PATTERN_NUM], 0, NULL, NULL);
這是我固定內存的代碼。
我使用MapBuffer和固定內存來重疊數據傳輸和執行。
我知道我的代碼是用於Nvidia GPU,我認爲Nvidia和AMD之間存在不匹配。
AMD GPU沒有關於重疊傳輸和計算的示例代碼。
所以,我不知道我必須做什麼。
我應該更改哪些代碼才能使我的代碼在AMD GPU中工作?
我在A & B中劃分了text_obj,因爲我想通過將文本拆分爲正面和背面來傳輸文本。
如)文字: 「ababcccc」 text_objA => ABAB和text_objB => CCCC
謝謝!你的意思是如果我嘗試一些選擇,是否有可能在AMD中重疊傳輸和執行?我看到一些提到AMD不支持它的評論。 –
我認爲一些AMD主板確實支持同時計算和傳輸;也許是FirePro。請注意,某些NVIDIA主板具有雙DMA引擎,您可以同時上傳,計算和下載。你需要三個命令隊列來解決這個問題。 – Dithermaster
還有更多問題。 1)我很困惑「下載」。這是否意味着'clEnqueueReadBuffer()'? 2)我參考了AMD優化指南並嘗試了一些選項,但仍然無法正常工作。我有權使用一個緩衝區和兩個命令隊列嗎? 3)創建它們之後,我執行setArg,writeBuffer和NDRangeKernel兩次,並且讀取兩次BufferBuffer。什麼是錯誤的東西? 4)而且,在每次使用writeBuffer和readBuffer後,我都會放入clEnqueueUnmapMemObject()。這樣對嗎? –