我正在調試一些涉及一些cuda操作的冗長代碼。 我目前在致電cudaMemcpy(...,...,cudaMemcpyHostToDevice)
時遇到上述錯誤,但我不確定它是否與此有特定關係。檢查失敗:錯誤== cudaSuccess(77對0)遇到非法內存訪問
下面的代碼片段:
int num_elements = 8294400; // --> I also tried it with "1" here which didn't work either!
float *checkArray = new float[num_elements];
float *checkArray_GPU;
CUDA_CHECK(cudaMalloc(&checkArray_GPU, num_elements * sizeof(float)));
CUDA_CHECK(cudaMemcpy(checkArray_GPU, checkArray, num_elements * sizeof(float), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(checkArray, checkArray_GPU, num_elements * sizeof(float), cudaMemcpyDeviceToHost));
其中CUDA_CHECK是一個簡單的宏印刷任何CUDA錯誤(這是現有代碼的一部分,奧德cudaMalloc電話工作正常,所有其他cudaMemcpy所以不問題的一部分)。奇怪的是,這個代碼片段在一個玩具*.cu
中單獨執行,可以正常工作。
所以我的假設是,由於在程序中的先前的cuda操作,有一些錯誤沒有被報告導致上面的代碼片段中的錯誤。那可能嗎? 有沒有辦法檢查是否有涉及cuda的未報告的錯誤?
我的其他估計是它可能來自我使用的特定圖形卡。我有一個Nvidia Titan X Pascal,Cuda 8.0和cudnn v5.1。我也試圖編譯使用some special compiler flags我的代碼就像
-arch=sm_30 \
-gencode=arch=compute_20,code=sm_20 \
-gencode=arch=compute_30,code=sm_30 \
-gencode=arch=compute_50,code=sm_50 \
-gencode=arch=compute_52,code=sm_52 \
-gencode=arch=compute_52,code=compute_52 \
-gencode=arch=compute_60,code=sm_60 \
-gencode=arch=compute_61,code=sm_61 \
-gencode=arch=compute_62,code=sm_62 \
,但它並沒有幫助至今。這裏是我目前簡化的Makefile的完整性:
NVCC = nvcc
CUDA_INC = -I/usr/local/cuda/include
CUDA_LIB = -L/usr/local/cuda/lib64
TARGET = myProgramm
OPTS = -std=c++11
$(TARGET).so: $(TARGET).o
$(NVCC) $(OPTS) -shared $(TARGET).o $(CUDA_LIB) -o $(TARGET).so
$(TARGET).o: $(TARGET).cu headers/some_header.hpp
$(NVCC) $(OPTS) $(CUDA_INC) -Xcompiler -fPIC -c $(TARGET).cu
有沒有人有一個想法,我可以得到這個底部?
編輯:
CUDA-MEMCHECK是一個好主意,所以錯誤apparantly的Kernel_set_value
在通話過程中較早發生的:
========= Invalid __global__ write of size 4
========= at 0x00000298 in void Kernel_set_value<float>(unsigned long, unsigned long, float*, float)
========= by thread (480,0,0) in block (30,0,0)
========= Address 0x0005cd00 is out of bounds
========= Saved host backtrace up to driver entry point at kernel launch time
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 (cuLaunchKernel + 0x2c5) [0x209035]
[...]
========= Host Frame:/media/.../myProgramm.so (_ZN5boost6python6detail6invokeIiPFvRKSsENS0_15arg_from_pythonIS4_EEEEP7_objectNS1_11invoke_tag_ILb1ELb0EEERKT_RT0_RT1_ + 0x2d) [0x3e5eb]
[...]
=========
========= Program hit cudaErrorLaunchFailure (error 4) due to "unspecified launch failure" on CUDA API call to cudaMemcpy.
========= Saved host backtrace up to driver entry point at error
========= Host Frame:/usr/lib/x86_64-linux-gnu/libcuda.so.1 [0x2f4e33]
========= Host Frame:/media/.../myProgramm.so [0x7489f]
F0703 16:23:54.840698 26207 myProgramm.cu:411] Check failed: error == cudaSuccess (4 vs. 0) unspecified launch failure
[...]
========= Host Frame:python (Py_Main + 0xb5e) [0x66d92]
========= Host Frame:/lib/x86_64-linux-gnu/libc.so.6 (__libc_start_main + 0xf5) [0x21f45]
========= Host Frame:python [0x177c2e]
=========
*** Check failure stack trace: ***
========= Error: process didn't terminate successfully
========= Internal error (20)
========= No CUDA-MEMCHECK results found
而且Kernel_set_value
在玩具例子正常工作的功能。使用Kernel_set_value
時有什麼特別的考慮。這是它的源代碼,它是相應的幫助函數。
#define CUDA_NUM_THREADS 512
#define MAX_NUM_BLOCKS 2880
inline int CUDA_GET_BLOCKS(const size_t N) {
return min(MAX_NUM_BLOCKS, int((N + size_t(CUDA_NUM_THREADS) - 1)/CUDA_NUM_THREADS));
}
inline size_t CUDA_GET_LOOPS(const size_t N) {
size_t total_threads = CUDA_GET_BLOCKS(N)*CUDA_NUM_THREADS;
return (N + total_threads -1)/ total_threads;
}
template <typename Dtype>
__global__ void Kernel_set_value(size_t CUDA_NUM_LOOPS, size_t N, Dtype* GPUdst, Dtype value){
const size_t idxBase = size_t(CUDA_NUM_LOOPS) * (size_t(CUDA_NUM_THREADS) * size_t(blockIdx.x) + size_t(threadIdx.x));
if (idxBase >= N) return;
for (size_t idx = idxBase; idx < min(N,idxBase+CUDA_NUM_LOOPS); ++idx){
GPUdst[idx] = value;
}
}
是'num_crop_voxels'等於'num_elements'? – sgarizvi
是的,我會編輯 – mcExchange
與cudamemcheck運行代碼。 – talonmies