An empirical method of debugging “illegal memory access” bug in CUDA programming

Recently, I encountered “an illegal memory access was encountered” error during CUDA programming. such as:

cudaSafeCall(cudaMemcpy(h_res, d_res, gParams.n*sizeof(uint32), cudaMemcpyDeviceToHost));

Because the kernel in CUDA programming is executed asynchronously, the code which reports error is not the original culprit usually. After referring this piece of code, I encapsulate a new cudaMemoryTest function:

#define cudaSafeCall(call)  \
        do {\
            cudaError_t err = call;\
            if (cudaSuccess != err) \
            {\
                std::cerr << "CUDA error in " << __FILE__ << "(" << __LINE__ << "): " \
                    << cudaGetErrorString(err);\
                exit(EXIT_FAILURE);\
            }\
        } while(0)

void cudaMemoryTest()
{
    const unsigned int N = 1048576;
    const unsigned int bytes = N * sizeof(int);
    int *h_a = (int*)malloc(bytes);
    int *d_a;
    cudaSafeCall(cudaMalloc((int**)&d_a, bytes));

    memset(h_a, 0, bytes);
    cudaSafeCall(cudaMemcpy(d_a, h_a, bytes, cudaMemcpyHostToDevice));
    cudaSafeCall(cudaMemcpy(h_a, d_a, bytes, cudaMemcpyDeviceToHost));

    return 0;
}

And insert this function call to the suspicious positions:

{
    cudaMemoryTest();
    kernel_A<<<...>>>;

    ......

    cudaMemoryTest();
    kernel_B<<<...>>>;

    ......
}

This method can notify me timely once the CUDA memory access is exceptional, then I can investigate further.

Mostly, the reasons causing this issue is NULL pointer or a pointer points to a already freed memory. But in multiple GPUs environment, you must make sure the memory for one operation is allocated in the same device.

nvidia-smi doesn’t display GPU device in order

Today I find nvidia-smi program doesn’t display GPU devices in order:

$ nvidia-smi -L
GPU 0: Tesla P100-PCIE-16GB (UUID: GPU-...)
GPU 1: Tesla P100-PCIE-16GB (UUID: GPU-...)
GPU 2: Tesla P100-PCIE-16GB (UUID: GPU-...)
GPU 3: Tesla V100-PCIE-16GB (UUID: GPU-...)

The above output displays V100‘s device ID is 3. While CUDA-Z display V100‘s device ID is 0:

Capture

My own lscuda also display it as the 0th device:

$ ./lscuda
CUDA Runtime Version:             9.1
CUDA Driver Version:              9.1
GPU(s):                           4
GPU Device ID:                    0
  Name:                           Tesla V100-PCIE-16GB
......

 

The “invalid argument” error of cudaMemcpyAsync

Check following CUDA code:

#include <iostream>

#define cudaSafeCall(call)  \
        do {\
            cudaError_t err = call;\
            if (cudaSuccess != err) \
            {\
                std::cerr << "CUDA error in " << __FILE__ << "(" << __LINE__ << "): " \
                    << cudaGetErrorString(err) << std::endl;\
                exit(EXIT_FAILURE);\
            }\
        } while(0)

int main(void)
{
    char *a, *d_a;
    cudaStream_t st;
    cudaSafeCall(cudaStreamCreate(&st));
    cudaSafeCall(cudaMallocHost(&a, 10));
    cudaSafeCall(cudaMalloc(&d_a, 4));
    cudaSafeCall(cudaMemcpyAsync(a, d_a, 10, cudaMemcpyHostToDevice, st));
    return 0;
}

The d_a is allocated only 4 bytes, but we want to copy 10 bytes. In this case, cudaMemcpyAsync will complain invalid argument error:

$ nvcc test.cu
$ ./a.out
CUDA error in test.cu(21): invalid argument

Specify compiler when using CUDA

I try to build a project which uses CUDA. The CUDA on my machine is the newest 9.1. The following error is reported:

In file included from /opt/cuda/include/host_config.h:50:0,
                 from /opt/cuda/include/cuda_runtime.h:78,
                 from <command-line>:0:
/opt/cuda/include/crt/host_config.h:121:2: error: #error -- unsupported GNU version! gcc versions later than 6 are not supported!
 #error -- unsupported GNU version! gcc versions later than 6 are not supported!
  ^~~~~

Check /opt/cuda/include/crt/host_config.h:

#if __GNUC__ > 6

#error -- unsupported GNU version! gcc versions later than 6 are not supported!

#endif /* __GNUC__ > 6 */

The solution is to specify the gcc-6 compiler during running cmake command:

# cmake -DCMAKE_C_COMPILER=gcc-6 -DCMAKE_CXX_COMPILER=g++-6 ..

The reference is here.