Build googlemock library

When enabling ENABLE_TESTS option to build tfhe project, the googlemock library is needed, otherwise the following errors will be generated:

[ 54%] Linking CXX executable unittests-fftw
/usr/bin/ld: cannot find -lgmock
collect2: error: ld returned 1 exit status
make[2]: *** [test/CMakeFiles/unittests-fftw.dir/build.make:229: test/unittests-fftw] Error 1
make[1]: *** [CMakeFiles/Makefile2:1290: test/CMakeFiles/unittests-fftw.dir/all] Error 2
make: *** [Makefile:95: all] Error 2

To build googlemock, you can follow this guide:

(1) Download googletest:

$ git clone https://github.com/google/googletest.git

(2) Execute autoreconf command:

$ cd googletest/googlemock
$ autoreconf -fvi

(3) Create libgmock:

$ cd make
$ make
$ ar -rv libgmock.a gtest-all.o gmock-all.o

(4) Copy libgmock.a into /usr/local/lib:

$ cp libgmock.a /usr/local/lib/

Then you can use libgmock.a now.

Don’t use “-G” compile option for profiling CUDA programs

I use Nsight as an IDE to develop CUDA programs:

capture

Use nvprof to measure the load efficiency and store efficiency of accessing global memory:

$ nvprof --devices 2 --metrics gld_efficiency,gst_efficiency ./cuHE_opt

................... CRT polynomial Terminated ...................

==1443== Profiling application: ./cuHE_opt
==1443== Profiling result:
==1443== Metric result:
Invocations   Metric NameMetric Description Min Max Avg
Device "Tesla K80 (2)"
Kernel: gpu_cuHE_crt(unsigned int*, unsigned int*, int, int, int, int)
  1gld_efficiency Global Memory Load Efficiency  62.50%  62.50%  62.50%
  1gst_efficiencyGlobal Memory Store Efficiency 100.00% 100.00% 100.00%
Kernel: gpu_crt(unsigned int*, unsigned int*, int, int, int, int)
  1gld_efficiency Global Memory Load Efficiency  39.77%  39.77%  39.77%
  1gst_efficiencyGlobal Memory Store Efficiency 100.00% 100.00% 100.00%

But if I use nvcc to compile the program directly:

 nvcc -arch=sm_37 cuHE_opt.cu  -o cuHE_opt

The nvprof displays the different measuring results:

$ nvprof --devices 2 --metrics gld_efficiency,gst_efficiency ./cuHE_opt
......
................... CRT polynomial Terminated ...................

==1801== Profiling application: ./cuHE_opt
==1801== Profiling result:
==1801== Metric result:
Invocations   Metric NameMetric Description Min Max Avg
Device "Tesla K80 (2)"
Kernel: gpu_cuHE_crt(unsigned int*, unsigned int*, int, int, int, int)
  1gld_efficiency Global Memory Load Efficiency 100.00% 100.00% 100.00%
  1gst_efficiencyGlobal Memory Store Efficiency 100.00% 100.00% 100.00%
Kernel: gpu_crt(unsigned int*, unsigned int*, int, int, int, int)
  1gld_efficiency Global Memory Load Efficiency  50.00%  50.00%  50.00%
  1gst_efficiencyGlobal Memory Store Efficiency 100.00% 100.00% 100.00%

After some investigations, the reason is using -G compile option in the first case. As the document of nvcc has mentioned:

--device-debug (-G)
    Generate debug information for device code. Turns off all optimizations.
    Don't use for profiling; use -lineinfo instead.

So don’t use -G compile option for profiling CUDA programs.

Enable C++11 support for NVCC compiler in Nsight

When using Nsight as an IDE to develop CUDA programs, sometimes, the program may require C++11 support, otherwise errors like this will occur:

/usr/lib/gcc/x86_64-pc-linux-gnu/5.4.0/include/c++/bits/c++0x_warning.h:32:2: error: #error This file requires compiler and library support for the ISO C++ 2011 standard. This support must be enabled with the -std=c++11 or -std=gnu++11 compiler options.
 #error This file requires compiler and library support \
  ^
make: *** [src/subdir.mk:20: src/cuHE_opt.o] Error 1

To enable C++11 support, you need to do following configurations:
(1) Right-click the project, and select the last item: Properities.

1

(2) Check Settings->Tool Settings->Code Generation->Enable C++11 support (-std=c++11).

2

 

Is the warp size always 32 in CUDA?

Last week, I began to read the awesome Professional CUDA C Programming, and bumped into the following words in GPU Architecture Overview section:

CUDA employs a Single Instruction Multiple Thread (SIMT) architecture to manage and execute threads in groups of 32 called warps.

Since this book is published in 2014, I just wonder whether the warp size is still 32 in CUDA no matter the different Compute Capability is. To figure out it, I turn to the official CUDA C Programming Guide, and get the answer from Compute Capability table:

capture

Yep, for all Compute Capabilities, the warp size is always 32.

BTW, you can also use following program to determine the warp size value:

#include <stdio.h>

int main(void) {
        cudaDeviceProp deviceProp;
        if (cudaSuccess != cudaGetDeviceProperties(&deviceProp, 0)) {
                printf("Get device properties failed.\n");
                return 1;
        } else {
                printf("The warp size is %d.\n", deviceProp.warpSize);
                return 0;
        }
}

The running result in my CUDA box is here:

The warp size is 32.

Resolve “Runtime Error” problem when hackinging in hackerrank

A few days ago, I was trying to resolve the classic maximum subarray conundrum in hackerrankand bumped into an interesting issue. When I submitted the solution, one testcase would fail. But if I ran the testcase individually, the result is right. After discussing in the IRC and Support, it seems the code didn’t pass some memory/time limit constraint in the environment, so I began to optimize my code.

My initial Go code is like this:

package main
import "fmt"
import "math"
import "os"

func MaxNonConArray(s []int) int {
    var max int

    if len(s) < 1 {
        return 0
    }

    for _, v := range s {
        if v > 0 {
            max += v
        }
    }

    if max == 0 {
        max = s[0]
        for _, v := range s[1:] {
            if v > max {
                max = v
            }
        } 
    }
    return max
}

func MaxConArray(s []int) int {
    var max int

    if len(s) > 0 {
        max = s[0]
        currMax := s[0]
        for _, v := range s[1:] {
            currMax = int(math.Max(float64(currMax+v), float64(v)))
            max = int(math.Max(float64(currMax), float64(max)))
        }
    }
    return max
}


func main() {
 //Enter your code here. Read input from STDIN. Print output to STDOUT
    num := 0
    s := [][]int(nil)

    _, err := fmt.Scanf("%d", &num)
    if err != nil {
        os.Exit(1)
    }

    s = make([][]int, num)
    for i := 0; i < len(s); i++ {
        n := 0
        _, err := fmt.Scanf("%d", &n)
        if err != nil {
            os.Exit(1)
        }

        s[i] = make([]int, n)
        for j := 0; j < n; j++ {
            _, err := fmt.Scanf("%d", &s[i][j])
            if err != nil {
                os.Exit(1)
            }
        }
    }

    for i := 0; i < len(s); i++ {
        fmt.Println(MaxConArray(s[i]), MaxNonConArray(s[i]))
    }
}

The main function would allocate a two-dimension slice to accommodate all the input elements. Suddenly I realized a one-dimension slice should be enough, and it could be reused after the wanted value was calculated. So the main code was changed like this:

func main() {
    //Enter your code here. Read input from STDIN. Print output to STDOUT
    var num int

    _, err := fmt.Scanf("%d", &num)
    if err != nil {
        os.Exit(1)
    }

    for i := 0; i < num; i++ {
        var n int
        _, err := fmt.Scanf("%d", &n)
        if err != nil {
            os.Exit(1)
        }

        s := make([]int, n)
        for j := 0; j < n; j++ {
            _, err := fmt.Scanf("%d", &s[j])
            if err != nil {
                os.Exit(1)
            }
        }
        fmt.Println(MaxConArray(s), MaxNonConArray(s))
    }
}

Unfortunately, this time the testcase still can’t pass. So I resorted to C programming language:

#include <stdio.h>
#include <string.h>
#include <math.h>
#include <stdlib.h>

int maxNonConSubArray(int *array, int n) {
    int max = 0;

    if (n > 0) {
        for (int i = 0; i < n; i++) {
            if (array[i] > 0) {
                max += array[i];
            }
        }

        if (max == 0) {
            max = array[0];
            for (int i = 1; i < n; i++) {
                if (array[i] > max) {
                    max = array[i];
                }
            }
        }
    }

    return max;
}

int MaxConSubArray(int *array, int n) {
    int max = 0, currMax = 0;

    if (n > 0) {
        max = currMax = array[0];
        for (int i = 1; i < n; i++) {
            currMax = (currMax + array[i]) > array[i] ? (currMax + array[i]) : array[i];
            max = max > currMax ? max : currMax;
        }
    }
    return max;
}

int main() {
    int num = 0;
    /* Enter your code here. Read input from STDIN. Print output to STDOUT */
    if (scanf("%d", &num) != 1) {
        return 1;
    }

    for (int i = 0; i < num; i++) {
        int n = 0, *array = NULL;

        if (scanf("%d", &n) != 1) {
            return 1;
        }

        array = calloc(n, sizeof(int));
        if (array == NULL) {
            return 1;
        }
        for (int j = 0; j < n; j++) {
            if (scanf("%d", array + j) != 1) {
                return 1;
            }
        }
        printf("%d %d\n",MaxConSubArray(array, n), maxNonConSubArray(array, n));
        free(array);
    }
    return 0;
}

In the C code, I need to manage memory manually, and this time, the testcase didn’t complain, and the submission was success.