CUDA Basics

Meta
CPU vs. GPU
Coding
Optimization

Structure

uni-kassel.de/eecs/fachgebiete/plm/

1. Meta

What's the most heavy object known to humans?

archlinux.org/packages/community/x86_64/cuda/

ompute

nified

evice

rchitecture

General Purpose GPU-Computing

Use Cases

developer.nvidia.com/cuda-zone

2. CPU vs. GPU

Single Instruction

Multiple Data

CUDA Cores?

docs.nvidia.com/cuda/cuda-c-programming-guide/

3. Coding

C++

Fortran

Perl

Python

Ruby

Java

.NET

Matlab

Mathematica

IDE?

File Extension

.cu

Compiling

nvcc -o helloWorld helloWorld.cu

sequential code
kernel code

Compiling

nvcc -o helloWorld helloWorld.cu

sequential code
kernel code

Parallel Thread eXecution

gcc

10₂

__host__

__global__

important
functions

Memory

cudaMalloc (void** devPtr,
            size_t size)

cudaFree   (void* devPtr)

cudaMemcpy (void* dst,
            const void* src,
            size_t count,
            cudaMemcpyKind kind)

basic

steps

GPU

RAM

1. global memory allocation

GPU

RAM

2. fill global memory

GPU

RAM

3. call kernel

GPU

RAM

4. read global memory

GPU

RAM

5. free global memory

my_kernel<<<grid,block>>>

Kernel Call

dim3 gsize(3, 2, 1);

dim3 bsize(4, 3, 1);

my_kernel<<<gsize, bsize>>>

e.g.

gridDim.{x|y|z}

blockDim.{x|y|z}

blockId.{x|y|z}

threadId.{x|y|z}

Dimensions & IDs

#include "buddyKVGSum.cuh"

__device__ int gcd(int a, int b) {
    if (a == 0) {
        return b;
    } else {
        while (b != 0) {
            if (a > b) {
                a -= b;
            } else {
                b -= a;
            }
        }

        return a;
    }
}

__device__ int lcm(const int a, const int b, const int gcd) {
    return (a / gcd) * b;
}

__device__ int f(const int a, const int b, const int min_lcm) {
    return lcm(a, b, gcd(a, b)) >= min_lcm ? 1 : 0;
}

__global__ void buddyKVGSum(const Matrix a,
        const Matrix b,
        Matrix c,
        const int n,
        const int min_lcm) {
    extern __shared__ int shared_copy[];
    int bx = blockIdx.x;
    int by = blockIdx.y;
    int tx = threadIdx.x;
    int ty = threadIdx.y;

    if (tx == 0) {
        if (ty == 0) {
            for (int i = 0; i < n; i++) {
                shared_copy(0, i) = a(by, i);
            }
        } else if (ty == 1) {
            for (int i = 0; i < n; i++) {
                shared_copy(1, i) = b(bx, i);
            }
        }
    }

    __syncthreads();

    if (f(shared_copy(0, tx), shared_copy(1, ty), min_lcm)) {
        atomicAdd(&shared_copy[2*n], 1);
    }

    __syncthreads();

    if (tx == 0 && ty == 0) {
        c(bx, by) = shared_copy[2*n];
    }
}

void print(const Matrix a, const int n) {
    for (int i = 0; i < n; i++) {
        for (int j = 0; j < n; j++) {
            printf("%d ", a(i, j));
        }

        printf("\n");
    }
}

void sum(const Matrix a,
        const Matrix b,
        Matrix c,
        const int n,
        const int min_lcm,
        const int input_size,
        const int verbose) {
    Matrix ad, bd, cd;
    int *nd, *min_lcmd;

    gpuErrchk(cudaMalloc((void **)&ad, input_size));
    gpuErrchk(cudaMalloc((void **)&bd, input_size));
    gpuErrchk(cudaMalloc((void **)&cd, input_size));
    gpuErrchk(cudaMalloc((void **)&nd, sizeof(int)));
    gpuErrchk(cudaMalloc((void **)&min_lcmd, sizeof(int)));

    gpuErrchk(cudaMemcpy(ad, a, input_size, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy(bd, b, input_size, cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy((void *)nd, (void *)&n, sizeof(int), cudaMemcpyHostToDevice));
    gpuErrchk(cudaMemcpy((void *)min_lcmd, (void *)&min_lcm, sizeof(int), cudaMemcpyHostToDevice));

    if (0 != (2 & verbose)) {
        print(a, n);
        printf("\n");
        print(b, n);
        printf("\n");
    }

    dim3 gsize(n, n);
    dim3 bsize(n, n);

    clock_t time = clock();
    buddyKVGSum<<<gsize, bsize, (2*n+1)*sizeof(int)>>>(ad, bd, cd, n, min_lcm);
    gpuErrchk(cudaDeviceSynchronize());
    time = clock() - time;

    gpuErrchk(cudaMemcpy(c, cd, input_size, cudaMemcpyDeviceToHost));

    if (0 != (1 & verbose)) {
        print(c, n);
        printf("\n");
    }

    printf("Execution time: %f\n", (float) time / CLOCKS_PER_SEC);

    gpuErrchk(cudaFree(ad));
    gpuErrchk(cudaFree(bd));
    gpuErrchk(cudaFree(cd));

    cudaDeviceReset();
}

int main(int argc, char **argv) {
    if (argc < 5) {
        printf("Program must be called with at least 4 parameters.");
        return 42;
    }

    const int n = atoi(argv[1]);
    const int m = atoi(argv[2]);
    const int min_lcm = atoi(argv[3]);
    const int seed = atoi(argv[4]);
    const int verbose = argc > 5 ? atoi(argv[5]) : 0;

    const int matrix_size = n * n;
    const int input_size = matrix_size * sizeof(int);

    Matrix a = (Matrix) malloc(input_size),
           b = (Matrix) malloc(input_size),
           c = (Matrix) malloc(input_size);

    srand(seed);

    int i, j;

    for (i = 0; i < n; ++i) {
        for (j = 0; j < n; ++j) {
            a(i, j) = rand() % (m - 1) + 1;
        }
    }

    for (i = 0; i < n; ++i) {
        for (j = 0; j < n; ++j) {
            b(i, j) = rand() % (m - 1) + 1;
        }
    }

    sum(a, b, c, n, min_lcm, input_size, verbose);

    free(a);
    free(b);
    free(c);

    return EXIT_SUCCESS;
}

4. Optimization

Optimizations

many threads per SM
same flow for all threads
register & shared memory
prefetching
memory coalescing
tiling

Memory Coalescing

read this data

with

threads!

...

memory look like this:

thread 1

thread 2

thread 3

thread 4

thread 1

thread 2

thread 3

thread 4

thread 1

thread 2

thread 3

thread 4

Tiling

Memory

task

Summary

CPU: sequential

cores execute threads

GPU: parallel

, same operation

, much data

in blocks

in a grid

CUDA is C-like

care for your memory!

optimize its usage!

think multidimensional!

CUDA Basics

By Jonas Thelemann

CUDA Basics

What's cool about graphics cards? If you are older than the age, at which you only think of games and the highest possible FPS when it comes to graphics cards, you will surely know that graphics cards are used for a variety of other purposes. Here they produce cryptocurrencies in mining farms, there they calculate hashes for all sorts of character combinations in order to infer passwords. But how does it work? How can I program my graphics card? Why isn't every program on my computer using the graphics card, if they could benefit from such an impressive performance gain? And why do I, when buying NVIDIA graphics cards, always want those with as many CUDA cores as possible, although I have no idea what they are there for. This presentation uses NVIDIA's CUDA to shed some light on the above questions, points out noteworthy aspects of the architecture and conveys that you should not throw your CPU away. The lecture is crowned by a look at optimization strategies, which are used in a real code example.

Jonas Thelemann

jonas-thelemann.de

CUDA Basics

Structure

1. Meta

What's the most heavy object known to humans?

Use Cases

2. CPU vs. GPU

CUDA Cores?

3. Coding

File Extension

Compiling

Compiling

Memory

1. global memory allocation

2. fill global memory

3. call kernel

4. read global memory

5. free global memory

Kernel Call

4. Optimization

Optimizations

Memory Coalescing

Tiling

Summary

CUDA Basics

More from Jonas Thelemann