High-Level GPU Programming with Julia

Peter

The CUDA approach

Introduction

What is GPU

Not for coin

What is GPU

  • Graphics Processing Unit
  • Designed to rapidly manipulate and alter memory to accelerate the creation of images
  • Highly parallel structure

How CPU Draw 

How GPU Draw 

GPU programming concept

GPu programming concept

  • Utilize the parallel structure
  • Lots of threads running at the same time

work flow

  1. GPU memory allocation
  2. copy data to device
  3. call kernel
  4. copy data back to host
  5. free GPU memory

<- I'm parallel!

How gpu programming looks like in other language

CUDA C/c++

__global__ void vec_add(float *C, float *A, float *B)

{
    int i = threadIdx.x;
    if (i < N)
        C[i] = A[i] + B[i];
}


int main(){
    size_t size = N * sizeof(float);
    float *dA;
    cudaMalloc((void **)&dA, size);
    float *dB;
    cudaMalloc((void **)&dB, size);
    float *dC;
    cudaMalloc((void **)&dC, size);
    
    cudaMemcpy(dA, hA, size, cudaMemcpyHostToDecvice);
    cudaMemcpy(dB, hB, size, cudaMemcpyHostToDecvice);
    
    vec_add<<<1, N>>>(dC, dA, dB);

    cudaMemcpy(hC, dC, size, cudaMemcpyDeviceToHost);

    cudaFree(dA);
    cudaFree(dB);
    cudaFree(dC);

    return 0;
}

Python

PyCUDA

import pycuda.autoinit
import pycuda.driver as drv
import numpy

from pycuda.compiler import SourceModule
mod = SourceModule("""
__global__ void vec_add(float *c, float *a, float *b)
{
  const int i = threadIdx.x;
  c[i] = a[i] * b[i];
}
""")

vec_add = mod.get_function("vec_add")

a = numpy.random.randn(400).astype(numpy.float32)
b = numpy.random.randn(400).astype(numpy.float32)

dest = numpy.zeros_like(a)
vec_add(
        drv.Out(dest), drv.In(a), drv.In(b),
        block=(400,1,1), grid=(1,1))

why Julia

why Julia

  • High-level programming language
    with low-level performance

  • Provides a first-class array implementation

  • Good compiler design

High-level programming language with low-level performance

No need to worry about the non-gpu part is too slow

Provides a first-class array implementation

No more reimplementation on array/tensor type

Good compiler design

Allow us to compile native julia code into gpu code  

Julia over GPU

Basic gpu programming

Kernel

  • Code that run on GPU
  • Execute on multiple GPU threads

writing a kernel

  • pick a function you want to parallelize
  • Split the workload into several subset
  • Let each thread do the subset

Example

function serial_vec_add(C, A, B)
    for i ∈ 1:length(C)
        C[i] = A[i] + B[i]
    end
end

function parallel_vec_add(C, A, B)
    i = threadIdx().x
    C[i] = A[i] + B[i]
end

# or 
function parallel_vec_add(C, A, B)
    i = threadIdx().x + (blockIdx().x - 1) * blockDim().x 
    if i < length(C)
        C[i] = A[i] + B[i]
    end
end

threads & blocks

Threads & Blocks

  • hardware dependent numbers of threads & blocks
  • each threads/blocks can be organized as 1/2/3 dimension grid
    • e.g. 32 threads can be (4, 8) or (4,4,2)

gpu programming in julia

Packages

  • CUDAnative.jl
    • Support for compiling and executing native Julia kernels on CUDA hardware.
  • CUDAdrv.jl
    • wrapper for the CUDA driver API.
  • CuArrays.jl
    • provides a fully-functional GPU array.
  • ​GPUArrays.jl
    • ​AbstractArray API for GPU
  • see https://github.com/JuliaGPU for other packages

GPU hello world

using CUDAnative

# hello world kernel
function greeting()
  @cuprintf("This is block %ld, thread %d speaking: hello\n", 
            blockIdx().x, threadIdx().x)
  return
end

function helloGPU(blocks::Int, threads::Int)
  @cuda blocks=blocks threads=threads greeting()
  return
end

# 2d block/thread version
function greeting2d()
  @cuprintf("This is block (%ld, %ld), thread (%ld, %ld) speaking: hello\n", 
            blockIdx().x, blockIdx().y, threadIdx().x, threadIdx().y)
  return
end

function helloGPU(blocks::Tuple, threads::Tuple)
  @cuda blocks=blocks threads=threads greeting2d()
  return
end

helloGPU(3, 5) #say hello to 3 blocks with 5 threads, total 15 hello
helloGPU((2,2), (3,3)) #say hello to 2x2 blocks with 3x3 threads, total 36 hello

High level gpu programming

  • With CuArrays.jl we can really use GPU without any  GPU knowledges.
  • Also support custom type.

gpu unaware code on gpu

w = randn(5,5)
b = randn(5)

x = randn(5,3)

linear(w,x,b) = w * x .+ b

dw = cu(w)
dx = cu(x)
db = cu(b)

linear(w,x,b)
linear(dw,dx,db)

struct Point{T<:Number}
  x::T
  y::T
end

import Base: *, +, adjoint
function Base.:*(a::Point{T}, b::Point{T}) where T
  Point{T}(a.x * b.y + a.y * b.x, a.x * b.x + a.y * b.y)
end
Base.:(+)(p1::Point, p2::Point) = Point(p1.x + p2.x, p1.y + p2.y)
adjoint(a::Point) = a

a = [Point(randn(2)...) for i = 1:10]
b = [Point(randn(2)...) for i = 1:10]

a * b'
cu(a) * cu(b)'

gpu kernel example

  • matmul
  • conv
  • gather

matmul

function cpu_matmul(C, A, B)
  for i ∈ 1:size(A, 1)
    for j ∈ 1:size(B, 2)
      for k ∈ 1:size(A, 2)
        C[i, j] += A[i, k] * B[k, j]
      end
    end
  end
  C
end

matmul

using CuArrays

function matmul_kernel(C, A, B)
  i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
  j = threadIdx().y + (blockIdx().y - 1) * blockDim().y

  if i <= size(A, 1) && j <= size(B, 2)
    for k ∈ 1:size(A, 2)
      C[i, j] += A[i, k] * B[k, j]
    end
  end
  return
end

function gpu_matmul(C, A, B)
  max_threads = 256
  threads_x = min(max_threads, size(C,1))
  threads_y = min(max_threads ÷ threads_x, size(C,2))
  threads = (threads_x, threads_y)
  blocks = ceil.(Int, (size(C,1), size(C,2)) ./ threads)

  @cuda blocks=blocks threads=threads matmul_kernel(C, A, B)
  C
end

a = cu(randn(3,5)); b = cu(randn(5,3); c = cu(zeros(3,3))
gpu_matmul(c, a, b)

#or @cuda threads=(3,3) matmul_kernel(c, a, b)
#or wasted ver.
# @cuda threads=(2,2) blocks=(2, 2) matmul_kernel(c, a, b) 

Conv2d

Conv2d

function naive_conv2d_kernel(y, x, k)
  i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
  j = threadIdx().y + (blockIdx().y - 1) * blockDim().y

  if i <= size(y, 1) && j <= size(y, 2)
    for k1 ∈ 1:size(k, 1)
      if i + k1 - 1 <= size(x, 1)
        for k2 ∈ 1:size(k, 2)
          if j + k2 - 1 <= size(x, 2)
            y[i,j] += x[i+k1-1, j+k2-1] * k[k1, k2]
          end
        end
      end
    end
  end
end

function gpu_naive_conv2d(C, A, B)
  max_threads = 256
  threads_x = min(max_threads, size(C,1))
  threads_y = min(max_threads ÷ threads_x, size(C,2))
  threads = (threads_x, threads_y)
  blocks = ceil.(Int, (size(C,1), size(C,2)) ./ threads)

  @cuda blocks=blocks threads=threads naive_conv2d_kernel(C, A, B)
  C
end

x = cu(reshape(collect(1:25), 5,5));
k = cu(reshape(collect(1:9), 3,3)) ; 
y = cu(zeros(5,5))
gpu_naive_conv2d(cy, cx, ck) 

gather

function gather(w::CuMatrix{T}, xs::CuArray{Int}) where T
    ys = CuArray{T}(undef, size(w, 1), size(xs)...)

    function kernel!(ys::CuDeviceArray{T}, w::CuDeviceArray{T}, xs)
        li = threadIdx().y + (blockIdx().y - 1) * blockDim().y
        i = threadIdx().x + (blockIdx().x - 1) * blockDim().x

        @inbounds if li <= length(xs)
            ind = Tuple(CartesianIndices(xs)[li])
            ys[i, ind...] = w[i, xs[li]]
        end

        return
    end

    max_threads = 256
    threads_x = min(max_threads, size(ys,1))
    threads_y = min(max_threads ÷ threads_x, length(xs))
    threads = (threads_x, threads_y)
    blocks = ceil.(Int, (size(ys,1), length(xs)) ./ threads)

    CuArrays.@cuda blocks=blocks threads=threads kernel!(ys, w, xs)
    return ys
end

w = cu(randn(10,10))
ind = CuArray(rand(1:10, 3,5))

gather(w, ind)

Tools

profiling with nvprof

using CUDAdrv
x = cu(randn(3,5)); y = cu(randn(5,4)); z = cu(zeros(3,4))
CUDAdrv.@profile gpu_matmul(z, x, y)

#run with $ nvprof --profile-from-start off /path/to/julia file.jl

what get compiled 

@device_code_typed @cuda matmul_kernal(z, x, y)
@device_code_llvm gpu_matmul(z, x, y)
@device_code_sass gpu_matmul(z, x, y)
@device_code_ptx @cuda greeting()

Conclusion

Reference

HIGH-LEVEL GPU PROGRAMMING WITH JULIA

By Peter Cheng

HIGH-LEVEL GPU PROGRAMMING WITH JULIA

The CUDA approach

  • 1,449