Parallel implementation of Particle-Mesh mapping using CUDA enabled GPU

@brijesh68kumar

Git hub : bit.ly/brijesh_gpu_git

Slides : bit.ly/slide_gpu_indicon2015

A little about me

ICT graduate, class of 2015
Past IAS IEEE chair sb DAIICT Guj | Mercedes Benz
Now SRF at NGO | IEEE YP
@brijesh68kumar| github.com/brijesh68kumar
Community growing (sponsors, speakers)
Previous
- IEEE educational tour to NASA Orlando link
- Anemia detection using modified microscope link
- Tempulsebee : IOT solution for health care link
- Ophthalmology imaging using Raspberry pi link

http://bit.ly/slide_gpu_indicon2015 @brijesh68kumar

Introduction

Do you like series/manga

http://bit.ly/slide_gpu_indicon2015 @brijesh68kumar

Download

How

http://bit.ly/slide_gpu_indicon2015 @brijesh68kumar

⬇ CPU ⬇ GPU

Architecture

http://bit.ly/slide_gpu_indicon2015 @brijesh68kumar

Latency oriented approach

Throughput oriented approach

Overhead

http://bit.ly/slide_gpu_indicon2015 @brijesh68kumar

Application code

Rest of

sequential

cpu code

CPU

GPU

computer intensive

of code

// Algorithm to launch the kernel from CPU
Int main()
{
Variable hostGrid, hostParticle;
Variable deviceGrid, deviceParticle;

// initialize host variable using Algorithm 1.
cudaMalloc(device variable space in GPU);
cudaMemcpy( host to device );

// define block dimension
Dim3 bD; //block dimension;
Dim3 gD; //grid dimension;

// bD x gD = total number of particles;
Launch kernel<< bD, gD>>(parameters);
cudaMemcpy(from device to host);
Store data into file;
Free memory;
}

Example: Matrix multiplication

http://bit.ly/slide_gpu_indicon2015 @brijesh68kumar

#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>

/* change dimension size as needed */
const int dimension = 4096;
struct timeval tv; 

double timestamp()
{
  double t;
  gettimeofday(&tv, NULL);
  t = tv.tv_sec + (tv.tv_usec/1000000.0);
  return t;
}

int main(int argc, char *argv[])
{
  int i, j, k;
  double *A, *B, *C, start, end;

  A = (double*)malloc(dimension*dimension*sizeof(double));
  B = (double*)malloc(dimension*dimension*sizeof(double));
  C = (double*)malloc(dimension*dimension*sizeof(double));

  srand(292);

  for(i = 0; i < dimension; i++)
  for(j = 0; j < dimension; j++)
  {   
    A[dimension*i+j] = (rand()/(RAND_MAX + 1.0));
    B[dimension*i+j] = (rand()/(RAND_MAX + 1.0));
    C[dimension*i+j] = 0.0;
   }   

   start = timestamp();
   for(i = 0; i < dimension; i++)
     for(j = 0; j < dimension; j++)
       for(k = 0; k < dimension; k++)
         C[dimension*i+j] += A[dimension*i+k] *
         B[dimension*k+j];
     end = timestamp();
   printf("\nsecs:%f\n", end-start);

   free(A);
   free(B);
   free(C);

   return 0;
}

#include <stdlib.h>
#include <stdio.h>
#include <sys/time.h>

/* change dimension size as needed */
const int dimension = 512 ;
const int blocksize = 10;
const int K = 1;

struct timeval tv; 

__global__ void gpuMM(float *A, float *B, float *C, int N)
{
  // Matrix multiplication for NxN matrices C=A*B
  // Each thread computes a single element of C
  int row = blockIdx.y*blockDim.y + threadIdx.y;
  int col = blockIdx.x*blockDim.x + threadIdx.x;
  float sum = 0.f;
  for (int n = 0; n < N; ++n)
    sum += A[row*N+n]*B[n*N+col];
  C[row*N+col] = sum;
}


int main(int argc, char *argv[])
{
  cudaEvent_t start_i, stop_i,start_mc_h2d, stop_mc_h2d,start_mc_d2h, stop_mc_d2h,start_pl, stop_pl;
  float time_i,time_mc_h2d,time_mc_d2h,time_pl;

  cudaEventCreate(&start_i);
  cudaEventCreate(&stop_i);
  cudaEventCreate(&start_mc_h2d);
  cudaEventCreate(&stop_mc_h2d);
  cudaEventCreate(&start_mc_d2h);
  cudaEventCreate(&stop_mc_d2h);
  cudaEventCreate(&start_pl);
  cudaEventCreate(&stop_pl);  

  int i, j;
  float *A, *B, *C;// start, end;
  float *Ad, *Bd, *Cd;  
  
cudaEventRecord( start_i, 0 );		
    A = (float*)malloc(dimension*dimension*sizeof(float));
    B = (float*)malloc(dimension*dimension*sizeof(float));
    C = (float*)malloc(dimension*dimension*sizeof(float));				
    srand(292);
    for(i = 0; i < dimension; i++)
      for(j = 0; j < dimension; j++)
      {   
        A[dimension*i+j] = (rand()/(RAND_MAX + 1.0));
        B[dimension*i+j] = (rand()/(RAND_MAX + 1.0));
        C[dimension*i+j] = 0.0;
      }   				
  cudaEventRecord( stop_i, 0 );
  cudaEventSynchronize( stop_i );
  cudaEventElapsedTime( &time_i, start_i, stop_i );
  
  cudaEventRecord( start_mc_h2d, 0 );		
    cudaMalloc( (void**)&Ad, dimension*dimension*sizeof(float) );
      cudaMemcpy( Ad, A, dimension*dimension*sizeof(float), cudaMemcpyHostToDevice );
    cudaMalloc( (void**)&Bd, dimension*dimension*sizeof(float) );
      cudaMemcpy( Bd, B, dimension*dimension*sizeof(float), cudaMemcpyHostToDevice );		
    cudaMalloc( (void**)&Cd, dimension*dimension*sizeof(float) );
  cudaEventRecord( stop_mc_h2d, 0 );
  cudaEventSynchronize( stop_mc_h2d );		
  cudaEventElapsedTime( &time_mc_h2d, start_mc_h2d, stop_mc_h2d );
  
    //start = timestamp();
  cudaEventRecord( start_pl, 0 );
      dim3 threadBlock(blocksize,blocksize);
      dim3 grid(K,K);		
      gpuMM<<<grid,threadBlock>>>( Ad,Bd,Cd,dimension);
    //end = timestamp();
  cudaEventRecord( stop_pl, 0 );
  cudaEventSynchronize( stop_pl );
  cudaEventElapsedTime( &time_pl, start_pl, stop_pl );

  cudaEventRecord( start_mc_d2h, 0 );
    cudaMemcpy(C,Cd,dimension*dimension*sizeof(float),cudaMemcpyDeviceToHost);
  cudaEventRecord( stop_mc_d2h, 0 );
  cudaEventSynchronize( stop_mc_d2h );
  cudaEventElapsedTime( &time_mc_d2h, start_mc_d2h, stop_mc_d2h );

  printf("\n IT : %f   ", time_i);
  printf(" MCT : %f", ( time_mc_d2h + time_mc_h2d ) );
  printf(" PLT:%f ", time_pl);
  printf(" Total:%f.... \n\n", (time_pl + time_mc_d2h + time_mc_h2d+time_i));
  
  cudaEventDestroy( start_i );
  cudaEventDestroy( stop_i );
  cudaEventDestroy( start_mc_d2h );
  cudaEventDestroy( stop_mc_d2h );
  cudaEventDestroy( start_mc_h2d );
  cudaEventDestroy( stop_mc_h2d );
  cudaEventDestroy( start_pl );
  cudaEventDestroy( stop_pl );
		
  free(A);
  free(B);
  free(C);
  cudaFree(Ad);
  cudaFree(Bd);
  cudaFree(Cd);
  return 0;
}

Performance analysis

http://bit.ly/slide_gpu_indicon2015 @brijesh68kumar

Basic Stream

----0----1----2----3----> rx.interval(5000)

$scope.counter = 0;
rx.Observable
 .interval(5000) 
 .take(3)
 .safeApply($scope, function(x) { 
   $scope.counter = x; 
 })
 .subscribe(); // shows 0, 1, 2

http://bit.ly/introduction-to-rxjs @angularjs_labs

Observable

Operations

Basic Stream

http://bit.ly/introduction-to-rxjs @angularjs_labs

Observable
Operations
- merge, concat, map and more!
Observer
- onNext, onError, onCompleted
Subscribe/Unsubscribe

Key concepts

Basic Stream

observable.subscribe(
  function onNext(value){
    console.log(value);
  }, 
  function onError(error){
    console.log(error);
  }, 
  function onCompleted(){
    console.log('Completed');
  });

http://bit.ly/introduction-to-rxjs @angularjs_labs

var observer = rx.Observer.create(
  function onNext(result){ 
    console.log(result); 
  },
  function onError(err){ 
    console.log(err); 
  },
  function onCompleted(){ 
    console.log('Completed'); 
  }
);
observable.subscribe(observer);

Observer

Basic Stream

Demo

http://bit.ly/introduction-to-rxjs @angularjs_labs

var subscriber = observable.subscribe(observer);

$scope.$on("$destroy", function() {
  subscriber.dispose();
});

Unsubscribe

Thanks!

@angularjs_labs

Copy of Parallel implementation of Particle-Mesh mapping using CUDA enabled GPU

By Brijesh Kumar

Copy of Parallel implementation of Particle-Mesh mapping using CUDA enabled GPU

Parallel implementation of Particle-Mesh mapping using CUDA enabled GPU

A little about me

Introduction

Do you like series/manga

Download

How

Architecture

Overhead

Example: Matrix multiplication

Basic Stream

Basic Stream

Key concepts

Basic Stream

Observer

Basic Stream

Unsubscribe

Thanks!

Copy of Parallel implementation of Particle-Mesh mapping using CUDA enabled GPU

More from Brijesh Kumar