Prefer Your Language

Search This Blog

CUDA C program for matrix Multiplication using Shared/non Shared memory



 //Matrix multiplication using shared and non shared kernal



#include <stdio.h>

#include <math.h>

#define TILE_WIDTH 2

/*matrix multiplication kernels*/

//non shared
__global__ void
MatrixMul( float *Md , float *Nd , float *Pd , const int WIDTH )
{

           // calculate thread id

           unsigned int col = TILE_WIDTH*blockIdx.x + threadIdx.x ;

           unsigned int row = TILE_WIDTH*blockIdx.y + threadIdx.y ;

         for (int k = 0 ; k<WIDTH ; k++ )
         {
                  Pd[row*WIDTH + col]+= Md[row * WIDTH + k ] * Nd[ k * WIDTH + col] ;
          }
}

// shared
__global__ void
MatrixMulSh( float *Md , float *Nd , float *Pd , const int WIDTH )
{

        //Taking shared array to break the MAtrix in Tile widht and fatch them in that array per ele

          __shared__ float Mds [TILE_WIDTH][TILE_WIDTH] ;

           __shared__ float Nds [TILE_WIDTH][TILE_WIDTH] ;

         // calculate thread id
          unsigned int col = TILE_WIDTH*blockIdx.x + threadIdx.x ;
          unsigned int row = TILE_WIDTH*blockIdx.y + threadIdx.y ;

        for (int m = 0 ; m<WIDTH/TILE_WIDTH ; m++ ) // m indicate number of phase
       {
            Mds[threadIdx.y][threadIdx.x] =  Md[row*WIDTH + (m*TILE_WIDTH + threadIdx.x)]  ;
            Nds[threadIdx.y][threadIdx.x] =  Nd[ ( m*TILE_WIDTH + threadIdx.y) * WIDTH + col] ;
         __syncthreads() ; // for syncronizeing the threads

         // Do for tile
           for ( int k = 0; k<TILE_WIDTH ; k++ )
                       Pd[row*WIDTH + col]+= Mds[threadIdx.x][k] * Nds[k][threadIdx.y] ;
         __syncthreads() ; // for syncronizeing the threads

     }
}

// main routine
int main ()
{
   const int WIDTH = 6 ;
   float array1_h[WIDTH][WIDTH] ,array2_h[WIDTH][WIDTH],
                     result_array_h[WIDTH][WIDTH] ,M_result_array_h[WIDTH][WIDTH]  ;
  float *array1_d , *array2_d ,*result_array_d  ,*M_result_array_d ; // device array
  int i , j ;
  //input in host array
  for ( i = 0 ; i<WIDTH ; i++ )
  {
     for (j = 0 ; j<WIDTH ; j++ )
     {
        array1_h[i][j] = 1 ;
        array2_h[i][j] = 2 ;
     }
  }

  //create device array cudaMalloc ( (void **)&array_name, sizeofmatrixinbytes) ;

  cudaMalloc((void **) &array1_d , WIDTH*WIDTH*sizeof (int) ) ;

  cudaMalloc((void **) &array2_d , WIDTH*WIDTH*sizeof (int) ) ;

 

  //copy host array to device array; cudaMemcpy ( dest , source , WIDTH , direction )

  cudaMemcpy ( array1_d , array1_h , WIDTH*WIDTH*sizeof (int) , cudaMemcpyHostToDevice ) ;

  cudaMemcpy ( array2_d , array2_h , WIDTH*WIDTH*sizeof (int) , cudaMemcpyHostToDevice ) ;

 

  //allocating memory for resultent device array

  cudaMalloc((void **) &result_array_d , WIDTH*WIDTH*sizeof (int) ) ;

  cudaMalloc((void **) &M_result_array_d , WIDTH*WIDTH*sizeof (int) ) ;

 

  //calling kernal

  dim3 dimGrid ( WIDTH/TILE_WIDTH , WIDTH/TILE_WIDTH ,1 ) ;

  dim3 dimBlock( TILE_WIDTH, TILE_WIDTH, 1 ) ;

// Change if 0 to if 1 for running non shared code and make if 0 for shared memory code
#if 0

                MatrixMul <<<dimGrid,dimBlock>>> ( array1_d , array2_d ,M_result_array_d , WIDTH) ;

#endif
 
#if 1

               MatrixMulSh<<<dimGrid,dimBlock>>> ( array1_d , array2_d ,M_result_array_d , WIDTH) ;

#endif

  // all gpu function blocked till kernel is working
  //copy back result_array_d to result_array_h

  cudaMemcpy(M_result_array_h , M_result_array_d , WIDTH*WIDTH*sizeof(int) ,
                                    cudaMemcpyDeviceToHost) ;

  //printf the result array
  for ( i = 0 ; i<WIDTH ; i++ )
  {
      for ( j = 0 ; j < WIDTH ; j++ )
     {
        printf ("%f   ",M_result_array_h[i][j] ) ;
     }
 printf ("\n") ;
}
 system("pause") ;
}



Got Questions? 
Feel free to ask me any question because I'd be happy to walk you through step by step! 

39 comments:

  1. thank u sir..
    it's really helpful...

    ReplyDelete
  2. have you guys done matrix inverse??

    ReplyDelete
  3. Thanks for sharing this code.
    May I ask how would you implement non-square matrix multiplication? For instance multiply a 4x3 by a 3x1
    Thanks.

    ReplyDelete
  4. I need to write a basic CUDA code for multiplying matrix...How would I write it?

    ReplyDelete
  5. when the WIDTH value is 2000 ,the code gets hanged why ???

    ReplyDelete
  6. This comment has been removed by the author.

    ReplyDelete
  7. hi, im trying to use your code, but why the result is always changing ?

    thanks :)

    ReplyDelete
  8. I am getting "Segmentation fault (core dumped)"
    when I try it for dynamic square arrays how do I solve this?

    ReplyDelete
  9. Hi,
    I am trying to change the cuda code, into a .so file and have to call it from python, initializing matrixes and converting into ctypes are done in python, any one have any idea of doing that.

    ReplyDelete
  10. Great idea for an article! Looking forward to the next part.

    ReplyDelete
  11. what is the diffference between shared and non-shared technique

    ReplyDelete
  12. why are you using 'sizeof(int)' in cudaMemcpy ( array1_d , array1_h , WIDTH*WIDTH*sizeof (int) , cudaMemcpyHostToDevice ) while you have declared floating point variables. should'nt it be sizeof(float)

    ReplyDelete
  13. Sir, how can I do the matrix multiplication in cuda without using shared memory? Can u share the code for same?

    ReplyDelete
  14. Hi, thanks for the code. Could you explain it step by step as you offered in last line?
    Thanks

    ReplyDelete
    Replies
    1. By the way my e-mail address is chinmay.toekker@gmail.com

      Delete
  15. I would like to thank you for the great text.

    ReplyDelete
  16. Can someone please paste the output screenshot

    ReplyDelete
  17. Aswindow is the Top Organization upvc entryways makers in delhi and Supply UPVC, Top of the line Entryways and Windows to Clients All around The Noida, Delhi Gurgaon and NCR. AS Window is a trailblazer in the creation of plasticized upvc window suppliers in gurugram and entryways. UPVC is a superb option in contrast to wood and metal. AS Window offers an extensive variety of wonderful and perfect home window plans that won't just change your home yet will likewise safeguard it from the rest of the world. UPVC is an all-climate, harmless to the ecosystem, intensity and commotion safe material for entryways and windows that add solace, accommodation, and style to current homes.

    ReplyDelete

Help us to improve our quality and become contributor to our blog

Become a contributor to this blog. Click on contact us tab
Blogger Template by Clairvo