
Thursday 27 December 2012

CUDA Test Code

This program is just for checking that your device and you kernel writing skills are correct or not?
Program statement : This program insert (
1000 * i + j) value in an array and check does inserted value correctly inserted or not  ... Well the main concern of this code is, we insert this
1000 * i + j) value in kernel code not in host code. 
where 'i' and 'j' are index indicator.

// includes, system
#include <stdio.h>
#include <assert.h>

// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char *msg);

// Part 3 of 5: implement the kernel
__global__ void myFirstKernel( int *d_a )
//assert(h_a[i * numThreadsPerBlock + j] == 1000 * i + j);
//int i = blockDim.x * blockIdx.x + threadIdx.x;
int bx = blockIdx.x ; 
int tx = threadIdx.x ; 
int i = blockDim.x*bx + tx ; 

d_a[i] = 1000*bx + tx;


// Program main
int main( int argc, char** argv)
    // pointer for host memory
    int *h_a;

    // pointer for device memory
    int *d_a;

    // define grid and block size
    int numBlocks = 8;
    int numThreadsPerBlock = 8;

    // Part 1 of 5: allocate host and device memory
    size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
    h_a = (int *) malloc(memSize);

    cudaMalloc((void **)&d_a,memSize);

   // copy host to device
// Part 2 of 5: configure and launch kernel
    dim3 dimGrid(numBlocks,1,1  );
    dim3 dimBlock(numThreadsPerBlock,1,1 );
    myFirstKernel<<< dimGrid ,dimBlock  >>>(d_a);

    // block until the device has completed

    // check if kernel execution generated an error
    checkCUDAError("kernel execution");

    // Part 4 of 5: device to host copy
    cudaMemcpy(h_a,d_a,memSize, cudaMemcpyDeviceToHost);

    // Check for any CUDA errors

    // Part 5 of 5: verify the data returned to the host is correct
    for (int i = 0; i <8       ; i++)
        for (int j = 0; j <8; j++)
            assert(h_a[i * numThreadsPerBlock + j] == 1000 * i + j);

    // free device memory

    // free host memory

    // If the program makes it this far, then the results are correct and
    // there are no run-time errors.  Good work!

    return 0;

void checkCUDAError(const char *msg)
    cudaError_t err = cudaGetLastError();
    if( cudaSuccess != err)
        fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );

CUDA C Programming Guide
Programming Massively Parallel Processors By David B. Kirk and Wen-mei W.Hwu

