Basic Programming Module, You are at Level - 1
CUDA Test Code
This program is just for checking that your device and you kernel writing skills are correct or not?
Program statement : This program insert (1000 * i + j) value in an array and check does inserted value correctly inserted or not ... Well the main concern of this code is, we insert this
(1000 * i + j) value in kernel code not in host code.
where 'i' and 'j' are index indicator.
// includes, system
#include <stdio.h>
#include <assert.h>
// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char *msg);
// Part 3 of 5: implement the kernel
__global__ void myFirstKernel( int *d_a )
{
//assert(h_a[i * numThreadsPerBlock + j] == 1000 * i + j);
//int i = blockDim.x * blockIdx.x + threadIdx.x;
int bx = blockIdx.x ;
int tx = threadIdx.x ;
int i = blockDim.x*bx + tx ;
d_a[i] = 1000*bx + tx;
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv)
{
// pointer for host memory
int *h_a;
// pointer for device memory
int *d_a;
// define grid and block size
int numBlocks = 8;
int numThreadsPerBlock = 8;
// Part 1 of 5: allocate host and device memory
size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
h_a = (int *) malloc(memSize);
cudaMalloc((void **)&d_a,memSize);
// copy host to device
// Part 2 of 5: configure and launch kernel
dim3 dimGrid(numBlocks,1,1 );
dim3 dimBlock(numThreadsPerBlock,1,1 );
myFirstKernel<<< dimGrid ,dimBlock >>>(d_a);
// block until the device has completed
cudaThreadSynchronize();
// check if kernel execution generated an error
checkCUDAError("kernel execution");
// Part 4 of 5: device to host copy
cudaMemcpy(h_a,d_a,memSize, cudaMemcpyDeviceToHost);
// Check for any CUDA errors
checkCUDAError("cudaMemcpy");
// Part 5 of 5: verify the data returned to the host is correct
for (int i = 0; i <8 ; i++)
{
for (int j = 0; j <8; j++)
{
assert(h_a[i * numThreadsPerBlock + j] == 1000 * i + j);
}
}
// free device memory
cudaFree(d_a);
// free host memory
free(h_a);
// If the program makes it this far, then the results are correct and
// there are no run-time errors. Good work!
printf("Correct!\n");
return 0;
}
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(-1);
}
}
Feel free to comment...
References
CUDA C Programming Guide
Programming Massively Parallel Processors By David B. Kirk and Wen-mei W.Hwu
CUDA Test Code
This program is just for checking that your device and you kernel writing skills are correct or not?
Program statement : This program insert (1000 * i + j) value in an array and check does inserted value correctly inserted or not ... Well the main concern of this code is, we insert this
(1000 * i + j) value in kernel code not in host code.
where 'i' and 'j' are index indicator.
// includes, system
#include <stdio.h>
#include <assert.h>
// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char *msg);
// Part 3 of 5: implement the kernel
__global__ void myFirstKernel( int *d_a )
{
//assert(h_a[i * numThreadsPerBlock + j] == 1000 * i + j);
//int i = blockDim.x * blockIdx.x + threadIdx.x;
int bx = blockIdx.x ;
int tx = threadIdx.x ;
int i = blockDim.x*bx + tx ;
d_a[i] = 1000*bx + tx;
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv)
{
// pointer for host memory
int *h_a;
// pointer for device memory
int *d_a;
// define grid and block size
int numBlocks = 8;
int numThreadsPerBlock = 8;
// Part 1 of 5: allocate host and device memory
size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
h_a = (int *) malloc(memSize);
cudaMalloc((void **)&d_a,memSize);
// copy host to device
// Part 2 of 5: configure and launch kernel
dim3 dimGrid(numBlocks,1,1 );
dim3 dimBlock(numThreadsPerBlock,1,1 );
myFirstKernel<<< dimGrid ,dimBlock >>>(d_a);
// block until the device has completed
cudaThreadSynchronize();
// check if kernel execution generated an error
checkCUDAError("kernel execution");
// Part 4 of 5: device to host copy
cudaMemcpy(h_a,d_a,memSize, cudaMemcpyDeviceToHost);
// Check for any CUDA errors
checkCUDAError("cudaMemcpy");
// Part 5 of 5: verify the data returned to the host is correct
for (int i = 0; i <8 ; i++)
{
for (int j = 0; j <8; j++)
{
assert(h_a[i * numThreadsPerBlock + j] == 1000 * i + j);
}
}
// free device memory
cudaFree(d_a);
// free host memory
free(h_a);
// If the program makes it this far, then the results are correct and
// there are no run-time errors. Good work!
printf("Correct!\n");
return 0;
}
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(-1);
}
}
Feel free to comment...
References
CUDA C Programming Guide
Programming Massively Parallel Processors By David B. Kirk and Wen-mei W.Hwu
No comments:
Post a Comment
Help us to improve our quality and become contributor to our blog