Basic Programming Module, You are at Level - 1
How to Reverse Single Block in an Array; CUDA C/C++
Problem Statement:
You are given an array of integer (say), your task to reverse the array content block wise.
Given an input array {a0, a1, …, an-1} in pointer d_a, store the reversed array {an-1, an-2, …, a0} in pointer d_b Start from the “reverseArray_singleblock” template Only one thread block launched, to reverse an array of size
N = numThreads = 256 elements Part 1 (of 1): All you have to do is implement the body of the kernel “reverseArrayBlock()” Each thread moves a single element to reversed position Read input from d_a pointer Store output in reversed location in d_b pointer.
Problem Solution:
This is a simple example, that teach you how kernel (device code) to be write. In order to solve above problem we need a mapping between blocks in Grid of threads.
Lets do it;
let say blocks in my grid is named as "bx" and thread in blocks is named as "tx".
since blockDim.x gives me, number of threads in X direction, since thread indexing start from 0 so the total number of thread in a block is ( blockDim.x - 1 ).
Up to this we came to know that how much threads in a block we have. Next task it to find the index of each thread, in array which is to be operate. which will give by this formula
// includes, system
#include <stdio.h>
#include <assert.h>
// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char* msg);
// Part 1 of 1: implement the kernel
__global__ void reverseArrayBlock(int *d_b, int *d_a )
{
int bx= blockIdx.x ;
int tx = threadIdx.x ;
int i = blockDim.x - 1 - (blockDim.x * bx + tx ) ;
d_b[blockDim.x * bx + tx ] = d_a[i] ;
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv)
{
// pointer for host memory and size
int *h_a;
int dimA = 256;
// pointer for device memory
int *d_b, *d_a;
// define grid and block size
int numBlocks = 1;
int numThreadsPerBlock = dimA;
// allocate host and device memory
size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
h_a = (int *) malloc(memSize);
cudaMalloc( (void **) &d_a, memSize );
cudaMalloc( (void **) &d_b, memSize );
// Initialize input array on host
for (int i = 0; i < dimA; ++i)
{
h_a[i] = i;
}
// Copy host array to device array
cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice );
// launch kernel
dim3 dimGrid(numBlocks);
dim3 dimBlock(numThreadsPerBlock);
reverseArrayBlock<<< dimGrid, dimBlock >>>( d_b, d_a );
// block until the device has completed
cudaThreadSynchronize();
// check if kernel execution generated an error
// Check for any CUDA errors
checkCUDAError("kernel invocation");
// device to host copy
cudaMemcpy( h_a, d_b, memSize, cudaMemcpyDeviceToHost );
// Check for any CUDA errors
checkCUDAError("memcpy");
// verify the data returned to the host is correct
for (int i = 0; i < dimA; i++)
{
assert(h_a[i] == dimA - 1 - i );
}
// free device memory
cudaFree(d_a);
cudaFree(d_b);
// free host memory
free(h_a);
// If the program makes it this far, then the results are correct and
// there are no run-time errors. Good work!
printf("Correct!\n");
return 0;
}
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
Does everything in above code is right? Think.. Think
How to Reverse Single Block in an Array; CUDA C/C++
Problem Statement:
You are given an array of integer (say), your task to reverse the array content block wise.
Given an input array {a0, a1, …, an-1} in pointer d_a, store the reversed array {an-1, an-2, …, a0} in pointer d_b Start from the “reverseArray_singleblock” template Only one thread block launched, to reverse an array of size
N = numThreads = 256 elements Part 1 (of 1): All you have to do is implement the body of the kernel “reverseArrayBlock()” Each thread moves a single element to reversed position Read input from d_a pointer Store output in reversed location in d_b pointer.
Problem Solution:
This is a simple example, that teach you how kernel (device code) to be write. In order to solve above problem we need a mapping between blocks in Grid of threads.
Lets do it;
let say blocks in my grid is named as "bx" and thread in blocks is named as "tx".
since blockDim.x gives me, number of threads in X direction, since thread indexing start from 0 so the total number of thread in a block is ( blockDim.x - 1 ).
Up to this we came to know that how much threads in a block we have. Next task it to find the index of each thread, in array which is to be operate. which will give by this formula
blockDim.x - 1 - (blockDim.x * bx + tx )
Question? how it came;
blockDim.x * bx ; gives you a unique block Id in a grid
blockDim.x * bx + tx ; Gives you unique thread id in a block; which is in operation
blockDim.x - 1 ; already explained above,
let say this term as "i". so ,
i = blockDim.x - 1 - (blockDim.x * bx + tx ) ;
so finally. we have the id of the thread element which is be swapped but booo.. we don't have still from which it is to be swapped, calm down, it is easy to get, blockDim.x * bx + tx gives us the second element Id.
Here is complete Code in CUDA C.
// includes, system
#include <stdio.h>
#include <assert.h>
// Simple utility function to check for CUDA runtime errors
void checkCUDAError(const char* msg);
// Part 1 of 1: implement the kernel
__global__ void reverseArrayBlock(int *d_b, int *d_a )
{
int bx= blockIdx.x ;
int tx = threadIdx.x ;
int i = blockDim.x - 1 - (blockDim.x * bx + tx ) ;
d_b[blockDim.x * bx + tx ] = d_a[i] ;
}
////////////////////////////////////////////////////////////////////////////////
// Program main
////////////////////////////////////////////////////////////////////////////////
int main( int argc, char** argv)
{
// pointer for host memory and size
int *h_a;
int dimA = 256;
// pointer for device memory
int *d_b, *d_a;
// define grid and block size
int numBlocks = 1;
int numThreadsPerBlock = dimA;
// allocate host and device memory
size_t memSize = numBlocks * numThreadsPerBlock * sizeof(int);
h_a = (int *) malloc(memSize);
cudaMalloc( (void **) &d_a, memSize );
cudaMalloc( (void **) &d_b, memSize );
// Initialize input array on host
for (int i = 0; i < dimA; ++i)
{
h_a[i] = i;
}
// Copy host array to device array
cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice );
// launch kernel
dim3 dimGrid(numBlocks);
dim3 dimBlock(numThreadsPerBlock);
reverseArrayBlock<<< dimGrid, dimBlock >>>( d_b, d_a );
// block until the device has completed
cudaThreadSynchronize();
// check if kernel execution generated an error
// Check for any CUDA errors
checkCUDAError("kernel invocation");
// device to host copy
cudaMemcpy( h_a, d_b, memSize, cudaMemcpyDeviceToHost );
// Check for any CUDA errors
checkCUDAError("memcpy");
// verify the data returned to the host is correct
for (int i = 0; i < dimA; i++)
{
assert(h_a[i] == dimA - 1 - i );
}
// free device memory
cudaFree(d_a);
cudaFree(d_b);
// free host memory
free(h_a);
// If the program makes it this far, then the results are correct and
// there are no run-time errors. Good work!
printf("Correct!\n");
return 0;
}
void checkCUDAError(const char *msg)
{
cudaError_t err = cudaGetLastError();
if( cudaSuccess != err)
{
fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString( err) );
exit(EXIT_FAILURE);
}
}
No, we did not check boundary condition in kernel. It may be possible the thread Id jumps outside the index of array element which leads to segmentation fault, try to find a way to overcome it....
Feel free to comment...
Exercise
How to Reverse MultiBlock in an Array; CUDA C/C++?
References
CUDA C Programming Guide
Programming Massively Parallel Processors By David B. Kirk and Wen-mei W.Hwu
Feel free to comment...
Exercise
How to Reverse MultiBlock in an Array; CUDA C/C++?
References
CUDA C Programming Guide
Programming Massively Parallel Processors By David B. Kirk and Wen-mei W.Hwu
Reverse an Array in C++
ReplyDeletesimple and easy
Hi,
ReplyDeleteHow blockDim.x - 1 - (blockDim.x * bx + tx ) will work if number of blocks are increased?
Thoughtful blog thanks for posting
ReplyDelete