Graphics Reference
In-Depth Information
#define HALO 8
__global__ void Convolution_2D_Shared ( float ￿ Filter_Response ,
float ￿ Image , int DATA_W , int DATA_H )
{ int x = blockIdx . x ￿ VALID_RESPONSES_X + threadIdx . x ;
int y = blockIdx . y ￿ VALID_RESPONSES_Y + threadIdx . y ;
__shared__ float s_Image [64][96]; // y, x
// Reset shared memory
s_Image [ threadIdx . y ][ threadIdx . x ] = 0.0 f ;
s_Image [ threadIdx . y ][ threadIdx . x + 32] = 0.0 f ;
s_Image [ threadIdx . y ][ threadIdx . x + 64] = 0.0 f ;
s_Image [ threadIdx . y +32][ threadIdx . x ] = 0.0 f ;
s_Image [ threadIdx . y +32][ threadIdx . x + 32] = 0.0 f ;
s_Image [ threadIdx . y +32][ threadIdx . x + 64] = 0.0 f ;
// Read data into shared memory
if ((( x HALO ) > =0)&&(( x HALO ) < DATA_W )
&& ( ( y HALO ) > =0)&&(( y HALO ) < DATA_H ))
s_Image [ threadIdx . y ][ threadIdx . x ]=
Image [ Get2DIndex ( x HALO , y HALO , DATA_W )];
if ((( x +32 HALO ) < DATA_W )
&& ( ( y HALO ) > =0)&&(( y HALO ) < DATA_H ))
s_Image [ threadIdx . y ][ threadIdx . x + 32] =
Image [ Get2DIndex ( x +32 HALO , y HALO , DATA_W )];
if ((( x +64 HALO ) < DATA_W )
&& ( ( y HALO ) > =0)&&(( y HALO ) < DATA_H ))
s_Image [ threadIdx . y ][ threadIdx . x + 64] =
Image [ Get2DIndex ( x +64 HALO , y HALO , DATA_W )];
if ((( x HALO ) > =0)
&& ( ( x HALO ) < DATA_W )&&(( y +32 HALO ) < DATA_H ))
s_Image [ threadIdx . y + 32][ threadIdx . x ]=
Image [ Get2DIndex ( x HALO , y +32
HALO , DATA_W )];
if ((( x +32
HALO ) < DATA_H ))
s_Image [ threadIdx . y + 32][ threadIdx . x + 32] =
Image [ Get2DIndex ( x +32
HALO ) < DATA_W )&&(( y +32
HALO , y +32
HALO , DATA_W )];
if ((( x +64 HALO ) < DATA_W )&&(( y +32 HALO ) < DATA_H ))
s_Image [ threadIdx . y + 32][ threadIdx . x + 64] =
Image [ Get2DIndex ( x +64 HALO , y +32 HALO , DATA_W )];
__syncthreads () ;
Listing 5.2. Non-separable 2D convolution using shared memory. This listing represents
the first part of the kernel, where data is loaded into shared memory. Each thread block
consists of 32 × 32 threads, such that each thread has to read six values into shared
memory (storing 96 × 64 values). The parameter HALO can be changed to control the
size of the largest filter that can be applied ( HALO*2 + 1 ). Before the actual convolution
is started, synchronization of the threads is required to guarantee that all values have
been loaded into shared memory.
Search WWH ::




Custom Search