Graphics Reference
In-Depth Information
#define
HALO 8
__global__
void
Convolution_2D_Shared
(
float
Filter_Response
,
float
Image
,
int
DATA_W
,
int
DATA_H
)
{
int
x
=
blockIdx
.
x
VALID_RESPONSES_X
+
threadIdx
.
x
;
int
y
=
blockIdx
.
y
VALID_RESPONSES_Y
+
threadIdx
.
y
;
__shared__
float
s_Image
[64][96];
// y, x
// Reset shared memory
s_Image
[
threadIdx
.
y
][
threadIdx
.
x
] = 0.0
f
;
s_Image
[
threadIdx
.
y
][
threadIdx
.
x
+ 32] = 0.0
f
;
s_Image
[
threadIdx
.
y
][
threadIdx
.
x
+ 64] = 0.0
f
;
s_Image
[
threadIdx
.
y
+32][
threadIdx
.
x
] = 0.0
f
;
s_Image
[
threadIdx
.
y
+32][
threadIdx
.
x
+ 32] = 0.0
f
;
s_Image
[
threadIdx
.
y
+32][
threadIdx
.
x
+ 64] = 0.0
f
;
// Read data into shared memory
if
(((
x
−
HALO
)
>
=0)&&((
x
−
HALO
)
<
DATA_W
)
&& ( (
y
−
HALO
)
>
=0)&&((
y
−
HALO
)
<
DATA_H
))
s_Image
[
threadIdx
.
y
][
threadIdx
.
x
]=
Image
[
Get2DIndex
(
x
−
HALO
,
y
−
HALO
,
DATA_W
)];
if
(((
x
+32
−
HALO
)
<
DATA_W
)
&& ( (
y
−
HALO
)
>
=0)&&((
y
−
HALO
)
<
DATA_H
))
s_Image
[
threadIdx
.
y
][
threadIdx
.
x
+ 32] =
Image
[
Get2DIndex
(
x
+32
−
HALO
,
y
−
HALO
,
DATA_W
)];
if
(((
x
+64
−
HALO
)
<
DATA_W
)
&& ( (
y
−
HALO
)
>
=0)&&((
y
−
HALO
)
<
DATA_H
))
s_Image
[
threadIdx
.
y
][
threadIdx
.
x
+ 64] =
Image
[
Get2DIndex
(
x
+64
−
HALO
,
y
−
HALO
,
DATA_W
)];
if
(((
x
−
HALO
)
>
=0)
&& ( (
x
−
HALO
)
<
DATA_W
)&&((
y
+32
−
HALO
)
<
DATA_H
))
s_Image
[
threadIdx
.
y
+ 32][
threadIdx
.
x
]=
Image
[
Get2DIndex
(
x
−
HALO
,
y
+32
−
HALO
,
DATA_W
)];
if
(((
x
+32
−
HALO
)
<
DATA_H
))
s_Image
[
threadIdx
.
y
+ 32][
threadIdx
.
x
+ 32] =
Image
[
Get2DIndex
(
x
+32
−
HALO
)
<
DATA_W
)&&((
y
+32
−
HALO
,
y
+32
−
HALO
,
DATA_W
)];
if
(((
x
+64
−
HALO
)
<
DATA_W
)&&((
y
+32
−
HALO
)
<
DATA_H
))
s_Image
[
threadIdx
.
y
+ 32][
threadIdx
.
x
+ 64] =
Image
[
Get2DIndex
(
x
+64
−
HALO
,
y
+32
−
HALO
,
DATA_W
)];
__syncthreads
() ;
Listing 5.2.
Non-separable 2D convolution using shared memory. This listing represents
the first part of the kernel, where data is loaded into shared memory. Each thread block
consists of 32
×
32 threads, such that each thread has to read six values into shared
memory (storing 96
×
64 values). The parameter
HALO
can be changed to control the
size of the largest filter that can be applied (
HALO*2 + 1
). Before the actual convolution
is started, synchronization of the threads is required to guarantee that all values have
been loaded into shared memory.