I'm fairly new to GPU programming and C++ AMP. Can anyone help make a general optimized 2D image convolution filter? My fasted version so far is listed below. Can this be done better with tiling in some way? This version works and is much faster than my CPU implementation but I hope to get it even better.
void FIRFilterCore(array_view<const float, 2> src, array_view<float, 2> dst, array_view<const float, 2> kernel)
{
int vertRadius = kernel.extent[0] / 2;
int horzRadius = kernel.extent[1] / 2;
parallel_for_each(src.extent, [=](index<2> idx) restrict(amp)
{
float sum = 0;
if (idx[0] < vertRadius || idx[1] < horzRadius ||
idx[0] >= src.extent[0] - vertRadius || idx[1] >= src.extent[1] - horzRadius)
{
// Handle borders by duplicating edges
for (int dy = -vertRadius; dy <= vertRadius; dy++)
{
index<2> srcIdx(direct3d::clamp(idx[0] + dy, 0, src.extent[0] - 1), 0);
index<2> kIdx(vertRadius + dy, 0);
for (int dx = -horzRadius; dx <= horzRadius; dx++)
{
srcIdx[1] = direct3d::clamp(idx[1] + dx, 0, src.extent[1] - 1);
sum += src[srcIdx] * kernel[kIdx];
kIdx[1]++;
}
}
}
else // Central part
{
for (int dy = -vertRadius; dy <= vertRadius; dy++)
{
index<2> srcIdx(idx[0] + dy, idx[1] - horzRadius);
index<2> kIdx(vertRadius + dy, 0);
for (int dx = -horzRadius; dx <= horzRadius; dx++)
{
sum += src[srcIdx] * kernel[kIdx];
srcIdx[1]++;
kIdx[1]++;
}
}
}
dst[idx] = sum;
});
}
Another way to go around it would of course be to perform the convolution in the Fourier domain, but I'm not sure it would perform as long as the filter is fairly small compared to the image (which does not have side lengths which are powers of 2 by the way).