cuda结合opencv实现简单的平滑滤波

这次也是使用opencv的mat加载处理图像。唯一与上次有区别的是核函数的编写。
根据cuda的线程分配模型,每一个像素是分配单独的线程处理的。那么有这样的一个疑问?
像平滑滤波这些应用,如何在每一个线程中获取周围的像素了?
其实,这个问题很好解决。因为,在核函数中,我们能够根据线程id,块id,块尺寸等计算
出当前像素的位置。那么,自然能够得到其邻域的位置。从而实现了平滑滤波。
代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#include <stdlib.h>
#include <stdio.h>
#include <opencv/cv.h>
#include <opencv/highgui.h>
#include <opencv2/opencv.hpp>

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#ifdef _DEBUG
#pragma comment(lib, "opencv_core247d.lib")
#pragma comment(lib, "opencv_imgproc247d.lib")
#pragma comment(lib, "opencv_highgui247d.lib")
#else
#pragma comment(lib, "opencv_core247.lib")
#pragma comment(lib, "opencv_imgproc247.lib")
#pragma comment(lib, "opencv_highgui247.lib")
#endif // DEBUG

__global__ void smooth_kernel(const uchar3* src, uchar3* dst, int width, int height)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;

if(x < width y < height)
{
int offset = x + y * width;
int left = offset - 1;
if (x - 1 < 0)
{
left += 1;
}
int right = offset + 1;
if (x + 1 >= width)
{
right -= 1;
}
int top = offset - width;
if (y - 1 < 0)
{
top += width;
}
int bottom = offset + width;
if (y + 1 >= height)
{
bottom -= width;
}

dst[offset].x = 0.125 * (4 * src[offset].x + src[left].x + src[right].x + src[top].x + src[bottom].x);
dst[offset].y = 0.125 * (4 * src[offset].y + src[left].y + src[right].y + src[top].y + src[bottom].y);
dst[offset].z = 0.125 * (4 * src[offset].z + src[left].z + src[right].z + src[top].z + src[bottom].z);
}
}

void smooth_caller(const uchar3* src, uchar3* dst, int width, int height)
{
dim3 threads(16, 16);
dim3 grids((width + threads.x - 1) / threads.x, (height + threads.y - 1) / threads.y);

smooth_kernel<< <grids, threads >> >(src, dst, width, height);
cudaThreadSynchronize();
}

int main()
{
cv::Mat image = cv::imread("lena.png");
cv::imshow("src", image);

size_t memSize = image.step * image.rows;
uchar3* d_src = NULL;
uchar3* d_dst = NULL;
cudaMalloc((void**)d_src, memSize);
cudaMalloc((void**)d_dst, memSize);
cudaMemcpy(d_src, image.data, memSize, cudaMemcpyHostToDevice);

smooth_caller(d_src, d_dst, image.cols, image.rows);

cudaMemcpy(image.data, d_dst, memSize, cudaMemcpyDeviceToHost);
cv::imshow("gpu", image);
cv::waitKey(0);

cudaFree(d_src);
cudaFree(d_dst);

return 0;
}

效果如图: