1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86
   | #include <stdlib.h> #include <stdio.h> #include <opencv/cv.h> #include <opencv/highgui.h> #include <opencv2/opencv.hpp>
  #include "cuda_runtime.h" #include "device_launch_parameters.h"
  #ifdef _DEBUG #pragma comment(lib, "opencv_core247d.lib") #pragma comment(lib, "opencv_imgproc247d.lib") #pragma comment(lib, "opencv_highgui247d.lib") #else #pragma comment(lib, "opencv_core247.lib") #pragma comment(lib, "opencv_imgproc247.lib") #pragma comment(lib, "opencv_highgui247.lib") #endif 
  __global__ void smooth_kernel(const uchar3* src, uchar3* dst, int width, int height) {     int x = threadIdx.x + blockIdx.x * blockDim.x;     int y = threadIdx.y + blockIdx.y * blockDim.y;
      if(x < width  y < height)     {         int offset = x + y * width;         int left = offset - 1;         if (x - 1 < 0)         {             left += 1;         }         int right = offset + 1;         if (x + 1 >= width)         {             right -= 1;         }         int top = offset - width;         if (y - 1 < 0)         {             top += width;         }         int bottom = offset + width;         if (y + 1 >= height)         {             bottom -= width;         }
          dst[offset].x = 0.125 * (4 * src[offset].x + src[left].x + src[right].x + src[top].x + src[bottom].x);         dst[offset].y = 0.125 * (4 * src[offset].y + src[left].y + src[right].y + src[top].y + src[bottom].y);         dst[offset].z = 0.125 * (4 * src[offset].z + src[left].z + src[right].z + src[top].z + src[bottom].z);     } }
  void smooth_caller(const uchar3* src, uchar3* dst, int width, int height) {     dim3 threads(16, 16);     dim3 grids((width + threads.x - 1) / threads.x, (height + threads.y - 1) / threads.y);
      smooth_kernel<< <grids, threads >> >(src, dst, width, height);     cudaThreadSynchronize(); }
  int main() {     cv::Mat image = cv::imread("lena.png");     cv::imshow("src", image);
      size_t memSize = image.step * image.rows;     uchar3* d_src = NULL;     uchar3* d_dst = NULL;     cudaMalloc((void**)d_src, memSize);     cudaMalloc((void**)d_dst, memSize);     cudaMemcpy(d_src, image.data, memSize, cudaMemcpyHostToDevice);
      smooth_caller(d_src, d_dst, image.cols, image.rows);
      cudaMemcpy(image.data, d_dst, memSize, cudaMemcpyDeviceToHost);     cv::imshow("gpu", image);     cv::waitKey(0);
      cudaFree(d_src);     cudaFree(d_dst);
      return 0; }
   |