__kernel
void convolution(
   __read_only  image2d_t  sourceImage,
   __write_only image2d_t  outputImage, 
   int rows,
   int cols,
   __constant float* filter, 
   int filterWidth,
   sampler_t sampler) 
{
   // Store each work-item’s unique row and column
   int column = get_global_id(0);
   int row    = get_global_id(1);

   // Half the width of the filter is needed for indexing 
   // memory later
   int halfWidth = (int)(filterWidth/2);

   // All accesses to images return data as four-element vector 
   // (i.e., float4), although only the 'x' component will contain 
   // meaningful data in this code
   float4 sum = {0.0f, 0.0f, 0.0f, 0.0f};
    
   // Iterator for the filter
   int filterIdx = 0;
    
   // Each work-item iterates around its local area based on the 
   // size of the filter
   int2 coords;  // Coordinates for accessing the image
   // Iterate the filter rows
   for(int i = -halfWidth; i <= halfWidth; i++) {
      coords.y = row + i;

      // Iterate over the filter columns
      for(int j = -halfWidth; j <= halfWidth; j++) {
         coords.x = column + j;

         float4 pixel;
         // Read a pixel from the image.  A single channel image 
         // stores the pixel in the 'x' coordinate of the returned
         // vector.
         pixel = read_imagef(sourceImage, sampler, coords);
         sum.x += pixel.x * filter[filterIdx++];
      }
   }
    
   // Copy the data to the output image if the
   // work-item is in bounds
   if(row < rows && column < cols) {
      coords.x = column;
      coords.y = row;
      write_imagef(outputImage, coords, sum);
   } 
}