#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>

#include "utils.h"

// This function takes a positive integer and rounds it up to
// the nearest multiple of another provided integer
unsigned int roundUp(unsigned int value, unsigned int multiple) {

  // Determine how far past the nearest multiple the value is
  unsigned int remainder = value % multiple;

  // Add the difference to make the value a multiple
  if(remainder != 0) {
          value += (multiple-remainder);
  }

  return value;
}

// This function reads in a text file and stores it as a char pointer
char* readSource(char* kernelPath) {

   cl_int status;
   FILE *fp;
   char *source;
   long int size;

   printf("Program file is: %s\n", kernelPath);

   fp = fopen(kernelPath, "rb");
   if(!fp) {
      printf("Could not open kernel file\n");
      exit(-1);
   }
   status = fseek(fp, 0, SEEK_END);
   if(status != 0) {
      printf("Error seeking to end of file\n");
      exit(-1);
   }
   size = ftell(fp);
   if(size < 0) {
      printf("Error getting file position\n");
      exit(-1);
   }

   rewind(fp);

   source = (char *)malloc(size + 1);

   int i;
   for (i = 0; i < size+1; i++) {
      source[i]='\0';
   }

   if(source == NULL) {
      printf("Error allocating space for the kernel source\n");
      exit(-1);
   }

   fread(source, 1, size, fp);
   source[size] = '\0';

   return source;
}

void chk(cl_int status, const char* cmd) {

   if(status != CL_SUCCESS) {
      printf("%s failed (%d)\n", cmd, status);
      exit(-1);
   }
}

int main() {

   int i, j, k, l;

   // Rows and columns in the input image
   int imageHeight;
   int imageWidth;

   const char* inputFile = "input.bmp";
   const char* outputFile = "output.bmp";

   // Homegrown function to read a BMP from file
   float* inputImage = readImage(inputFile, &imageWidth,
      &imageHeight);

   // Size of the input and output images on the host
   int dataSize = imageHeight*imageWidth*sizeof(float);

   // Output image on the host
   float* outputImage = NULL;
   outputImage = (float*)malloc(dataSize);
   float* refImage = NULL;
   refImage = (float*)malloc(dataSize);

   // 45 degree motion blur
   float filter[49] =
      {0,      0,      0,      0,      0,      0,      0,
       0,      0,      0,      0,      0,      0,      0,
       0,      0,     -1,      0,      1,      0,      0,
       0,      0,     -2,      0,      2,      0,      0,
       0,      0,     -1,      0,      1,      0,      0,
       0,      0,      0,      0,      0,      0,      0,
       0,      0,      0,      0,      0,      0,      0};

   // The convolution filter is 7x7
   int filterWidth = 7;  
   int filterSize  = filterWidth*filterWidth;  // Assume a square kernel

   // Set up the OpenCL environment
   cl_int status;

   // Discovery platform
   cl_platform_id platform;
   status = clGetPlatformIDs(1, &platform, NULL);
   chk(status, "clGetPlatformIDs");

   // Discover device
   cl_device_id device;
   clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
   chk(status, "clGetDeviceIDs");

   // Create context
   cl_context_properties props[3] = {CL_CONTEXT_PLATFORM,
       (cl_context_properties)(platform), 0};
   cl_context context;
   context = clCreateContext(props, 1, &device, NULL, NULL, &status);
   chk(status, "clCreateContext");

   // Create command queue
   cl_command_queue queue;
   queue = clCreateCommandQueue(context, device, 0, &status);
   chk(status, "clCreateCommandQueue");

   // The image format describes how the data will be stored in memory
   cl_image_format format;
   format.image_channel_order     = CL_R;     // single channel
   format.image_channel_data_type = CL_FLOAT; // float data type

   // Create space for the source image on the device
   cl_mem d_inputImage = clCreateImage2D(context, 0, &format, imageWidth, 
      imageHeight, 0, NULL, &status);
   chk(status, "clCreateImage2D");

   // Create space for the output image on the device
   cl_mem d_outputImage = clCreateImage2D(context, 0, &format, imageWidth, 
      imageHeight, 0, NULL, &status);
   chk(status, "clCreateImage2D");

   // Create space for the 7x7 filter on the device
   cl_mem d_filter = clCreateBuffer(context, 0, filterSize*sizeof(float), 
      NULL, &status);
   chk(status, "clCreateBuffer");

   // Copy the source image to the device
   size_t origin[3] = {0, 0, 0};  // Offset within the image to copy from
   size_t region[3] = {imageWidth, imageHeight, 1}; // Elements to per dimension
   status = clEnqueueWriteImage(queue, d_inputImage, CL_FALSE, origin, region, 
      0, 0, inputImage, 0, NULL, NULL);
   chk(status, "clEnqueueWriteImage");
    
   // Copy the 7x7 filter to the device
   status = clEnqueueWriteBuffer(queue, d_filter, CL_FALSE, 0, 
      filterSize*sizeof(float), filter, 0, NULL, NULL);
   chk(status, "clEnqueueWriteBuffer");

   // Create the image sampler
   cl_sampler sampler = clCreateSampler(context, CL_FALSE, 
      CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &status);
   chk(status, "clCreateSampler");

   const char* source = readSource("kernel.cl");

   // Create a program object with source and build it
   cl_program program;
   program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);
   chk(status, "clCreateProgramWithSource");
   status = clBuildProgram(program, 1, &device, NULL, NULL, NULL);
   chk(status, "clBuildProgram");
 
   // Create the kernel object
   cl_kernel kernel;
   kernel = clCreateKernel(program, "convolution", &status);
   chk(status, "clCreateKernel");

   // Set the kernel arguments
   status  = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_inputImage);
   status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_outputImage);
   status |= clSetKernelArg(kernel, 2, sizeof(int), &imageHeight);
   status |= clSetKernelArg(kernel, 3, sizeof(int), &imageWidth);
   status |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_filter);
   status |= clSetKernelArg(kernel, 5, sizeof(int), &filterWidth);
   status |= clSetKernelArg(kernel, 6, sizeof(cl_sampler), &sampler);
   chk(status, "clSetKernelArg");

   // Set the work item dimensions
   size_t globalSize[2] = {imageWidth, imageHeight};
   status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, NULL, 0,
      NULL, NULL);
   chk(status, "clEnqueueNDRange");

   // Read the image back to the host
   status = clEnqueueReadImage(queue, d_outputImage, CL_TRUE, origin, 
      region, 0, 0, outputImage, 0, NULL, NULL); 
   chk(status, "clEnqueueReadImage");

   // Write the output image to file
   storeImage(outputImage, outputFile, imageHeight, imageWidth, inputFile);

   // Compute the reference image
   for(i = 0; i < imageHeight; i++) {
      for(j = 0; j < imageWidth; j++) {
         refImage[i*imageWidth+j] = 0;
      }
   }

   // Iterate over the rows of the source image
   int halfFilterWidth = filterWidth/2;
   float sum;
   for(i = 0; i < imageHeight; i++) {
      // Iterate over the columns of the source image
      for(j = 0; j < imageWidth; j++) {
         sum = 0; // Reset sum for new source pixel
         // Apply the filter to the neighborhood
         for(k = - halfFilterWidth; k <= halfFilterWidth; k++) {
            for(l = - halfFilterWidth; l <= halfFilterWidth; l++) {
               if(i+k >= 0 && i+k < imageHeight && 
                  j+l >= 0 && j+l < imageWidth) {
                  sum += inputImage[(i+k)*imageWidth + j+l] * 
                         filter[(k+halfFilterWidth)*filterWidth + 
                            l+halfFilterWidth];
               }
            }
         }
         refImage[i*imageWidth+j] = sum;
      }
   }

   int failed = 0;
   for(i = 0; i < imageHeight; i++) {
      for(j = 0; j < imageWidth; j++) {
         if(abs(outputImage[i*imageWidth+j]-refImage[i*imageWidth+j]) > 0.01) {
            printf("Results are INCORRECT\n");
            printf("Pixel mismatch at <%d,%d> (%f vs. %f)\n", i, j,
               outputImage[i*imageWidth+j], refImage[i*imageWidth+j]);
            failed = 1;
         }
         if(failed) break;
      }
      if(failed) break;
   }
   if(!failed) {
      printf("Results are correct\n");
   }
             
   return 0;
}