/*
 * (c) 2007 The Board of Trustees of the University of Illinois.
 */

#include <parboil.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <assert.h>
#include <CL/cl.h>

#if _POSIX_VERSION >= 200112L
# include <sys/time.h>
#endif

//#include "perfmon.h"

cl_context *clContextPtr;
cl_command_queue *clCommandQueuePtr;

// #define DISABLE_PARBOIL_TIMER

/*****************************************************************************/
/* Timer routines */

static int is_async(enum pb_TimerID timer)
{
  return (timer == pb_TimerID_KERNEL) || 
             (timer == pb_TimerID_COPY_ASYNC);
}

static int is_blocking(enum pb_TimerID timer)
{
  return (timer == pb_TimerID_COPY) || (timer == pb_TimerID_NONE);
}

#define INVALID_TIMERID pb_TimerID_LAST

static int asyncs_outstanding(struct pb_TimerSet* timers)
{
  return (timers->async_markers != NULL) && 
           (timers->async_markers->timerID != INVALID_TIMERID);
}

static struct pb_async_time_marker_list * 
get_last_async(struct pb_TimerSet* timers)
{
  /* Find the last event recorded thus far */
  struct pb_async_time_marker_list * last_event = timers->async_markers;
  if(last_event != NULL && last_event->timerID != INVALID_TIMERID) {
    while(last_event->next != NULL && 
            last_event->next->timerID != INVALID_TIMERID)
      last_event = last_event->next;
    return last_event;
  } else
    return NULL;
}

static void insert_marker(struct pb_TimerSet* tset, enum pb_TimerID timer)
{
  cl_int ciErrNum = CL_SUCCESS;
  struct pb_async_time_marker_list ** new_event = &(tset->async_markers);

  while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) {
    new_event = &((*new_event)->next);
  }

  if(*new_event == NULL) {
    *new_event = (struct pb_async_time_marker_list *) 
      			malloc(sizeof(struct pb_async_time_marker_list));
    (*new_event)->marker = calloc(1, sizeof(cl_event));
    /*
    // I don't think this is needed at all. I believe clEnqueueMarker 'creates' the event
#if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 )
fprintf(stderr, "Creating Marker [%d]\n", timer);
    *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, &ciErrNum);
    if (ciErrNum != CL_SUCCESS) {
      fprintf(stderr, "Error Creating User Event Object!\n");
    }
    ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), CL_QUEUED);
    if (ciErrNum != CL_SUCCESS) {
      fprintf(stderr, "Error Setting User Event Status!\n");
    }
#endif
*/
    (*new_event)->next = NULL;
  }

  /* valid event handle now aquired: insert the event record */
  (*new_event)->label = NULL;
  (*new_event)->timerID = timer;
  ciErrNum = clEnqueueMarker(*clCommandQueuePtr, (cl_event *)(*new_event)->marker);
  if (ciErrNum != CL_SUCCESS) {
      fprintf(stderr, "Error Enqueueing Marker!\n");
  }

}

static void insert_submarker(struct pb_TimerSet* tset, char *label, enum pb_TimerID timer)
{
  cl_int ciErrNum = CL_SUCCESS;
  struct pb_async_time_marker_list ** new_event = &(tset->async_markers);

  while(*new_event != NULL && (*new_event)->timerID != INVALID_TIMERID) {
    new_event = &((*new_event)->next);
  }

  if(*new_event == NULL) {
    *new_event = (struct pb_async_time_marker_list *) 
      			malloc(sizeof(struct pb_async_time_marker_list));
    (*new_event)->marker = calloc(1, sizeof(cl_event));
    /*
#if ( __OPENCL_VERSION__ >= CL_VERSION_1_1 )
fprintf(stderr, "Creating SubMarker %s[%d]\n", label, timer);
    *((cl_event *)((*new_event)->marker)) = clCreateUserEvent(*clContextPtr, &ciErrNum);
    if (ciErrNum != CL_SUCCESS) {
      fprintf(stderr, "Error Creating User Event Object!\n");
    }
    ciErrNum = clSetUserEventStatus(*((cl_event *)((*new_event)->marker)), CL_QUEUED);
    if (ciErrNum != CL_SUCCESS) {
      fprintf(stderr, "Error Setting User Event Status!\n");
    }
#endif
*/
    (*new_event)->next = NULL;
  }

  /* valid event handle now aquired: insert the event record */
  (*new_event)->label = label;
  (*new_event)->timerID = timer;
  ciErrNum = clEnqueueMarker(*clCommandQueuePtr, (cl_event *)(*new_event)->marker);
  if (ciErrNum != CL_SUCCESS) {
      fprintf(stderr, "Error Enqueueing Marker!\n");
  }

}


/* Assumes that all recorded events have completed */
static pb_Timestamp record_async_times(struct pb_TimerSet* tset)
{
  struct pb_async_time_marker_list * next_interval = NULL;
  struct pb_async_time_marker_list * last_marker = get_last_async(tset);
  pb_Timestamp total_async_time = 0;
  enum pb_TimerID timer;
  
  for(next_interval = tset->async_markers; next_interval != last_marker; 
      next_interval = next_interval->next) {
    cl_ulong command_start=0, command_end=0;
    cl_int ciErrNum = CL_SUCCESS;
    
    ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->marker), CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &command_start, NULL);
    if (ciErrNum != CL_SUCCESS) {
      fprintf(stderr, "Error getting first EventProfilingInfo: %d\n", ciErrNum);
    }

    ciErrNum = clGetEventProfilingInfo(*((cl_event *)next_interval->next->marker), CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &command_end, NULL);
    if (ciErrNum != CL_SUCCESS) {
      fprintf(stderr, "Error getting second EventProfilingInfo: %d\n", ciErrNum);
    } 
    
    pb_Timestamp interval = (pb_Timestamp) (((double)(command_end - command_start)) / 1e3);
    tset->timers[next_interval->timerID].elapsed += interval;
    if (next_interval->label != NULL) {
      struct pb_SubTimer *subtimer = tset->sub_timer_list[next_interval->timerID]->subtimer_list;
      while (subtimer != NULL) {
        if ( strcmp(subtimer->label, next_interval->label) == 0) {
          subtimer->timer.elapsed += interval;
          break;
        }
        subtimer = subtimer->next;
      }      
    }        
    total_async_time += interval;
    next_interval->timerID = INVALID_TIMERID;
  }

  if(next_interval != NULL)
    next_interval->timerID = INVALID_TIMERID;
  
  return total_async_time;
}

static void
accumulate_time(pb_Timestamp *accum,
		pb_Timestamp start,
		pb_Timestamp end)
{
//#if _POSIX_VERSION >= 200112L
  *accum += end - start;
//#else
//# error "Timestamps not implemented for this system"
//#endif
}

//#if _POSIX_VERSION >= 200112L
static pb_Timestamp get_time()
{
  //struct timeval tv;
  //gettimeofday(&tv, NULL);
  //return (pb_Timestamp) (tv.tv_sec * 1000000LL + tv.tv_usec);
  return 0;
}
//#else
//# error "no supported time libraries are available on this platform"
//#endif

void
pb_ResetTimer(struct pb_Timer *timer)
{
//#ifndef DISABLE_PARBOIL_TIMER
  timer->state = pb_Timer_STOPPED;

//#if _POSIX_VERSION >= 200112L
  timer->elapsed = 0;
//#else
//# error "pb_ResetTimer: not implemented for this system"
//#endif
//#endif
}

void
pb_StartTimer(struct pb_Timer *timer)
{
/*#ifndef DISABLE_PARBOIL_TIMER
  if (timer->state != pb_Timer_STOPPED) {
    fputs("Ignoring attempt to start a running timer\n", stderr);
    return;
  }

  timer->state = pb_Timer_RUNNING;

#if _POSIX_VERSION >= 200112L
  {
    struct timeval tv;
    gettimeofday(&tv, NULL);
    timer->init = tv.tv_sec * 1000000LL + tv.tv_usec;
  }
#else
# error "pb_StartTimer: not implemented for this system"
#endif
#endif*/
}

void
pb_StartTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer)
{
/*#ifndef DISABLE_PARBOIL_TIMER

  unsigned int numNotStopped = 0x3; // 11
  if (timer->state != pb_Timer_STOPPED) {
    fputs("Warning: Timer was not stopped\n", stderr);
    numNotStopped &= 0x1; // Zero out 2^1
  }
  if (subtimer->state != pb_Timer_STOPPED) {
    fputs("Warning: Subtimer was not stopped\n", stderr);
    numNotStopped &= 0x2; // Zero out 2^0
  }
  if (numNotStopped == 0x0) {
    fputs("Ignoring attempt to start running timer and subtimer\n", stderr);
    return;
  }

  timer->state = pb_Timer_RUNNING;
  subtimer->state = pb_Timer_RUNNING;

#if _POSIX_VERSION >= 200112L
  {
    struct timeval tv;
    gettimeofday(&tv, NULL);
    
    if (numNotStopped & 0x2) {
      timer->init = tv.tv_sec * 1000000LL + tv.tv_usec;
    }
  
    if (numNotStopped & 0x1) {
      subtimer->init = tv.tv_sec * 1000000LL + tv.tv_usec;
    }
  }
#else
# error "pb_StartTimer: not implemented for this system"
#endif

#endif*/
}

void
pb_StopTimer(struct pb_Timer *timer)
{
/*#ifndef DISABLE_PARBOIL_TIMER

  pb_Timestamp fini;

  if (timer->state != pb_Timer_RUNNING) {
    fputs("Ignoring attempt to stop a stopped timer\n", stderr);
    return;
  }

  timer->state = pb_Timer_STOPPED;

#if _POSIX_VERSION >= 200112L
  {
    struct timeval tv;
    gettimeofday(&tv, NULL);
    fini = tv.tv_sec * 1000000LL + tv.tv_usec;
  }
#else
# error "pb_StopTimer: not implemented for this system"
#endif

  accumulate_time(&timer->elapsed, timer->init, fini);
  timer->init = fini;

#endif*/
}

void pb_StopTimerAndSubTimer(struct pb_Timer *timer, struct pb_Timer *subtimer) {
/*#ifndef DISABLE_PARBOIL_TIMER

  pb_Timestamp fini;

  unsigned int numNotRunning = 0x3; // 11
  if (timer->state != pb_Timer_RUNNING) {
    fputs("Warning: Timer was not running\n", stderr);
    numNotRunning &= 0x1; // Zero out 2^1
  }
  if (subtimer->state != pb_Timer_RUNNING) {
    fputs("Warning: Subtimer was not running\n", stderr);
    numNotRunning &= 0x2; // Zero out 2^0
  }
  if (numNotRunning == 0x0) {
    fputs("Ignoring attempt to stop stopped timer and subtimer\n", stderr);
    return;
  }


  timer->state = pb_Timer_STOPPED;
  subtimer->state = pb_Timer_STOPPED;

#if _POSIX_VERSION >= 200112L
  {
    struct timeval tv;
    gettimeofday(&tv, NULL);
    fini = tv.tv_sec * 1000000LL + tv.tv_usec;
  }
#else
# error "pb_StopTimer: not implemented for this system"
#endif

  if (numNotRunning & 0x2) {
    accumulate_time(&timer->elapsed, timer->init, fini);
    timer->init = fini;
  }
  
  if (numNotRunning & 0x1) {
    accumulate_time(&subtimer->elapsed, subtimer->init, fini);
    subtimer->init = fini;
  }

#endif*/
}

/* Get the elapsed time in seconds. */
double
pb_GetElapsedTime(struct pb_Timer *timer)
{
  /*double ret;
#ifndef DISABLE_PARBOIL_TIMER

  if (timer->state != pb_Timer_STOPPED) {
    fputs("Elapsed time from a running timer is inaccurate\n", stderr);
  }

#if _POSIX_VERSION >= 200112L
  ret = timer->elapsed / 1e6;
#else
# error "pb_GetElapsedTime: not implemented for this system"
#endif
#endif
  return ret;*/
  return 0;
}

void
pb_InitializeTimerSet(struct pb_TimerSet *timers)
{
/*#ifndef DISABLE_PARBOIL_TIMER
  int n;

  timers->wall_begin = 0; //get_time();
  timers->current = pb_TimerID_NONE;

  timers->async_markers = NULL;
  
  for (n = 0; n < pb_TimerID_LAST; n++) {
    pb_ResetTimer(&timers->timers[n]);
    timers->sub_timer_list[n] = NULL;
  }
#endif*/
}

void pb_SetOpenCL(void *p_clContextPtr, void *p_clCommandQueuePtr) {
  clContextPtr = ((cl_context *)p_clContextPtr);
  clCommandQueuePtr = ((cl_command_queue *)p_clCommandQueuePtr);
}

void
pb_AddSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID pb_Category) {    
/*#ifndef DISABLE_PARBOIL_TIMER
  
  struct pb_SubTimer *subtimer = (struct pb_SubTimer *) malloc
    (sizeof(struct pb_SubTimer));
     
  int len = strlen(label);
    
  subtimer->label = (char *) malloc (sizeof(char)*(len+1));
  sprintf(subtimer->label, "%s\0", label);
  
  pb_ResetTimer(&subtimer->timer);
  subtimer->next = NULL;
  
  struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[pb_Category];
  if (subtimerlist == NULL) {
    subtimerlist = (struct pb_SubTimerList *) calloc
      (1, sizeof(struct pb_SubTimerList));
    subtimerlist->subtimer_list = subtimer;
    timers->sub_timer_list[pb_Category] = subtimerlist;
  } else {
    // Append to list
    struct pb_SubTimer *element = subtimerlist->subtimer_list;
    while (element->next != NULL) {
      element = element->next;
    }
    element->next = subtimer;
  }
  
#endif*/
}

void
pb_SwitchToTimer(struct pb_TimerSet *timers, enum pb_TimerID timer)
{
#if 0
#ifndef DISABLE_PARBOIL_TIMER

  /* Stop the currently running timer */
  if (timers->current != pb_TimerID_NONE) {
    struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current];
    struct pb_SubTimer *currSubTimer = (subtimerlist != NULL) ? subtimerlist->current : NULL;
  
    if (!is_async(timers->current) ) {
      if (timers->current != timer) {
        if (currSubTimer != NULL) {
          pb_StopTimerAndSubTimer(&timers->timers[timers->current], &currSubTimer->timer);
        } else {
          pb_StopTimer(&timers->timers[timers->current]);
        }
      } else {
        if (currSubTimer != NULL) {
          pb_StopTimer(&currSubTimer->timer);
        }
      }
    } else {
      insert_marker(timers, timer);
      if (!is_async(timer)) { // if switching to async too, keep driver going
        pb_StopTimer(&timers->timers[pb_TimerID_DRIVER]);
      }
    }
  }
  
  pb_Timestamp currentTime = 0; //get_time();

  /* The only cases we check for asynchronous task completion is 
   * when an overlapping CPU operation completes, or the next 
   * segment blocks on completion of previous async operations */
  if( asyncs_outstanding(timers) && 
      (!is_async(timers->current) || is_blocking(timer) ) ) {

    struct pb_async_time_marker_list * last_event = get_last_async(timers);
    /* CL_COMPLETE if completed */
    
    cl_int ciErrNum = CL_SUCCESS;
    cl_int async_done = CL_COMPLETE;
    
    ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &async_done, NULL);
    if (ciErrNum != CL_SUCCESS) {
      fprintf(stderr, "Error Querying EventInfo!\n");
    }
    

    if(is_blocking(timer)) {
      /* Async operations completed after previous CPU operations: 
       * overlapped time is the total CPU time since this set of async 
       * operations were first issued */
       
      // timer to switch to is COPY or NONE 
      if(async_done != CL_COMPLETE) {
        accumulate_time(&(timers->timers[pb_TimerID_OVERLAP].elapsed), 
	                  timers->async_begin,currentTime);
      }	                  
	                  
      /* Wait on async operation completion */
      ciErrNum = clWaitForEvents(1, (cl_event *)last_event->marker);
      if (ciErrNum != CL_SUCCESS) {
        fprintf(stderr, "Error Waiting for Events!\n");
      }
      
      pb_Timestamp total_async_time = record_async_times(timers);

      /* Async operations completed before previous CPU operations: 
       * overlapped time is the total async time */
      if(async_done == CL_COMPLETE) {
        //fprintf(stderr, "Async_done: total_async_type = %lld\n", total_async_time);
        timers->timers[pb_TimerID_OVERLAP].elapsed += total_async_time;
      }

    } else 
    /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */
    // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding
    // so something is deeper in stack
    if(async_done == CL_COMPLETE ) {
      /* Async operations completed before previous CPU operations: 
       * overlapped time is the total async time */
      timers->timers[pb_TimerID_OVERLAP].elapsed += record_async_times(timers);
    }   
  }

  /* Start the new timer */
  if (timer != pb_TimerID_NONE) {
    if(!is_async(timer)) {
      pb_StartTimer(&timers->timers[timer]);
    } else {
      // toSwitchTo Is Async (KERNEL/COPY_ASYNC)
      if (!asyncs_outstanding(timers)) {
        /* No asyncs outstanding, insert a fresh async marker */
      
        insert_marker(timers, timer);
        timers->async_begin = currentTime;
      } else if(!is_async(timers->current)) {
        /* Previous asyncs still in flight, but a previous SwitchTo
         * already marked the end of the most recent async operation, 
         * so we can rename that marker as the beginning of this async 
         * operation */
         
        struct pb_async_time_marker_list * last_event = get_last_async(timers);
        last_event->label = NULL;
        last_event->timerID = timer;
      }
      if (!is_async(timers->current)) {
        pb_StartTimer(&timers->timers[pb_TimerID_DRIVER]);
      }
    }
  }
  timers->current = timer;

#endif
#endif
}

void
pb_SwitchToSubTimer(struct pb_TimerSet *timers, char *label, enum pb_TimerID category) 
{
#if 0
#ifndef DISABLE_PARBOIL_TIMER
  struct pb_SubTimerList *subtimerlist = timers->sub_timer_list[timers->current];
  struct pb_SubTimer *curr = (subtimerlist != NULL) ? subtimerlist->current : NULL;
  
  if (timers->current != pb_TimerID_NONE) {
    if (!is_async(timers->current) ) {
      if (timers->current != category) {
        if (curr != NULL) {
          pb_StopTimerAndSubTimer(&timers->timers[timers->current], &curr->timer);
        } else {
          pb_StopTimer(&timers->timers[timers->current]);
        }
      } else {
        if (curr != NULL) {
          pb_StopTimer(&curr->timer);
        }
      }
    } else {
      insert_submarker(timers, label, category);
      if (!is_async(category)) { // if switching to async too, keep driver going
        pb_StopTimer(&timers->timers[pb_TimerID_DRIVER]);
      }
    }
  }

  pb_Timestamp currentTime = 0; //get_time();

  /* The only cases we check for asynchronous task completion is 
   * when an overlapping CPU operation completes, or the next 
   * segment blocks on completion of previous async operations */
  if( asyncs_outstanding(timers) && 
      (!is_async(timers->current) || is_blocking(category) ) ) {

    struct pb_async_time_marker_list * last_event = get_last_async(timers);
    /* CL_COMPLETE if completed */

    cl_int ciErrNum = CL_SUCCESS;
    cl_int async_done = CL_COMPLETE;
    
    ciErrNum = clGetEventInfo(*((cl_event *)last_event->marker), CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &async_done, NULL);
    if (ciErrNum != CL_SUCCESS) {
      fprintf(stderr, "Error Querying EventInfo!\n");
    }

    if(is_blocking(category)) {
      /* Async operations completed after previous CPU operations: 
       * overlapped time is the total CPU time since this set of async 
       * operations were first issued */
       
      // timer to switch to is COPY or NONE 
      // if it hasn't already finished, then just take now and use that as the elapsed time in OVERLAP
      // anything happening after now isn't OVERLAP because everything is being stopped to wait for synchronization
      // it seems that the extra sync wall time isn't being recorded anywhere
      if(async_done != CL_COMPLETE) 
        accumulate_time(&(timers->timers[pb_TimerID_OVERLAP].elapsed), 
	                  timers->async_begin,currentTime);

      /* Wait on async operation completion */
      ciErrNum = clWaitForEvents(1, (cl_event *)last_event->marker);
      if (ciErrNum != CL_SUCCESS) {
        fprintf(stderr, "Error Waiting for Events!\n");
      }
      pb_Timestamp total_async_time = record_async_times(timers);

      /* Async operations completed before previous CPU operations: 
       * overlapped time is the total async time */
       // If it did finish, then accumulate all the async time that did happen into OVERLAP
       // the immediately preceding EventSynchronize theoretically didn't have any effect since it was already completed.
      if(async_done == CL_COMPLETE /*cudaSuccess*/)
        timers->timers[pb_TimerID_OVERLAP].elapsed += total_async_time;

    } else 
    /* implies (!is_async(timers->current) && asyncs_outstanding(timers)) */
    // i.e. Current Not Async (not KERNEL/COPY_ASYNC) but there are outstanding
    // so something is deeper in stack
    if(async_done == CL_COMPLETE /*cudaSuccess*/) {
      /* Async operations completed before previous CPU operations: 
       * overlapped time is the total async time */
      timers->timers[pb_TimerID_OVERLAP].elapsed += record_async_times(timers);
    }   
    // else, this isn't blocking, so just check the next time around
  }
  
  subtimerlist = timers->sub_timer_list[category];
  struct pb_SubTimer *subtimer = NULL;
  
  if (label != NULL) {
    subtimer = subtimerlist->subtimer_list;
    while (subtimer != NULL) {
      if (strcmp(subtimer->label, label) == 0) {
        break;
      } else {
        subtimer = subtimer->next;
      }
    }
  }

  /* Start the new timer */
  if (category != pb_TimerID_NONE) {
    if(!is_async(category)) {  
      if (subtimerlist != NULL) {
        subtimerlist->current = subtimer;
      }
    
      if (category != timers->current && subtimer != NULL) {
        pb_StartTimerAndSubTimer(&timers->timers[category], &subtimer->timer);
      } else if (subtimer != NULL) {
        pb_StartTimer(&subtimer->timer);
      } else {
        pb_StartTimer(&timers->timers[category]);
      }
    } else {
      if (subtimerlist != NULL) {
        subtimerlist->current = subtimer;
      }
    
      // toSwitchTo Is Async (KERNEL/COPY_ASYNC)
      if (!asyncs_outstanding(timers)) {
        /* No asyncs outstanding, insert a fresh async marker */
        insert_submarker(timers, label, category);
        timers->async_begin = currentTime;
      } else if(!is_async(timers->current)) {
        /* Previous asyncs still in flight, but a previous SwitchTo
         * already marked the end of the most recent async operation, 
         * so we can rename that marker as the beginning of this async 
         * operation */
                  
        struct pb_async_time_marker_list * last_event = get_last_async(timers);
        last_event->timerID = category;
        last_event->label = label;
      } // else, marker for switchToThis was already inserted
      
      //toSwitchto is already asynchronous, but if current/prev state is async too, then DRIVER is already running
      if (!is_async(timers->current)) {
        pb_StartTimer(&timers->timers[pb_TimerID_DRIVER]);
      }
    }
  }
  
  timers->current = category;
#endif
#endif
}

void
pb_PrintTimerSet(struct pb_TimerSet *timers)
{
#if 0
#ifndef DISABLE_PARBOIL_TIMER
  pb_Timestamp wall_end = 0; //get_time();

  struct pb_Timer *t = timers->timers;
  struct pb_SubTimer* sub = NULL;
  
  int maxSubLength;
    
  const char *categories[] = {
    "IO", "Kernel", "Copy", "Driver", "Copy Async", "Compute"
  };
  
  const int maxCategoryLength = 10;
  
  int i;
  for(i = 1; i < pb_TimerID_LAST-1; ++i) { // exclude NONE and OVRELAP from this format
    if(pb_GetElapsedTime(&t[i]) != 0) {
    
      // Print Category Timer
      printf("%-*s: %f\n", maxCategoryLength, categories[i-1], pb_GetElapsedTime(&t[i]));
      
      if (timers->sub_timer_list[i] != NULL) {
        sub = timers->sub_timer_list[i]->subtimer_list;
        maxSubLength = 0;
        while (sub != NULL) {
          // Find longest SubTimer label
          if (strlen(sub->label) > maxSubLength) {
            maxSubLength = strlen(sub->label);
          }
          sub = sub->next;
        }
        
        // Fit to Categories
        if (maxSubLength <= maxCategoryLength) {
         maxSubLength = maxCategoryLength;
        }
        
        sub = timers->sub_timer_list[i]->subtimer_list;
        
        // Print SubTimers
        while (sub != NULL) {
          printf(" -%-*s: %f\n", maxSubLength, sub->label, pb_GetElapsedTime(&sub->timer));
          sub = sub->next;
        }
      }
    }
  }
  
  if(pb_GetElapsedTime(&t[pb_TimerID_OVERLAP]) != 0)
    printf("CPU/Kernel Overlap: %f\n", pb_GetElapsedTime(&t[pb_TimerID_OVERLAP]));
        
  float walltime = (wall_end - timers->wall_begin)/ 1e6;
  printf("Timer Wall Time: %f\n", walltime);
  
#endif
#endif
}

void pb_DestroyTimerSet(struct pb_TimerSet * timers)
{
#ifndef DISABLE_PARBOIL_TIMER
  /* clean up all of the async event markers */
  struct pb_async_time_marker_list* event = timers->async_markers;
  while(event != NULL) {

    cl_int ciErrNum = CL_SUCCESS;
    ciErrNum = clWaitForEvents(1, (cl_event *)(event)->marker);
    if (ciErrNum != CL_SUCCESS) {
      //fprintf(stderr, "Error Waiting for Events!\n");
    }
    
    ciErrNum = clReleaseEvent( *((cl_event *)(event)->marker) );
    if (ciErrNum != CL_SUCCESS) {
      fprintf(stderr, "Error Release Events!\n");
    }
    
    free((event)->marker);
    struct pb_async_time_marker_list* next = ((event)->next);

    free(event);

    // (*event) = NULL;
    event = next;
  }

  int i = 0;
  for(i = 0; i < pb_TimerID_LAST; ++i) {    
    if (timers->sub_timer_list[i] != NULL) {
      struct pb_SubTimer *subtimer = timers->sub_timer_list[i]->subtimer_list;
      struct pb_SubTimer *prev = NULL;
      while (subtimer != NULL) {
        free(subtimer->label);
        prev = subtimer;
        subtimer = subtimer->next;
        free(prev);
      }
      free(timers->sub_timer_list[i]);
    }
  }
#endif
}

static pb_Platform** ptr = NULL;

// verbosely print out list of platforms and their devices to the console.
pb_Platform**
pb_GetPlatforms() {
  if (ptr == NULL) {
    cl_uint num_platforms;
    clGetPlatformIDs(0, NULL, &num_platforms);
    if (num_platforms == 0) return NULL;

    ptr = (pb_Platform **) malloc(sizeof(pb_Platform *) * (num_platforms + 1));
    cl_platform_id* ids = (cl_platform_id *) malloc(num_platforms * sizeof(cl_platform_id));
    clGetPlatformIDs(num_platforms, ids, NULL);

    unsigned int i;
    for (i = 0; i < num_platforms; i++) {
      ptr[i] = (pb_Platform *) malloc(sizeof(pb_Platform));
      ptr[i]->clPlatform = ids[i];
      ptr[i]->contexts = NULL;
      ptr[i]->in_use = 0;
      ptr[i]->devices = NULL;

      size_t sz;
      clGetPlatformInfo(ids[i], CL_PLATFORM_NAME, 0, NULL, &sz);
      char* name = (char *) malloc(sz + 1);
      clGetPlatformInfo(ids[i], CL_PLATFORM_NAME, sz, name, NULL);
      name[sz] = '\0';
      ptr[i]->name = name;

      clGetPlatformInfo(ids[i], CL_PLATFORM_VERSION, 0, NULL, &sz);
      char* version = (char *) malloc(sz + 1);
      clGetPlatformInfo(ids[i], CL_PLATFORM_VERSION, sz, version, NULL);
      version[sz] = '\0';
      ptr[i]->version = version;
    }
    ptr[i] = NULL;

    free(ids);
  }

  return (pb_Platform**) ptr;
}

pb_Context* 
createContext(pb_Platform* pb_platform, pb_Device* pb_device) {
  pb_Context* c = (pb_Context*) malloc(sizeof(pb_Context));
  cl_int clStatus;
  cl_context_properties clCps[3] = {
    CL_CONTEXT_PLATFORM, (cl_context_properties)(pb_platform->clPlatform), 0
  };
  c->clContext =
    clCreateContext(clCps, 1, (cl_device_id*)&pb_device->clDevice, NULL, NULL, &clStatus);
  c->clPlatformId = pb_platform->clPlatform;
  c->clDeviceId = pb_device->clDevice;
  c->pb_platform = pb_platform;
  c->pb_device = pb_device;
  pb_platform->in_use = 1;
  pb_device->in_use = 1;
  unsigned int i = 0;
  if (pb_platform->contexts == NULL) {
    pb_platform->contexts = (pb_Context**) malloc(2*sizeof(pb_Context*));
  } else {
    for (i = 0; pb_platform->contexts[i] != NULL; i++) {};
    pb_platform->contexts = (pb_Context**) realloc(pb_platform->contexts,
                                                   (i+1)*sizeof(pb_Context*));
  }
  pb_platform->contexts[i+1] = NULL;
  pb_platform->contexts[i] = c;
  return c;
}

// choose a platform by name.
pb_Platform*
pb_GetPlatformByName(const char* name) {
  pb_Platform** ps = (pb_Platform **) pb_GetPlatforms();
  if (ps == NULL) return NULL;
  if (name == NULL) {
    return *ps;
  }

  while (*ps) {
    if (strstr((*ps)->name, name)) break;
    ps++;
  }
  return (pb_Platform*) *ps;
}

pb_Device**
pb_GetDevices(pb_Platform* pb_platform) {
  if (pb_platform->devices == NULL) {
    cl_uint num_devs;
    cl_device_id* dev_ids;
    clGetDeviceIDs((cl_platform_id) pb_platform->clPlatform,
                    CL_DEVICE_TYPE_ALL, 0, NULL, &num_devs);
    if (num_devs == 0) return NULL;

    pb_platform->devices =
      (pb_Device **) malloc((num_devs + 1) * sizeof(pb_Device *));
    dev_ids = (cl_device_id *) malloc(sizeof(cl_device_id) * num_devs);
    clGetDeviceIDs((cl_platform_id) pb_platform->clPlatform,
                    CL_DEVICE_TYPE_ALL, num_devs, dev_ids, NULL);

    unsigned int i;
    for (i = 0; i < num_devs; i++) {
      pb_platform->devices[i] = (pb_Device *) malloc(sizeof(pb_Device));

      pb_platform->devices[i]->clDevice = dev_ids[i];
      pb_platform->devices[i]->id = i;

      size_t sz;
      clGetDeviceInfo(dev_ids[i], CL_DEVICE_NAME, 0, NULL, &sz);
      char* name = (char *) malloc(sz + 1);
      clGetDeviceInfo(dev_ids[i], CL_DEVICE_NAME, sz, name, NULL);
      name[sz] = '\0';
      pb_platform->devices[i]->name = (char *) name;

      cl_bool available;
      clGetDeviceInfo(dev_ids[i], CL_DEVICE_AVAILABLE, sizeof(cl_bool), &available, NULL); 
      pb_platform->devices[i]->available = (int) available;

      pb_platform->devices[i]->in_use = 0;
    }
    pb_platform->devices[i] = NULL;
  }
  return (pb_Device **) pb_platform->devices;
}

// choose a device by name.
static pb_Device*
pb_SelectDeviceByName(pb_Device **ds, const char* name) {
  if (ds == NULL) return NULL;
  if (name == NULL) return *ds;
  while (*ds) {
    if (strstr((*ds)->name, name)) break;
    ds++;
  }

  return *ds;
}

// choose a device by name and set the device's 'in_use' flag.
pb_Device*
pb_GetDeviceByName(pb_Platform* pb_platform, const char* name) {
  pb_Device** ds = (pb_Device **) pb_GetDevices(pb_platform);
  pb_Device *d = pb_SelectDeviceByName(ds, name);

  if (d) d->in_use = 1;

  return d;
}

void
pb_ReleasePlatforms() {
  if (!ptr) return;
  pb_Platform** cur_ptr = ptr;
  while (*cur_ptr) {
    pb_Platform* pfptr = *cur_ptr++;
    if (pfptr->devices) {
      pb_Device** dvptr = pfptr->devices;
      while (*dvptr) {
        pb_Device* d = *dvptr++;
        free(d->name);
        free(d);
      }
      free(pfptr->devices);
    }
    if (pfptr->contexts) {
      pb_Context** cptr = pfptr->contexts;
      while (*cptr) {
        free(*cptr++);
      }
      free(pfptr->contexts);
    }
    free(pfptr->name);
    free(pfptr);
  }
  free(ptr);
  ptr = NULL;
}

pb_Platform*
pb_GetPlatformByNameAndVersion(const char* name, const char* version) {
  pb_Platform** ps = (pb_Platform **) pb_GetPlatforms();
  if (ps == NULL) return NULL;
  if (name == NULL) return *ps;
  while (*ps) {
    if (strstr((*ps)->name, name) && strstr((*ps)->version, version)) break;
    ps++;
  }
  return (pb_Platform*) *ps;
}

/* Return a pointer to the device at the specified index, or NULL.
 * Used by pb_GetDevice. */
static pb_Device *
select_device_by_index(pb_Device** ds, int id)
{
  int i = 0;
  pb_Device** p = ds;
  while (*p && (i < id)) { p++; i++; }
  return *p;
}

/* Return a pointer to the device with the specified type, or NULL.
 * Used by pb_GetDevice. */
static pb_Device *
select_device_by_type(pb_Device** ds,
                      enum pb_DeviceSelectionCriterion criterion)
{
  cl_device_type sought_type;

  /* Determine the OpenCL device type to search for */
  switch(criterion) {
  case pb_Device_CPU:
    sought_type = CL_DEVICE_TYPE_CPU;
    break;
  case pb_Device_GPU:
    sought_type = CL_DEVICE_TYPE_GPU;
    break;
  case pb_Device_ACCELERATOR:
    sought_type = CL_DEVICE_TYPE_ACCELERATOR;
    break;
  default:
    fprintf(stderr, "pb_GetDevice: Invalid device type");
    exit(-1);
  }

  /* Find the device */
  {
    pb_Device** p = ds;
    cl_device_type type;
    while (*p) {
      clGetDeviceInfo(((cl_device_id) ((*p)->clDevice)), CL_DEVICE_TYPE,
                      sizeof(cl_device_type), &type, NULL);
      if (type == sought_type) break;
    }

    return *p;
  }
}

pb_Device*
pb_GetDevice(pb_Platform* pb_platform, struct pb_DeviceParam *device)
{
  pb_Device** ds = (pb_Device **) pb_GetDevices(pb_platform);

  // The list of devices must be nonempty
  if (ds == NULL || *ds == NULL) {
    fprintf(stderr, "Error: No device is found in platform: name = %s, version = %s\n.", pb_platform->name, pb_platform->version);
    exit(-1);
  }

  pb_Device *selected_device = NULL;

  if (device != NULL) {
    /* Use 'device' to select and return a device.
     * If unable to select a device, fall
     * back on the default selection mechanism. */
    switch(device->criterion) {
    case pb_Device_INDEX:
      selected_device = select_device_by_index(ds, device->index);
      break;
    case pb_Device_GPU:
    case pb_Device_CPU:
    case pb_Device_ACCELERATOR:
      selected_device = select_device_by_type(ds, device->criterion);
      break;
    case pb_Device_NAME:
      selected_device = pb_SelectDeviceByName(ds, device->name);
      break;
    default:
      fprintf(stderr, "pb_GetDevice: Invalid argument");
      exit(-1);
    }
  }

  /* By default or if user-specified selection failed,
   * select the first device */
  if (selected_device == NULL)
    selected_device = *ds;

  /* Set the in_use flag */
  selected_device->in_use = 1;

  return selected_device;
}

pb_Device*
pb_GetDeviceByEnvVars(pb_Platform* pb_platform) {

  /* Convert environment variables to a 'pb_DeviceParam' */
  struct pb_DeviceParam *param = NULL;

  char* device_num = getenv("PARBOIL_DEVICE_NUMBER");
  if (device_num && strcmp(device_num, "")) {
    int id = atoi(device_num);
    param = pb_DeviceParam_index(id);
  }
  else {
    char* device_name = getenv("PARBOIL_DEVICE_NAME");
    if (device_name && strcmp(device_name, "")) {
      param = pb_DeviceParam_name(strdup(device_name));
    }
    else {
      char* device_type = getenv("PARBOIL_DEVICE_TYPE");
      if (device_type && strcmp(device_type, "")) {
        if (strcmp(device_type, "CPU") == 0)
          param = pb_DeviceParam_cpu();
        else if (strcmp(device_type, "GPU") == 0)
          param = pb_DeviceParam_gpu();
        else if (strcmp(device_type, "ACCELERATOR") == 0)
          param = pb_DeviceParam_accelerator();
      }
    }
  }

  /* Get a device */
  pb_Device *d = pb_GetDevice(pb_platform, param);
  pb_FreeDeviceParam(param);

  return d;
}

pb_Platform*
pb_GetPlatformByEnvVars() {
  char* name = getenv("PARBOIL_PLATFORM_NAME");
  char* version = getenv("PARBOIL_PLATFORM_VERSION");

  /* Create a pb_PlatformParam object (or NULL) representing the data from the
   * environment variables */
  struct pb_PlatformParam *platform;

  if (name) {
    if (version) {
      platform = pb_PlatformParam(strdup(name), strdup(version));
    }
    else {
      platform = pb_PlatformParam(strdup(name), NULL);
    }
  }
  else {
    platform = NULL;
  }

  /* Convert to a platform */
  pb_Platform *p = pb_GetPlatform(platform);
  pb_FreePlatformParam(platform);

  return p;
}

/* Choose an OpenCL platform based on the given command-line parameters.
 * If NULL, use the default OpenCL platform. */
pb_Platform*
pb_GetPlatform(struct pb_PlatformParam *platform) {
  if (platform != NULL) {
    /* Try to use command-line parameters to choose platform */
    char *name = platform->name;
    char *version = platform->version;

    if (!name) {
      fprintf(stderr, "Internal error: NULL pointer");
      exit(-1);
    }

    if (version) {
      pb_Platform* p = pb_GetPlatformByNameAndVersion(name, version);
      if (p) return p;
    }

    pb_Platform* p = pb_GetPlatformByName(name);
    if (p) return p;
  }

  pb_Platform* p = pb_GetPlatformByName(NULL);
  if (p == NULL) {
    fprintf(stderr, "Error: No OpenCL platform in this system. Exiting.");
    exit(-1);
  }
  return p;
}

//extern void perf_init();
//extern void mxpa_scheduler_init();

pb_Context*
pb_InitOpenCLContext(struct pb_Parameters* parameters) {
#if 0
  pb_Platform* ps = pb_GetPlatform(parameters->platform);
  if (!ps) return NULL;
  pb_Device* ds = pb_GetDevice(ps, parameters->device);
  if (!ds) return NULL;

  /* HERE INITIALIZE TIMER */
  //perf_init();
  //mxpa_scheduler_init();

  pb_Context* c = createContext(ps, ds);
  pb_PrintPlatformInfo(c);
  return c;
#endif
  cl_int _err;
  cl_platform_id platform_id;
  cl_device_id device_id;
  cl_context context;
  clGetPlatformIDs(1, &platform_id, NULL);
  clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL);
  context = clCreateContext(NULL, 1, &device_id, NULL, NULL,  &_err);

  pb_Context* c = (pb_Context*)malloc(sizeof(pb_Context));
  c->clContext = context;
  c->clDeviceId = device_id;
  c->clPlatformId = platform_id;
  c->pb_platform = (pb_Platform*)malloc(sizeof(pb_Platform));
  c->pb_device = (pb_Device*)malloc(sizeof(pb_Device));  
  c->pb_platform->devices = (pb_Device**)malloc(sizeof(pb_Device*) * 2);  
  c->pb_platform->devices[0] = c->pb_device;
  c->pb_platform->devices[1] = NULL;
  c->pb_platform->contexts = (pb_Context**)malloc(sizeof(pb_Context*) * 2);  
  c->pb_platform->contexts[0] = c;
  c->pb_platform->contexts[1] = NULL;
  c->pb_platform->in_use = 1;
  c->pb_device->in_use = 1;  
  return c;
}

void
pb_ReleaseOpenCLContext(pb_Context* c) {
  pb_ReleasePlatforms();
}

void
pb_PrintPlatformInfo(pb_Context* c) {
  /*pb_Platform** ps = pb_GetPlatforms();
  if (!ps) {
    fprintf (stderr, "No platform found");
    return;
  }

  printf ("********************************************************\n");
  printf ("DETECTED OPENCL PLATFORMS AND DEVICES:\n");
  printf ("--------------------------------------------------------\n");

  while (*ps) {
    printf ("PLATFORM = %s, %s", (*ps)->name, (*ps)->version); 
    if (c->pb_platform == *ps) printf (" (SELECTED)");
    printf ("\n");

    pb_Device** ds = (pb_Device **) pb_GetDevices((*ps));
    if (ds == NULL) {
      printf ("  + (No devices)\n");
    } else {
      while (*ds) {
        printf ("  + %d: %s", (*ds)->id, (*ds)->name);
        if (c->pb_device == *ds) printf (" (SELECTED)");
        printf ("\n");
        ds++;
      }
    }

    ps++;
  }
  printf ("********************************************************\n");*/
}

#ifdef MEASURE_KERNEL_TIME

#undef clEnqueueNDRangeKernel

//extern void pin_trace_enable(char*);
//extern void pin_trace_disable(char*);

cl_int
pb_clEnqueueNDRangeKernel(cl_command_queue  q/* command_queue */,
                       cl_kernel            k/* kernel */,
                       cl_uint              d/* work_dim */,
                       const size_t *       o/* global_work_offset */,
                       const size_t *       gws/* global_work_size */,
                       const size_t *       lws/* local_work_size */,
                       cl_uint              n/* num_events_in_wait_list */,
                       const cl_event *     w/* event_wait_list */,
                       cl_event *           e/* event */) {

  char buf[128];
  struct timeval begin, end;
  clGetKernelInfo(k, CL_KERNEL_FUNCTION_NAME, 128, buf, NULL);

#if 0
  int i;
  for (i = 0; i  < d; i++) {
    printf ("%s: %d: %d / %d\n", buf, i, gws[i], (lws == NULL ? 0 : lws[i]));
  }
#endif

  clFinish(q); clFlush(q);
  //pin_trace_enable(buf);
  //gettimeofday(&begin, NULL);
  cl_int result = clEnqueueNDRangeKernel(q, k, d, o, gws, lws, n, w, e);
  clFinish(q); clFlush(q);
  //gettimeofday(&end, NULL);
  //pin_trace_disable(buf);
  //float t = (float)(end.tv_sec - begin.tv_sec) + (end.tv_usec - begin.tv_usec) / 1000000.0f;
  fflush(stdout);
  fflush(stderr);
  //printf ("PBTIMER: %s: %f\n", buf, t);
  return result;
}

#endif

void
pb_sig_float(char* c, float* p, int sz) {
  int i;
  double s = 0.0;
  for (i = 0; i < sz; i++) s += p[i] * (float)(i+1);
  printf ("[Signature] %s = %lf\n", c, s);
}

void
pb_sig_double(char* c, double* p, int sz) {
  int i;
  double s = 0.0;
  for (i = 0; i < sz; i++) s += p[i];
  printf ("[Signature] %s = %lf\n", c, s);
}

void
pb_sig_short(char* c, short* p, int sz) {
  int i;
  long long int s = 0;
  for (i = 0; i < sz; i++) s += p[i];
  printf ("[Signature] %s = %lld\n", c, s);
}

void
pb_sig_int(char* c, int* p, int sz) {
  int i;
  long long int s = 0;
  for (i = 0; i < sz; i++) s += p[i];
  printf ("[Signature] %s = %lld\n", c, s);
}

void
pb_sig_uchar(char* c, unsigned char* p, unsigned int sz) {
  int i;
  unsigned long long int s = 0;
  for (i = 0; i < sz; i++) s += p[i];
  printf ("[Signature] %s = %lld\n", c, s);
}

void pb_sig_clmem(char* s, cl_command_queue command_queue, cl_mem memobj, int ty) {
  size_t sz;
  if (clGetMemObjectInfo(memobj, CL_MEM_SIZE, sizeof(size_t), &sz, NULL) != CL_SUCCESS) {
    printf ("Something wrong.\n");
    assert(0);
  } else {
    printf ("size = %d\n", sz);
  }
  char* hp; // = (char*) malloc(sz);
  //posix_memalign((void**)&hp, 64, sz);
  hp = (char*)malloc(sz);

  clEnqueueReadBuffer (command_queue,
  memobj,
  CL_TRUE,
  0,
  sz,
  hp,
  0,
  NULL,
  NULL);

  if (ty == T_FLOAT) pb_sig_float(s, (float*)hp, sz/sizeof(float));
  if (ty == T_DOUBLE) pb_sig_double(s, (double*)hp, sz/sizeof(double));
  if (ty == T_INT) pb_sig_int(s, (int*)hp, sz/sizeof(int));
  if (ty == T_SHORT) pb_sig_short(s, (short*)hp, sz/sizeof(short));
  if (ty == T_UCHAR) pb_sig_uchar(s, (unsigned char*)hp, sz/sizeof(char));

  free(hp);
}