change d2d to d2h & h2d

2025-04-24 22:07:41 -04:00 · 2024-04-24 11:11:50 +09:00 · 2024-04-24 11:11:50 +09:00 · 80c81fc77d
commit 80c81fc77d
parent ea26d69751
2 changed files with 1437 additions and 1438 deletions
--- a/tests/opencl/cfd/CLHelper.h
+++ b/tests/opencl/cfd/CLHelper.h
--- a/tests/opencl/cfd/euler3d.cc
+++ b/tests/opencl/cfd/euler3d.cc
@ -12,25 +12,25 @@
 	on 24/03/2011
 ********************************************************************/
-#include <iostream>
+#include "CLHelper.h"
 #include <fstream>
 #include <iostream>
 #include <math.h>
-#include "CLHelper.h" 
+
 /*
 * Options 
 * 
- */ 
+ */
 #define GAMMA 1.4f
 #define iterations 2000
 #ifndef block_length
-	#define block_length 192
+#define block_length 192
 #endif
 #define NDIM 3
 #define NNB 4
-#define RK 3	// 3rd order RK
+#define RK 3 // 3rd order RK
 #define ff_mach 1.2f
 #define deg_angle_of_attack 0.0f
@ -38,381 +38,391 @@
 * not options
 */
 #if block_length > 128
 #warning "the kernels may fail too launch on some systems if the block length is too large"
 #endif
 #define VAR_DENSITY 0
-#define VAR_MOMENTUM  1
+#define VAR_MOMENTUM 1
-#define VAR_DENSITY_ENERGY (VAR_MOMENTUM+NDIM)
+#define VAR_DENSITY_ENERGY (VAR_MOMENTUM + NDIM)
-#define NVAR (VAR_DENSITY_ENERGY+1)
+#define NVAR (VAR_DENSITY_ENERGY + 1)
 //self-defined user type
-typedef struct{
+typedef struct {
-	float x;
+  float x;
-	float y;
+  float y;
-	float z;
+  float z;
 } float3;
 /*
 * Generic functions
 */
 template <typename T>
-cl_mem alloc(int N){
+cl_mem alloc(int N)
-	cl_mem mem_d = _clMalloc(sizeof(T)*N);
+{
-	return mem_d;
+  cl_mem mem_d = _clMalloc(sizeof(T) * N);
  return mem_d;
 }
 template <typename T>
-void dealloc(cl_mem array){
+void dealloc(cl_mem array)
-	_clFree(array);
+{
  _clFree(array);
 }
 template <typename T>
-void copy(cl_mem dst, cl_mem src, int N){
+void copy(cl_mem dst, cl_mem src, int N)
-	_clMemcpyD2D(dst, src, N*sizeof(T));
+{
  _clMemcpyD2D(dst, src, N * sizeof(T));
 }
 template <typename T>
-void upload(cl_mem dst, T* src, int N){
+void upload(cl_mem dst, T* src, int N)
-	_clMemcpyH2D(dst, src, N*sizeof(T));
+{
  _clMemcpyH2D(dst, src, N * sizeof(T));
 }
 template <typename T>
-void download(T* dst, cl_mem src, int N){
+void download(T* dst, cl_mem src, int N)
-	_clMemcpyD2H(dst, src, N*sizeof(T));
+{
  _clMemcpyD2H(dst, src, N * sizeof(T));
 }
-void dump(cl_mem variables, int nel, int nelr){
+void dump(cl_mem variables, int nel, int nelr)
-	float* h_variables = new float[nelr*NVAR];
+{
-	download(h_variables, variables, nelr*NVAR);
+  float* h_variables = new float[nelr * NVAR];
  download(h_variables, variables, nelr * NVAR);
-	{
+  {
-		std::ofstream file("density.txt");
+    std::ofstream file("density.txt");
-		file << nel << " " << nelr << std::endl;
+    file << nel << " " << nelr << std::endl;
-		for(int i = 0; i < nel; i++) file << h_variables[i + VAR_DENSITY*nelr] << std::endl;
+    for (int i = 0; i < nel; i++)
-	}
+      file << h_variables[i + VAR_DENSITY * nelr] << std::endl;
  }
  {
    std::ofstream file("momentum.txt");
    file << nel << " " << nelr << std::endl;
    for (int i = 0; i < nel; i++) {
      for (int j = 0; j != NDIM; j++)
        file << h_variables[i + (VAR_MOMENTUM + j) * nelr] << " ";
      file << std::endl;
    }
  }
-	{
+  {
-		std::ofstream file("momentum.txt");
+    std::ofstream file("density_energy.txt");
-		file << nel << " " << nelr << std::endl;
+    file << nel << " " << nelr << std::endl;
-		for(int i = 0; i < nel; i++)
+    for (int i = 0; i < nel; i++)
-		{
+      file << h_variables[i + VAR_DENSITY_ENERGY * nelr] << std::endl;
-			for(int j = 0; j != NDIM; j++)
+  }
-				file << h_variables[i + (VAR_MOMENTUM+j)*nelr] << " ";
+  delete[] h_variables;
 			file << std::endl;
 		}
 	}
 	{
 		std::ofstream file("density_energy.txt");
 		file << nel << " " << nelr << std::endl;
 		for(int i = 0; i < nel; i++) file << h_variables[i + VAR_DENSITY_ENERGY*nelr] << std::endl;
 	}
 	delete[] h_variables;
 }
-void initialize_variables(int nelr, cl_mem variables, cl_mem ff_variable) throw(string){
+void initialize_variables(int nelr, cl_mem variables, cl_mem ff_variable) throw(string)
 {
-	int work_items = nelr;
+  int work_items = nelr;
-	int work_group_size = BLOCK_SIZE_1;
+  int work_group_size = BLOCK_SIZE_1;
-	int kernel_id = 1;
+  int kernel_id = 1;
-	int arg_idx = 0;	
+  int arg_idx = 0;
-	_clSetArgs(kernel_id, arg_idx++, variables);
+  _clSetArgs(kernel_id, arg_idx++, variables);
-	_clSetArgs(kernel_id, arg_idx++, ff_variable);
+  _clSetArgs(kernel_id, arg_idx++, ff_variable);
-	_clSetArgs(kernel_id, arg_idx++, &nelr, sizeof(int));
+  _clSetArgs(kernel_id, arg_idx++, &nelr, sizeof(int));
-	_clInvokeKernel(kernel_id, work_items, work_group_size);
+  _clInvokeKernel(kernel_id, work_items, work_group_size);
 }
-void compute_step_factor(int nelr, cl_mem variables, cl_mem areas, cl_mem step_factors){
+void compute_step_factor(int nelr, cl_mem variables, cl_mem areas, cl_mem step_factors)
 {
-	int work_items = nelr;
+  int work_items = nelr;
-	int work_group_size = BLOCK_SIZE_2;
+  int work_group_size = BLOCK_SIZE_2;
-	int kernel_id = 2;
+  int kernel_id = 2;
-	int arg_idx = 0;
+  int arg_idx = 0;
-	_clSetArgs(kernel_id, arg_idx++, variables);
+  _clSetArgs(kernel_id, arg_idx++, variables);
-	_clSetArgs(kernel_id, arg_idx++, areas);
+  _clSetArgs(kernel_id, arg_idx++, areas);
-	_clSetArgs(kernel_id, arg_idx++, step_factors);
+  _clSetArgs(kernel_id, arg_idx++, step_factors);
-	_clSetArgs(kernel_id, arg_idx++, &nelr, sizeof(int));
+  _clSetArgs(kernel_id, arg_idx++, &nelr, sizeof(int));
-	_clInvokeKernel(kernel_id, work_items, work_group_size);
+  _clInvokeKernel(kernel_id, work_items, work_group_size);
 }
-void compute_flux(int nelr, cl_mem elements_surrounding_elements, cl_mem normals, cl_mem variables, cl_mem ff_variable, \
+void compute_flux(int nelr, cl_mem elements_surrounding_elements, cl_mem normals, cl_mem variables, cl_mem ff_variable,
-			cl_mem fluxes, cl_mem ff_flux_contribution_density_energy,
+    cl_mem fluxes, cl_mem ff_flux_contribution_density_energy,
-			cl_mem ff_flux_contribution_momentum_x,
+    cl_mem ff_flux_contribution_momentum_x,
-			cl_mem ff_flux_contribution_momentum_y,
+    cl_mem ff_flux_contribution_momentum_y,
-			cl_mem ff_flux_contribution_momentum_z){
+    cl_mem ff_flux_contribution_momentum_z)
 {
-	int work_items = nelr;
+  int work_items = nelr;
-	int work_group_size = BLOCK_SIZE_3;
+  int work_group_size = BLOCK_SIZE_3;
-	int kernel_id = 3;
+  int kernel_id = 3;
-	int arg_idx = 0;
+  int arg_idx = 0;
-	_clSetArgs(kernel_id, arg_idx++, elements_surrounding_elements);
+  _clSetArgs(kernel_id, arg_idx++, elements_surrounding_elements);
-	_clSetArgs(kernel_id, arg_idx++, normals);
+  _clSetArgs(kernel_id, arg_idx++, normals);
-	_clSetArgs(kernel_id, arg_idx++, variables);
+  _clSetArgs(kernel_id, arg_idx++, variables);
-	_clSetArgs(kernel_id, arg_idx++, ff_variable);
+  _clSetArgs(kernel_id, arg_idx++, ff_variable);
-	_clSetArgs(kernel_id, arg_idx++, fluxes);
+  _clSetArgs(kernel_id, arg_idx++, fluxes);
-	_clSetArgs(kernel_id, arg_idx++, ff_flux_contribution_density_energy);
+  _clSetArgs(kernel_id, arg_idx++, ff_flux_contribution_density_energy);
-	_clSetArgs(kernel_id, arg_idx++, ff_flux_contribution_momentum_x);
+  _clSetArgs(kernel_id, arg_idx++, ff_flux_contribution_momentum_x);
-	_clSetArgs(kernel_id, arg_idx++, ff_flux_contribution_momentum_y);
+  _clSetArgs(kernel_id, arg_idx++, ff_flux_contribution_momentum_y);
-	_clSetArgs(kernel_id, arg_idx++, ff_flux_contribution_momentum_z);
+  _clSetArgs(kernel_id, arg_idx++, ff_flux_contribution_momentum_z);
-	_clSetArgs(kernel_id, arg_idx++, &nelr, sizeof(int));
+  _clSetArgs(kernel_id, arg_idx++, &nelr, sizeof(int));
-	_clInvokeKernel(kernel_id, work_items, work_group_size);
+  _clInvokeKernel(kernel_id, work_items, work_group_size);
 }
-void time_step(int j, int nelr, cl_mem old_variables, cl_mem variables, cl_mem step_factors, cl_mem fluxes){
+void time_step(int j, int nelr, cl_mem old_variables, cl_mem variables, cl_mem step_factors, cl_mem fluxes)
 {
-	int work_items = nelr;
+  int work_items = nelr;
-	int work_group_size = BLOCK_SIZE_4;
+  int work_group_size = BLOCK_SIZE_4;
-	int kernel_id = 4;
+  int kernel_id = 4;
-	int arg_idx = 0;
+  int arg_idx = 0;
-	_clSetArgs(kernel_id, arg_idx++, &j, sizeof(int));
+  _clSetArgs(kernel_id, arg_idx++, &j, sizeof(int));
-	_clSetArgs(kernel_id, arg_idx++, &nelr, sizeof(int));
+  _clSetArgs(kernel_id, arg_idx++, &nelr, sizeof(int));
-	_clSetArgs(kernel_id, arg_idx++, old_variables);
+  _clSetArgs(kernel_id, arg_idx++, old_variables);
-	_clSetArgs(kernel_id, arg_idx++, variables);
+  _clSetArgs(kernel_id, arg_idx++, variables);
-	_clSetArgs(kernel_id, arg_idx++, step_factors);
+  _clSetArgs(kernel_id, arg_idx++, step_factors);
-	_clSetArgs(kernel_id, arg_idx++, fluxes);
+  _clSetArgs(kernel_id, arg_idx++, fluxes);
-	
+
-	_clInvokeKernel(kernel_id, work_items, work_group_size);
+  _clInvokeKernel(kernel_id, work_items, work_group_size);
 }
 inline void compute_flux_contribution(float& density, float3& momentum, float& density_energy, float& pressure, float3& velocity, float3& fc_momentum_x, float3& fc_momentum_y, float3& fc_momentum_z, float3& fc_density_energy)
 {
-	fc_momentum_x.x = velocity.x*momentum.x + pressure;
+  fc_momentum_x.x = velocity.x * momentum.x + pressure;
-	fc_momentum_x.y = velocity.x*momentum.y;
+  fc_momentum_x.y = velocity.x * momentum.y;
-	fc_momentum_x.z = velocity.x*momentum.z;
+  fc_momentum_x.z = velocity.x * momentum.z;
 	fc_momentum_y.x = fc_momentum_x.y;
 	fc_momentum_y.y = velocity.y*momentum.y + pressure;
 	fc_momentum_y.z = velocity.y*momentum.z;
-	fc_momentum_z.x = fc_momentum_x.z;
+  fc_momentum_y.x = fc_momentum_x.y;
-	fc_momentum_z.y = fc_momentum_y.z;
+  fc_momentum_y.y = velocity.y * momentum.y + pressure;
-	fc_momentum_z.z = velocity.z*momentum.z + pressure;
+  fc_momentum_y.z = velocity.y * momentum.z;
-	float de_p = density_energy+pressure;
+  fc_momentum_z.x = fc_momentum_x.z;
-	fc_density_energy.x = velocity.x*de_p;
+  fc_momentum_z.y = fc_momentum_y.z;
-	fc_density_energy.y = velocity.y*de_p;
+  fc_momentum_z.z = velocity.z * momentum.z + pressure;
-	fc_density_energy.z = velocity.z*de_p;
+
  float de_p = density_energy + pressure;
  fc_density_energy.x = velocity.x * de_p;
  fc_density_energy.y = velocity.y * de_p;
  fc_density_energy.z = velocity.z * de_p;
 }
 /*
 * Main function
 */
-int main(int argc, char** argv){
+int main(int argc, char** argv)
 {
  printf("WG size of kernel:initialize = %d, WG size of kernel:compute_step_factor = %d, WG size of kernel:compute_flux = %d, WG size of kernel:time_step = %d\n", BLOCK_SIZE_1, BLOCK_SIZE_2, BLOCK_SIZE_3, BLOCK_SIZE_4);
-	if (argc < 2){
+  if (argc < 2) {
-		//std::cout << "specify data file name and [device type] [device id]" << std::endl;
+    //std::cout << "specify data file name and [device type] [device id]" << std::endl;
-		return 0;
+    printf("specify data file name and [device type] [device id]\n");
-	}
+    return 0;
-	const char* data_file_name = argv[1];
+  }
-	_clCmdParams(argc, argv);
+  const char* data_file_name = argv[1];
-	cl_mem ff_variable, ff_flux_contribution_momentum_x, ff_flux_contribution_momentum_y,ff_flux_contribution_momentum_z,  ff_flux_contribution_density_energy;
+  _clCmdParams(argc, argv);
-	cl_mem areas, elements_surrounding_elements, normals;
+  cl_mem ff_variable, ff_flux_contribution_momentum_x, ff_flux_contribution_momentum_y, ff_flux_contribution_momentum_z, ff_flux_contribution_density_energy;
-	cl_mem variables, old_variables, fluxes, step_factors;
+  cl_mem areas, elements_surrounding_elements, normals;
-	float h_ff_variable[NVAR];
+  cl_mem variables, old_variables, fluxes, step_factors;
  float h_ff_variable[NVAR];
-	try{		
+  try {
-		_clInit(device_type, device_id);		
+    _clInit(device_type, device_id);
-		// set far field conditions and load them into constant memory on the gpu
+    // set far field conditions and load them into constant memory on the gpu
-		{
+    {
-			//float h_ff_variable[NVAR];
+      //float h_ff_variable[NVAR];
-			const float angle_of_attack = float(3.1415926535897931 / 180.0f) * float(deg_angle_of_attack);
+      const float angle_of_attack = float(3.1415926535897931 / 180.0f) * float(deg_angle_of_attack);
 			h_ff_variable[VAR_DENSITY] = float(1.4);
 			float ff_pressure = float(1.0f);
 			float ff_speed_of_sound = sqrt(GAMMA*ff_pressure / h_ff_variable[VAR_DENSITY]);
 			float ff_speed = float(ff_mach)*ff_speed_of_sound;
 			float3 ff_velocity;
 			ff_velocity.x = ff_speed*float(cos((float)angle_of_attack));
 			ff_velocity.y = ff_speed*float(sin((float)angle_of_attack));
 			ff_velocity.z = 0.0f;
 			h_ff_variable[VAR_MOMENTUM+0] = h_ff_variable[VAR_DENSITY] * ff_velocity.x;
 			h_ff_variable[VAR_MOMENTUM+1] = h_ff_variable[VAR_DENSITY] * ff_velocity.y;
 			h_ff_variable[VAR_MOMENTUM+2] = h_ff_variable[VAR_DENSITY] * ff_velocity.z;
 			h_ff_variable[VAR_DENSITY_ENERGY] = h_ff_variable[VAR_DENSITY]*(float(0.5f)*(ff_speed*ff_speed)) + (ff_pressure / float(GAMMA-1.0f));
-			float3 h_ff_momentum;
+      h_ff_variable[VAR_DENSITY] = float(1.4);
 			h_ff_momentum.x = *(h_ff_variable+VAR_MOMENTUM+0);
 			h_ff_momentum.y = *(h_ff_variable+VAR_MOMENTUM+1);
 			h_ff_momentum.z = *(h_ff_variable+VAR_MOMENTUM+2);
 			float3 h_ff_flux_contribution_momentum_x;
 			float3 h_ff_flux_contribution_momentum_y;
 			float3 h_ff_flux_contribution_momentum_z;
 			float3 h_ff_flux_contribution_density_energy;
 			compute_flux_contribution(h_ff_variable[VAR_DENSITY], h_ff_momentum, h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure, ff_velocity, h_ff_flux_contribution_momentum_x, h_ff_flux_contribution_momentum_y, h_ff_flux_contribution_momentum_z, h_ff_flux_contribution_density_energy);
-			// copy far field conditions to the gpu
+      float ff_pressure = float(1.0f);
-			//cl_mem ff_variable, ff_flux_contribution_momentum_x, ff_flux_contribution_momentum_y,ff_flux_contribution_momentum_z,  ff_flux_contribution_density_energy;
+      float ff_speed_of_sound = sqrt(GAMMA * ff_pressure / h_ff_variable[VAR_DENSITY]);
-			ff_variable = _clMalloc(NVAR*sizeof(float));
+      float ff_speed = float(ff_mach) * ff_speed_of_sound;
 			ff_flux_contribution_momentum_x = _clMalloc(sizeof(float3));
 			ff_flux_contribution_momentum_y = _clMalloc(sizeof(float3));
 			ff_flux_contribution_momentum_z = _clMalloc(sizeof(float3));
 			ff_flux_contribution_density_energy = _clMalloc(sizeof(float3));
 			_clMemcpyH2D(ff_variable,          h_ff_variable,          NVAR*sizeof(float));
 			_clMemcpyH2D(ff_flux_contribution_momentum_x, &h_ff_flux_contribution_momentum_x, sizeof(float3));
 			_clMemcpyH2D(ff_flux_contribution_momentum_y, &h_ff_flux_contribution_momentum_y, sizeof(float3));
 			_clMemcpyH2D(ff_flux_contribution_momentum_z, &h_ff_flux_contribution_momentum_z, sizeof(float3));		
 			_clMemcpyH2D(ff_flux_contribution_density_energy, &h_ff_flux_contribution_density_energy, sizeof(float3));
 			_clFinish();
 		}
 		int nel;
 		int nelr;
 		// read in domain geometry
 		//float* areas;
 		//int* elements_surrounding_elements;
 		//float* normals;
 		{
 			std::ifstream file(data_file_name);
 			if(!file.good()){
 				throw(string("can not find/open file!"));
 			}
 			file >> nel;
 			nelr = block_length*((nel / block_length )+ std::min(1, nel % block_length));
 			////std::cout<<"--cambine: nel="<<nel<<", nelr="<<nelr<<std::endl;
 			float* h_areas = new float[nelr];
 			int* h_elements_surrounding_elements = new int[nelr*NNB];
 			float* h_normals = new float[nelr*NDIM*NNB];
-					
+      float3 ff_velocity;
-			// read in data
+      ff_velocity.x = ff_speed * float(cos((float)angle_of_attack));
-			for(int i = 0; i < nel; i++)
+      ff_velocity.y = ff_speed * float(sin((float)angle_of_attack));
-			{
+      ff_velocity.z = 0.0f;
 				file >> h_areas[i];
 				for(int j = 0; j < NNB; j++)
 				{
 					file >> h_elements_surrounding_elements[i + j*nelr];
 					if(h_elements_surrounding_elements[i+j*nelr] < 0) h_elements_surrounding_elements[i+j*nelr] = -1;
 					h_elements_surrounding_elements[i + j*nelr]--; //it's coming in with Fortran numbering				
 					for(int k = 0; k < NDIM; k++)
 					{
 						file >> h_normals[i + (j + k*NNB)*nelr];
 						h_normals[i + (j + k*NNB)*nelr] = -h_normals[i + (j + k*NNB)*nelr];
 					}
 				}
 			}
 			// fill in remaining data
 			int last = nel-1;
 			for(int i = nel; i < nelr; i++)
 			{
 				h_areas[i] = h_areas[last];
 				for(int j = 0; j < NNB; j++)
 				{
 					// duplicate the last element
 					h_elements_surrounding_elements[i + j*nelr] = h_elements_surrounding_elements[last + j*nelr];	
 					for(int k = 0; k < NDIM; k++) h_normals[last + (j + k*NNB)*nelr] = h_normals[last + (j + k*NNB)*nelr];
 				}
 			}
 			areas = alloc<float>(nelr);
 			upload<float>(areas, h_areas, nelr);
-			elements_surrounding_elements = alloc<int>(nelr*NNB);
+      h_ff_variable[VAR_MOMENTUM + 0] = h_ff_variable[VAR_DENSITY] * ff_velocity.x;
-			upload<int>(elements_surrounding_elements, h_elements_surrounding_elements, nelr*NNB);
+      h_ff_variable[VAR_MOMENTUM + 1] = h_ff_variable[VAR_DENSITY] * ff_velocity.y;
      h_ff_variable[VAR_MOMENTUM + 2] = h_ff_variable[VAR_DENSITY] * ff_velocity.z;
-			normals = alloc<float>(nelr*NDIM*NNB);
+      h_ff_variable[VAR_DENSITY_ENERGY] = h_ff_variable[VAR_DENSITY] * (float(0.5f) * (ff_speed * ff_speed)) + (ff_pressure / float(GAMMA - 1.0f));
 			upload<float>(normals, h_normals, nelr*NDIM*NNB);
 			delete[] h_areas;
 			delete[] h_elements_surrounding_elements;
 			delete[] h_normals;
 		}
 		// Create arrays and set initial conditions
 		variables = alloc<float>(nelr*NVAR);				
 		int tp = 0;
 		initialize_variables(nelr, variables, ff_variable);			
 		old_variables = alloc<float>(nelr*NVAR);   	
 		fluxes = alloc<float>(nelr*NVAR);
 		step_factors = alloc<float>(nelr); 
 		// make sure all memory is floatly allocated before we start timing
 		initialize_variables(nelr, old_variables, ff_variable);	
 		initialize_variables(nelr, fluxes, ff_variable);		
 		_clMemset(step_factors, 0, sizeof(float)*nelr);
 		// make sure CUDA isn't still doing something before we start timing
 		_clFinish();
 		// these need to be computed the first time in order to compute time step
 		printf( "Starting...\n");
-		// Begin iterations
+      float3 h_ff_momentum;
-		for(int i = 0; i < iterations; i++){
+      h_ff_momentum.x = *(h_ff_variable + VAR_MOMENTUM + 0);
-			copy<float>(old_variables, variables, nelr*NVAR);
+      h_ff_momentum.y = *(h_ff_variable + VAR_MOMENTUM + 1);
-			// for the first iteration we compute the time step
+      h_ff_momentum.z = *(h_ff_variable + VAR_MOMENTUM + 2);
-			compute_step_factor(nelr, variables, areas, step_factors);
+      float3 h_ff_flux_contribution_momentum_x;
-			for(int j = 0; j < RK; j++){
+      float3 h_ff_flux_contribution_momentum_y;
-				compute_flux(nelr, elements_surrounding_elements, normals, variables, ff_variable, fluxes, ff_flux_contribution_density_energy, \
+      float3 h_ff_flux_contribution_momentum_z;
-				ff_flux_contribution_momentum_x, ff_flux_contribution_momentum_y, ff_flux_contribution_momentum_z);
+      float3 h_ff_flux_contribution_density_energy;
-			
+      compute_flux_contribution(h_ff_variable[VAR_DENSITY], h_ff_momentum, h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure, ff_velocity, h_ff_flux_contribution_momentum_x, h_ff_flux_contribution_momentum_y, h_ff_flux_contribution_momentum_z, h_ff_flux_contribution_density_energy);
-				time_step(j, nelr, old_variables, variables, step_factors, fluxes);
+
-			}
+      // copy far field conditions to the gpu
-		}
+      //cl_mem ff_variable, ff_flux_contribution_momentum_x, ff_flux_contribution_momentum_y,ff_flux_contribution_momentum_z,  ff_flux_contribution_density_energy;
-		_clFinish();
+      ff_variable = _clMalloc(NVAR * sizeof(float));
-		//std::cout << "Saving solution..." << std::endl;
+      ff_flux_contribution_momentum_x = _clMalloc(sizeof(float3));
-		dump(variables, nel, nelr);
+      ff_flux_contribution_momentum_y = _clMalloc(sizeof(float3));
-		//std::cout << "Saved solution..." << std::endl;
+      ff_flux_contribution_momentum_z = _clMalloc(sizeof(float3));
-		_clStatistics();
+      ff_flux_contribution_density_energy = _clMalloc(sizeof(float3));
-		//std::cout << "Cleaning up..." << std::endl;
+      _clMemcpyH2D(ff_variable, h_ff_variable, NVAR * sizeof(float));
-		
+      _clMemcpyH2D(ff_flux_contribution_momentum_x, &h_ff_flux_contribution_momentum_x, sizeof(float3));
-		//--release resources
+      _clMemcpyH2D(ff_flux_contribution_momentum_y, &h_ff_flux_contribution_momentum_y, sizeof(float3));
-		_clFree(ff_variable);
+      _clMemcpyH2D(ff_flux_contribution_momentum_z, &h_ff_flux_contribution_momentum_z, sizeof(float3));
-		_clFree(ff_flux_contribution_momentum_x);
+      _clMemcpyH2D(ff_flux_contribution_density_energy, &h_ff_flux_contribution_density_energy, sizeof(float3));
-		_clFree(ff_flux_contribution_momentum_y);
+      _clFinish();
-		_clFree(ff_flux_contribution_momentum_z);
+    }
-		_clFree(ff_flux_contribution_density_energy);
+    int nel;
-		_clFree(areas);
+    int nelr;
-		_clFree(elements_surrounding_elements);
+    // read in domain geometry
-		_clFree(normals);
+    //float* areas;
-		_clFree(variables);
+    //int* elements_surrounding_elements;
-		_clFree(old_variables);
+    //float* normals;
-		_clFree(fluxes);
+    {
-		_clFree(step_factors);
+      std::ifstream file(data_file_name);
-		_clRelease();
+      if ((file.rdstate() & std::ifstream::failbit) != 0) {
-		printf("Done...\n");
+        throw(string("can not find/open file!"));
-		_clPrintTiming();
+      }
-	}
+      file >> nel;
-	catch(string msg){
+      nelr = block_length * ((nel / block_length) + std::min(1, nel % block_length));
-		printf("--cambine:( an exception catched in main body ->%s\n", msg.c_str());		
+      //std::cout << "--cambine: nel=" << nel << ", nelr=" << nelr << std::endl;
-		_clFree(ff_variable);
+      float* h_areas = new float[nelr];
-		_clFree(ff_flux_contribution_momentum_x);
+      int* h_elements_surrounding_elements = new int[nelr * NNB];
-		_clFree(ff_flux_contribution_momentum_y);
+      float* h_normals = new float[nelr * NDIM * NNB];
-		_clFree(ff_flux_contribution_momentum_z);
+
-		_clFree(ff_flux_contribution_density_energy);
+      // read in data
-		_clFree(areas);
+      for (int i = 0; i < nel; i++) {
-		_clFree(elements_surrounding_elements);
+        file >> h_areas[i];
-		_clFree(normals);
+        for (int j = 0; j < NNB; j++) {
-		_clFree(variables);
+          file >> h_elements_surrounding_elements[i + j * nelr];
-		_clFree(old_variables);
+          if (h_elements_surrounding_elements[i + j * nelr] < 0)
-		_clFree(fluxes);
+            h_elements_surrounding_elements[i + j * nelr] = -1;
-		_clFree(step_factors);
+          h_elements_surrounding_elements[i + j * nelr]--; //it's coming in with Fortran numbering
-		_clRelease();		
+
-	}
+          for (int k = 0; k < NDIM; k++) {
-	catch(...){
+            file >> h_normals[i + (j + k * NNB) * nelr];
-		//std::cout<<"--cambine:( unknow exceptions in main body..."<<std::endl;		
+            h_normals[i + (j + k * NNB) * nelr] = -h_normals[i + (j + k * NNB) * nelr];
-		_clFree(ff_variable);
+          }
-		_clFree(ff_flux_contribution_momentum_x);
+        }
-		_clFree(ff_flux_contribution_momentum_y);
+      }
-		_clFree(ff_flux_contribution_momentum_z);
+
-		_clFree(ff_flux_contribution_density_energy);
+      // fill in remaining data
-		_clFree(areas);
+      int last = nel - 1;
-		_clFree(elements_surrounding_elements);
+      for (int i = nel; i < nelr; i++) {
-		_clFree(normals);
+        h_areas[i] = h_areas[last];
-		_clFree(variables);
+        for (int j = 0; j < NNB; j++) {
-		_clFree(old_variables);
+          // duplicate the last element
-		_clFree(fluxes);
+          h_elements_surrounding_elements[i + j * nelr] = h_elements_surrounding_elements[last + j * nelr];
-		_clFree(step_factors);
+          for (int k = 0; k < NDIM; k++)
-		_clRelease();		
+            h_normals[last + (j + k * NNB) * nelr] = h_normals[last + (j + k * NNB) * nelr];
-	}
+        }
-		
+      }
-	return 0;
+
      areas = alloc<float>(nelr);
      upload<float>(areas, h_areas, nelr);
      elements_surrounding_elements = alloc<int>(nelr * NNB);
      upload<int>(elements_surrounding_elements, h_elements_surrounding_elements, nelr * NNB);
      normals = alloc<float>(nelr * NDIM * NNB);
      upload<float>(normals, h_normals, nelr * NDIM * NNB);
      delete[] h_areas;
      delete[] h_elements_surrounding_elements;
      delete[] h_normals;
    }
    // Create arrays and set initial conditions
    variables = alloc<float>(nelr * NVAR);
    int tp = 0;
    initialize_variables(nelr, variables, ff_variable);
    old_variables = alloc<float>(nelr * NVAR);
    fluxes = alloc<float>(nelr * NVAR);
    step_factors = alloc<float>(nelr);
    // make sure all memory is floatly allocated before we start timing
    initialize_variables(nelr, old_variables, ff_variable);
    initialize_variables(nelr, fluxes, ff_variable);
    _clMemset(step_factors, 0, sizeof(float) * nelr);
    // make sure CUDA isn't still doing something before we start timing
    _clFinish();
    // these need to be computed the first time in order to compute time step
    //std::cout << "Starting..." << std::endl;
    printf("starting .. \n");
    // Begin iterations
    for (int i = 0; i < iterations; i++) {
      //copy<float>(old_variables, variables, nelr * NVAR);
      //download(ff_variable, old_variables, nelr * NVAR);
      // upload(ff_variable, old_variables, nelr * NVAR);
      _clMemcpyH2D(ff_variable, h_ff_variable, NVAR * sizeof(float));
      _clMemcpyD2H(h_ff_variable, variables, NVAR * sizeof(float));
      // for the first iteration we compute the time step
      compute_step_factor(nelr, variables, areas, step_factors);
      for (int j = 0; j < RK; j++) {
        compute_flux(nelr, elements_surrounding_elements, normals, variables, ff_variable, fluxes, ff_flux_contribution_density_energy,
            ff_flux_contribution_momentum_x, ff_flux_contribution_momentum_y, ff_flux_contribution_momentum_z);
        time_step(j, nelr, old_variables, variables, step_factors, fluxes);
      }
    }
    _clFinish();
    //std::cout << "Saving solution..." << std::endl;
    dump(variables, nel, nelr);
    //std::cout << "Saved solution..." << std::endl;
    _clStatistics();
    //std::cout << "Cleaning up..." << std::endl;
    printf("Cleaning up ...\n");
    //--release resources
    _clFree(ff_variable);
    _clFree(ff_flux_contribution_momentum_x);
    _clFree(ff_flux_contribution_momentum_y);
    _clFree(ff_flux_contribution_momentum_z);
    _clFree(ff_flux_contribution_density_energy);
    _clFree(areas);
    _clFree(elements_surrounding_elements);
    _clFree(normals);
    _clFree(variables);
    _clFree(old_variables);
    _clFree(fluxes);
    _clFree(step_factors);
    _clRelease();
    //std::cout << "Done..." << std::endl;
    _clPrintTiming();
  } catch (string msg) {
    //std::cout << "--cambine:( an exception catched in main body ->" << msg << std::endl;
    printf("--cambine:( an exception catched in main body\n");
    _clFree(ff_variable);
    _clFree(ff_flux_contribution_momentum_x);
    _clFree(ff_flux_contribution_momentum_y);
    _clFree(ff_flux_contribution_momentum_z);
    _clFree(ff_flux_contribution_density_energy);
    _clFree(areas);
    _clFree(elements_surrounding_elements);
    _clFree(normals);
    _clFree(variables);
    _clFree(old_variables);
    _clFree(fluxes);
    _clFree(step_factors);
    _clRelease();
  } catch (...) {
    printf("--cambine:( unknow exceptions in main body...\n");
    //std::cout << "--cambine:( unknow exceptions in main body..." << std::endl;
    _clFree(ff_variable);
    _clFree(ff_flux_contribution_momentum_x);
    _clFree(ff_flux_contribution_momentum_y);
    _clFree(ff_flux_contribution_momentum_z);
    _clFree(ff_flux_contribution_density_energy);
    _clFree(areas);
    _clFree(elements_surrounding_elements);
    _clFree(normals);
    _clFree(variables);
    _clFree(old_variables);
    _clFree(fluxes);
    _clFree(step_factors);
    _clRelease();
  }
  return 0;
 }