Merge branch 'develop' of https://github.com/vortexgpgpu/vortex into develop

2025-04-23 21:39:10 -04:00 · 2023-11-18 00:27:46 -08:00 · 2023-11-18 00:27:46 -08:00 · 11752b2562
commit 11752b2562
parent 64dc5e1667 43154cf738
67 changed files with 1234 additions and 956 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -38,7 +38,7 @@ jobs:
        - rm -rf $HOME/build32 && cp -r $PWD $HOME/build32
        - rm -rf $HOME/build64 && cp -r $PWD $HOME/build64
        - make -C $HOME/build32
-        - XLEN=64 RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv64-gnu-toolchain make -C $HOME/build64
+        - XLEN=64 make -C $HOME/build64
    - stage: test 
      name: unittest
      script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --unittest
@ -47,13 +47,13 @@ jobs:
      script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --isa
    - stage: test 
      name: isa64
-      script: cp -r $HOME/build64 build && cd build && XLEN=64 RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv64-gnu-toolchain ./ci/travis_run.py ./ci/regression.sh --isa
+      script: cp -r $HOME/build64 build && cd build && XLEN=64 ./ci/travis_run.py ./ci/regression.sh --isa
    - stage: test 
      name: regression
      script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --regression
    - stage: test 
      name: regression64
-      script: cp -r $HOME/build64 build && cd build && XLEN=64 RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv64-gnu-toolchain ./ci/travis_run.py ./ci/regression.sh --regression
+      script: cp -r $HOME/build64 build && cd build && XLEN=64 ./ci/travis_run.py ./ci/regression.sh --regression
    - stage: test 
      name: opencl
      script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --opencl    
--- a/ci/toolchain_env.sh
+++ b/ci/toolchain_env.sh
@ -16,14 +16,11 @@

 TOOLDIR=${TOOLDIR:=/opt}

-export RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv-gnu-toolchain
-export LLVM_POCL=$TOOLDIR/llvm-pocl
-export LLVM_VORTEX=$TOOLDIR/llvm-vortex
 export VERILATOR_ROOT=$TOOLDIR/verilator
 export PATH=$VERILATOR_ROOT/bin:$PATH
+
 export SV2V_PATH=$TOOLDIR/sv2v
 export PATH=$SV2V_PATH/bin:$PATH
+
 export YOSYS_PATH=$TOOLDIR/yosys
 export PATH=$YOSYS_PATH/bin:$PATH
-export POCL_CC_PATH=$TOOLDIR/pocl/compiler
-export POCL_RT_PATH=$TOOLDIR/pocl/runtime
--- a/docs/fpga_setup.md
+++ b/docs/fpga_setup.md
@ -9,9 +9,6 @@ OPAE Environment Setup
    $ export C_INCLUDE_PATH=$OPAE_HOME/include:$C_INCLUDE_PATH
    $ export LIBRARY_PATH=$OPAE_HOME/lib:$LIBRARY_PATH
    $ export LD_LIBRARY_PATH=$OPAE_HOME/lib:$LD_LIBRARY_PATH
-    $ export RISCV_TOOLCHAIN_PATH=/opt/riscv-gnu-toolchain
-    $ export PATH=:/opt/verilator/bin:$PATH
-    $ export VERILATOR_ROOT=/opt/verilator

 OPAE Build
 ------------------
--- a/hw/rtl/VX_config.vh
+++ b/hw/rtl/VX_config.vh
@ -223,18 +223,18 @@

 // Number of ALU units
 `ifndef NUM_ALU_LANES
-`define NUM_ALU_LANES   `UP(`NUM_THREADS / 2)
+`define NUM_ALU_LANES   `NUM_THREADS
 `endif
 `ifndef NUM_ALU_BLOCKS
-`define NUM_ALU_BLOCKS  `UP(`ISSUE_WIDTH / 1)
+`define NUM_ALU_BLOCKS  `ISSUE_WIDTH
 `endif

 // Number of FPU units
 `ifndef NUM_FPU_LANES
-`define NUM_FPU_LANES   `UP(`NUM_THREADS / 2)
+`define NUM_FPU_LANES   `NUM_THREADS
 `endif
 `ifndef NUM_FPU_BLOCKS
-`define NUM_FPU_BLOCKS  `UP(`ISSUE_WIDTH / 1)
+`define NUM_FPU_BLOCKS  `ISSUE_WIDTH
 `endif

 // Number of LSU units
@ -407,7 +407,7 @@

 // Number of Associative Ways
 `ifndef ICACHE_NUM_WAYS
-`define ICACHE_NUM_WAYS 2
+`define ICACHE_NUM_WAYS 1
 `endif

 // Dcache Configurable Knobs //////////////////////////////////////////////////
@ -461,7 +461,7 @@

 // Number of Associative Ways
 `ifndef DCACHE_NUM_WAYS
-`define DCACHE_NUM_WAYS 2
+`define DCACHE_NUM_WAYS 1
 `endif

 // SM Configurable Knobs //////////////////////////////////////////////////////
@ -520,7 +520,7 @@

 // Number of Associative Ways
 `ifndef L2_NUM_WAYS
-`define L2_NUM_WAYS 4
+`define L2_NUM_WAYS 2
 `endif

 // L3cache Configurable Knobs /////////////////////////////////////////////////
--- a/hw/rtl/afu/xrt/VX_afu_wrap.sv
+++ b/hw/rtl/afu/xrt/VX_afu_wrap.sv
@ -262,7 +262,7 @@ module VX_afu_wrap #(
 		.m_axi_awready	(m_axi_mem_awready_a),
 		.m_axi_awaddr	(m_axi_mem_awaddr_w),
 		.m_axi_awid		(m_axi_mem_awid_a),
-		`UNUSED_PIN (m_axi_awlen),
+		.m_axi_awlen    (m_axi_mem_awlen_a),
 		`UNUSED_PIN (m_axi_awsize),
 		`UNUSED_PIN (m_axi_awburst),
 		`UNUSED_PIN (m_axi_awlock),
--- a/hw/rtl/cache/VX_cache_cluster_top.sv
+++ b/hw/rtl/cache/VX_cache_cluster_top.sv
@ -1,190 +0,0 @@
-// Copyright © 2019-2023
-// 
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-// http://www.apache.org/licenses/LICENSE-2.0
-// 
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-`include "VX_cache_define.vh"
-
-module VX_cache_cluster_top import VX_gpu_pkg::*; #(
-    parameter `STRING INSTANCE_ID    = "",
-
-    parameter NUM_UNITS             = 2,
-    parameter NUM_INPUTS            = 4,
-    parameter TAG_SEL_IDX           = 0,
-
-    // Number of Word requests per cycle
-    parameter NUM_REQS              = 4,
-
-    // Size of cache in bytes
-    parameter CACHE_SIZE            = 16384, 
-    // Size of line inside a bank in bytes
-    parameter LINE_SIZE             = 16, 
-    // Number of banks
-    parameter NUM_BANKS             = 4,
-    // Number of associative ways
-    parameter NUM_WAYS              = 4,
-    // Size of a word in bytes
-    parameter WORD_SIZE             = 4, 
-
-    // Core Response Queue Size
-    parameter CRSQ_SIZE             = 2,
-    // Miss Reserv Queue Knob
-    parameter MSHR_SIZE             = 16, 
-    // Memory Response Queue Size
-    parameter MRSQ_SIZE             = 0,
-    // Memory Request Queue Size
-    parameter MREQ_SIZE             = 4,
-
-    // Enable cache writeable
-    parameter WRITE_ENABLE          = 1,
-
-    // Request debug identifier
-    parameter UUID_WIDTH            = 0,
-
-    // core request tag size
-    parameter TAG_WIDTH             = UUID_WIDTH + 16,
-
-    // enable bypass for non-cacheable addresses
-    parameter NC_ENABLE             = 1,
-
-    // Core response output register
-    parameter CORE_OUT_REG          = 2,
-
-    // Memory request output register
-    parameter MEM_OUT_REG           = 2,
-
-    parameter NUM_CACHES = `UP(NUM_UNITS),
-    parameter PASSTHRU   = (NUM_UNITS == 0),
-    parameter ARB_TAG_WIDTH = TAG_WIDTH + `ARB_SEL_BITS(NUM_INPUTS, NUM_CACHES),
-    parameter MEM_TAG_WIDTH = PASSTHRU ? (NC_ENABLE ? `CACHE_NC_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) : 
-                                                       `CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH)) : 
-                                          (NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
-                                                       `CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS)),
-    parameter MEM_TAG_X_WIDTH = MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1)
- ) (    
-    input wire clk,
-    input wire reset,
-
-// PERF
-`ifdef PERF_ENABLE
-    output cache_perf_t cache_perf,
-`endif
-
-    // Core request
-    input  wire [NUM_INPUTS-1:0][NUM_REQS-1:0]                 core_req_valid,
-    input  wire [NUM_INPUTS-1:0][NUM_REQS-1:0]                 core_req_rw,
-    input  wire [NUM_INPUTS-1:0][NUM_REQS-1:0][WORD_SIZE-1:0]  core_req_byteen,
-    input  wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr,
-    input  wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data,
-    input  wire [NUM_INPUTS-1:0][NUM_REQS-1:0][TAG_WIDTH-1:0]  core_req_tag,
-    output wire [NUM_INPUTS-1:0][NUM_REQS-1:0]                 core_req_ready,
-
-    // Core response
-    output wire [NUM_INPUTS-1:0][NUM_REQS-1:0]                 core_rsp_valid,
-    output wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data,
-    output wire [NUM_INPUTS-1:0][NUM_REQS-1:0][TAG_WIDTH-1:0]  core_rsp_tag,
-    input  wire [NUM_INPUTS-1:0][NUM_REQS-1:0]                 core_rsp_ready,
-
-    // Memory request
-    output wire                    mem_req_valid,
-    output wire                    mem_req_rw, 
-    output wire [LINE_SIZE-1:0]    mem_req_byteen,
-    output wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr,
-    output wire [`CS_LINE_WIDTH-1:0] mem_req_data, 
-    output wire [MEM_TAG_X_WIDTH-1:0] mem_req_tag, 
-    input  wire                    mem_req_ready,
-    
-    // Memory response
-    input  wire                    mem_rsp_valid, 
-    input  wire [`CS_LINE_WIDTH-1:0] mem_rsp_data,
-    input  wire [MEM_TAG_X_WIDTH-1:0] mem_rsp_tag, 
-    output wire                    mem_rsp_ready
-);
-    VX_mem_bus_if #(
-        .DATA_SIZE (WORD_SIZE),
-        .TAG_WIDTH (TAG_WIDTH)
-    ) core_bus_if[NUM_INPUTS * NUM_REQS]();
-
-    VX_mem_bus_if #(
-        .DATA_SIZE (LINE_SIZE),
-        .TAG_WIDTH (MEM_TAG_X_WIDTH)
-    ) mem_bus_if();
-
-    // Core request
-    for (genvar i = 0; i < NUM_INPUTS; ++i) begin
-        for (genvar r = 0; r < NUM_REQS; ++r) begin
-            assign core_bus_if[i * NUM_REQS + r].req_valid = core_req_valid[i][r];
-            assign core_bus_if[i * NUM_REQS + r].req_data.rw = core_req_rw[i][r];
-            assign core_bus_if[i * NUM_REQS + r].req_data.byteen = core_req_byteen[i][r];
-            assign core_bus_if[i * NUM_REQS + r].req_data.addr = core_req_addr[i][r];
-            assign core_bus_if[i * NUM_REQS + r].req_data.data = core_req_data[i][r];
-            assign core_bus_if[i * NUM_REQS + r].req_data.tag = core_req_tag[i][r];
-            assign core_req_ready[i][r] = core_bus_if[i * NUM_REQS + r].req_ready;
-        end
-    end
-
-    // Core response
-    for (genvar i = 0; i < NUM_INPUTS; ++i) begin
-        for (genvar r = 0; r < NUM_REQS; ++r) begin
-            assign core_rsp_valid[i][r] = core_bus_if[i * NUM_REQS + r].rsp_valid;
-            assign core_rsp_data[i][r] = core_bus_if[i * NUM_REQS + r].rsp_data.data;
-            assign core_rsp_tag[i][r] = core_bus_if[i * NUM_REQS + r].rsp_data.tag;
-            assign core_bus_if[i * NUM_REQS + r].rsp_ready = core_rsp_ready[i][r];
-        end
-    end
-
-    // Memory request
-    assign mem_req_valid = mem_bus_if.req_valid;
-    assign mem_req_rw = mem_bus_if.req_data.rw; 
-    assign mem_req_byteen = mem_bus_if.req_data.byteen;
-    assign mem_req_addr = mem_bus_if.req_data.addr;
-    assign mem_req_data = mem_bus_if.req_data.data;  
-    assign mem_req_tag = mem_bus_if.req_data.tag; 
-    assign mem_bus_if.req_ready = mem_req_ready;
-    
-    // Memory response
-    assign mem_bus_if.rsp_valid = mem_rsp_valid;    
-    assign mem_bus_if.rsp_data.data = mem_rsp_data;
-    assign mem_bus_if.rsp_data.tag = mem_rsp_tag; 
-    assign mem_rsp_ready = mem_bus_if.rsp_ready;
-
-    VX_cache_cluster #(
-        .INSTANCE_ID    (INSTANCE_ID),
-        .NUM_UNITS      (NUM_UNITS),
-        .NUM_INPUTS     (NUM_INPUTS),
-        .TAG_SEL_IDX    (TAG_SEL_IDX),
-        .NUM_REQS       (NUM_REQS),
-        .CACHE_SIZE     (CACHE_SIZE),        
-        .LINE_SIZE      (LINE_SIZE),
-        .NUM_BANKS      (NUM_BANKS),
-        .NUM_WAYS       (NUM_WAYS),
-        .WORD_SIZE      (WORD_SIZE),
-        .CRSQ_SIZE      (CRSQ_SIZE),
-        .MSHR_SIZE      (MSHR_SIZE),
-        .MRSQ_SIZE      (MRSQ_SIZE),
-        .MREQ_SIZE      (MREQ_SIZE),
-        .WRITE_ENABLE   (WRITE_ENABLE),
-        .UUID_WIDTH     (UUID_WIDTH),
-        .TAG_WIDTH      (TAG_WIDTH),
-        .NC_ENABLE      (NC_ENABLE),
-        .CORE_OUT_REG   (CORE_OUT_REG),
-        .MEM_OUT_REG    (MEM_OUT_REG)
-    ) cache (
-    `ifdef PERF_ENABLE
-        .cache_perf     (cache_perf),
-    `endif
-        .clk            (clk),
-        .reset          (reset),
-        .core_bus_if    (core_bus_if),
-        .mem_bus_if     (mem_bus_if)
-    );
-
-endmodule
--- a/hw/rtl/cache/VX_cache_data.sv
+++ b/hw/rtl/cache/VX_cache_data.sv
@ -93,7 +93,7 @@ module VX_cache_data #(
        assign wren  = fill;
    end
    
-    wire [`CLOG2(NUM_WAYS)-1:0] way_idx;
+    wire [`LOG2UP(NUM_WAYS)-1:0] way_idx;

    VX_onehot_encoder #(
        .N (NUM_WAYS)
--- a/hw/rtl/cache/VX_cache_top.sv
+++ b/hw/rtl/cache/VX_cache_top.sv
@ -22,7 +22,7 @@ module VX_cache_top #(
    // Size of cache in bytes
    parameter CACHE_SIZE            = 16384, 
    // Size of line inside a bank in bytes
-    parameter LINE_SIZE             = 16, 
+    parameter LINE_SIZE             = 64, 
    // Number of banks
    parameter NUM_BANKS             = 4,
    // Number of associative ways
--- a/hw/rtl/core/VX_core.sv
+++ b/hw/rtl/core/VX_core.sv
@ -267,7 +267,7 @@ module VX_core import VX_gpu_pkg::*; #(

    wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;    

-    wire perf_icache_pending_read_cycle;
+    wire [1:0] perf_icache_pending_read_cycle;
    wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;

    reg  [`PERF_CTR_BITS-1:0] perf_icache_pending_reads;
--- a/hw/rtl/core/VX_fpu_unit.sv
+++ b/hw/rtl/core/VX_fpu_unit.sv
--- a/hw/rtl/core/VX_muldiv_unit.sv
+++ b/hw/rtl/core/VX_muldiv_unit.sv
@ -220,8 +220,13 @@ module VX_muldiv_unit #(
    wire [NUM_LANES-1:0][`XLEN-1:0] div_in2;

    for (genvar i = 0; i < NUM_LANES; ++i) begin
+    `ifdef XLEN_64
        assign div_in1[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]}: execute_if.data.rs1_data[i];
        assign div_in2[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]}: execute_if.data.rs2_data[i];
+    `else
+        assign div_in1[i] = execute_if.data.rs1_data[i];
+        assign div_in2[i] = execute_if.data.rs2_data[i];
+    `endif        
    end

 `ifdef IDIV_DPI    
--- a/hw/rtl/core/VX_operands.sv
+++ b/hw/rtl/core/VX_operands.sv
@ -38,9 +38,12 @@ module VX_operands import VX_gpu_pkg::*; #(
        reg [`NR_BITS-1:0] gpr_rd_rid, gpr_rd_rid_n;
        reg [ISSUE_WIS_W-1:0] gpr_rd_wis, gpr_rd_wis_n;

-        reg [ISSUE_RATIO-1:0][`NUM_THREADS-1:0][`XLEN-1:0] cache_data, cache_data_n;
-        reg [ISSUE_RATIO-1:0][`NR_BITS-1:0] cache_reg, cache_reg_n;
-        reg [ISSUE_RATIO-1:0][`NUM_THREADS-1:0] cache_tmask, cache_tmask_n;
+        reg [`NUM_THREADS-1:0][`XLEN-1:0] cache_data [ISSUE_RATIO-1:0];
+        reg [`NUM_THREADS-1:0][`XLEN-1:0] cache_data_n [ISSUE_RATIO-1:0];
+        reg [`NR_BITS-1:0] cache_reg [ISSUE_RATIO-1:0];
+        reg [`NR_BITS-1:0] cache_reg_n [ISSUE_RATIO-1:0];
+        reg [`NUM_THREADS-1:0] cache_tmask [ISSUE_RATIO-1:0];
+        reg [`NUM_THREADS-1:0] cache_tmask_n [ISSUE_RATIO-1:0];
        reg [ISSUE_RATIO-1:0] cache_eop, cache_eop_n;

        reg [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data, rs1_data_n;
@ -160,11 +163,8 @@ module VX_operands import VX_gpu_pkg::*; #(
                    end
                    cache_reg_n[writeback_if[i].data.wis] = writeback_if[i].data.rd;
                    cache_eop_n[writeback_if[i].data.wis] = writeback_if[i].data.eop;
-                    if (writeback_if[i].data.sop) begin
-                        cache_tmask_n[writeback_if[i].data.wis] = writeback_if[i].data.tmask;
-                    end else begin
-                        cache_tmask_n[writeback_if[i].data.wis] |= writeback_if[i].data.tmask;
-                    end                
+                    cache_tmask_n[writeback_if[i].data.wis] = writeback_if[i].data.sop ? writeback_if[i].data.tmask : 
+                                                                    (cache_tmask_n[writeback_if[i].data.wis] | writeback_if[i].data.tmask);
                end
            end
        end
@ -175,7 +175,6 @@ module VX_operands import VX_gpu_pkg::*; #(
                gpr_rd_rid  <= '0;
                gpr_rd_wis  <= '0;
                cache_eop   <= {ISSUE_RATIO{1'b1}};
-                cache_reg   <= '0;
                data_ready  <= 0;
            end else begin
                state       <= state_n;
--- a/hw/rtl/core/VX_scoreboard.sv
+++ b/hw/rtl/core/VX_scoreboard.sv
@ -107,6 +107,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
            .ready_out (scoreboard_if[i].ready)
        );

+    `ifdef SIMULATION
        reg [31:0] timeout_ctr;
    
        always @(posedge clk) begin
@ -134,6 +135,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
        `RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0,
            ("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
                $time, CORE_ID, wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.PC, writeback_if[i].data.tmask, writeback_if[i].data.rd, writeback_if[i].data.uuid));
+    `endif
+    
    end    

 endmodule
--- a/hw/rtl/core/VX_sfu_unit.sv
+++ b/hw/rtl/core/VX_sfu_unit.sv
@ -170,7 +170,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
        .NUM_INPUTS (RSP_ARB_SIZE),
        .DATAW      (RSP_ARB_DATAW),
        .ARBITER    ("R"),
-        .OUT_REG    (1)
+        .OUT_REG    (3)
    ) rsp_arb (
        .clk       (clk),
        .reset     (commit_reset), 
@ -186,7 +186,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
    VX_gather_unit #(
        .BLOCK_SIZE (BLOCK_SIZE),
        .NUM_LANES  (NUM_LANES),
-        .OUT_REG    (3)
+        .OUT_REG    (1)
    ) gather_unit (
        .clk           (clk),
        .reset         (commit_reset),
--- a/hw/rtl/fpu/VX_fpu_cvt.sv
+++ b/hw/rtl/fpu/VX_fpu_cvt.sv
@ -52,30 +52,24 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
 
    localparam MAN_BITS = 23;
    localparam EXP_BITS = 8;
-    localparam EXP_BIAS = 2**(EXP_BITS-1)-1;    
-
-    localparam logic [EXP_BITS-1:0] QNAN_EXPONENT = 2**EXP_BITS-1;
-    localparam logic [MAN_BITS-1:0] QNAN_MANTISSA = 2**(MAN_BITS-1);
+    localparam EXP_BIAS = 2**(EXP_BITS-1)-1;
    
    // Use 32-bit integer
-    localparam MAX_INT_WIDTH = 32;
+    localparam INT_WIDTH = 32;

    // The internal mantissa includes normal bit or an entire integer
-    localparam INT_MAN_WIDTH = `MAX(MAN_BITS + 1, MAX_INT_WIDTH);
+    localparam INT_MAN_WIDTH = `MAX(MAN_BITS + 1, INT_WIDTH);

    // The lower 2p+3 bits of the internal FMA result will be needed for leading-zero detection
    localparam LZC_RESULT_WIDTH = `CLOG2(INT_MAN_WIDTH);

    // The internal exponent must be able to represent the smallest denormal input value as signed
    // or the number of bits in an integer
-    localparam INT_EXP_WIDTH = `MAX(`CLOG2(MAX_INT_WIDTH), `MAX(EXP_BITS, `CLOG2(EXP_BIAS + MAN_BITS))) + 1;
-
-    // shift amount for denormalization
-    localparam SHAMT_BITS = `CLOG2(INT_MAN_WIDTH+1);
+    localparam INT_EXP_WIDTH = `MAX(`CLOG2(INT_WIDTH), `MAX(EXP_BITS, `CLOG2(EXP_BIAS + MAN_BITS))) + 1;

    localparam FMT_SHIFT_COMPENSATION = INT_MAN_WIDTH - 1 - MAN_BITS;
    localparam NUM_FP_STICKY  = 2 * INT_MAN_WIDTH - MAN_BITS - 1;   // removed mantissa, 1. and R
-    localparam NUM_INT_STICKY = 2 * INT_MAN_WIDTH - MAX_INT_WIDTH;  // removed int and R
+    localparam NUM_INT_STICKY = 2 * INT_MAN_WIDTH - INT_WIDTH;  // removed int and R
    
    // Input processing
    
@ -86,8 +80,8 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
            .EXP_BITS (EXP_BITS),
            .MAN_BITS (MAN_BITS)
        ) fp_class (
-            .exp_i  (dataa[i][30:23]),
-            .man_i  (dataa[i][22:0]),
+            .exp_i  (dataa[i][INT_WIDTH-2:MAN_BITS]),
+            .man_i  (dataa[i][MAN_BITS-1:0]),
            .clss_o (fclass[i])
        );
    end
@ -97,27 +91,25 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
    wire [NUM_LANES-1:0]                    input_sign;
    
    for (genvar i = 0; i < NUM_LANES; ++i) begin
-        wire [INT_MAN_WIDTH-1:0] int_mantissa;
-        wire [INT_MAN_WIDTH-1:0] fmt_mantissa;
-        wire fmt_sign        = dataa[i][31];
-        wire int_sign        = dataa[i][31] && is_signed;
-        assign int_mantissa  = int_sign ? (-dataa[i]) : dataa[i];
-        assign fmt_mantissa  = INT_MAN_WIDTH'({fclass[i].is_normal, dataa[i][MAN_BITS-1:0]});
+        wire i2f_sign = dataa[i][INT_WIDTH-1];
+        wire f2i_sign = dataa[i][INT_WIDTH-1] && is_signed;
+        wire [INT_MAN_WIDTH-1:0] f2i_mantissa = f2i_sign ? (-dataa[i]) : dataa[i];
+        wire [INT_MAN_WIDTH-1:0] i2f_mantissa = INT_MAN_WIDTH'({fclass[i].is_normal, dataa[i][MAN_BITS-1:0]});
        assign input_exp[i]  = {1'b0, dataa[i][MAN_BITS +: EXP_BITS]} + INT_EXP_WIDTH'({1'b0, fclass[i].is_subnormal});
-        assign input_mant[i] = is_itof ? int_mantissa : fmt_mantissa;
-        assign input_sign[i] = is_itof ? int_sign : fmt_sign;
+        assign input_mant[i] = is_itof ? f2i_mantissa : i2f_mantissa;
+        assign input_sign[i] = is_itof ? f2i_sign : i2f_sign;
    end

    // Pipeline stage0
    
-    wire                    valid_in_s0;
-    wire [NUM_LANES-1:0]    lane_mask_s0;
-    wire [TAGW-1:0]         tag_in_s0;
-    wire                    is_itof_s0;
-    wire                    unsigned_s0;
-    wire [2:0]              rnd_mode_s0;
+    wire                 valid_in_s0;
+    wire [NUM_LANES-1:0] lane_mask_s0;
+    wire [TAGW-1:0]      tag_in_s0;
+    wire                 is_itof_s0;
+    wire                 is_signed_s0;
+    wire [2:0]           rnd_mode_s0;
    fclass_t [NUM_LANES-1:0] fclass_s0;
-    wire [NUM_LANES-1:0]    input_sign_s0;
+    wire [NUM_LANES-1:0] input_sign_s0;
    wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] fmt_exponent_s0;
    wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] encoded_mant_s0;

@ -130,8 +122,8 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
        .clk      (clk),
        .reset    (reset),
        .enable   (~stall),
-        .data_in  ({valid_in, lane_mask, tag_in, is_itof, !is_signed, frm, fclass, input_sign, input_exp, input_mant}),
-        .data_out ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
+        .data_in  ({valid_in, lane_mask, tag_in, is_itof, is_signed, frm, fclass, input_sign, input_exp, input_mant}),
+        .data_out ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, is_signed_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
    );
    
    // Normalization
@ -159,22 +151,22 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
        assign input_mant_n_s0[i] = encoded_mant_s0[i] << renorm_shamt_s0[i];

        // Unbias exponent and compensate for shift
-        wire [INT_EXP_WIDTH-1:0] fp_input_exp_s0 = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
-        wire [INT_EXP_WIDTH-1:0] int_input_exp_s0 = INT_EXP_WIDTH'(INT_MAN_WIDTH-1) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
-        assign input_exp_n_s0[i] = is_itof_s0 ? int_input_exp_s0 : fp_input_exp_s0;
+        wire [INT_EXP_WIDTH-1:0] i2f_input_exp_s0 = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
+        wire [INT_EXP_WIDTH-1:0] f2i_input_exp_s0 = INT_EXP_WIDTH'(INT_MAN_WIDTH-1) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
+        assign input_exp_n_s0[i] = is_itof_s0 ? f2i_input_exp_s0 : i2f_input_exp_s0;
    end

    // Pipeline stage1

-    wire                    valid_in_s1;
-    wire [NUM_LANES-1:0]    lane_mask_s1;
-    wire [TAGW-1:0]         tag_in_s1;
-    wire                    is_itof_s1;
-    wire                    unsigned_s1;
-    wire [2:0]              rnd_mode_s1;
+    wire                 valid_in_s1;
+    wire [NUM_LANES-1:0] lane_mask_s1;
+    wire [TAGW-1:0]      tag_in_s1;
+    wire                 is_itof_s1;
+    wire                 is_signed_s1;
+    wire [2:0]           rnd_mode_s1;
    fclass_t [NUM_LANES-1:0] fclass_s1;
-    wire [NUM_LANES-1:0]    input_sign_s1;
-    wire [NUM_LANES-1:0]    mant_is_zero_s1;
+    wire [NUM_LANES-1:0] input_sign_s1;
+    wire [NUM_LANES-1:0] mant_is_zero_s1;
    wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] input_mant_s1;
    wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] input_exp_s1;

@ -185,76 +177,49 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
        .clk      (clk),
        .reset    (reset),
        .enable   (~stall),
-        .data_in  ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fclass_s0, input_sign_s0, mant_is_zero_s0, input_mant_n_s0, input_exp_n_s0}),
-        .data_out ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fclass_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1})
+        .data_in  ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, is_signed_s0, rnd_mode_s0, fclass_s0, input_sign_s0, mant_is_zero_s0, input_mant_n_s0, input_exp_n_s0}),
+        .data_out ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, is_signed_s1, rnd_mode_s1, fclass_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1})
    );

    // Perform adjustments to mantissa and exponent

    wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s1;
    wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s1;
-    wire [NUM_LANES-1:0]                    of_before_round_s1;
+    wire [NUM_LANES-1:0] of_before_round_s1;

    for (genvar i = 0; i < NUM_LANES; ++i) begin
-        reg [2*INT_MAN_WIDTH:0] preshift_mant_s1;   // mantissa before final shift                
-        reg [SHAMT_BITS-1:0]    denorm_shamt_s1;    // shift amount for denormalization
-        reg [INT_EXP_WIDTH-1:0] final_exp_tmp_s1;   // after eventual adjustments
-        reg                     of_before_round_tmp_s1;
-
+        wire [INT_EXP_WIDTH-1:0] denorm_shamt = INT_EXP_WIDTH'(INT_WIDTH-1) - input_exp_s1[i];
+        wire overflow = ($signed(denorm_shamt) <= -$signed(INT_EXP_WIDTH'(!is_signed_s1)));
+        wire underflow = ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-1)));
+        reg [INT_EXP_WIDTH-1:0] denorm_shamt_q;
        always @(*) begin
-            final_exp_tmp_s1 = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS); // take exponent as is, only look at lower bits
-            preshift_mant_s1 = {input_mant_s1[i], 33'b0};
-            denorm_shamt_s1  = '0;
-            of_before_round_tmp_s1 = 1'b0;
-
-            if (is_itof_s1) begin                   
-                if ($signed(input_exp_s1[i]) >= INT_EXP_WIDTH'($signed(2**EXP_BITS-1-EXP_BIAS))) begin
-                    // Overflow or infinities (for proper rounding)
-                    final_exp_tmp_s1 = (2**EXP_BITS-2); // largest normal value
-                    preshift_mant_s1 = ~0;  // largest normal value and RS bits set
-                    of_before_round_tmp_s1 = 1'b1;
-                end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-MAN_BITS-EXP_BIAS))) begin
-                    // Limit the shift to retain sticky bits
-                    final_exp_tmp_s1 = '0; // denormal result
-                    denorm_shamt_s1  = (2 + MAN_BITS); // to sticky                
-                end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(1-EXP_BIAS))) begin
-                    // Denormalize underflowing values
-                    final_exp_tmp_s1 = '0; // denormal result
-                    denorm_shamt_s1  = SHAMT_BITS'(1-EXP_BIAS) - SHAMT_BITS'(input_exp_s1[i]); // adjust right shifting               
-                end
+            if (overflow) begin
+                denorm_shamt_q = '0;
+            end else if (underflow) begin
+                denorm_shamt_q = INT_WIDTH+1;
            end else begin
-                if ($signed(input_exp_s1[i]) >= $signed(INT_EXP_WIDTH'(MAX_INT_WIDTH-1) + INT_EXP_WIDTH'(unsigned_s1))) begin
-                    // overflow: when converting to unsigned the range is larger by one
-                    of_before_round_tmp_s1 = 1'b1;                
-                end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-1))) begin
-                    // underflow
-                    denorm_shamt_s1 = MAX_INT_WIDTH+1; // all bits go to the sticky
-                end else begin
-                    // By default right shift mantissa to be an integer
-                    denorm_shamt_s1 = SHAMT_BITS'(MAX_INT_WIDTH-1) - SHAMT_BITS'(input_exp_s1[i]);
-                end              
+                denorm_shamt_q = denorm_shamt;
            end
        end
-
-        assign destination_mant_s1[i] = preshift_mant_s1 >> denorm_shamt_s1;
-        assign final_exp_s1[i]        = final_exp_tmp_s1;
-        assign of_before_round_s1[i]  = of_before_round_tmp_s1;
+        assign destination_mant_s1[i] = is_itof_s1 ? {input_mant_s1[i], 33'b0} : ({input_mant_s1[i], 33'b0} >> denorm_shamt_q);
+        assign final_exp_s1[i]        = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS);
+        assign of_before_round_s1[i]  = overflow;
    end

    // Pipeline stage2
    
-    wire                    valid_in_s2;
-    wire [NUM_LANES-1:0]    lane_mask_s2;
-    wire [TAGW-1:0]         tag_in_s2;
-    wire                    is_itof_s2;
-    wire                    unsigned_s2;
-    wire [2:0]              rnd_mode_s2;
+    wire                 valid_in_s2;
+    wire [NUM_LANES-1:0] lane_mask_s2;
+    wire [TAGW-1:0]      tag_in_s2;
+    wire                 is_itof_s2;
+    wire                 is_signed_s2;
+    wire [2:0]           rnd_mode_s2;
    fclass_t [NUM_LANES-1:0] fclass_s2;   
-    wire [NUM_LANES-1:0]    mant_is_zero_s2;
-    wire [NUM_LANES-1:0]    input_sign_s2;
+    wire [NUM_LANES-1:0] mant_is_zero_s2;
+    wire [NUM_LANES-1:0] input_sign_s2;
    wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s2;
    wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s2;
-    wire [NUM_LANES-1:0]    of_before_round_s2;
+    wire [NUM_LANES-1:0] of_before_round_s2;
    
    VX_pipe_register #(
        .DATAW  (1 + NUM_LANES + TAGW + 1 + 1 + `INST_FRM_BITS + NUM_LANES * ($bits(fclass_t) + 1 + 1 + (2*INT_MAN_WIDTH+1) + INT_EXP_WIDTH + 1)),
@ -263,37 +228,37 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
        .clk      (clk),
        .reset    (reset),
        .enable   (~stall),
-        .data_in  ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
-        .data_out ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
+        .data_in  ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, is_signed_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
+        .data_out ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, is_signed_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
    );

-    wire [NUM_LANES-1:0]       rounded_sign_s2;
-    wire [NUM_LANES-1:0][31:0] rounded_abs_s2;      // absolute value of result after rounding
-    wire [NUM_LANES-1:0]       int_round_has_sticky_s2;
-    wire [NUM_LANES-1:0]       fp_round_has_sticky_s2;
+    wire [NUM_LANES-1:0] rounded_sign_s2;
+    wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_abs_s2; // absolute value of result after rounding
+    wire [NUM_LANES-1:0] f2i_round_has_sticky_s2;
+    wire [NUM_LANES-1:0] i2f_round_has_sticky_s2;
    
    // Rouding and classification
   
    for (genvar i = 0; i < NUM_LANES; ++i) begin
-        wire [MAN_BITS-1:0]      final_mant_s2;        // mantissa after adjustments
-        wire [MAX_INT_WIDTH-1:0] final_int_s2;         // integer shifted in position
-        wire [1:0]               round_sticky_bits_s2;
-        wire [31:0]              fmt_pre_round_abs_s2;
-        wire [31:0]              pre_round_abs_s2;
-        wire [1:0]               int_round_sticky_bits_s2, fp_round_sticky_bits_s2;
+        wire [MAN_BITS-1:0]  final_mant_s2;  // mantissa after adjustments
+        wire [INT_WIDTH-1:0] final_int_s2;   // integer shifted in position
+        wire [1:0]           round_sticky_bits_s2;
+        wire [INT_WIDTH-1:0] fmt_pre_round_abs_s2;
+        wire [INT_WIDTH-1:0] pre_round_abs_s2;
+        wire [1:0]           f2i_round_sticky_bits_s2, i2f_round_sticky_bits_s2;

        // Extract final mantissa and round bit, discard the normal bit (for FP)
-        assign {final_mant_s2, fp_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1];
-        assign {final_int_s2, int_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH   : 2*INT_MAN_WIDTH   - (MAX_INT_WIDTH+1) + 1];
+        assign {final_mant_s2, i2f_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1];
+        assign {final_int_s2, f2i_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH   : 2*INT_MAN_WIDTH   - (INT_WIDTH+1) + 1];

        // Collapse sticky bits
-        assign fp_round_sticky_bits_s2[0]  = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]);
-        assign int_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]);
-        assign fp_round_has_sticky_s2[i]   = (| fp_round_sticky_bits_s2);
-        assign int_round_has_sticky_s2[i]  = (| int_round_sticky_bits_s2);
+        assign i2f_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]);
+        assign f2i_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]);
+        assign i2f_round_has_sticky_s2[i]  = (| i2f_round_sticky_bits_s2);
+        assign f2i_round_has_sticky_s2[i]  = (| f2i_round_sticky_bits_s2);

        // select RS bits for destination operation
-        assign round_sticky_bits_s2 = is_itof_s2 ? fp_round_sticky_bits_s2 : int_round_sticky_bits_s2;
+        assign round_sticky_bits_s2 = is_itof_s2 ? i2f_round_sticky_bits_s2 : f2i_round_sticky_bits_s2;

        // Pack exponent and mantissa into proper rounding form
        assign fmt_pre_round_abs_s2 = {1'b0, final_exp_s2[i][EXP_BITS-1:0], final_mant_s2[MAN_BITS-1:0]};
@ -322,15 +287,15 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
    wire [NUM_LANES-1:0] lane_mask_s3;
    wire [TAGW-1:0]      tag_in_s3;
    wire                 is_itof_s3;
-    wire                 unsigned_s3;
+    wire                 is_signed_s3;
    fclass_t [NUM_LANES-1:0] fclass_s3;   
    wire [NUM_LANES-1:0] mant_is_zero_s3;
    wire [NUM_LANES-1:0] input_sign_s3;
    wire [NUM_LANES-1:0] rounded_sign_s3;
-    wire [NUM_LANES-1:0][31:0] rounded_abs_s3;
+    wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_abs_s3;
    wire [NUM_LANES-1:0] of_before_round_s3;   
-    wire [NUM_LANES-1:0] int_round_has_sticky_s3;
-    wire [NUM_LANES-1:0] fp_round_has_sticky_s3; 
+    wire [NUM_LANES-1:0] f2i_round_has_sticky_s3;
+    wire [NUM_LANES-1:0] i2f_round_has_sticky_s3; 

    VX_pipe_register #(
        .DATAW  (1 + NUM_LANES + TAGW + 1 + 1 + NUM_LANES * ($bits(fclass_t) + 1 + 1 + 32 + 1 + 1 + 1 + 1)),
@ -339,105 +304,68 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
        .clk      (clk),
        .reset    (reset),
        .enable   (~stall),
-        .data_in  ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, int_round_has_sticky_s2, fp_round_has_sticky_s2}),
-        .data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, unsigned_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, int_round_has_sticky_s3, fp_round_has_sticky_s3})
+        .data_in  ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, is_signed_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, f2i_round_has_sticky_s2, i2f_round_has_sticky_s2}),
+        .data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, is_signed_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, f2i_round_has_sticky_s3, i2f_round_has_sticky_s3})
    );
     
-    wire [NUM_LANES-1:0] of_after_round_s3;
-    wire [NUM_LANES-1:0] uf_after_round_s3;
-    wire [NUM_LANES-1:0][31:0] fmt_result_s3;
-    wire [NUM_LANES-1:0][31:0] rounded_int_res_s3; // after possible inversion
+    wire [NUM_LANES-1:0][INT_WIDTH-1:0] fmt_result_s3;
+    wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_int_res_s3; // after possible inversion
    wire [NUM_LANES-1:0] rounded_int_res_zero_s3;  // after rounding

    for (genvar i = 0; i < NUM_LANES; ++i) begin
        // Assemble regular result, nan box short ones. Int zeroes need to be detected
-        assign fmt_result_s3[i] = (is_itof_s3 & mant_is_zero_s3[i]) ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]};
-
-        // Classification after rounding select by destination format
-        assign uf_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == 0);  // denormal
-        assign of_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == ~0); // inf exp.
+        assign fmt_result_s3[i] = mant_is_zero_s3[i] ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]};

        // Negative integer result needs to be brought into two's complement
        assign rounded_int_res_s3[i] = rounded_sign_s3[i] ? (-rounded_abs_s3[i]) : rounded_abs_s3[i];
        assign rounded_int_res_zero_s3[i] = (rounded_int_res_s3[i] == 0);
    end

-    // FP Special case handling
+    // F2I Special case handling

-    wire [NUM_LANES-1:0][31:0] fp_special_result_s3;
-    fflags_t [NUM_LANES-1:0]   fp_special_status_s3;
-    wire [NUM_LANES-1:0]       fp_result_is_special_s3;
-
-    for (genvar i = 0; i < NUM_LANES; ++i) begin
-        // Detect special case from source format, I2F casts don't produce a special result
-        assign fp_result_is_special_s3[i] = ~is_itof_s3 & (fclass_s3[i].is_zero | fclass_s3[i].is_nan);
-
-        // Signalling input NaNs raise invalid flag, otherwise no flags set
-        assign fp_special_status_s3[i] = fclass_s3[i].is_signaling ? {1'b1, 4'h0} : 5'h0; // invalid operation
-
-        // Assemble result according to destination format
-        assign fp_special_result_s3[i] = fclass_s3[i].is_zero ? (32'(input_sign_s3) << 31) // signed zero
-                                                              : {1'b0, QNAN_EXPONENT, QNAN_MANTISSA}; // qNaN
-    end
-
-    // INT Special case handling
-
-    reg [NUM_LANES-1:0][31:0] int_special_result_s3;
-    fflags_t [NUM_LANES-1:0]  int_special_status_s3;
-    wire [NUM_LANES-1:0]      int_result_is_special_s3;
+    reg [NUM_LANES-1:0][INT_WIDTH-1:0] f2i_special_result_s3;
+    fflags_t [NUM_LANES-1:0] f2i_special_status_s3;
+    wire [NUM_LANES-1:0] f2i_result_is_special_s3;

    for (genvar i = 0; i < NUM_LANES; ++i) begin
         // Assemble result according to destination format
        always @(*) begin
            if (input_sign_s3[i] && !fclass_s3[i].is_nan) begin
-                int_special_result_s3[i][30:0] = '0;            // alone yields 2**(31)-1
-                int_special_result_s3[i][31]   = ~unsigned_s3;  // for unsigned casts yields 2**31
+                f2i_special_result_s3[i][INT_WIDTH-2:0] = '0;            // alone yields 2**(31)-1
+                f2i_special_result_s3[i][INT_WIDTH-1]   = is_signed_s3;  // for unsigned casts yields 2**31
            end else begin
-                int_special_result_s3[i][30:0] = 2**(31) - 1;   // alone yields 2**(31)-1
-                int_special_result_s3[i][31]   = unsigned_s3;   // for unsigned casts yields 2**31
+                f2i_special_result_s3[i][INT_WIDTH-2:0] = 2**(INT_WIDTH-1) - 1;   // alone yields 2**(31)-1
+                f2i_special_result_s3[i][INT_WIDTH-1]   = ~is_signed_s3;   // for unsigned casts yields 2**31
            end
        end            

        // Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned)
-        assign int_result_is_special_s3[i] = fclass_s3[i].is_nan 
+        assign f2i_result_is_special_s3[i] = fclass_s3[i].is_nan 
                                           | fclass_s3[i].is_inf
                                           | of_before_round_s3[i]
-                                           | (input_sign_s3[i] & unsigned_s3 & ~rounded_int_res_zero_s3[i]);
+                                           | (input_sign_s3[i] & ~is_signed_s3 & ~rounded_int_res_zero_s3[i]);
                                        
        // All integer special cases are invalid
-        assign int_special_status_s3[i] = {1'b1, 4'h0};
+        assign f2i_special_status_s3[i] = {1'b1, 4'h0};
    end

    // Result selection and Output handshake

    fflags_t [NUM_LANES-1:0] tmp_fflags_s3;    
-    wire [NUM_LANES-1:0][31:0] tmp_result_s3;
+    wire [NUM_LANES-1:0][INT_WIDTH-1:0] tmp_result_s3;

-    for (genvar i = 0; i < NUM_LANES; ++i) begin
-        fflags_t fp_regular_status_s3, int_regular_status_s3;
-        fflags_t fp_status_s3, int_status_s3;    
-        wire [31:0] fp_result_s3, int_result_s3;
+    for (genvar i = 0; i < NUM_LANES; ++i) begin                                  
+        fflags_t i2f_regular_status_s3 = i2f_round_has_sticky_s3[i] ? 5'h1 : 5'h0;
+        fflags_t f2i_regular_status_s3 = f2i_round_has_sticky_s3[i] ? 5'h1 : 5'h0;

-        wire inexact_s3 = is_itof_s3 ? fp_round_has_sticky_s3[i] // overflow is invalid in i2f;        
-                                     : (fp_round_has_sticky_s3[i] || (~fclass_s3[i].is_inf && (of_before_round_s3[i] || of_after_round_s3[i])));
-                                  
-        assign fp_regular_status_s3.NV = is_itof_s3 & (of_before_round_s3[i] | of_after_round_s3[i]); // overflow is invalid for I2F casts
-        assign fp_regular_status_s3.DZ = 1'b0; // no divisions
-        assign fp_regular_status_s3.OF = ~is_itof_s3 & (~fclass_s3[i].is_inf & (of_before_round_s3[i] | of_after_round_s3[i])); // inf casts no OF
-        assign fp_regular_status_s3.UF = uf_after_round_s3[i] & inexact_s3;
-        assign fp_regular_status_s3.NX = inexact_s3;
+        fflags_t i2f_status_s3 = i2f_regular_status_s3;
+        fflags_t f2i_status_s3 = f2i_result_is_special_s3[i] ? f2i_special_status_s3[i] : f2i_regular_status_s3;

-        assign int_regular_status_s3 = int_round_has_sticky_s3[i] ? {4'h0, 1'b1} : 5'h0;
+        wire [INT_WIDTH-1:0] i2f_result_s3 = fmt_result_s3[i];
+        wire [INT_WIDTH-1:0] f2i_result_s3 = f2i_result_is_special_s3[i] ? f2i_special_result_s3[i] : rounded_int_res_s3[i];

-        assign fp_result_s3  = fp_result_is_special_s3[i]  ? fp_special_result_s3[i]  : fmt_result_s3[i];        
-        assign int_result_s3 = int_result_is_special_s3[i] ? int_special_result_s3[i] : rounded_int_res_s3[i];
-
-        assign fp_status_s3  = fp_result_is_special_s3[i]  ? fp_special_status_s3[i]  : fp_regular_status_s3;
-        assign int_status_s3 = int_result_is_special_s3[i] ? int_special_status_s3[i] : int_regular_status_s3;
-
-        // Select output depending on special case detection
-        assign tmp_result_s3[i] = is_itof_s3 ? fp_result_s3 : int_result_s3;
-        assign tmp_fflags_s3[i] = is_itof_s3 ? fp_status_s3 : int_status_s3;
+        assign tmp_result_s3[i] = is_itof_s3 ? i2f_result_s3 : f2i_result_s3;
+        assign tmp_fflags_s3[i] = is_itof_s3 ? i2f_status_s3 : f2i_status_s3;
    end

    assign stall = ~ready_out && valid_out;
@ -457,7 +385,6 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
    );

    assign ready_in = ~stall;
-
    assign has_fflags = 1'b1;

 endmodule
--- a/hw/rtl/fpu/VX_fpu_rounding.sv
+++ b/hw/rtl/fpu/VX_fpu_rounding.sv
@ -54,7 +54,6 @@ module VX_fpu_rounding #(
                      2'b01: round_up = 1'b0;            // < ulp/2 away, round down
                      2'b10: round_up = abs_value_i[0];  // = ulp/2 away, round towards even result
                      2'b11: round_up = 1'b1;            // > ulp/2 away, round up
-                    default: round_up = 1'bx;
                endcase
            `INST_FRM_RTZ: round_up = 1'b0; // always round down
            `INST_FRM_RDN: round_up = (| round_sticky_bits_i) & sign_i;  // to 0 if +, away if -
--- a/hw/syn/altera/quartus/cache/Makefile
+++ b/hw/syn/altera/quartus/cache/Makefile
@ -1,6 +1,6 @@
-PROJECT = VX_cache_cluster_top
+PROJECT = VX_cache_top
 TOP_LEVEL_ENTITY = $(PROJECT)
-SRC_FILE = VX_cache_cluster.sv
+SRC_FILE = $(PROJECT).sv

 include ../../common.mk

--- a/hw/syn/altera/quartus/core/Makefile
+++ b/hw/syn/altera/quartus/core/Makefile
@ -1,6 +1,6 @@
 PROJECT = VX_core_top
 TOP_LEVEL_ENTITY = $(PROJECT)
-SRC_FILE = VX_core.sv
+SRC_FILE = $(PROJECT).sv

 include ../../common.mk

--- a/hw/syn/xilinx/test/kernel/Makefile
+++ b/hw/syn/xilinx/test/kernel/Makefile
@ -1,10 +1,11 @@
 XLEN ?= 32
+TOOLDIR ?= /opt

 ifeq ($(XLEN),64)
-RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
+RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
 CFLAGS += -march=rv64imafd -mabi=lp64d
 else
-RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
+RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
 CFLAGS += -march=rv32imaf -mabi=ilp32f
 endif

--- a/hw/unittest/top_modules/Makefile
+++ b/hw/unittest/top_modules/Makefile
@ -56,7 +56,6 @@ PROJECT = top_modules
 all: build
 	
 build: $(SRCS)
-	verilator --build $(VL_FLAGS) --cc VX_cache_cluster_top --top-module VX_cache_cluster_top $^ -CFLAGS '$(CXXFLAGS)'
 	verilator --build $(VL_FLAGS) --cc VX_cache_top --top-module VX_cache_top $^ -CFLAGS '$(CXXFLAGS)'
 	verilator --build $(VL_FLAGS) --cc VX_core_top --top-module VX_core_top $^ -CFLAGS '$(CXXFLAGS)'

--- a/kernel/Makefile
+++ b/kernel/Makefile
@ -1,17 +1,18 @@
 XLEN ?= 32
+TOOLDIR ?= /opt

 ifeq ($(XLEN),64)
-RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
+RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
 CFLAGS += -march=rv64imafd -mabi=lp64d
 else
-RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
+RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
 CFLAGS += -march=rv32imaf -mabi=ilp32f
 endif

 RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
 RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)

-LLVM_VORTEX ?= /opt/llvm-vortex
+LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex

 LLVM_CFLAGS += --sysroot=$(RISCV_SYSROOT)
 LLVM_CFLAGS += --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH)
--- a/kernel/src/vx_spawn.c
+++ b/kernel/src/vx_spawn.c
@ -51,9 +51,8 @@ inline char is_log2(int x) {
  return ((x & (x-1)) == 0);
 }

-inline int fast_log2(int x) {
-  float f = x;
-  return (*(int*)(&f)>>23) - 127;
+inline int log2_fast(int x) {
+  return 31 - __builtin_clz (x);
 }

 static void __attribute__ ((noinline)) spawn_tasks_all_stub() {
@ -286,8 +285,8 @@ void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg) {

  // fast path handling
  char isXYpow2 = is_log2(XY);
-  char log2XY   = fast_log2(XY);
-  char log2X    = fast_log2(X);
+  char log2XY   = log2_fast(XY);
+  char log2X    = log2_fast(X);

  wspawn_kernel_args_t wspawn_args = { 
    ctx, callback, arg, core_id * tasks_per_core, fW, rW, isXYpow2, log2XY, log2X
--- a/tests/kernel/common.mk
+++ b/tests/kernel/common.mk
@ -0,0 +1,49 @@
+XLEN ?= 32
+TOOLDIR ?= /opt
+
+ifeq ($(XLEN),64)
+RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
+CFLAGS += -march=rv64imafd -mabi=lp64d
+else
+RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
+CFLAGS += -march=rv32imaf -mabi=ilp32f
+endif
+
+RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
+
+VORTEX_KN_PATH ?= $(realpath ../../../kernel)
+
+CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
+AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar
+DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
+CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
+
+SIM_DIR = ../../../sim
+
+CFLAGS += -O3 -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
+CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw
+
+LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a
+
+all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump
+
+$(PROJECT).dump: $(PROJECT).elf
+	$(DP) -D $(PROJECT).elf > $(PROJECT).dump
+
+$(PROJECT).bin: $(PROJECT).elf
+	$(CP) -O binary $(PROJECT).elf $(PROJECT).bin
+
+$(PROJECT).elf: $(SRCS)
+	$(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf
+
+run-rtlsim: $(PROJECT).bin
+	$(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin
+
+run-simx: $(PROJECT).bin
+	$(SIM_DIR)/simx/simx $(PROJECT).bin
+
+.depend: $(SRCS)
+	$(CC) $(CFLAGS) -MM $^ > .depend;
+
+clean:
+	rm -rf *.elf *.bin *.dump .depend 
--- a/tests/kernel/conform/Makefile
+++ b/tests/kernel/conform/Makefile
@ -1,52 +1,5 @@
-XLEN ?= 32
-
-ifeq ($(XLEN),64)
-RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
-CFLAGS += -march=rv64imafd -mabi=lp64d
-else
-RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
-CFLAGS += -march=rv32imaf -mabi=ilp32f
-endif
-
-RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
-
-VORTEX_KN_PATH ?= $(realpath ../../../kernel)
-
-CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
-AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar
-DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
-CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
-
-SIM_DIR = ../../../sim
-
-CFLAGS += -O3 -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
-CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw
-
-LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a
-
 PROJECT = conform

 SRCS = main.cpp tests.cpp

-all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump
-
-$(PROJECT).dump: $(PROJECT).elf
-	$(DP) -D $(PROJECT).elf > $(PROJECT).dump
-
-$(PROJECT).bin: $(PROJECT).elf
-	$(CP) -O binary $(PROJECT).elf $(PROJECT).bin
-
-$(PROJECT).elf: $(SRCS)
-	$(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf
-
-run-rtlsim: $(PROJECT).bin
-	$(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin
-
-run-simx: $(PROJECT).bin
-	$(SIM_DIR)/simx/simx $(PROJECT).bin
-
-.depend: $(SRCS)
-	$(CC) $(CFLAGS) -MM $^ > .depend;
-
-clean:
-	rm -rf *.elf *.bin *.dump .depend 
+include ../common.mk
--- a/tests/kernel/fibonacci/Makefile
+++ b/tests/kernel/fibonacci/Makefile
@ -1,52 +1,5 @@
-XLEN ?= 32
-
-ifeq ($(XLEN),64)
-RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
-CFLAGS += -march=rv64imafd -mabi=lp64d
-else
-RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
-CFLAGS += -march=rv32imaf -mabi=ilp32f
-endif
-
-RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
-
-VORTEX_KN_PATH ?= $(realpath ../../../kernel)
-
-CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
-AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar
-DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
-CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
-
-SIM_DIR = ../../../sim
-
-CFLAGS += -O3 -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
-CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw
-
-LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a
-
 PROJECT = fibonacci

 SRCS = main.cpp

-all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump
-
-$(PROJECT).dump: $(PROJECT).elf
-	$(DP) -D $(PROJECT).elf > $(PROJECT).dump
-
-$(PROJECT).bin: $(PROJECT).elf
-	$(CP) -O binary $(PROJECT).elf $(PROJECT).bin
-
-$(PROJECT).elf: $(SRCS)
-	$(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf
-
-run-rtlsim: $(PROJECT).bin
-	$(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin
-
-run-simx: $(PROJECT).bin
-	$(SIM_DIR)/simx/simx $(PROJECT).bin
-
-.depend: $(SRCS)
-	$(CC) $(CFLAGS) -MM $^ > .depend;
-
-clean:
-	rm -rf *.elf *.bin *.dump .depend 
+include ../common.mk
--- a/tests/kernel/hello/Makefile
+++ b/tests/kernel/hello/Makefile
@ -1,52 +1,5 @@
-XLEN ?= 32
-
-ifeq ($(XLEN),64)
-RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
-CFLAGS += -march=rv64imafd -mabi=lp64d
-else
-RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
-CFLAGS += -march=rv32imaf -mabi=ilp32f
-endif
-
-RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
-
-VORTEX_KN_PATH ?= $(realpath ../../../kernel)
-
-CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
-AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar
-DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
-CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
-
-SIM_DIR = ../../../sim
-
-CFLAGS += -O3 -v -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
-CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw
-
-LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a
-
 PROJECT = hello

 SRCS = main.cpp

-all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump
-
-$(PROJECT).dump: $(PROJECT).elf
-	$(DP) -D $(PROJECT).elf > $(PROJECT).dump
-
-$(PROJECT).bin: $(PROJECT).elf
-	$(CP) -O binary $(PROJECT).elf $(PROJECT).bin
-
-$(PROJECT).elf: $(SRCS)
-	$(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf
-
-run-rtlsim: $(PROJECT).bin
-	$(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin
-
-run-simx: $(PROJECT).bin
-	$(SIM_DIR)/simx/simx $(PROJECT).bin
-
-.depend: $(SRCS)
-	$(CC) $(CFLAGS) -MM $^ > .depend;
-
-clean:
-	rm -rf *.elf *.bin *.dump .depend 
+include ../common.mk
--- a/tests/opencl/Makefile
+++ b/tests/opencl/Makefile
@ -18,6 +18,7 @@ all:
 	$(MAKE) -C oclprintf
 	$(MAKE) -C blackscholes
 	$(MAKE) -C matmul
+	$(MAKE) -C convolution

 run-simx:
 	$(MAKE) -C vecadd run-simx
@ -30,15 +31,16 @@ run-simx:
 	$(MAKE) -C dotproduct run-simx
 	$(MAKE) -C kmeans run-simx
 	$(MAKE) -C spmv run-simx
-	$(MAKE) -C cutcp run-simx	
 	$(MAKE) -C stencil run-simx
 	$(MAKE) -C lbm run-simx
 	$(MAKE) -C oclprintf run-simx
 	$(MAKE) -C blackscholes run-simx
-	$(MAKE) -C matmul run-simx
 	$(MAKE) -C transpose run-simx
-#	$(MAKE) -C vectorhypot run-simx	
-#	$(MAKE) -C mri-q run-simx
+	$(MAKE) -C convolution run-simx
+	$(MAKE) -C cutcp run-simx
+	$(MAKE) -C matmul run-simx
+	$(MAKE) -C vectorhypot run-simx	
+	$(MAKE) -C mri-q run-simx

 run-rtlsim:
 	$(MAKE) -C vecadd run-rtlsim
@ -52,12 +54,13 @@ run-rtlsim:
 	$(MAKE) -C kmeans run-rtlsim
 	$(MAKE) -C spmv run-rtlsim
 	$(MAKE) -C transpose run-rtlsim
-	$(MAKE) -C cutcp run-rtlsim
 	$(MAKE) -C stencil run-rtlsim
 	$(MAKE) -C lbm run-rtlsim
 	$(MAKE) -C oclprintf run-rtlsim
 	$(MAKE) -C blackscholes run-rtlsim
-	$(MAKE) -C matmul run-rtlsim
+	$(MAKE) -C convolution run-rtlsim
+#	$(MAKE) -C cutcp run-rtlsim
+#	$(MAKE) -C matmul run-rtlsim
 #	$(MAKE) -C vectorhypot run-rtlsim
 #	$(MAKE) -C mri-q run-rtlsim

@ -73,12 +76,13 @@ run-opae:
 	$(MAKE) -C kmeans run-opae
 	$(MAKE) -C spmv run-opae
 	$(MAKE) -C transpose run-opae
-	$(MAKE) -C cutcp run-opae	
 	$(MAKE) -C stencil run-opae
 	$(MAKE) -C lbm run-opae
 	$(MAKE) -C oclprintf run-opae
 	$(MAKE) -C blackscholes run-opae
-	$(MAKE) -C matmul run-opae
+	$(MAKE) -C convolution run-opae
+#	$(MAKE) -C cutcp run-opae
+#	$(MAKE) -C matmul run-opae
 #	$(MAKE) -C vectorhypot run-opae
 #	$(MAKE) -C mri-q run-opae

@ -102,6 +106,7 @@ clean:
 	$(MAKE) -C oclprintf clean
 	$(MAKE) -C blackscholes clean
 	$(MAKE) -C matmul clean
+	$(MAKE) -C convolution clean

 clean-all:
 	$(MAKE) -C vecadd clean-all
@ -124,3 +129,4 @@ clean-all:
 	$(MAKE) -C oclprintf clean-all
 	$(MAKE) -C blackscholes clean-all
 	$(MAKE) -C matmul clean-all
+	$(MAKE) -C convolution clean-all
--- a/tests/opencl/common.mk
+++ b/tests/opencl/common.mk
@ -1,4 +1,5 @@
 XLEN ?= 32
+TOOLDIR ?= /opt

 TARGET ?= opaesim

@ -6,12 +7,12 @@ XRT_SYN_DIR  ?= ../../../hw/syn/xilinx/xrt
 XRT_DEVICE_INDEX ?= 0

 ifeq ($(XLEN),64)
-RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
+RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
 VX_CFLAGS += -march=rv64imafd -mabi=lp64d
 K_CFLAGS += -march=rv64imafd -mabi=ilp64d
 STARTUP_ADDR ?= 0x180000000
 else
-RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
+RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
 VX_CFLAGS += -march=rv32imaf -mabi=ilp32f
 K_CFLAGS += -march=rv32imaf -mabi=ilp32f
 STARTUP_ADDR ?= 0x80000000
@ -20,16 +21,16 @@ endif
 RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
 RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)

-POCL_CC_PATH ?= /opt/pocl/compiler
-POCL_RT_PATH ?= /opt/pocl/runtime
+POCL_CC_PATH ?= $(TOOLDIR)/pocl/compiler
+POCL_RT_PATH ?= $(TOOLDIR)/pocl/runtime

 VORTEX_RT_PATH ?= $(realpath ../../../runtime)
 VORTEX_KN_PATH ?= $(realpath ../../../kernel)

 FPGA_BIN_DIR ?= $(VORTEX_RT_PATH)/opae

-LLVM_VORTEX ?= /opt/llvm-vortex
-LLVM_POCL ?= /opt/llvm-vortex
+LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex
+LLVM_POCL ?= $(TOOLDIR)/llvm-vortex

 K_CFLAGS   += -v -O3 --sysroot=$(RISCV_SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -Xclang -target-feature -Xclang +vortex
 K_CFLAGS   += -fno-rtti -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
@ -40,13 +41,12 @@ CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
 CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing
 CXXFLAGS += -pthread
 CXXFLAGS += -I$(POCL_RT_PATH)/include
-LDFLAGS  += -L$(POCL_RT_PATH)/lib -L$(VORTEX_RT_PATH)/stub -lvortex

 ifdef HOSTGPU
 	CXXFLAGS += -DHOSTGPU
 	LDFLAGS += -lOpenCL
 else
-	LDFLAGS += $(POCL_RT_PATH)/lib/libOpenCL.so	
+	LDFLAGS += -L$(VORTEX_RT_PATH)/stub -lvortex $(POCL_RT_PATH)/lib/libOpenCL.so
 endif

 # Debugigng
--- a/tests/opencl/convolution/Makefile
+++ b/tests/opencl/convolution/Makefile
@ -0,0 +1,7 @@
+PROJECT = convolution
+
+SRCS = main.cc
+
+OPTS ?= -n32
+
+include ../common.mk
--- a/tests/opencl/convolution/kernel.cl
+++ b/tests/opencl/convolution/kernel.cl
@ -0,0 +1,32 @@
+__kernel void conv3x3(__global float* output, 
+                      __global float* input,
+                      __global float* weights, 
+                      const int width, 
+                      const int height) 
+{
+    int x = get_global_id(0);
+    int y = get_global_id(1);
+
+    // Adjust for padded borders
+    int paddedWidth = width + 2;
+    int paddedX = x + 1;
+    int paddedY = y + 1;
+
+    // Compute the convolution sum
+    float sum = 0.0f;
+
+    sum += input[(paddedY - 1) * paddedWidth + (paddedX - 1)] * weights[0]; // Top-left
+    sum += input[(paddedY - 1) * paddedWidth + paddedX] * weights[1];       // Top-center
+    sum += input[(paddedY - 1) * paddedWidth + (paddedX + 1)] * weights[2]; // Top-right
+
+    sum += input[paddedY * paddedWidth + (paddedX - 1)] * weights[3];       // Middle-left
+    sum += input[paddedY * paddedWidth + paddedX] * weights[4];             // Center
+    sum += input[paddedY * paddedWidth + (paddedX + 1)] * weights[5];       // Middle-right
+
+    sum += input[(paddedY + 1) * paddedWidth + (paddedX - 1)] * weights[6]; // Bottom-left
+    sum += input[(paddedY + 1) * paddedWidth + paddedX] * weights[7];       // Bottom-center
+    sum += input[(paddedY + 1) * paddedWidth + (paddedX + 1)] * weights[8]; // Bottom-right
+
+    // Store the result in the output array
+    output[y * width + x] = sum;
+}
--- a/tests/opencl/convolution/main.cc
+++ b/tests/opencl/convolution/main.cc
@ -0,0 +1,258 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <CL/opencl.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h> 
+#include <chrono>
+#include <vector>
+
+#define FLOAT_ULP 6
+
+#define KERNEL_NAME "conv3x3"
+
+#define CL_CHECK(_expr)                                                \
+   do {                                                                \
+     cl_int _err = _expr;                                              \
+     if (_err == CL_SUCCESS)                                           \
+       break;                                                          \
+     printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err);   \
+	 cleanup();			                                                     \
+     exit(-1);                                                         \
+   } while (0)
+
+#define CL_CHECK2(_expr)                                               \
+   ({                                                                  \
+     cl_int _err = CL_INVALID_VALUE;                                   \
+     decltype(_expr) _ret = _expr;                                     \
+     if (_err != CL_SUCCESS) {                                         \
+       printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
+	   cleanup();			                                                   \
+       exit(-1);                                                       \
+     }                                                                 \
+     _ret;                                                             \
+   })
+
+static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
+  if (nullptr == filename || nullptr == data || 0 == size)
+    return -1;
+
+  FILE* fp = fopen(filename, "r");
+  if (NULL == fp) {
+    fprintf(stderr, "Failed to load kernel.");
+    return -1;
+  }
+  
+  fseek(fp , 0 , SEEK_END);
+  long fsize = ftell(fp);
+  rewind(fp);
+
+  *data = (uint8_t*)malloc(fsize);
+  *size = fread(*data, 1, fsize, fp);
+  
+  fclose(fp);
+  
+  return 0;
+}
+
+static bool compare_equal(float a, float b) {
+  union fi_t { float f; int32_t i; };
+  fi_t fa, fb;
+  fa.f = a;
+  fb.f = b;
+  auto d = std::abs(fa.i - fb.i);
+  return d <= FLOAT_ULP;
+}
+
+static void convolution_cpu(float *O, float *I, float *W, int32_t width, int32_t height) {
+  int paddedWidth = width + 2;
+  for (int32_t y = 0; y < height; ++y) {
+    for (int32_t x = 0; x < width; ++x) {
+      int paddedY = y + 1;
+      int paddedX = x + 1;
+      float sum = 0.0f;
+      for (int32_t ky = -1; ky <= 1; ++ky) {
+        for (int32_t kx = -1; kx <= 1; ++kx) {
+          int32_t iy = paddedY + ky;
+          int32_t ix = paddedX + kx;
+          float value = I[iy * paddedWidth + ix];
+          float weight = W[(ky + 1) * 3 + (kx + 1)];
+          sum += value * weight;
+        }
+      }
+      O[y * width + x] = sum;
+    }
+  }
+}
+
+cl_device_id device_id = NULL;
+cl_context context = NULL;
+cl_command_queue commandQueue = NULL;
+cl_program program = NULL;
+cl_kernel kernel = NULL;
+cl_mem i_memobj = NULL;
+cl_mem w_memobj = NULL;
+cl_mem o_memobj = NULL;
+uint8_t* kernel_bin = NULL;
+
+static void cleanup() {
+  if (commandQueue) clReleaseCommandQueue(commandQueue);
+  if (kernel) clReleaseKernel(kernel);
+  if (program) clReleaseProgram(program);
+  if (i_memobj) clReleaseMemObject(i_memobj);
+  if (w_memobj) clReleaseMemObject(w_memobj);
+  if (o_memobj) clReleaseMemObject(o_memobj);
+  if (context) clReleaseContext(context);
+  if (device_id) clReleaseDevice(device_id);  
+  if (kernel_bin) free(kernel_bin);
+}
+
+int size = 32;
+
+static void show_usage() {
+  printf("Usage: [-n size] [-h: help]\n");
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:h?")) != -1) {
+    switch (c) {
+    case 'n':
+      size = atoi(optarg);
+      break;
+    case 'h':
+    case '?': {
+      show_usage();
+      exit(0);
+    } break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+}
+
+int main (int argc, char **argv) {
+  // parse command arguments
+  parse_args(argc, argv);
+
+  printf("Matrix size=%d\n", size);
+
+  uint32_t o_points = size * size;
+  uint32_t i_points = (size+2) * (size+2);
+  uint32_t w_points = 3 * 3;
+  
+  cl_platform_id platform_id;
+  size_t kernel_size;
+  
+  // Getting platform and device information
+  CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
+  CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
+
+  printf("Create context\n");
+  context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL,  &_err));
+
+  char device_string[1024];
+  clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
+  printf("Using device: %s\n", device_string);
+
+  printf("Allocate device buffers\n");  
+  size_t i_nbytes = i_points * sizeof(float);
+  size_t w_nbytes = w_points * sizeof(float);
+  size_t o_nbytes = o_points * sizeof(float);
+  i_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, i_nbytes, NULL, &_err));
+  w_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, w_nbytes, NULL, &_err));
+  o_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, o_nbytes, NULL, &_err));
+
+  printf("Create program from kernel source\n");
+#ifdef HOSTGPU
+  if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
+    return -1;
+  program = CL_CHECK2(clCreateProgramWithSource(
+    context, 1, (const char**)&kernel_bin, &kernel_size, &_err));  
+#else
+  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
+    return -1;
+  program = CL_CHECK2(clCreateProgramWithBinary(
+    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
+#endif  
+  if (program == NULL) {
+    cleanup();
+    return -1;
+  }
+
+  // Build program
+  CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
+  
+  // Create kernel
+  kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
+
+  size_t global_size[2] = {size, size};
+
+  // Set kernel arguments
+  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&o_memobj));	
+  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&i_memobj));
+  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&w_memobj));
+  CL_CHECK(clSetKernelArg(kernel, 3, sizeof(uint32_t), &size));
+  CL_CHECK(clSetKernelArg(kernel, 4, sizeof(uint32_t), &size));
+
+  // Allocate memories for input arrays and output arrays.
+  std::vector<float> h_i(i_points);
+  std::vector<float> h_w(w_points);
+  std::vector<float> h_o(o_points, 0.0f);
+	
+  // Generate input values
+  for (int32_t y = -1; y < size+1; ++y) {
+    for (int32_t x = -1; x < size+1; ++x) {
+      if (x >= 0 && x < size && y >= 0 && y < size) {
+        h_i[(y+1) * (size+2) + (x+1)] = static_cast<float>(rand()) / RAND_MAX;
+      } else {
+        h_i[(y+1) * (size+2) + (x+1)] = 0;
+      }
+    }
+  }
+  for (uint32_t i = 0; i < w_points; ++i) {
+    h_w[i] = static_cast<float>(rand()) / RAND_MAX;
+  }
+
+  // Creating command queue
+  commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err));  
+
+	printf("Upload source buffers\n");
+  CL_CHECK(clEnqueueWriteBuffer(commandQueue, i_memobj, CL_TRUE, 0, i_nbytes, h_i.data(), 0, NULL, NULL));
+  CL_CHECK(clEnqueueWriteBuffer(commandQueue, w_memobj, CL_TRUE, 0, w_nbytes, h_w.data(), 0, NULL, NULL));
+
+  printf("Execute the kernel\n");
+  auto time_start = std::chrono::high_resolution_clock::now();
+  CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
+  CL_CHECK(clFinish(commandQueue));
+  auto time_end = std::chrono::high_resolution_clock::now();
+  double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
+  printf("Elapsed time: %lg ms\n", elapsed);
+
+  printf("Download destination buffer\n");
+  CL_CHECK(clEnqueueReadBuffer(commandQueue, o_memobj, CL_TRUE, 0, o_nbytes, h_o.data(), 0, NULL, NULL));
+
+  printf("Verify result\n");
+  std::vector<float> ref_vec(o_points);
+  convolution_cpu(ref_vec.data(), h_i.data(), h_w.data(), size, size);
+  int errors = 0;
+  for (uint32_t i = 0; i < o_points; ++i) {
+    if (!compare_equal(h_o[i], ref_vec[i])) {
+      if (errors < 100) 
+        printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_o[i]);
+      ++errors;
+    }
+  }
+  if (errors != 0) {
+    printf("FAILED! - %d errors\n", errors);    
+  } else {
+    printf("PASSED!\n");
+  }
+
+  // Clean up		
+  cleanup();  
+
+  return errors;
+}
--- a/tests/opencl/lbm/main.cc
+++ b/tests/opencl/lbm/main.cc
@ -173,14 +173,10 @@ void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) {

 	pb_SwitchToTimer(&timers, pb_TimerID_COPY);

-	printf("OK+\n");
-
  // Setup DEVICE datastructures
  OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid);
  OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid);

-	printf("OK-\n");
-
  // Initialize DEVICE datastructures
  OpenCL_LBM_initializeGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
  OpenCL_LBM_initializeGrid(prm, OpenCL_dstGrid, TEMP_dstGrid);
--- a/tests/opencl/matmul/Makefile
+++ b/tests/opencl/matmul/Makefile
@ -2,6 +2,6 @@ PROJECT = matmul

 SRCS = main.cc

-OPTS ?= -n16
+OPTS ?= -n32

 include ../common.mk
--- a/tests/opencl/matmul/kernel.cl
+++ b/tests/opencl/matmul/kernel.cl
@ -5,10 +5,10 @@ __kernel void matmul(__global float *A,
                     __local float *localA, 
                     __local float *localB)
 {
-    int row = get_global_id(1);
-    int col = get_global_id(0);
-    int localRow = get_local_id(1);
-    int localCol = get_local_id(0);
+    int globalRow = get_global_id(1);
+    int globalCol = get_global_id(0);
+    int localRow  = get_local_id(1);
+    int localCol  = get_local_id(0);
    int localSize = get_local_size(0);  // assuming square local size

    float sum = 0.0f;
@ -16,10 +16,10 @@ __kernel void matmul(__global float *A,
    // Loop over all blocks of both matrices
    for (int k = 0; k < N; k += localSize) {
        // Load block of matrix A to local memory
-        localA[localRow * localSize + localCol] = A[row * N + k + localCol];
+        localA[localRow * localSize + localCol] = A[globalRow * N + k + localCol];

        // Load block of matrix B to local memory, adjusting for column-major access
-        localB[localRow * localSize + localCol] = B[(k + localRow) * N + col];
+        localB[localRow * localSize + localCol] = B[(k + localRow) * N + globalCol];

        // Synchronize to make sure the tiles are loaded
        barrier(CLK_LOCAL_MEM_FENCE);
@ -28,20 +28,20 @@ __kernel void matmul(__global float *A,
        for (int j = 0; j < localSize; j++) {
            sum += localA[localRow * localSize + j] * localB[j * localSize + localCol];
        }
-
-        // Synchronize before loading the next block
-        barrier(CLK_LOCAL_MEM_FENCE);
    }

-    C[row * N + col] = sum;
+    C[globalRow * N + globalCol] = sum;
 }

-/*__kernel void matmul(__global float *A, __global float *B, __global float *C, const unsigned int N)
+/*__kernel void matmul(__global float *A, 
+                       __global float *B, 
+                       __global float *C, 
+                       const unsigned int N)
 {
    int globalRow = get_global_id(1);
    int globalCol = get_global_id(0);
-    int localRow = get_local_id(1);
-    int localCol = get_local_id(0);
+    int localRow  = get_local_id(1);
+    int localCol  = get_local_id(0);

    // Static local memory declaration
    __local float localA[16][16];
@ -64,9 +64,6 @@ __kernel void matmul(__global float *A,
        for (int j = 0; j < 16; j++) {
            sum += localA[localRow][j] * localB[j][localCol];
        }
-
-        // Wait until all threads have computed before loading the next block
-        barrier(CLK_LOCAL_MEM_FENCE);
    }

    C[globalRow * N + globalCol] = sum;
--- a/tests/opencl/matmul/main.cc
+++ b/tests/opencl/matmul/main.cc
@ -10,6 +10,8 @@

 #define LOCAL_SIZE 16

+#define FLOAT_ULP 6
+
 #define KERNEL_NAME "matmul"

 #define CL_CHECK(_expr)                                                \
@ -56,15 +58,16 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
  return 0;
 }

-static bool compare_equal(float a, float b, int ulp = 21) {
-  union fi_t { int i; float f; };
+static bool compare_equal(float a, float b) {
+  union fi_t { float f; int32_t i; };
  fi_t fa, fb;
  fa.f = a;
  fb.f = b;
-  return std::abs(fa.i - fb.i) <= ulp;
+  auto d = std::abs(fa.i - fb.i);
+  return d <= FLOAT_ULP;
 }

-static void matrix_multiply_cpu(float *A, float *B, float *C, int N) {
+static void matmul_cpu(float *C, float *A, float *B, int N) {
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            float sum = 0.0f;
@ -98,7 +101,7 @@ static void cleanup() {
  if (kernel_bin) free(kernel_bin);
 }

-int size = 64;
+int size = 32;

 static void show_usage() {
  printf("Usage: [-n size] [-h: help]\n");
@ -106,7 +109,7 @@ static void show_usage() {

 static void parse_args(int argc, char **argv) {
  int c;
-  while ((c = getopt(argc, argv, "fn:h?")) != -1) {
+  while ((c = getopt(argc, argv, "n:h?")) != -1) {
    switch (c) {
    case 'n':
      size = atoi(optarg);
@ -127,6 +130,8 @@ int main (int argc, char **argv) {
  // parse command arguments
  parse_args(argc, argv);

+  uint32_t num_points = size * size;
+
  printf("Matrix size=%d\n", size);
  if ((size / LOCAL_SIZE) * LOCAL_SIZE != size) {
    printf("Error: matrix size must be a multiple of %d\n", LOCAL_SIZE);
@ -148,7 +153,7 @@ int main (int argc, char **argv) {
  printf("Using device: %s\n", device_string);

  printf("Allocate device buffers\n");
-  size_t nbytes = size * size * sizeof(float);
+  size_t nbytes = num_points * sizeof(float);
  a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
  b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
  c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
@ -176,8 +181,8 @@ int main (int argc, char **argv) {
  // Create kernel
  kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));

-  size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE};
  size_t global_size[2] = {size, size};
+  size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE};  

  // Set kernel arguments
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));	
@ -188,20 +193,14 @@ int main (int argc, char **argv) {
  CL_CHECK(clSetKernelArg(kernel, 5, local_size[0]*local_size[1]*sizeof(float), NULL));

 // Allocate memories for input arrays and output arrays.
- std::vector<float> h_a(size * size);
- std::vector<float> h_b(size * size);
- std::vector<float> h_c(size * size);
+ std::vector<float> h_a(num_points);
+ std::vector<float> h_b(num_points);
+ std::vector<float> h_c(num_points);
 	
-  // Initialize values for array members.  
-  for (int i = 0; i < (size * size); ++i) {
-  #ifdef USE_FLOAT
-    h_a[i] = (float)rand() / (float)RAND_MAX;
-    h_b[i] = (float)rand() / (float)RAND_MAX;
-  #else
-    h_a[i] = rand();
-    h_b[i] = rand();
-  #endif
-    h_c[i] = 0xdeadbeef;
+  // Generate input values
+  for (uint32_t i = 0; i < num_points; ++i) {
+    h_a[i] = static_cast<float>(rand()) / RAND_MAX;
+    h_b[i] = static_cast<float>(rand()) / RAND_MAX;
  }

  // Creating command queue
@ -223,10 +222,10 @@ int main (int argc, char **argv) {
  CL_CHECK(clEnqueueReadBuffer(commandQueue, c_memobj, CL_TRUE, 0, nbytes, h_c.data(), 0, NULL, NULL));

  printf("Verify result\n");
-  std::vector<float> ref_vec(size * size);
-  matrix_multiply_cpu(h_a.data(), h_b.data(), ref_vec.data(), size);
+  std::vector<float> ref_vec(num_points);
+  matmul_cpu(ref_vec.data(), h_a.data(), h_b.data(), size);
  int errors = 0;
-  for (int i = 0; i < (size * size); i++) {
+  for (uint32_t i = 0; i < num_points; ++i) {
    if (!compare_equal(h_c[i], ref_vec[i])) {
      if (errors < 100) 
        printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_c[i]);
--- a/tests/opencl/oclprintf/main.cc
+++ b/tests/opencl/oclprintf/main.cc
@ -106,11 +106,6 @@ int main (int argc, char **argv) {
  
  cl_platform_id platform_id;
  size_t kernel_size;
-  cl_int binary_status;
-
-  // read kernel binary from file  
-  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
-    return -1;
  
  // Getting platform and device information
  CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
@ -124,12 +119,17 @@ int main (int argc, char **argv) {
  a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));

  printf("Create program from kernel source\n");
-  program = CL_CHECK2(clCreateProgramWithBinary(
-    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
-  if (program == NULL) {
-    cleanup();
+#ifdef HOSTGPU
+  if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
    return -1;
-  }
+  program = CL_CHECK2(clCreateProgramWithSource(
+    context, 1, (const char**)&kernel_bin, &kernel_size, &_err));  
+#else
+  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
+    return -1;
+  program = CL_CHECK2(clCreateProgramWithBinary(
+    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
+#endif

  // Build program
  CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
@ -143,7 +143,7 @@ int main (int argc, char **argv) {
  // Allocate memories for input arrays and output arrays.    
  h_a = (int*)malloc(nbytes);
 	
-  // Initialize values for array members.  
+  // Generate input values
  for (int i = 0; i < size; ++i) {
    h_a[i] = -1 + i;
  }
--- a/tests/opencl/psort/main.cc
+++ b/tests/opencl/psort/main.cc
@ -115,11 +115,6 @@ int main (int argc, char **argv) {
  
  cl_platform_id platform_id;
  size_t kernel_size;
-  cl_int binary_status;
-
-  // read kernel binary from file  
-  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
-    return -1;
  
  // Getting platform and device information
  CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
@ -134,12 +129,17 @@ int main (int argc, char **argv) {
  c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));

  printf("Create program from kernel source\n");
-  program = CL_CHECK2(clCreateProgramWithBinary(
-    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
-  if (program == NULL) {
-    cleanup();
+#ifdef HOSTGPU
+  if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
    return -1;
-  }
+  program = CL_CHECK2(clCreateProgramWithSource(
+    context, 1, (const char**)&kernel_bin, &kernel_size, &_err));  
+#else
+  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
+    return -1;
+  program = CL_CHECK2(clCreateProgramWithBinary(
+    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
+#endif

  // Build program
  CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
@ -155,17 +155,16 @@ int main (int argc, char **argv) {
  h_a = (int*)malloc(nbytes);
  h_c = (int*)malloc(nbytes);	
 	
-  // Initialize values for array members.  
+  // Generate input values
  for (int i = 0; i < size; ++i) {
-    h_c[i] = 0xdeadbeef;
    if (float_enable) {
      float value = sinf(i)*sinf(i);
-      h_a[i] = *(int*)&value;
-      printf("*** [%d]: h_a=%f\n", i, value);
+      ((float*)h_a)[i] = value;
+      printf("*** [%d]: %f\n", i, value);
    } else {      
      int value = size*sinf(i);
      h_a[i] = value;
-      printf("*** [%d]: h_a=%d\n", i, value);
+      printf("*** [%d]: %d\n", i, value);
    }
  }

@ -189,38 +188,44 @@ int main (int argc, char **argv) {
  CL_CHECK(clEnqueueReadBuffer(commandQueue, c_memobj, CL_TRUE, 0, nbytes, h_c, 0, NULL, NULL));

  printf("Verify result\n");
-  for (int i = 0; i < size; ++i) {
-    int value = h_c[i];
+  for (int i = 0; i < size; ++i) {    
    if (float_enable) {
-      printf("*** [%d]: h_a=%f\n", i, *(float*)&value);
+      float value = ((float*)h_c)[i];
+      printf("*** [%d]: %f\n", i, value);
    } else {
-      printf("*** [%d]: h_a=%d\n", i, value);
+      int value = h_c[i];
+      printf("*** [%d]: %d\n", i, value);
    }
  }
  int errors = 0;
-  for (int i = 0; i < size; ++i) {
-    int ref = h_a[i];
-    float ref_f = *(float*)&ref;
+  for (int i = 0; i < size; ++i) {    
    int pos = 0;
-    for (int j = 0; j < size; ++j) {
-      int cur = h_a[j];
-      if (float_enable) {
-        float cur_f = *(float*)&cur;
-        pos += (cur_f < ref_f) || (cur_f == ref_f && j < i);
-      } else {
+    if (float_enable) {    
+      float ref = ((float*)h_a)[i];
+      for (int j = 0; j < size; ++j) {        
+        float cur = ((float*)h_a)[j];
+        pos += (cur < ref) || (cur == ref && j < i);
+      }        
+      float value = ((float*)h_c)[pos];
+      if (value != ref) {
+        if (errors < 100) {        
+          printf("*** error: [%d] expected=%f, actual=%f\n", pos, ref, value);
+        }        
+        ++errors;
+      }
+    } else {      
+      int ref = h_a[i];
+      for (int j = 0; j < size; ++j) {        
+        int cur = h_a[j];
        pos += (cur < ref) || (cur == ref && j < i);
      }
-    }
-    int value = h_c[pos];
-    if (value != ref) {
-      if (errors < 100) {
-        if (float_enable) {
-          printf("*** error: [%d] expected=%f, actual=%f\n", pos, ref_f, *(float*)&value);
-        } else {
+      int value = h_c[pos];
+      if (value != ref) {
+        if (errors < 100) {        
          printf("*** error: [%d] expected=%d, actual=%d\n", pos, ref, value);
        }
+        ++errors;
      }
-      ++errors;
    }
  }
  if (0 == errors) {
--- a/tests/opencl/saxpy/main.cc
+++ b/tests/opencl/saxpy/main.cc
@ -151,16 +151,12 @@ int main(int argc, char **argv) {
  
  cl_platform_id platform_id;
  cl_device_id device_id;
+  cl_program program;
  cl_mem input_buffer;
  cl_mem output_buffer;
  size_t kernel_size;
  cl_context context;
  cl_command_queue queue;
-  cl_int binary_status = 0;
-
-  // read kernel binary from file  
-  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
-    return -1;

  // Getting platform and device information
  CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
@ -172,19 +168,18 @@ int main(int argc, char **argv) {
  cl_kernel kernel = 0;
  cl_mem memObjects[2] = {0, 0};

-  // Create OpenCL program - first attempt to load cached binary.
-  //  If that is not available, then create the program from source
-  //  and store the binary for future use.
-  printf("create program from binary...\n");
-  cl_program program = CL_CHECK_ERR(clCreateProgramWithBinary(
-    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
-  if (program == NULL) {
-    std::cerr << "Failed to write program binary" << std::endl;
-    Cleanup(device_id, context, queue, program, kernel, memObjects);
-    return 1;
-  } else {
-    printf("Read program from binary.\n");
-  }
+  printf("Create program from kernel source\n");
+#ifdef HOSTGPU
+  if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
+    return -1;
+  program = CL_CHECK_ERR(clCreateProgramWithSource(
+    context, 1, (const char**)&kernel_bin, &kernel_size, &_err));  
+#else
+  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
+    return -1;
+  program = CL_CHECK_ERR(clCreateProgramWithBinary(
+    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
+#endif

  // Build program
  CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
--- a/tests/opencl/sfilter/main.cc
+++ b/tests/opencl/sfilter/main.cc
@ -149,14 +149,10 @@ int main(int argc, char **argv) {

  cl_platform_id platform_id;
  cl_device_id device_id;
+  cl_program program;
  size_t kernel_size;
-  cl_int binary_status = 0;
  uint8_t *kernel_bin = NULL;

-  // read kernel binary from file  
-  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
-    return -1;
-
  // Getting platform and device information
  CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
  CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
@ -170,19 +166,18 @@ int main(int argc, char **argv) {
  cl_kernel kernel = 0;
  cl_mem memObjects[2] = {0, 0};

-  // Create OpenCL program - first attempt to load cached binary.
-  // If that is not available, then create the program from source
-  // and store the binary for future use.
-  printf("create program from binary...\n");
-  cl_program program = CL_CHECK_ERR(clCreateProgramWithBinary(
-    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
-  if (program == NULL) {
-    std::cerr << "Failed to write program binary" << std::endl;
-    Cleanup(kernel_bin, device_id, context, queue, program, kernel, memObjects);
-    return 1;
-  } else {
-    printf("Read program from binary.");
-  }
+  printf("Create program from kernel source\n");
+#ifdef HOSTGPU
+  if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
+    return -1;
+  program = CL_CHECK_ERR(clCreateProgramWithSource(
+    context, 1, (const char**)&kernel_bin, &kernel_size, &_err));  
+#else
+  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
+    return -1;
+  program = CL_CHECK_ERR(clCreateProgramWithBinary(
+    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
+#endif

  // Build program
  CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
--- a/tests/opencl/sgemm/common.h
+++ b/tests/opencl/sgemm/common.h
@ -1,12 +1,8 @@
 #ifndef COMMON_H
 #define COMMON_H

-#define USE_FLOAT
-
-#ifdef USE_FLOAT
+#ifndef TYPE
 #define TYPE float
-#else
-#define TYPE int
 #endif

 #endif // COMMON_H
--- a/tests/opencl/sgemm/main.cc
+++ b/tests/opencl/sgemm/main.cc
@ -11,6 +11,8 @@

 #define KERNEL_NAME "sgemm"

+#define FLOAT_ULP 6
+
 #define CL_CHECK(_expr)                                                \
   do {                                                                \
     cl_int _err = _expr;                                              \
@ -33,6 +35,66 @@
     _ret;                                                             \
   })

+template <typename Type>
+class Comparator {};
+
+template <>
+class Comparator<int> {
+public:
+  static const char* type_str() {
+    return "integer";
+  }
+  static int generate() { 
+    return rand(); 
+  }
+  static bool compare(int a, int b, int index, int errors) { 
+    if (a != b) {
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
+      }
+      return false;
+    }
+    return true;
+  }  
+};
+
+template <>
+class Comparator<float> {
+public:
+  static const char* type_str() {
+    return "float";
+  }
+  static int generate() { 
+    return static_cast<float>(rand()) / RAND_MAX;
+  }
+  static bool compare(float a, float b, int index, int errors) { 
+    union fi_t { float f; int32_t i; };
+    fi_t fa, fb;
+    fa.f = a;
+    fb.f = b;
+    auto d = std::abs(fa.i - fb.i);
+    if (d > FLOAT_ULP) {
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
+      }
+      return false;
+    }
+    return true;
+  }  
+};
+
+/*static void sgemm_cpu(TYPE *C, const TYPE* A, const TYPE *B, int M, int N, int K) {
+  for (int m = 0; m < M; ++m) {
+    for (int n = 0; n < N; ++n) {
+      TYPE acc = 0;
+      for (int k = 0; k < K; ++k) {
+          acc += A[k * M + m] * B[n * K + k];
+      }
+      C[n * M + m] = acc;
+    }
+  }
+}*/
+
 static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
  if (nullptr == filename || nullptr == data || 0 == size)
    return -1;
@ -54,32 +116,6 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
  return 0;
 }

-/*static void matmul(TYPE *C, const TYPE* A, const TYPE *B, int M, int N, int K) {
-  for (int m = 0; m < M; ++m) {
-    for (int n = 0; n < N; ++n) {
-      TYPE acc = 0;
-      for (int k = 0; k < K; ++k) {
-          acc += A[k * M + m] * B[n * K + k];
-      }
-      C[n * M + m] = acc;
-    }
-  }
-}*/
-
-#ifdef USE_FLOAT
-static bool compare_equal(float a, float b, int ulp = 21) {
-  union fi_t { int i; float f; };
-  fi_t fa, fb;
-  fa.f = a;
-  fb.f = b;
-  return std::abs(fa.i - fb.i) <= ulp;
-}
-#else
-static bool compare_equal(int a, int b, int ulp = 21) {
-  return (a == b);
-}
-#endif
-
 cl_device_id device_id = NULL;
 cl_context context = NULL;
 cl_command_queue commandQueue = NULL;
@ -145,15 +181,12 @@ int main (int argc, char **argv) {
  // parse command arguments
  parse_args(argc, argv);

+  uint32_t num_points = size * size;
+
  cl_platform_id platform_id;
  size_t kernel_size;
-  cl_int binary_status;

  srand(50);
-
-  // read kernel binary from file  
-  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
-    return -1;
  
  // Getting platform and device information
  CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
@ -163,18 +196,23 @@ int main (int argc, char **argv) {
  context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL,  &_err));

  // Allocate device buffers
-  size_t nbytes = size * size * sizeof(TYPE);
+  size_t nbytes = num_points * sizeof(TYPE);
  a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
  b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
  c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));

  printf("Create program from kernel source\n");
-  program = CL_CHECK2(clCreateProgramWithBinary(
-    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
-  if (program == NULL) {
-    cleanup();
+#ifdef HOSTGPU
+  if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
    return -1;
-  }
+  program = CL_CHECK2(clCreateProgramWithSource(
+    context, 1, (const char**)&kernel_bin, &kernel_size, &_err));  
+#else
+  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
+    return -1;
+  program = CL_CHECK2(clCreateProgramWithBinary(
+    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
+#endif

  // Build program
  CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
@ -194,23 +232,17 @@ int main (int argc, char **argv) {
  h_b = (TYPE*)malloc(nbytes);
  h_c = (TYPE*)malloc(nbytes);	
 	
-  // Initialize values for array members.  
-  for (int i = 0; i < (size * size); ++i) {
-  #ifdef USE_FLOAT
-    h_a[i] = (float)rand() / (float)RAND_MAX;
-    h_b[i] = (float)rand() / (float)RAND_MAX;
-  #else
-    h_a[i] = rand();
-    h_b[i] = rand();
-  #endif
-    h_c[i] = 0xdeadbeef;
+  // Generate input values 
+  for (uint32_t i = 0; i < num_points; ++i) {
+    h_a[i] = Comparator<TYPE>::generate();
+    h_b[i] = Comparator<TYPE>::generate();
  }

  size_t global_offset[2] = {0, 0};
  size_t global_work_size[2] = {size, size};
  size_t local_work_size[2] = {1, 1};

-  std::vector<float> ref_vec(size * size);
+  std::vector<float> ref_vec(num_points);

  // reference generation
  size_t num_groups_y = global_work_size[1] / local_work_size[1];
@ -228,12 +260,7 @@ int main (int argc, char **argv) {
          TYPE acc = 0;
          for (int k = 0; k < width; k++) {
            acc += h_a[k * width + r] * h_b[c * width + k];
-          }
-        /*#ifdef USE_FLOAT
-          printf("*** r=%d, c=%d, v=%f\n", r, c, acc);
-        #else
-          printf("*** r=%d, c=%d, v=%d\n", r, c, acc);
-        #endif*/                   
+          }                  
          ref_vec[c * width + r] = acc;         
        }
      }
@ -260,14 +287,8 @@ int main (int argc, char **argv) {

  printf("Verify result\n");
  int errors = 0;
-  for (int i = 0; i < (size * size); i++) {
-    if (!compare_equal(h_c[i], ref_vec[i])) {
-      if (errors < 100) 
-      #ifdef USE_FLOAT
-        printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_c[i]);
-      #else
-        printf("*** error: [%d] expected=%d, actual=%d\n", i, ref_vec[i], h_c[i]);
-      #endif
+  for (uint32_t i = 0; i < num_points; ++i) {
+    if (!Comparator<TYPE>::compare(h_c[i], ref_vec[i], i, errors)) {
      ++errors;
    }
  }
--- a/tests/opencl/spmv/convert_dataset.c
+++ b/tests/opencl/spmv/convert_dataset.c
@ -91,15 +91,11 @@ int coo_to_jds(char *mtx_filename, int pad_rows, int warp_size, int pack_size,
  if ((f = fopen(mtx_filename, "r")) == NULL)
    exit(1);

-  printf("OK**\n");
-
  if (mm_read_banner(f, &matcode) != 0) {
    printf("Could not process Matrix Market banner.\n");
    exit(1);
  }

-	printf("OK**\n");
-
  /*  This is how one can screen matrix types if their application */
  /*  only supports a subset of the Matrix Market data types.      */

--- a/tests/opencl/spmv/main.cc
+++ b/tests/opencl/spmv/main.cc
@ -148,7 +148,6 @@ int main(int argc, char **argv) {
  //    &h_data, &h_indices, &h_ptr,
  //    &h_perm, &h_nzcnt);
  int col_count;
-	printf("OK--\n");
  coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
             1,                       // row padding
             pad,                     // warp size
@ -159,8 +158,6 @@ int main(int argc, char **argv) {
             &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim,
             &len, &nzcnt_len, &depth);

-	printf("OK++\n");
-
  //	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
  h_Ax_vector = (float *)malloc(sizeof(float) * dim);
  h_x_vector = (float *)malloc(sizeof(float) * dim);
--- a/tests/opencl/stencil/main.cc
+++ b/tests/opencl/stencil/main.cc
@ -157,9 +157,7 @@ int main(int argc, char** argv) {
 	CHECK_ERROR("clBuildProgram")

 	cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus);
-	CHECK_ERROR("clCreateKernel") 
-
-  printf("OK+\n");
+	CHECK_ERROR("clCreateKernel")

 	//host data
 	float *h_A0;
@ -177,15 +175,11 @@ int main(int argc, char** argv) {
 	h_Anext=(float*)malloc(sizeof(float)*size);
 	pb_SwitchToTimer(&timers, pb_TimerID_IO);
 	//FILE *fp = fopen(parameters->inpFiles[0], "rb");
-	printf("OK+\n");
 	read_data(h_A0, nx,ny,nz,NULL);
-	printf("OK+\n");
-  //fclose(fp);
-  memcpy (h_Anext,h_A0,sizeof(float)*size);
+  	//fclose(fp);
+ 	memcpy (h_Anext,h_A0,sizeof(float)*size);

 	pb_SwitchToTimer(&timers, pb_TimerID_COPY);
-
-	printf("OK+\n");
 	
 	//memory allocation
 	d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
@ -201,18 +195,16 @@ int main(int argc, char** argv) {

 	pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);

-	printf("OK+\n");
-
 	//only use 1D thread block
-  int tx = 128;
+  	int tx = 128;
 	size_t block[3] = {tx,1,1};
 	size_t grid[3] = {(nx-2+tx-1)/tx*tx,ny-2,nz-2};
-  //size_t grid[3] = {nx-2,ny-2,nz-2};
-  size_t offset[3] = {1,1,1};
-  printf("grid size in x/y/z = %d %d %d\n",grid[0],grid[1],grid[2]);
+  	//size_t grid[3] = {nx-2,ny-2,nz-2};
+  	size_t offset[3] = {1,1,1};
+  	printf("grid size in x/y/z = %d %d %d\n",grid[0],grid[1],grid[2]);
 	printf("block size in x/y/z = %d %d %d\n",block[0],block[1],block[2]);
  
-  printf ("blocks = %d\n", (grid[0]/block[0])*(grid[1]/block[1])*(grid[2]*block[2]));
+  	printf ("blocks = %d\n", (grid[0]/block[0])*(grid[1]/block[1])*(grid[2]*block[2]));

 	clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0);
 	clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1);
@ -226,14 +218,10 @@ int main(int argc, char** argv) {
 	//main execution
 	pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);

-	printf("OK+0\n");
-
 	int t;
 	for(t=0;t<iteration;t++)
 	{
 		clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL);
-		printf("OK+0\n");
-
    //printf("iteration %d\n",t)
 		CHECK_ERROR("clEnqueueNDRangeKernel")
    
@ -244,11 +232,9 @@ int main(int argc, char** argv) {
    clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
 	}

-  printf("OK+1\n");
-
-  cl_mem d_temp = d_A0;
-  d_A0 = d_Anext;
-  d_Anext = d_temp;
+	cl_mem d_temp = d_A0;
+	d_A0 = d_Anext;
+	d_Anext = d_temp;

 	pb_SwitchToTimer(&timers, pb_TimerID_COPY);
 	clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
@ -260,8 +246,6 @@ int main(int argc, char** argv) {
 	clStatus = clReleaseCommandQueue(clCommandQueue);
 	clStatus = clReleaseContext(clContext);
 	CHECK_ERROR("clReleaseContext")
-
-	printf("OK+2\n");
 
 	if (parameters->outFile) {
 		pb_SwitchToTimer(&timers, pb_TimerID_IO);
--- a/tests/opencl/vecadd/main.cc
+++ b/tests/opencl/vecadd/main.cc
@ -122,11 +122,6 @@ int main (int argc, char **argv) {
  
  cl_platform_id platform_id;
  size_t kernel_size;
-  cl_int binary_status;
-
-  // read kernel binary from file  
-  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
-    return -1;
  
  // Getting platform and device information
  CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
@ -142,13 +137,17 @@ int main (int argc, char **argv) {
  c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));

  printf("Create program from kernel source\n");
-  cl_int _err;
-  program = clCreateProgramWithBinary(
-    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err);
-  if (program == NULL) {
-    cleanup();
+#ifdef HOSTGPU
+  if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
    return -1;
-  }
+  program = CL_CHECK2(clCreateProgramWithSource(
+    context, 1, (const char**)&kernel_bin, &kernel_size, &_err));  
+#else
+  if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
+    return -1;
+  program = CL_CHECK2(clCreateProgramWithBinary(
+    context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
+#endif

  // Build program
  CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
@ -166,12 +165,10 @@ int main (int argc, char **argv) {
  h_b = (float*)malloc(nbytes);
  h_c = (float*)malloc(nbytes);	
 	
-  // Initialize values for array members.  
+  // Generate input values
  for (int i = 0; i < size; ++i) {
    h_a[i] = sinf(i)*sinf(i);
    h_b[i] = cosf(i)*cosf(i);
-    h_c[i] = 0xdeadbeef;
-    //printf("*** [%d]: h_a=%f, h_b=%f\n", i, h_a[i], h_b[i]);
  }

  // Creating command queue
--- a/tests/regression/Makefile
+++ b/tests/regression/Makefile
@ -10,6 +10,7 @@ all:
 	$(MAKE) -C fence
 	$(MAKE) -C no_mf_ext
 	$(MAKE) -C no_smem
+	$(MAKE) -C tensor

 run-simx:
 	$(MAKE) -C basic run-simx
@ -23,6 +24,7 @@ run-simx:
 	$(MAKE) -C fence run-simx
 	$(MAKE) -C no_mf_ext run-simx
 	$(MAKE) -C no_smem run-simx
+	$(MAKE) -C tensor run-simx

 run-rtlsim:
 	$(MAKE) -C basic run-rtlsim
@ -36,6 +38,7 @@ run-rtlsim:
 	$(MAKE) -C fence run-rtlsim
 	$(MAKE) -C no_mf_ext run-rtlsim
 	$(MAKE) -C no_smem run-rtlsim
+	$(MAKE) -C tensor run-rtlsim

 run-opae:
 	$(MAKE) -C basic run-opae
@ -49,6 +52,7 @@ run-opae:
 	$(MAKE) -C fence run-opae
 	$(MAKE) -C no_mf_ext run-opae
 	$(MAKE) -C no_smem run-opae
+	$(MAKE) -C tensor run-opae

 clean:
 	$(MAKE) -C basic clean
@ -62,6 +66,7 @@ clean:
 	$(MAKE) -C fence clean
 	$(MAKE) -C no_mf_ext clean
 	$(MAKE) -C no_smem clean
+	$(MAKE) -C tensor clean

 clean-all:
 	$(MAKE) -C basic clean-all
@ -75,3 +80,4 @@ clean-all:
 	$(MAKE) -C fence clean-all
 	$(MAKE) -C no_mf_ext clean-all
 	$(MAKE) -C no_smem clean-all
+	$(MAKE) -C tensor clean-all
--- a/tests/regression/basic/main.cpp
+++ b/tests/regression/basic/main.cpp
@ -262,11 +262,8 @@ int main(int argc, char *argv[]) {

    // upload kernel argument
    std::cout << "upload kernel argument" << std::endl;
-    {
-      auto buf_ptr = (void*)staging_buf.data();
-      memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
-      RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
-    }
+    memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
+    RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));

    std::cout << "run kernel test" << std::endl;
    RT_CHECK(run_kernel_test(kernel_arg, buf_size, num_points));
--- a/tests/regression/common.mk
+++ b/tests/regression/common.mk
@ -1,16 +1,18 @@
 XLEN ?= 32

+TOOLDIR ?= /opt
+
 TARGET ?= opaesim

 XRT_SYN_DIR ?= ../../../hw/syn/xilinx/xrt
 XRT_DEVICE_INDEX ?= 0

 ifeq ($(XLEN),64)
-RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
+RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
 VX_CFLAGS += -march=rv64imafd -mabi=lp64d
 STARTUP_ADDR ?= 0x180000000
 else
-RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
+RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
 VX_CFLAGS += -march=rv32imaf -mabi=ilp32f
 STARTUP_ADDR ?= 0x80000000
 endif
@ -23,7 +25,7 @@ VORTEX_KN_PATH ?= $(realpath ../../../kernel)

 FPGA_BIN_DIR ?= $(VORTEX_RT_PATH)/opae

-LLVM_VORTEX ?= /opt/llvm-vortex
+LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex

 LLVM_CFLAGS += --sysroot=$(RISCV_SYSROOT)
 LLVM_CFLAGS += --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH)
--- a/tests/regression/demo/common.h
+++ b/tests/regression/demo/common.h
@ -3,6 +3,10 @@

 #define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000

+#ifndef TYPE
+#define TYPE float
+#endif
+
 typedef struct {
  uint32_t num_tasks;
  uint32_t task_size;
@ -11,4 +15,4 @@ typedef struct {
  uint64_t dst_addr;  
 } kernel_arg_t;

-#endif
+#endif
--- a/tests/regression/demo/kernel.cpp
+++ b/tests/regression/demo/kernel.cpp
@ -4,11 +4,11 @@
 #include "common.h"

 void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
-	uint32_t count    = arg->task_size;
-	int32_t* src0_ptr = (int32_t*)arg->src0_addr;
-	int32_t* src1_ptr = (int32_t*)arg->src1_addr;
-	int32_t* dst_ptr  = (int32_t*)arg->dst_addr;
+	auto src0_ptr = reinterpret_cast<TYPE*>(arg->src0_addr);
+	auto src1_ptr = reinterpret_cast<TYPE*>(arg->src1_addr);
+	auto dst_ptr = reinterpret_cast<TYPE*>(arg->dst_addr);
 	
+	uint32_t count = arg->task_size;
 	uint32_t offset = task_id * count;

 	for (uint32_t i = 0; i < count; ++i) {
--- a/tests/regression/demo/main.cpp
+++ b/tests/regression/demo/main.cpp
@ -5,6 +5,8 @@
 #include <vortex.h>
 #include "common.h"

+#define FLOAT_ULP 6
+
 #define RT_CHECK(_expr)                                         \
   do {                                                         \
     int _ret = _expr;                                          \
@ -17,10 +19,61 @@

 ///////////////////////////////////////////////////////////////////////////////

+template <typename Type>
+class Comparator {};
+
+template <>
+class Comparator<int> {
+public:
+  static const char* type_str() {
+    return "integer";
+  }
+  static int generate() { 
+    return rand(); 
+  }
+  static bool compare(int a, int b, int index, int errors) { 
+    if (a != b) {
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
+      }
+      return false;
+    }
+    return true;
+  }  
+};
+
+template <>
+class Comparator<float> {
+private:
+  union Float_t { float f; int i; };
+public:
+  static const char* type_str() {
+    return "float";
+  }
+  static int generate() { 
+    return static_cast<float>(rand()) / RAND_MAX;
+  }
+  static bool compare(float a, float b, int index, int errors) {     
+    union fi_t { float f; int32_t i; };
+    fi_t fa, fb;
+    fa.f = a;
+    fb.f = b;
+    auto d = std::abs(fa.i - fb.i);
+    if (d > FLOAT_ULP) {
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
+      }
+      return false;
+    }
+    return true;
+  }  
+};
+
 const char* kernel_file = "kernel.bin";
-uint32_t count = 0;
+uint32_t count = 16;

 vx_device_h device = nullptr;
+std::vector<TYPE> source_data;
 std::vector<uint8_t> staging_buf;
 kernel_arg_t kernel_arg = {};

@ -79,13 +132,11 @@ int run_test(const kernel_arg_t& kernel_arg,
  std::cout << "verify result" << std::endl;  
  {
    int errors = 0;
-    auto buf_ptr = (int32_t*)staging_buf.data();
+    auto buf_ptr = (TYPE*)staging_buf.data();
    for (uint32_t i = 0; i < num_points; ++i) {
-      int ref = i + i; 
-      int cur = buf_ptr[i];
-      if (cur != ref) {
-        std::cout << "error at result #" << std::dec << i
-                  << std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
+      auto ref = source_data[2 * i + 0] + source_data[2 * i + 1];
+      auto cur = buf_ptr[i];
+      if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
        ++errors;
      }
    }
@ -103,9 +154,7 @@ int main(int argc, char *argv[]) {
  // parse command arguments
  parse_args(argc, argv);

-  if (count == 0) {
-    count = 1;
-  }
+  std::srand(50);

  // open device connection
  std::cout << "open device connection" << std::endl;  
@ -118,8 +167,9 @@ int main(int argc, char *argv[]) {

  uint32_t num_tasks  = num_cores * num_warps * num_threads;
  uint32_t num_points = count * num_tasks;
-  uint32_t buf_size   = num_points * sizeof(int32_t);
+  uint32_t buf_size   = num_points * sizeof(TYPE);

+  std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
  std::cout << "number of points: " << num_points << std::endl;
  std::cout << "buffer size: " << buf_size << " bytes" << std::endl;

@ -147,18 +197,21 @@ int main(int argc, char *argv[]) {
  
  // upload kernel argument
  std::cout << "upload kernel argument" << std::endl;
-  {
-    auto buf_ptr = (int*)staging_buf.data();
-    memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
-    RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
+  memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
+  RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
+
+  // generate source data
+  source_data.resize(2 * num_points);
+  for (uint32_t i = 0; i < source_data.size(); ++i) {
+    source_data[i] = Comparator<TYPE>::generate();
  }

  // upload source buffer0
  {
    std::cout << "upload source buffer0" << std::endl;
-    auto buf_ptr = (int32_t*)staging_buf.data();
+    auto buf_ptr = (TYPE*)staging_buf.data();
    for (uint32_t i = 0; i < num_points; ++i) {
-      buf_ptr[i] = i-1;
+      buf_ptr[i] = source_data[2 * i + 0];
    }
    RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size));
  }
@ -166,23 +219,18 @@ int main(int argc, char *argv[]) {
  // upload source buffer1
  {
    std::cout << "upload source buffer1" << std::endl;
-    auto buf_ptr = (int32_t*)staging_buf.data();
+    auto buf_ptr = (TYPE*)staging_buf.data();
    for (uint32_t i = 0; i < num_points; ++i) {
-      buf_ptr[i] = i+1;
+      buf_ptr[i] = source_data[2 * i + 1];
    }   
    RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size));
  }

  // clear destination buffer
-  {
-    std::cout << "clear destination buffer" << std::endl;      
-    auto buf_ptr = (int32_t*)staging_buf.data();
-    for (uint32_t i = 0; i < num_points; ++i) {
-      buf_ptr[i] = 0xdeadbeef;
-    }  
-    RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));  
-  }
-
+  std::cout << "clear destination buffer" << std::endl;
+  memset(staging_buf.data(), 0, num_points * sizeof(TYPE));
+  RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));  
+  
  // run tests
  std::cout << "run tests" << std::endl;
  RT_CHECK(run_test(kernel_arg, buf_size, num_points));
--- a/tests/regression/diverge/main.cpp
+++ b/tests/regression/diverge/main.cpp
@ -233,11 +233,8 @@ int main(int argc, char *argv[]) {
  
  // upload kernel argument
  std::cout << "upload kernel argument" << std::endl;
-  {
-    auto buf_ptr = (int*)staging_buf.data();
-    memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
-    RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
-  }
+  memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
+  RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));

  // upload source buffer
  {
--- a/tests/regression/fence/main.cpp
+++ b/tests/regression/fence/main.cpp
@ -147,11 +147,8 @@ int main(int argc, char *argv[]) {
  
  // upload kernel argument
  std::cout << "upload kernel argument" << std::endl;
-  {
-    auto buf_ptr = (int*)staging_buf.data();
-    memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
-    RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
-  }
+  memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
+  RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));

  // upload source buffer0
  {
--- a/tests/regression/io_addr/main.cpp
+++ b/tests/regression/io_addr/main.cpp
@ -190,13 +190,10 @@ int main(int argc, char *argv[]) {
  staging_buf.resize(staging_buf_size);
  
  // upload kernel argument  
-  {
-    std::cout << "upload kernel argument" << std::endl;
-    auto buf_ptr = (int*)staging_buf.data();
-    memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
-    RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
-  }
-
+  std::cout << "upload kernel argument" << std::endl;
+  memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
+  RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
+  
  // upload test address data
  {
    std::cout << "upload test address data" << std::endl;
--- a/tests/regression/mstress/main.cpp
+++ b/tests/regression/mstress/main.cpp
@ -236,13 +236,10 @@ int main(int argc, char *argv[]) {
  staging_buf.resize(staging_buf_size);
  
  // upload kernel argument  
-  {
-    std::cout << "upload kernel argument" << std::endl;
-    auto buf_ptr = (int*)staging_buf.data();
-    memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
-    RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
-  }
-
+  std::cout << "upload kernel argument" << std::endl;
+  memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
+  RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
+  
  // upload source buffer0
  {
    std::cout << "upload address buffer" << std::endl;
--- a/tests/regression/no_mf_ext/main.cpp
+++ b/tests/regression/no_mf_ext/main.cpp
@ -136,13 +136,10 @@ int main(int argc, char *argv[]) {
  staging_buf.resize(alloc_size);
  
  // upload kernel argument  
-  {
-    std::cout << "upload kernel argument" << std::endl;
-    auto buf_ptr = (int*)staging_buf.data();
-    memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
-    RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
-  }
-
+  std::cout << "upload kernel argument" << std::endl;
+  memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
+  RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
+  
  // upload source buffer0
  {
    std::cout << "upload source buffer" << std::endl;
--- a/tests/regression/no_smem/main.cpp
+++ b/tests/regression/no_smem/main.cpp
@ -135,13 +135,10 @@ int main(int argc, char *argv[]) {
  uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
  staging_buf.resize(alloc_size);
  
-  // upload kernel argument
+  // upload kernel argument  
  std::cout << "upload kernel argument" << std::endl;
-  {
-    auto buf_ptr = (int*)staging_buf.data();
-    memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
-    RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
-  }
+  memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
+  RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));

  // upload source buffer0
  {
--- a/tests/regression/printf/main.cpp
+++ b/tests/regression/printf/main.cpp
@ -110,13 +110,10 @@ int main(int argc, char *argv[]) {
  staging_buf.resize(alloc_size);
  
  // upload kernel argument  
-  {
-    std::cout << "upload kernel argument" << std::endl;
-    auto buf_ptr = (void*)staging_buf.data();
-    memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
-    RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
-  }
-
+  std::cout << "upload kernel argument" << std::endl;
+  memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
+  RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
+  
  // upload source buffer0
  {
    std::cout << "upload source buffer" << std::endl;
--- a/tests/regression/sort/common.h
+++ b/tests/regression/sort/common.h
@ -3,11 +3,7 @@

 #define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000

-#define FP_ENABLE 
-
-#ifdef FP_ENABLE
-#define TYPE float
-#else
+#ifndef TYPE
 #define TYPE int
 #endif

@ -17,4 +13,4 @@ typedef struct {
  uint64_t dst_addr;  
 } kernel_arg_t;

-#endif
+#endif
--- a/tests/regression/sort/kernel.cpp
+++ b/tests/regression/sort/kernel.cpp
@ -5,14 +5,14 @@

 void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
 	uint32_t num_points = arg->num_points;
-	TYPE* src_ptr = (TYPE*)arg->src_addr;
-	TYPE* dst_ptr = (TYPE*)arg->dst_addr;
+	auto src_ptr = (TYPE*)arg->src_addr;
+	auto dst_ptr = (TYPE*)arg->dst_addr;

-	TYPE ref_value = src_ptr[task_id];
+	auto ref_value = src_ptr[task_id];

 	uint32_t pos = 0;
 	for (uint32_t i = 0; i < num_points; ++i) {
-		TYPE cur_value = src_ptr[i];		
+		auto cur_value = src_ptr[i];		
 		pos += (cur_value < ref_value) || ((cur_value == ref_value) && (i < task_id));
 	}
 	dst_ptr[pos] = ref_value;
--- a/tests/regression/sort/main.cpp
+++ b/tests/regression/sort/main.cpp
@ -66,8 +66,8 @@ void gen_input_data(uint32_t num_points) {
  src_data.resize(num_points);

  for (uint32_t i = 0; i < num_points; ++i) {
-    float r = static_cast<float>(std::rand()) / RAND_MAX;
-    TYPE value = r * num_points;
+    auto r = static_cast<float>(std::rand()) / RAND_MAX;
+    auto value = static_cast<TYPE>(r * num_points);
    src_data[i] = value;
    std::cout << std::dec << i << ": value=" << value << std::endl;
  }  
@ -172,19 +172,16 @@ int main(int argc, char *argv[]) {
  {
    std::cout << "allocate staging buffer" << std::endl;    
    uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
-                                    std::max<uint32_t>(dst_buf_size, 
-                                      sizeof(kernel_arg_t)));
+                                  std::max<uint32_t>(dst_buf_size, 
+                                    sizeof(kernel_arg_t)));
    staging_buf.resize(staging_buf_size);
  }
  
  // upload kernel argument  
-  {
-    std::cout << "upload kernel argument" << std::endl;
-    auto buf_ptr = staging_buf.data();
-    memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
-    RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
-  }
-
+  std::cout << "upload kernel argument" << std::endl;
+  memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
+  RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
+  
  // upload source buffer
  {
    std::cout << "upload source buffer" << std::endl;
--- a/tests/regression/tensor/Makefile
+++ b/tests/regression/tensor/Makefile
@ -0,0 +1,9 @@
+PROJECT = tensor
+
+SRCS = main.cpp
+
+VX_SRCS = kernel.cpp
+
+OPTS ?= -n32
+
+include ../common.mk
--- a/tests/regression/tensor/common.h
+++ b/tests/regression/tensor/common.h
@ -0,0 +1,18 @@
+#ifndef _COMMON_H_
+#define _COMMON_H_
+
+#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
+
+#ifndef TYPE
+#define TYPE float
+#endif
+
+typedef struct {
+  uint32_t num_tasks;
+  uint32_t size;
+  uint64_t A_addr;
+  uint64_t B_addr;
+  uint64_t C_addr;  
+} kernel_arg_t;
+
+#endif
--- a/tests/regression/tensor/kernel.cpp
+++ b/tests/regression/tensor/kernel.cpp
@ -0,0 +1,41 @@
+#include <stdint.h>
+#include <vx_intrinsics.h>
+#include <vx_spawn.h>
+#include "common.h"
+
+inline char is_log2(uint32_t x) {
+    return ((x & (x-1)) == 0);
+}
+
+inline uint32_t log2_fast(uint32_t x) { 
+    return 31 - __builtin_clz (x);
+}
+
+void kernel_body(uint32_t task_id, kernel_arg_t* __UNIFORM__ arg) {
+	auto A = reinterpret_cast<TYPE*>(arg->A_addr);
+	auto B = reinterpret_cast<TYPE*>(arg->B_addr);
+	auto C = reinterpret_cast<TYPE*>(arg->C_addr);
+    auto size  = arg->size;
+
+    uint32_t row, col;
+    if (is_log2(size)) {
+        uint32_t log_size = log2_fast(size);
+        row = task_id >> log_size;
+        col = task_id & (size-1);
+    } else {
+        row = task_id / size;
+        col = task_id % size;
+    }
+
+    TYPE sum (0);
+    for (int e = 0; e < size; ++e) {
+        sum += A[row * size + e] * B[e * size + col];
+    }
+    C[row * size + col] = sum;
+}
+
+int main() {
+	kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
+	vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
+	return 0;
+}
--- a/tests/regression/tensor/main.cpp
+++ b/tests/regression/tensor/main.cpp
@ -0,0 +1,254 @@
+#include <iostream>
+#include <unistd.h>
+#include <string.h>
+#include <vector>
+#include <vortex.h>
+#include "common.h"
+
+#define FLOAT_ULP 6
+
+#define RT_CHECK(_expr)                                         \
+   do {                                                         \
+     int _ret = _expr;                                          \
+     if (0 == _ret)                                             \
+       break;                                                   \
+     printf("Error: '%s' returned %d!\n", #_expr, (int)_ret);   \
+	 cleanup();			                                              \
+     exit(-1);                                                  \
+   } while (false)
+
+///////////////////////////////////////////////////////////////////////////////
+
+template <typename Type>
+class Comparator {};
+
+template <>
+class Comparator<int> {
+public:
+  static const char* type_str() {
+    return "integer";
+  }
+  static int generate() { 
+    return rand(); 
+  }
+  static bool compare(int a, int b, int index, int errors) { 
+    if (a != b) {
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
+      }
+      return false;
+    }
+    return true;
+  }  
+};
+
+template <>
+class Comparator<float> {
+public:
+  static const char* type_str() {
+    return "float";
+  }
+  static int generate() { 
+    return static_cast<float>(rand()) / RAND_MAX;
+  }
+  static bool compare(float a, float b, int index, int errors) { 
+    union fi_t { float f; int32_t i; };
+    fi_t fa, fb;
+    fa.f = a;
+    fb.f = b;
+    auto d = std::abs(fa.i - fb.i);
+    if (d > FLOAT_ULP) {
+      if (errors < 100) {
+        printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
+      }
+      return false;
+    }
+    return true;
+  }  
+};
+
+static void matmul_cpu(TYPE* out, const TYPE* A, const TYPE* B, uint32_t width, uint32_t height) {
+  for (uint32_t row = 0; row < height; ++row) {
+    for (uint32_t col = 0; col < width; ++col) {
+      TYPE sum(0);
+      for (uint32_t e = 0; e < width; ++e) {
+          sum += A[row * width + e] * B[e * width + col];
+      }
+      out[row * width + col] = sum;
+    }
+  }
+}
+
+const char* kernel_file = "kernel.bin";
+uint32_t size = 32;
+
+vx_device_h device = nullptr;
+std::vector<uint8_t> staging_buf;
+kernel_arg_t kernel_arg = {};
+
+static void show_usage() {
+   std::cout << "Vortex Test." << std::endl;
+   std::cout << "Usage: [-k: kernel] [-n size] [-h: help]" << std::endl;
+}
+
+static void parse_args(int argc, char **argv) {
+  int c;
+  while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
+    switch (c) {
+    case 'n':
+      size = atoi(optarg);
+      break;
+    case 'k':
+      kernel_file = optarg;
+      break;
+    case 'h':
+    case '?': {
+      show_usage();
+      exit(0);
+    } break;
+    default:
+      show_usage();
+      exit(-1);
+    }
+  }
+}
+
+void cleanup() {
+  if (device) {    
+    vx_mem_free(device, kernel_arg.A_addr);
+    vx_mem_free(device, kernel_arg.B_addr);
+    vx_mem_free(device, kernel_arg.C_addr);
+    vx_dev_close(device);
+  }
+}
+
+int run_test(const kernel_arg_t& kernel_arg,
+             uint32_t buf_size, 
+             const std::vector<TYPE>& refs) {              
+  // start device
+  std::cout << "start device" << std::endl;
+  RT_CHECK(vx_start(device));
+
+  // wait for completion
+  std::cout << "wait for completion" << std::endl;
+  RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
+
+  // download destination buffer
+  std::cout << "download destination buffer" << std::endl;
+  RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.C_addr, buf_size));
+
+  // verify result
+  std::cout << "verify result" << std::endl;  
+  {
+    int errors = 0;
+    auto buf_ptr = (TYPE*)staging_buf.data();
+    for (uint32_t i = 0; i < refs.size(); ++i) {
+      auto ref = refs[i];
+      auto cur = buf_ptr[i];
+      if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
+        ++errors;
+      }
+    }
+    if (errors != 0) {
+      std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
+      std::cout << "FAILED!" << std::endl;
+      return 1;  
+    }
+  }
+
+  return 0;
+}
+
+int main(int argc, char *argv[]) {  
+  // parse command arguments
+  parse_args(argc, argv);
+
+  std::srand(50);
+
+  // open device connection
+  std::cout << "open device connection" << std::endl;  
+  RT_CHECK(vx_dev_open(&device));
+
+  uint32_t num_points = size * size;
+  uint32_t buf_size = num_points * sizeof(TYPE);
+
+  std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
+  std::cout << "matrix size: " << size << "x" << size << std::endl;
+  std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
+
+  // upload program
+  std::cout << "upload program" << std::endl;  
+  RT_CHECK(vx_upload_kernel_file(device, kernel_file));
+
+  // allocate device memory
+  std::cout << "allocate device memory" << std::endl;
+  RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.A_addr));
+  RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.B_addr));
+  RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.C_addr));
+
+  kernel_arg.num_tasks = num_points;
+  kernel_arg.size = size;
+
+  std::cout << "dev_src0=0x" << std::hex << kernel_arg.A_addr << std::endl;
+  std::cout << "dev_src1=0x" << std::hex << kernel_arg.B_addr << std::endl;
+  std::cout << "dev_dst=0x" << std::hex << kernel_arg.C_addr << std::endl;
+  
+  // allocate staging buffer  
+  std::cout << "allocate staging buffer" << std::endl;    
+  uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
+  staging_buf.resize(alloc_size);
+  
+  // upload kernel argument
+  std::cout << "upload kernel argument" << std::endl;
+  memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
+  RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
+
+  // generate source data
+  std::vector<TYPE> src_A(num_points);
+  std::vector<TYPE> src_B(num_points);
+  std::vector<TYPE> refs(num_points);
+  for (uint32_t i = 0; i < num_points; ++i) {
+    auto a = static_cast<float>(std::rand()) / RAND_MAX;
+    auto b = static_cast<float>(std::rand()) / RAND_MAX;
+    src_A[i] = static_cast<TYPE>(a * size);
+    src_B[i] = static_cast<TYPE>(b * size);
+  }
+  matmul_cpu(refs.data(), src_A.data(), src_B.data(), size, size);
+
+  // upload source buffer0
+  {
+    std::cout << "upload source buffer0" << std::endl;
+    auto buf_ptr = (TYPE*)staging_buf.data();
+    for (uint32_t i = 0; i < num_points; ++i) {
+      buf_ptr[i] = src_A[i];
+    }
+    RT_CHECK(vx_copy_to_dev(device, kernel_arg.A_addr, staging_buf.data(), buf_size));
+  }
+
+  // upload source buffer1
+  {
+    std::cout << "upload source buffer1" << std::endl;
+    auto buf_ptr = (TYPE*)staging_buf.data();
+    for (uint32_t i = 0; i < num_points; ++i) {
+      buf_ptr[i] = src_B[i];
+    }   
+    RT_CHECK(vx_copy_to_dev(device, kernel_arg.B_addr, staging_buf.data(), buf_size));
+  }
+
+  // clear destination buffer
+  std::cout << "clear destination buffer" << std::endl;
+  memset(staging_buf.data(), 0, num_points * sizeof(TYPE));
+  RT_CHECK(vx_copy_to_dev(device, kernel_arg.C_addr, staging_buf.data(), buf_size));  
+  
+  // run tests
+  std::cout << "run tests" << std::endl;
+  RT_CHECK(run_test(kernel_arg, buf_size, refs));
+
+  // cleanup
+  std::cout << "cleanup" << std::endl;  
+  cleanup();
+
+  std::cout << "PASSED!" << std::endl;
+
+  return 0;
+}
--- a/tests/unittest/common.mk
+++ b/tests/unittest/common.mk
@ -0,0 +1,30 @@
+VORTEX_RT_PATH ?= $(realpath ../../../runtime)
+
+CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
+
+CXXFLAGS += -I$(VORTEX_RT_PATH)/common
+
+# Debugigng
+ifdef DEBUG
+	CXXFLAGS += -g -O0
+else    
+	CXXFLAGS += -O2 -DNDEBUG
+endif
+
+all: $(PROJECT)
+
+$(PROJECT): $(SRCS)
+	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
+
+run:
+	./$(PROJECT)
+
+clean:
+	rm -rf $(PROJECT) *.o .depend
+
+clean-all: clean
+	rm -rf *.elf *.bin *.dump
+
+ifneq ($(MAKECMDGOALS),clean)
+    -include .depend
+endif
--- a/tests/unittest/vx_malloc/Makefile
+++ b/tests/unittest/vx_malloc/Makefile
@ -1,34 +1,5 @@
-VORTEX_RT_PATH ?= $(realpath ../../../runtime)
-
-CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
-
-CXXFLAGS += -I$(VORTEX_RT_PATH)/common
-
-# Debugigng
-ifdef DEBUG
-	CXXFLAGS += -g -O0
-else    
-	CXXFLAGS += -O2 -DNDEBUG
-endif
-
 PROJECT = vx_malloc

 SRCS = main.cpp

-all: $(PROJECT)
-
-$(PROJECT): $(SRCS)
-	$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
-
-run:
-	./$(PROJECT)
-
-clean:
-	rm -rf $(PROJECT) *.o .depend
-
-clean-all: clean
-	rm -rf *.elf *.bin *.dump
-
-ifneq ($(MAKECMDGOALS),clean)
-    -include .depend
-endif
+include ../common.mk