Merge branch 'develop' of https://github.com/vortexgpgpu/vortex into develop

This commit is contained in:
Blaise Tine 2023-11-18 00:27:46 -08:00
commit 11752b2562
67 changed files with 1234 additions and 956 deletions

View file

@ -38,7 +38,7 @@ jobs:
- rm -rf $HOME/build32 && cp -r $PWD $HOME/build32
- rm -rf $HOME/build64 && cp -r $PWD $HOME/build64
- make -C $HOME/build32
- XLEN=64 RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv64-gnu-toolchain make -C $HOME/build64
- XLEN=64 make -C $HOME/build64
- stage: test
name: unittest
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --unittest
@ -47,13 +47,13 @@ jobs:
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --isa
- stage: test
name: isa64
script: cp -r $HOME/build64 build && cd build && XLEN=64 RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv64-gnu-toolchain ./ci/travis_run.py ./ci/regression.sh --isa
script: cp -r $HOME/build64 build && cd build && XLEN=64 ./ci/travis_run.py ./ci/regression.sh --isa
- stage: test
name: regression
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --regression
- stage: test
name: regression64
script: cp -r $HOME/build64 build && cd build && XLEN=64 RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv64-gnu-toolchain ./ci/travis_run.py ./ci/regression.sh --regression
script: cp -r $HOME/build64 build && cd build && XLEN=64 ./ci/travis_run.py ./ci/regression.sh --regression
- stage: test
name: opencl
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --opencl

View file

@ -16,14 +16,11 @@
TOOLDIR=${TOOLDIR:=/opt}
export RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv-gnu-toolchain
export LLVM_POCL=$TOOLDIR/llvm-pocl
export LLVM_VORTEX=$TOOLDIR/llvm-vortex
export VERILATOR_ROOT=$TOOLDIR/verilator
export PATH=$VERILATOR_ROOT/bin:$PATH
export SV2V_PATH=$TOOLDIR/sv2v
export PATH=$SV2V_PATH/bin:$PATH
export YOSYS_PATH=$TOOLDIR/yosys
export PATH=$YOSYS_PATH/bin:$PATH
export POCL_CC_PATH=$TOOLDIR/pocl/compiler
export POCL_RT_PATH=$TOOLDIR/pocl/runtime

View file

@ -9,9 +9,6 @@ OPAE Environment Setup
$ export C_INCLUDE_PATH=$OPAE_HOME/include:$C_INCLUDE_PATH
$ export LIBRARY_PATH=$OPAE_HOME/lib:$LIBRARY_PATH
$ export LD_LIBRARY_PATH=$OPAE_HOME/lib:$LD_LIBRARY_PATH
$ export RISCV_TOOLCHAIN_PATH=/opt/riscv-gnu-toolchain
$ export PATH=:/opt/verilator/bin:$PATH
$ export VERILATOR_ROOT=/opt/verilator
OPAE Build
------------------

View file

@ -223,18 +223,18 @@
// Number of ALU units
`ifndef NUM_ALU_LANES
`define NUM_ALU_LANES `UP(`NUM_THREADS / 2)
`define NUM_ALU_LANES `NUM_THREADS
`endif
`ifndef NUM_ALU_BLOCKS
`define NUM_ALU_BLOCKS `UP(`ISSUE_WIDTH / 1)
`define NUM_ALU_BLOCKS `ISSUE_WIDTH
`endif
// Number of FPU units
`ifndef NUM_FPU_LANES
`define NUM_FPU_LANES `UP(`NUM_THREADS / 2)
`define NUM_FPU_LANES `NUM_THREADS
`endif
`ifndef NUM_FPU_BLOCKS
`define NUM_FPU_BLOCKS `UP(`ISSUE_WIDTH / 1)
`define NUM_FPU_BLOCKS `ISSUE_WIDTH
`endif
// Number of LSU units
@ -407,7 +407,7 @@
// Number of Associative Ways
`ifndef ICACHE_NUM_WAYS
`define ICACHE_NUM_WAYS 2
`define ICACHE_NUM_WAYS 1
`endif
// Dcache Configurable Knobs //////////////////////////////////////////////////
@ -461,7 +461,7 @@
// Number of Associative Ways
`ifndef DCACHE_NUM_WAYS
`define DCACHE_NUM_WAYS 2
`define DCACHE_NUM_WAYS 1
`endif
// SM Configurable Knobs //////////////////////////////////////////////////////
@ -520,7 +520,7 @@
// Number of Associative Ways
`ifndef L2_NUM_WAYS
`define L2_NUM_WAYS 4
`define L2_NUM_WAYS 2
`endif
// L3cache Configurable Knobs /////////////////////////////////////////////////

View file

@ -262,7 +262,7 @@ module VX_afu_wrap #(
.m_axi_awready (m_axi_mem_awready_a),
.m_axi_awaddr (m_axi_mem_awaddr_w),
.m_axi_awid (m_axi_mem_awid_a),
`UNUSED_PIN (m_axi_awlen),
.m_axi_awlen (m_axi_mem_awlen_a),
`UNUSED_PIN (m_axi_awsize),
`UNUSED_PIN (m_axi_awburst),
`UNUSED_PIN (m_axi_awlock),

View file

@ -1,190 +0,0 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_cache_define.vh"
module VX_cache_cluster_top import VX_gpu_pkg::*; #(
parameter `STRING INSTANCE_ID = "",
parameter NUM_UNITS = 2,
parameter NUM_INPUTS = 4,
parameter TAG_SEL_IDX = 0,
// Number of Word requests per cycle
parameter NUM_REQS = 4,
// Size of cache in bytes
parameter CACHE_SIZE = 16384,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 16,
// Number of banks
parameter NUM_BANKS = 4,
// Number of associative ways
parameter NUM_WAYS = 4,
// Size of a word in bytes
parameter WORD_SIZE = 4,
// Core Response Queue Size
parameter CRSQ_SIZE = 2,
// Miss Reserv Queue Knob
parameter MSHR_SIZE = 16,
// Memory Response Queue Size
parameter MRSQ_SIZE = 0,
// Memory Request Queue Size
parameter MREQ_SIZE = 4,
// Enable cache writeable
parameter WRITE_ENABLE = 1,
// Request debug identifier
parameter UUID_WIDTH = 0,
// core request tag size
parameter TAG_WIDTH = UUID_WIDTH + 16,
// enable bypass for non-cacheable addresses
parameter NC_ENABLE = 1,
// Core response output register
parameter CORE_OUT_REG = 2,
// Memory request output register
parameter MEM_OUT_REG = 2,
parameter NUM_CACHES = `UP(NUM_UNITS),
parameter PASSTHRU = (NUM_UNITS == 0),
parameter ARB_TAG_WIDTH = TAG_WIDTH + `ARB_SEL_BITS(NUM_INPUTS, NUM_CACHES),
parameter MEM_TAG_WIDTH = PASSTHRU ? (NC_ENABLE ? `CACHE_NC_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
`CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH)) :
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS)),
parameter MEM_TAG_X_WIDTH = MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1)
) (
input wire clk,
input wire reset,
// PERF
`ifdef PERF_ENABLE
output cache_perf_t cache_perf,
`endif
// Core request
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_req_valid,
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_req_rw,
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen,
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr,
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data,
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag,
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_req_ready,
// Core response
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_rsp_valid,
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data,
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag,
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_rsp_ready,
// Memory request
output wire mem_req_valid,
output wire mem_req_rw,
output wire [LINE_SIZE-1:0] mem_req_byteen,
output wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr,
output wire [`CS_LINE_WIDTH-1:0] mem_req_data,
output wire [MEM_TAG_X_WIDTH-1:0] mem_req_tag,
input wire mem_req_ready,
// Memory response
input wire mem_rsp_valid,
input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data,
input wire [MEM_TAG_X_WIDTH-1:0] mem_rsp_tag,
output wire mem_rsp_ready
);
VX_mem_bus_if #(
.DATA_SIZE (WORD_SIZE),
.TAG_WIDTH (TAG_WIDTH)
) core_bus_if[NUM_INPUTS * NUM_REQS]();
VX_mem_bus_if #(
.DATA_SIZE (LINE_SIZE),
.TAG_WIDTH (MEM_TAG_X_WIDTH)
) mem_bus_if();
// Core request
for (genvar i = 0; i < NUM_INPUTS; ++i) begin
for (genvar r = 0; r < NUM_REQS; ++r) begin
assign core_bus_if[i * NUM_REQS + r].req_valid = core_req_valid[i][r];
assign core_bus_if[i * NUM_REQS + r].req_data.rw = core_req_rw[i][r];
assign core_bus_if[i * NUM_REQS + r].req_data.byteen = core_req_byteen[i][r];
assign core_bus_if[i * NUM_REQS + r].req_data.addr = core_req_addr[i][r];
assign core_bus_if[i * NUM_REQS + r].req_data.data = core_req_data[i][r];
assign core_bus_if[i * NUM_REQS + r].req_data.tag = core_req_tag[i][r];
assign core_req_ready[i][r] = core_bus_if[i * NUM_REQS + r].req_ready;
end
end
// Core response
for (genvar i = 0; i < NUM_INPUTS; ++i) begin
for (genvar r = 0; r < NUM_REQS; ++r) begin
assign core_rsp_valid[i][r] = core_bus_if[i * NUM_REQS + r].rsp_valid;
assign core_rsp_data[i][r] = core_bus_if[i * NUM_REQS + r].rsp_data.data;
assign core_rsp_tag[i][r] = core_bus_if[i * NUM_REQS + r].rsp_data.tag;
assign core_bus_if[i * NUM_REQS + r].rsp_ready = core_rsp_ready[i][r];
end
end
// Memory request
assign mem_req_valid = mem_bus_if.req_valid;
assign mem_req_rw = mem_bus_if.req_data.rw;
assign mem_req_byteen = mem_bus_if.req_data.byteen;
assign mem_req_addr = mem_bus_if.req_data.addr;
assign mem_req_data = mem_bus_if.req_data.data;
assign mem_req_tag = mem_bus_if.req_data.tag;
assign mem_bus_if.req_ready = mem_req_ready;
// Memory response
assign mem_bus_if.rsp_valid = mem_rsp_valid;
assign mem_bus_if.rsp_data.data = mem_rsp_data;
assign mem_bus_if.rsp_data.tag = mem_rsp_tag;
assign mem_rsp_ready = mem_bus_if.rsp_ready;
VX_cache_cluster #(
.INSTANCE_ID (INSTANCE_ID),
.NUM_UNITS (NUM_UNITS),
.NUM_INPUTS (NUM_INPUTS),
.TAG_SEL_IDX (TAG_SEL_IDX),
.NUM_REQS (NUM_REQS),
.CACHE_SIZE (CACHE_SIZE),
.LINE_SIZE (LINE_SIZE),
.NUM_BANKS (NUM_BANKS),
.NUM_WAYS (NUM_WAYS),
.WORD_SIZE (WORD_SIZE),
.CRSQ_SIZE (CRSQ_SIZE),
.MSHR_SIZE (MSHR_SIZE),
.MRSQ_SIZE (MRSQ_SIZE),
.MREQ_SIZE (MREQ_SIZE),
.WRITE_ENABLE (WRITE_ENABLE),
.UUID_WIDTH (UUID_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.NC_ENABLE (NC_ENABLE),
.CORE_OUT_REG (CORE_OUT_REG),
.MEM_OUT_REG (MEM_OUT_REG)
) cache (
`ifdef PERF_ENABLE
.cache_perf (cache_perf),
`endif
.clk (clk),
.reset (reset),
.core_bus_if (core_bus_if),
.mem_bus_if (mem_bus_if)
);
endmodule

View file

@ -93,7 +93,7 @@ module VX_cache_data #(
assign wren = fill;
end
wire [`CLOG2(NUM_WAYS)-1:0] way_idx;
wire [`LOG2UP(NUM_WAYS)-1:0] way_idx;
VX_onehot_encoder #(
.N (NUM_WAYS)

View file

@ -22,7 +22,7 @@ module VX_cache_top #(
// Size of cache in bytes
parameter CACHE_SIZE = 16384,
// Size of line inside a bank in bytes
parameter LINE_SIZE = 16,
parameter LINE_SIZE = 64,
// Number of banks
parameter NUM_BANKS = 4,
// Number of associative ways

View file

@ -267,7 +267,7 @@ module VX_core import VX_gpu_pkg::*; #(
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;
wire perf_icache_pending_read_cycle;
wire [1:0] perf_icache_pending_read_cycle;
wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads;

View file

@ -220,8 +220,13 @@ module VX_muldiv_unit #(
wire [NUM_LANES-1:0][`XLEN-1:0] div_in2;
for (genvar i = 0; i < NUM_LANES; ++i) begin
`ifdef XLEN_64
assign div_in1[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]}: execute_if.data.rs1_data[i];
assign div_in2[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]}: execute_if.data.rs2_data[i];
`else
assign div_in1[i] = execute_if.data.rs1_data[i];
assign div_in2[i] = execute_if.data.rs2_data[i];
`endif
end
`ifdef IDIV_DPI

View file

@ -38,9 +38,12 @@ module VX_operands import VX_gpu_pkg::*; #(
reg [`NR_BITS-1:0] gpr_rd_rid, gpr_rd_rid_n;
reg [ISSUE_WIS_W-1:0] gpr_rd_wis, gpr_rd_wis_n;
reg [ISSUE_RATIO-1:0][`NUM_THREADS-1:0][`XLEN-1:0] cache_data, cache_data_n;
reg [ISSUE_RATIO-1:0][`NR_BITS-1:0] cache_reg, cache_reg_n;
reg [ISSUE_RATIO-1:0][`NUM_THREADS-1:0] cache_tmask, cache_tmask_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] cache_data [ISSUE_RATIO-1:0];
reg [`NUM_THREADS-1:0][`XLEN-1:0] cache_data_n [ISSUE_RATIO-1:0];
reg [`NR_BITS-1:0] cache_reg [ISSUE_RATIO-1:0];
reg [`NR_BITS-1:0] cache_reg_n [ISSUE_RATIO-1:0];
reg [`NUM_THREADS-1:0] cache_tmask [ISSUE_RATIO-1:0];
reg [`NUM_THREADS-1:0] cache_tmask_n [ISSUE_RATIO-1:0];
reg [ISSUE_RATIO-1:0] cache_eop, cache_eop_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data, rs1_data_n;
@ -160,11 +163,8 @@ module VX_operands import VX_gpu_pkg::*; #(
end
cache_reg_n[writeback_if[i].data.wis] = writeback_if[i].data.rd;
cache_eop_n[writeback_if[i].data.wis] = writeback_if[i].data.eop;
if (writeback_if[i].data.sop) begin
cache_tmask_n[writeback_if[i].data.wis] = writeback_if[i].data.tmask;
end else begin
cache_tmask_n[writeback_if[i].data.wis] |= writeback_if[i].data.tmask;
end
cache_tmask_n[writeback_if[i].data.wis] = writeback_if[i].data.sop ? writeback_if[i].data.tmask :
(cache_tmask_n[writeback_if[i].data.wis] | writeback_if[i].data.tmask);
end
end
end
@ -175,7 +175,6 @@ module VX_operands import VX_gpu_pkg::*; #(
gpr_rd_rid <= '0;
gpr_rd_wis <= '0;
cache_eop <= {ISSUE_RATIO{1'b1}};
cache_reg <= '0;
data_ready <= 0;
end else begin
state <= state_n;

View file

@ -107,6 +107,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
.ready_out (scoreboard_if[i].ready)
);
`ifdef SIMULATION
reg [31:0] timeout_ctr;
always @(posedge clk) begin
@ -134,6 +135,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0,
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
$time, CORE_ID, wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.PC, writeback_if[i].data.tmask, writeback_if[i].data.rd, writeback_if[i].data.uuid));
`endif
end
endmodule

View file

@ -170,7 +170,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
.NUM_INPUTS (RSP_ARB_SIZE),
.DATAW (RSP_ARB_DATAW),
.ARBITER ("R"),
.OUT_REG (1)
.OUT_REG (3)
) rsp_arb (
.clk (clk),
.reset (commit_reset),
@ -186,7 +186,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
VX_gather_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_REG (3)
.OUT_REG (1)
) gather_unit (
.clk (clk),
.reset (commit_reset),

View file

@ -52,30 +52,24 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
localparam MAN_BITS = 23;
localparam EXP_BITS = 8;
localparam EXP_BIAS = 2**(EXP_BITS-1)-1;
localparam logic [EXP_BITS-1:0] QNAN_EXPONENT = 2**EXP_BITS-1;
localparam logic [MAN_BITS-1:0] QNAN_MANTISSA = 2**(MAN_BITS-1);
localparam EXP_BIAS = 2**(EXP_BITS-1)-1;
// Use 32-bit integer
localparam MAX_INT_WIDTH = 32;
localparam INT_WIDTH = 32;
// The internal mantissa includes normal bit or an entire integer
localparam INT_MAN_WIDTH = `MAX(MAN_BITS + 1, MAX_INT_WIDTH);
localparam INT_MAN_WIDTH = `MAX(MAN_BITS + 1, INT_WIDTH);
// The lower 2p+3 bits of the internal FMA result will be needed for leading-zero detection
localparam LZC_RESULT_WIDTH = `CLOG2(INT_MAN_WIDTH);
// The internal exponent must be able to represent the smallest denormal input value as signed
// or the number of bits in an integer
localparam INT_EXP_WIDTH = `MAX(`CLOG2(MAX_INT_WIDTH), `MAX(EXP_BITS, `CLOG2(EXP_BIAS + MAN_BITS))) + 1;
// shift amount for denormalization
localparam SHAMT_BITS = `CLOG2(INT_MAN_WIDTH+1);
localparam INT_EXP_WIDTH = `MAX(`CLOG2(INT_WIDTH), `MAX(EXP_BITS, `CLOG2(EXP_BIAS + MAN_BITS))) + 1;
localparam FMT_SHIFT_COMPENSATION = INT_MAN_WIDTH - 1 - MAN_BITS;
localparam NUM_FP_STICKY = 2 * INT_MAN_WIDTH - MAN_BITS - 1; // removed mantissa, 1. and R
localparam NUM_INT_STICKY = 2 * INT_MAN_WIDTH - MAX_INT_WIDTH; // removed int and R
localparam NUM_INT_STICKY = 2 * INT_MAN_WIDTH - INT_WIDTH; // removed int and R
// Input processing
@ -86,8 +80,8 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
.EXP_BITS (EXP_BITS),
.MAN_BITS (MAN_BITS)
) fp_class (
.exp_i (dataa[i][30:23]),
.man_i (dataa[i][22:0]),
.exp_i (dataa[i][INT_WIDTH-2:MAN_BITS]),
.man_i (dataa[i][MAN_BITS-1:0]),
.clss_o (fclass[i])
);
end
@ -97,27 +91,25 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
wire [NUM_LANES-1:0] input_sign;
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [INT_MAN_WIDTH-1:0] int_mantissa;
wire [INT_MAN_WIDTH-1:0] fmt_mantissa;
wire fmt_sign = dataa[i][31];
wire int_sign = dataa[i][31] && is_signed;
assign int_mantissa = int_sign ? (-dataa[i]) : dataa[i];
assign fmt_mantissa = INT_MAN_WIDTH'({fclass[i].is_normal, dataa[i][MAN_BITS-1:0]});
wire i2f_sign = dataa[i][INT_WIDTH-1];
wire f2i_sign = dataa[i][INT_WIDTH-1] && is_signed;
wire [INT_MAN_WIDTH-1:0] f2i_mantissa = f2i_sign ? (-dataa[i]) : dataa[i];
wire [INT_MAN_WIDTH-1:0] i2f_mantissa = INT_MAN_WIDTH'({fclass[i].is_normal, dataa[i][MAN_BITS-1:0]});
assign input_exp[i] = {1'b0, dataa[i][MAN_BITS +: EXP_BITS]} + INT_EXP_WIDTH'({1'b0, fclass[i].is_subnormal});
assign input_mant[i] = is_itof ? int_mantissa : fmt_mantissa;
assign input_sign[i] = is_itof ? int_sign : fmt_sign;
assign input_mant[i] = is_itof ? f2i_mantissa : i2f_mantissa;
assign input_sign[i] = is_itof ? f2i_sign : i2f_sign;
end
// Pipeline stage0
wire valid_in_s0;
wire [NUM_LANES-1:0] lane_mask_s0;
wire [TAGW-1:0] tag_in_s0;
wire is_itof_s0;
wire unsigned_s0;
wire [2:0] rnd_mode_s0;
wire valid_in_s0;
wire [NUM_LANES-1:0] lane_mask_s0;
wire [TAGW-1:0] tag_in_s0;
wire is_itof_s0;
wire is_signed_s0;
wire [2:0] rnd_mode_s0;
fclass_t [NUM_LANES-1:0] fclass_s0;
wire [NUM_LANES-1:0] input_sign_s0;
wire [NUM_LANES-1:0] input_sign_s0;
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] fmt_exponent_s0;
wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] encoded_mant_s0;
@ -130,8 +122,8 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
.clk (clk),
.reset (reset),
.enable (~stall),
.data_in ({valid_in, lane_mask, tag_in, is_itof, !is_signed, frm, fclass, input_sign, input_exp, input_mant}),
.data_out ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
.data_in ({valid_in, lane_mask, tag_in, is_itof, is_signed, frm, fclass, input_sign, input_exp, input_mant}),
.data_out ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, is_signed_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
);
// Normalization
@ -159,22 +151,22 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
assign input_mant_n_s0[i] = encoded_mant_s0[i] << renorm_shamt_s0[i];
// Unbias exponent and compensate for shift
wire [INT_EXP_WIDTH-1:0] fp_input_exp_s0 = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
wire [INT_EXP_WIDTH-1:0] int_input_exp_s0 = INT_EXP_WIDTH'(INT_MAN_WIDTH-1) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
assign input_exp_n_s0[i] = is_itof_s0 ? int_input_exp_s0 : fp_input_exp_s0;
wire [INT_EXP_WIDTH-1:0] i2f_input_exp_s0 = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
wire [INT_EXP_WIDTH-1:0] f2i_input_exp_s0 = INT_EXP_WIDTH'(INT_MAN_WIDTH-1) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
assign input_exp_n_s0[i] = is_itof_s0 ? f2i_input_exp_s0 : i2f_input_exp_s0;
end
// Pipeline stage1
wire valid_in_s1;
wire [NUM_LANES-1:0] lane_mask_s1;
wire [TAGW-1:0] tag_in_s1;
wire is_itof_s1;
wire unsigned_s1;
wire [2:0] rnd_mode_s1;
wire valid_in_s1;
wire [NUM_LANES-1:0] lane_mask_s1;
wire [TAGW-1:0] tag_in_s1;
wire is_itof_s1;
wire is_signed_s1;
wire [2:0] rnd_mode_s1;
fclass_t [NUM_LANES-1:0] fclass_s1;
wire [NUM_LANES-1:0] input_sign_s1;
wire [NUM_LANES-1:0] mant_is_zero_s1;
wire [NUM_LANES-1:0] input_sign_s1;
wire [NUM_LANES-1:0] mant_is_zero_s1;
wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] input_mant_s1;
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] input_exp_s1;
@ -185,76 +177,49 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
.clk (clk),
.reset (reset),
.enable (~stall),
.data_in ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fclass_s0, input_sign_s0, mant_is_zero_s0, input_mant_n_s0, input_exp_n_s0}),
.data_out ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fclass_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1})
.data_in ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, is_signed_s0, rnd_mode_s0, fclass_s0, input_sign_s0, mant_is_zero_s0, input_mant_n_s0, input_exp_n_s0}),
.data_out ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, is_signed_s1, rnd_mode_s1, fclass_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1})
);
// Perform adjustments to mantissa and exponent
wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s1;
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s1;
wire [NUM_LANES-1:0] of_before_round_s1;
wire [NUM_LANES-1:0] of_before_round_s1;
for (genvar i = 0; i < NUM_LANES; ++i) begin
reg [2*INT_MAN_WIDTH:0] preshift_mant_s1; // mantissa before final shift
reg [SHAMT_BITS-1:0] denorm_shamt_s1; // shift amount for denormalization
reg [INT_EXP_WIDTH-1:0] final_exp_tmp_s1; // after eventual adjustments
reg of_before_round_tmp_s1;
wire [INT_EXP_WIDTH-1:0] denorm_shamt = INT_EXP_WIDTH'(INT_WIDTH-1) - input_exp_s1[i];
wire overflow = ($signed(denorm_shamt) <= -$signed(INT_EXP_WIDTH'(!is_signed_s1)));
wire underflow = ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-1)));
reg [INT_EXP_WIDTH-1:0] denorm_shamt_q;
always @(*) begin
final_exp_tmp_s1 = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS); // take exponent as is, only look at lower bits
preshift_mant_s1 = {input_mant_s1[i], 33'b0};
denorm_shamt_s1 = '0;
of_before_round_tmp_s1 = 1'b0;
if (is_itof_s1) begin
if ($signed(input_exp_s1[i]) >= INT_EXP_WIDTH'($signed(2**EXP_BITS-1-EXP_BIAS))) begin
// Overflow or infinities (for proper rounding)
final_exp_tmp_s1 = (2**EXP_BITS-2); // largest normal value
preshift_mant_s1 = ~0; // largest normal value and RS bits set
of_before_round_tmp_s1 = 1'b1;
end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-MAN_BITS-EXP_BIAS))) begin
// Limit the shift to retain sticky bits
final_exp_tmp_s1 = '0; // denormal result
denorm_shamt_s1 = (2 + MAN_BITS); // to sticky
end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(1-EXP_BIAS))) begin
// Denormalize underflowing values
final_exp_tmp_s1 = '0; // denormal result
denorm_shamt_s1 = SHAMT_BITS'(1-EXP_BIAS) - SHAMT_BITS'(input_exp_s1[i]); // adjust right shifting
end
if (overflow) begin
denorm_shamt_q = '0;
end else if (underflow) begin
denorm_shamt_q = INT_WIDTH+1;
end else begin
if ($signed(input_exp_s1[i]) >= $signed(INT_EXP_WIDTH'(MAX_INT_WIDTH-1) + INT_EXP_WIDTH'(unsigned_s1))) begin
// overflow: when converting to unsigned the range is larger by one
of_before_round_tmp_s1 = 1'b1;
end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-1))) begin
// underflow
denorm_shamt_s1 = MAX_INT_WIDTH+1; // all bits go to the sticky
end else begin
// By default right shift mantissa to be an integer
denorm_shamt_s1 = SHAMT_BITS'(MAX_INT_WIDTH-1) - SHAMT_BITS'(input_exp_s1[i]);
end
denorm_shamt_q = denorm_shamt;
end
end
assign destination_mant_s1[i] = preshift_mant_s1 >> denorm_shamt_s1;
assign final_exp_s1[i] = final_exp_tmp_s1;
assign of_before_round_s1[i] = of_before_round_tmp_s1;
assign destination_mant_s1[i] = is_itof_s1 ? {input_mant_s1[i], 33'b0} : ({input_mant_s1[i], 33'b0} >> denorm_shamt_q);
assign final_exp_s1[i] = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS);
assign of_before_round_s1[i] = overflow;
end
// Pipeline stage2
wire valid_in_s2;
wire [NUM_LANES-1:0] lane_mask_s2;
wire [TAGW-1:0] tag_in_s2;
wire is_itof_s2;
wire unsigned_s2;
wire [2:0] rnd_mode_s2;
wire valid_in_s2;
wire [NUM_LANES-1:0] lane_mask_s2;
wire [TAGW-1:0] tag_in_s2;
wire is_itof_s2;
wire is_signed_s2;
wire [2:0] rnd_mode_s2;
fclass_t [NUM_LANES-1:0] fclass_s2;
wire [NUM_LANES-1:0] mant_is_zero_s2;
wire [NUM_LANES-1:0] input_sign_s2;
wire [NUM_LANES-1:0] mant_is_zero_s2;
wire [NUM_LANES-1:0] input_sign_s2;
wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s2;
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s2;
wire [NUM_LANES-1:0] of_before_round_s2;
wire [NUM_LANES-1:0] of_before_round_s2;
VX_pipe_register #(
.DATAW (1 + NUM_LANES + TAGW + 1 + 1 + `INST_FRM_BITS + NUM_LANES * ($bits(fclass_t) + 1 + 1 + (2*INT_MAN_WIDTH+1) + INT_EXP_WIDTH + 1)),
@ -263,37 +228,37 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
.clk (clk),
.reset (reset),
.enable (~stall),
.data_in ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
.data_out ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
.data_in ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, is_signed_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
.data_out ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, is_signed_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
);
wire [NUM_LANES-1:0] rounded_sign_s2;
wire [NUM_LANES-1:0][31:0] rounded_abs_s2; // absolute value of result after rounding
wire [NUM_LANES-1:0] int_round_has_sticky_s2;
wire [NUM_LANES-1:0] fp_round_has_sticky_s2;
wire [NUM_LANES-1:0] rounded_sign_s2;
wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_abs_s2; // absolute value of result after rounding
wire [NUM_LANES-1:0] f2i_round_has_sticky_s2;
wire [NUM_LANES-1:0] i2f_round_has_sticky_s2;
// Rouding and classification
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [MAN_BITS-1:0] final_mant_s2; // mantissa after adjustments
wire [MAX_INT_WIDTH-1:0] final_int_s2; // integer shifted in position
wire [1:0] round_sticky_bits_s2;
wire [31:0] fmt_pre_round_abs_s2;
wire [31:0] pre_round_abs_s2;
wire [1:0] int_round_sticky_bits_s2, fp_round_sticky_bits_s2;
wire [MAN_BITS-1:0] final_mant_s2; // mantissa after adjustments
wire [INT_WIDTH-1:0] final_int_s2; // integer shifted in position
wire [1:0] round_sticky_bits_s2;
wire [INT_WIDTH-1:0] fmt_pre_round_abs_s2;
wire [INT_WIDTH-1:0] pre_round_abs_s2;
wire [1:0] f2i_round_sticky_bits_s2, i2f_round_sticky_bits_s2;
// Extract final mantissa and round bit, discard the normal bit (for FP)
assign {final_mant_s2, fp_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1];
assign {final_int_s2, int_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH : 2*INT_MAN_WIDTH - (MAX_INT_WIDTH+1) + 1];
assign {final_mant_s2, i2f_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1];
assign {final_int_s2, f2i_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH : 2*INT_MAN_WIDTH - (INT_WIDTH+1) + 1];
// Collapse sticky bits
assign fp_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]);
assign int_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]);
assign fp_round_has_sticky_s2[i] = (| fp_round_sticky_bits_s2);
assign int_round_has_sticky_s2[i] = (| int_round_sticky_bits_s2);
assign i2f_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]);
assign f2i_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]);
assign i2f_round_has_sticky_s2[i] = (| i2f_round_sticky_bits_s2);
assign f2i_round_has_sticky_s2[i] = (| f2i_round_sticky_bits_s2);
// select RS bits for destination operation
assign round_sticky_bits_s2 = is_itof_s2 ? fp_round_sticky_bits_s2 : int_round_sticky_bits_s2;
assign round_sticky_bits_s2 = is_itof_s2 ? i2f_round_sticky_bits_s2 : f2i_round_sticky_bits_s2;
// Pack exponent and mantissa into proper rounding form
assign fmt_pre_round_abs_s2 = {1'b0, final_exp_s2[i][EXP_BITS-1:0], final_mant_s2[MAN_BITS-1:0]};
@ -322,15 +287,15 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
wire [NUM_LANES-1:0] lane_mask_s3;
wire [TAGW-1:0] tag_in_s3;
wire is_itof_s3;
wire unsigned_s3;
wire is_signed_s3;
fclass_t [NUM_LANES-1:0] fclass_s3;
wire [NUM_LANES-1:0] mant_is_zero_s3;
wire [NUM_LANES-1:0] input_sign_s3;
wire [NUM_LANES-1:0] rounded_sign_s3;
wire [NUM_LANES-1:0][31:0] rounded_abs_s3;
wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_abs_s3;
wire [NUM_LANES-1:0] of_before_round_s3;
wire [NUM_LANES-1:0] int_round_has_sticky_s3;
wire [NUM_LANES-1:0] fp_round_has_sticky_s3;
wire [NUM_LANES-1:0] f2i_round_has_sticky_s3;
wire [NUM_LANES-1:0] i2f_round_has_sticky_s3;
VX_pipe_register #(
.DATAW (1 + NUM_LANES + TAGW + 1 + 1 + NUM_LANES * ($bits(fclass_t) + 1 + 1 + 32 + 1 + 1 + 1 + 1)),
@ -339,105 +304,68 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
.clk (clk),
.reset (reset),
.enable (~stall),
.data_in ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, int_round_has_sticky_s2, fp_round_has_sticky_s2}),
.data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, unsigned_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, int_round_has_sticky_s3, fp_round_has_sticky_s3})
.data_in ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, is_signed_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, f2i_round_has_sticky_s2, i2f_round_has_sticky_s2}),
.data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, is_signed_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, f2i_round_has_sticky_s3, i2f_round_has_sticky_s3})
);
wire [NUM_LANES-1:0] of_after_round_s3;
wire [NUM_LANES-1:0] uf_after_round_s3;
wire [NUM_LANES-1:0][31:0] fmt_result_s3;
wire [NUM_LANES-1:0][31:0] rounded_int_res_s3; // after possible inversion
wire [NUM_LANES-1:0][INT_WIDTH-1:0] fmt_result_s3;
wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_int_res_s3; // after possible inversion
wire [NUM_LANES-1:0] rounded_int_res_zero_s3; // after rounding
for (genvar i = 0; i < NUM_LANES; ++i) begin
// Assemble regular result, nan box short ones. Int zeroes need to be detected
assign fmt_result_s3[i] = (is_itof_s3 & mant_is_zero_s3[i]) ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]};
// Classification after rounding select by destination format
assign uf_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == 0); // denormal
assign of_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == ~0); // inf exp.
assign fmt_result_s3[i] = mant_is_zero_s3[i] ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]};
// Negative integer result needs to be brought into two's complement
assign rounded_int_res_s3[i] = rounded_sign_s3[i] ? (-rounded_abs_s3[i]) : rounded_abs_s3[i];
assign rounded_int_res_zero_s3[i] = (rounded_int_res_s3[i] == 0);
end
// FP Special case handling
// F2I Special case handling
wire [NUM_LANES-1:0][31:0] fp_special_result_s3;
fflags_t [NUM_LANES-1:0] fp_special_status_s3;
wire [NUM_LANES-1:0] fp_result_is_special_s3;
for (genvar i = 0; i < NUM_LANES; ++i) begin
// Detect special case from source format, I2F casts don't produce a special result
assign fp_result_is_special_s3[i] = ~is_itof_s3 & (fclass_s3[i].is_zero | fclass_s3[i].is_nan);
// Signalling input NaNs raise invalid flag, otherwise no flags set
assign fp_special_status_s3[i] = fclass_s3[i].is_signaling ? {1'b1, 4'h0} : 5'h0; // invalid operation
// Assemble result according to destination format
assign fp_special_result_s3[i] = fclass_s3[i].is_zero ? (32'(input_sign_s3) << 31) // signed zero
: {1'b0, QNAN_EXPONENT, QNAN_MANTISSA}; // qNaN
end
// INT Special case handling
reg [NUM_LANES-1:0][31:0] int_special_result_s3;
fflags_t [NUM_LANES-1:0] int_special_status_s3;
wire [NUM_LANES-1:0] int_result_is_special_s3;
reg [NUM_LANES-1:0][INT_WIDTH-1:0] f2i_special_result_s3;
fflags_t [NUM_LANES-1:0] f2i_special_status_s3;
wire [NUM_LANES-1:0] f2i_result_is_special_s3;
for (genvar i = 0; i < NUM_LANES; ++i) begin
// Assemble result according to destination format
always @(*) begin
if (input_sign_s3[i] && !fclass_s3[i].is_nan) begin
int_special_result_s3[i][30:0] = '0; // alone yields 2**(31)-1
int_special_result_s3[i][31] = ~unsigned_s3; // for unsigned casts yields 2**31
f2i_special_result_s3[i][INT_WIDTH-2:0] = '0; // alone yields 2**(31)-1
f2i_special_result_s3[i][INT_WIDTH-1] = is_signed_s3; // for unsigned casts yields 2**31
end else begin
int_special_result_s3[i][30:0] = 2**(31) - 1; // alone yields 2**(31)-1
int_special_result_s3[i][31] = unsigned_s3; // for unsigned casts yields 2**31
f2i_special_result_s3[i][INT_WIDTH-2:0] = 2**(INT_WIDTH-1) - 1; // alone yields 2**(31)-1
f2i_special_result_s3[i][INT_WIDTH-1] = ~is_signed_s3; // for unsigned casts yields 2**31
end
end
// Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned)
assign int_result_is_special_s3[i] = fclass_s3[i].is_nan
assign f2i_result_is_special_s3[i] = fclass_s3[i].is_nan
| fclass_s3[i].is_inf
| of_before_round_s3[i]
| (input_sign_s3[i] & unsigned_s3 & ~rounded_int_res_zero_s3[i]);
| (input_sign_s3[i] & ~is_signed_s3 & ~rounded_int_res_zero_s3[i]);
// All integer special cases are invalid
assign int_special_status_s3[i] = {1'b1, 4'h0};
assign f2i_special_status_s3[i] = {1'b1, 4'h0};
end
// Result selection and Output handshake
fflags_t [NUM_LANES-1:0] tmp_fflags_s3;
wire [NUM_LANES-1:0][31:0] tmp_result_s3;
wire [NUM_LANES-1:0][INT_WIDTH-1:0] tmp_result_s3;
for (genvar i = 0; i < NUM_LANES; ++i) begin
fflags_t fp_regular_status_s3, int_regular_status_s3;
fflags_t fp_status_s3, int_status_s3;
wire [31:0] fp_result_s3, int_result_s3;
for (genvar i = 0; i < NUM_LANES; ++i) begin
fflags_t i2f_regular_status_s3 = i2f_round_has_sticky_s3[i] ? 5'h1 : 5'h0;
fflags_t f2i_regular_status_s3 = f2i_round_has_sticky_s3[i] ? 5'h1 : 5'h0;
wire inexact_s3 = is_itof_s3 ? fp_round_has_sticky_s3[i] // overflow is invalid in i2f;
: (fp_round_has_sticky_s3[i] || (~fclass_s3[i].is_inf && (of_before_round_s3[i] || of_after_round_s3[i])));
assign fp_regular_status_s3.NV = is_itof_s3 & (of_before_round_s3[i] | of_after_round_s3[i]); // overflow is invalid for I2F casts
assign fp_regular_status_s3.DZ = 1'b0; // no divisions
assign fp_regular_status_s3.OF = ~is_itof_s3 & (~fclass_s3[i].is_inf & (of_before_round_s3[i] | of_after_round_s3[i])); // inf casts no OF
assign fp_regular_status_s3.UF = uf_after_round_s3[i] & inexact_s3;
assign fp_regular_status_s3.NX = inexact_s3;
fflags_t i2f_status_s3 = i2f_regular_status_s3;
fflags_t f2i_status_s3 = f2i_result_is_special_s3[i] ? f2i_special_status_s3[i] : f2i_regular_status_s3;
assign int_regular_status_s3 = int_round_has_sticky_s3[i] ? {4'h0, 1'b1} : 5'h0;
wire [INT_WIDTH-1:0] i2f_result_s3 = fmt_result_s3[i];
wire [INT_WIDTH-1:0] f2i_result_s3 = f2i_result_is_special_s3[i] ? f2i_special_result_s3[i] : rounded_int_res_s3[i];
assign fp_result_s3 = fp_result_is_special_s3[i] ? fp_special_result_s3[i] : fmt_result_s3[i];
assign int_result_s3 = int_result_is_special_s3[i] ? int_special_result_s3[i] : rounded_int_res_s3[i];
assign fp_status_s3 = fp_result_is_special_s3[i] ? fp_special_status_s3[i] : fp_regular_status_s3;
assign int_status_s3 = int_result_is_special_s3[i] ? int_special_status_s3[i] : int_regular_status_s3;
// Select output depending on special case detection
assign tmp_result_s3[i] = is_itof_s3 ? fp_result_s3 : int_result_s3;
assign tmp_fflags_s3[i] = is_itof_s3 ? fp_status_s3 : int_status_s3;
assign tmp_result_s3[i] = is_itof_s3 ? i2f_result_s3 : f2i_result_s3;
assign tmp_fflags_s3[i] = is_itof_s3 ? i2f_status_s3 : f2i_status_s3;
end
assign stall = ~ready_out && valid_out;
@ -457,7 +385,6 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
);
assign ready_in = ~stall;
assign has_fflags = 1'b1;
endmodule

View file

@ -54,7 +54,6 @@ module VX_fpu_rounding #(
2'b01: round_up = 1'b0; // < ulp/2 away, round down
2'b10: round_up = abs_value_i[0]; // = ulp/2 away, round towards even result
2'b11: round_up = 1'b1; // > ulp/2 away, round up
default: round_up = 1'bx;
endcase
`INST_FRM_RTZ: round_up = 1'b0; // always round down
`INST_FRM_RDN: round_up = (| round_sticky_bits_i) & sign_i; // to 0 if +, away if -

View file

@ -1,6 +1,6 @@
PROJECT = VX_cache_cluster_top
PROJECT = VX_cache_top
TOP_LEVEL_ENTITY = $(PROJECT)
SRC_FILE = VX_cache_cluster.sv
SRC_FILE = $(PROJECT).sv
include ../../common.mk

View file

@ -1,6 +1,6 @@
PROJECT = VX_core_top
TOP_LEVEL_ENTITY = $(PROJECT)
SRC_FILE = VX_core.sv
SRC_FILE = $(PROJECT).sv
include ../../common.mk

View file

@ -1,10 +1,11 @@
XLEN ?= 32
TOOLDIR ?= /opt
ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
CFLAGS += -march=rv64imafd -mabi=lp64d
else
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
CFLAGS += -march=rv32imaf -mabi=ilp32f
endif

View file

@ -56,7 +56,6 @@ PROJECT = top_modules
all: build
build: $(SRCS)
verilator --build $(VL_FLAGS) --cc VX_cache_cluster_top --top-module VX_cache_cluster_top $^ -CFLAGS '$(CXXFLAGS)'
verilator --build $(VL_FLAGS) --cc VX_cache_top --top-module VX_cache_top $^ -CFLAGS '$(CXXFLAGS)'
verilator --build $(VL_FLAGS) --cc VX_core_top --top-module VX_core_top $^ -CFLAGS '$(CXXFLAGS)'

View file

@ -1,17 +1,18 @@
XLEN ?= 32
TOOLDIR ?= /opt
ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
CFLAGS += -march=rv64imafd -mabi=lp64d
else
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
CFLAGS += -march=rv32imaf -mabi=ilp32f
endif
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
LLVM_VORTEX ?= /opt/llvm-vortex
LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex
LLVM_CFLAGS += --sysroot=$(RISCV_SYSROOT)
LLVM_CFLAGS += --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH)

View file

@ -51,9 +51,8 @@ inline char is_log2(int x) {
return ((x & (x-1)) == 0);
}
inline int fast_log2(int x) {
float f = x;
return (*(int*)(&f)>>23) - 127;
inline int log2_fast(int x) {
return 31 - __builtin_clz (x);
}
static void __attribute__ ((noinline)) spawn_tasks_all_stub() {
@ -286,8 +285,8 @@ void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg) {
// fast path handling
char isXYpow2 = is_log2(XY);
char log2XY = fast_log2(XY);
char log2X = fast_log2(X);
char log2XY = log2_fast(XY);
char log2X = log2_fast(X);
wspawn_kernel_args_t wspawn_args = {
ctx, callback, arg, core_id * tasks_per_core, fW, rW, isXYpow2, log2XY, log2X

49
tests/kernel/common.mk Normal file
View file

@ -0,0 +1,49 @@
XLEN ?= 32
TOOLDIR ?= /opt
ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
CFLAGS += -march=rv64imafd -mabi=lp64d
else
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
CFLAGS += -march=rv32imaf -mabi=ilp32f
endif
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar
DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
SIM_DIR = ../../../sim
CFLAGS += -O3 -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw
LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a
all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump
$(PROJECT).dump: $(PROJECT).elf
$(DP) -D $(PROJECT).elf > $(PROJECT).dump
$(PROJECT).bin: $(PROJECT).elf
$(CP) -O binary $(PROJECT).elf $(PROJECT).bin
$(PROJECT).elf: $(SRCS)
$(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf
run-rtlsim: $(PROJECT).bin
$(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin
run-simx: $(PROJECT).bin
$(SIM_DIR)/simx/simx $(PROJECT).bin
.depend: $(SRCS)
$(CC) $(CFLAGS) -MM $^ > .depend;
clean:
rm -rf *.elf *.bin *.dump .depend

View file

@ -1,52 +1,5 @@
XLEN ?= 32
ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
CFLAGS += -march=rv64imafd -mabi=lp64d
else
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
CFLAGS += -march=rv32imaf -mabi=ilp32f
endif
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar
DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
SIM_DIR = ../../../sim
CFLAGS += -O3 -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw
LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a
PROJECT = conform
SRCS = main.cpp tests.cpp
all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump
$(PROJECT).dump: $(PROJECT).elf
$(DP) -D $(PROJECT).elf > $(PROJECT).dump
$(PROJECT).bin: $(PROJECT).elf
$(CP) -O binary $(PROJECT).elf $(PROJECT).bin
$(PROJECT).elf: $(SRCS)
$(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf
run-rtlsim: $(PROJECT).bin
$(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin
run-simx: $(PROJECT).bin
$(SIM_DIR)/simx/simx $(PROJECT).bin
.depend: $(SRCS)
$(CC) $(CFLAGS) -MM $^ > .depend;
clean:
rm -rf *.elf *.bin *.dump .depend
include ../common.mk

View file

@ -1,52 +1,5 @@
XLEN ?= 32
ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
CFLAGS += -march=rv64imafd -mabi=lp64d
else
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
CFLAGS += -march=rv32imaf -mabi=ilp32f
endif
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar
DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
SIM_DIR = ../../../sim
CFLAGS += -O3 -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw
LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a
PROJECT = fibonacci
SRCS = main.cpp
all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump
$(PROJECT).dump: $(PROJECT).elf
$(DP) -D $(PROJECT).elf > $(PROJECT).dump
$(PROJECT).bin: $(PROJECT).elf
$(CP) -O binary $(PROJECT).elf $(PROJECT).bin
$(PROJECT).elf: $(SRCS)
$(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf
run-rtlsim: $(PROJECT).bin
$(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin
run-simx: $(PROJECT).bin
$(SIM_DIR)/simx/simx $(PROJECT).bin
.depend: $(SRCS)
$(CC) $(CFLAGS) -MM $^ > .depend;
clean:
rm -rf *.elf *.bin *.dump .depend
include ../common.mk

View file

@ -1,52 +1,5 @@
XLEN ?= 32
ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
CFLAGS += -march=rv64imafd -mabi=lp64d
else
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
CFLAGS += -march=rv32imaf -mabi=ilp32f
endif
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar
DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
SIM_DIR = ../../../sim
CFLAGS += -O3 -v -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw
LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a
PROJECT = hello
SRCS = main.cpp
all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump
$(PROJECT).dump: $(PROJECT).elf
$(DP) -D $(PROJECT).elf > $(PROJECT).dump
$(PROJECT).bin: $(PROJECT).elf
$(CP) -O binary $(PROJECT).elf $(PROJECT).bin
$(PROJECT).elf: $(SRCS)
$(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf
run-rtlsim: $(PROJECT).bin
$(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin
run-simx: $(PROJECT).bin
$(SIM_DIR)/simx/simx $(PROJECT).bin
.depend: $(SRCS)
$(CC) $(CFLAGS) -MM $^ > .depend;
clean:
rm -rf *.elf *.bin *.dump .depend
include ../common.mk

View file

@ -18,6 +18,7 @@ all:
$(MAKE) -C oclprintf
$(MAKE) -C blackscholes
$(MAKE) -C matmul
$(MAKE) -C convolution
run-simx:
$(MAKE) -C vecadd run-simx
@ -30,15 +31,16 @@ run-simx:
$(MAKE) -C dotproduct run-simx
$(MAKE) -C kmeans run-simx
$(MAKE) -C spmv run-simx
$(MAKE) -C cutcp run-simx
$(MAKE) -C stencil run-simx
$(MAKE) -C lbm run-simx
$(MAKE) -C oclprintf run-simx
$(MAKE) -C blackscholes run-simx
$(MAKE) -C matmul run-simx
$(MAKE) -C transpose run-simx
# $(MAKE) -C vectorhypot run-simx
# $(MAKE) -C mri-q run-simx
$(MAKE) -C convolution run-simx
$(MAKE) -C cutcp run-simx
$(MAKE) -C matmul run-simx
$(MAKE) -C vectorhypot run-simx
$(MAKE) -C mri-q run-simx
run-rtlsim:
$(MAKE) -C vecadd run-rtlsim
@ -52,12 +54,13 @@ run-rtlsim:
$(MAKE) -C kmeans run-rtlsim
$(MAKE) -C spmv run-rtlsim
$(MAKE) -C transpose run-rtlsim
$(MAKE) -C cutcp run-rtlsim
$(MAKE) -C stencil run-rtlsim
$(MAKE) -C lbm run-rtlsim
$(MAKE) -C oclprintf run-rtlsim
$(MAKE) -C blackscholes run-rtlsim
$(MAKE) -C matmul run-rtlsim
$(MAKE) -C convolution run-rtlsim
# $(MAKE) -C cutcp run-rtlsim
# $(MAKE) -C matmul run-rtlsim
# $(MAKE) -C vectorhypot run-rtlsim
# $(MAKE) -C mri-q run-rtlsim
@ -73,12 +76,13 @@ run-opae:
$(MAKE) -C kmeans run-opae
$(MAKE) -C spmv run-opae
$(MAKE) -C transpose run-opae
$(MAKE) -C cutcp run-opae
$(MAKE) -C stencil run-opae
$(MAKE) -C lbm run-opae
$(MAKE) -C oclprintf run-opae
$(MAKE) -C blackscholes run-opae
$(MAKE) -C matmul run-opae
$(MAKE) -C convolution run-opae
# $(MAKE) -C cutcp run-opae
# $(MAKE) -C matmul run-opae
# $(MAKE) -C vectorhypot run-opae
# $(MAKE) -C mri-q run-opae
@ -102,6 +106,7 @@ clean:
$(MAKE) -C oclprintf clean
$(MAKE) -C blackscholes clean
$(MAKE) -C matmul clean
$(MAKE) -C convolution clean
clean-all:
$(MAKE) -C vecadd clean-all
@ -124,3 +129,4 @@ clean-all:
$(MAKE) -C oclprintf clean-all
$(MAKE) -C blackscholes clean-all
$(MAKE) -C matmul clean-all
$(MAKE) -C convolution clean-all

View file

@ -1,4 +1,5 @@
XLEN ?= 32
TOOLDIR ?= /opt
TARGET ?= opaesim
@ -6,12 +7,12 @@ XRT_SYN_DIR ?= ../../../hw/syn/xilinx/xrt
XRT_DEVICE_INDEX ?= 0
ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
VX_CFLAGS += -march=rv64imafd -mabi=lp64d
K_CFLAGS += -march=rv64imafd -mabi=ilp64d
STARTUP_ADDR ?= 0x180000000
else
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
VX_CFLAGS += -march=rv32imaf -mabi=ilp32f
K_CFLAGS += -march=rv32imaf -mabi=ilp32f
STARTUP_ADDR ?= 0x80000000
@ -20,16 +21,16 @@ endif
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
POCL_CC_PATH ?= /opt/pocl/compiler
POCL_RT_PATH ?= /opt/pocl/runtime
POCL_CC_PATH ?= $(TOOLDIR)/pocl/compiler
POCL_RT_PATH ?= $(TOOLDIR)/pocl/runtime
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
FPGA_BIN_DIR ?= $(VORTEX_RT_PATH)/opae
LLVM_VORTEX ?= /opt/llvm-vortex
LLVM_POCL ?= /opt/llvm-vortex
LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex
LLVM_POCL ?= $(TOOLDIR)/llvm-vortex
K_CFLAGS += -v -O3 --sysroot=$(RISCV_SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -Xclang -target-feature -Xclang +vortex
K_CFLAGS += -fno-rtti -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
@ -40,13 +41,12 @@ CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing
CXXFLAGS += -pthread
CXXFLAGS += -I$(POCL_RT_PATH)/include
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_RT_PATH)/stub -lvortex
ifdef HOSTGPU
CXXFLAGS += -DHOSTGPU
LDFLAGS += -lOpenCL
else
LDFLAGS += $(POCL_RT_PATH)/lib/libOpenCL.so
LDFLAGS += -L$(VORTEX_RT_PATH)/stub -lvortex $(POCL_RT_PATH)/lib/libOpenCL.so
endif
# Debugigng

View file

@ -0,0 +1,7 @@
PROJECT = convolution
SRCS = main.cc
OPTS ?= -n32
include ../common.mk

View file

@ -0,0 +1,32 @@
__kernel void conv3x3(__global float* output,
__global float* input,
__global float* weights,
const int width,
const int height)
{
int x = get_global_id(0);
int y = get_global_id(1);
// Adjust for padded borders
int paddedWidth = width + 2;
int paddedX = x + 1;
int paddedY = y + 1;
// Compute the convolution sum
float sum = 0.0f;
sum += input[(paddedY - 1) * paddedWidth + (paddedX - 1)] * weights[0]; // Top-left
sum += input[(paddedY - 1) * paddedWidth + paddedX] * weights[1]; // Top-center
sum += input[(paddedY - 1) * paddedWidth + (paddedX + 1)] * weights[2]; // Top-right
sum += input[paddedY * paddedWidth + (paddedX - 1)] * weights[3]; // Middle-left
sum += input[paddedY * paddedWidth + paddedX] * weights[4]; // Center
sum += input[paddedY * paddedWidth + (paddedX + 1)] * weights[5]; // Middle-right
sum += input[(paddedY + 1) * paddedWidth + (paddedX - 1)] * weights[6]; // Bottom-left
sum += input[(paddedY + 1) * paddedWidth + paddedX] * weights[7]; // Bottom-center
sum += input[(paddedY + 1) * paddedWidth + (paddedX + 1)] * weights[8]; // Bottom-right
// Store the result in the output array
output[y * width + x] = sum;
}

View file

@ -0,0 +1,258 @@
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <CL/opencl.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <chrono>
#include <vector>
#define FLOAT_ULP 6
#define KERNEL_NAME "conv3x3"
#define CL_CHECK(_expr) \
do { \
cl_int _err = _expr; \
if (_err == CL_SUCCESS) \
break; \
printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
cleanup(); \
exit(-1); \
} while (0)
#define CL_CHECK2(_expr) \
({ \
cl_int _err = CL_INVALID_VALUE; \
decltype(_expr) _ret = _expr; \
if (_err != CL_SUCCESS) { \
printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
cleanup(); \
exit(-1); \
} \
_ret; \
})
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
if (nullptr == filename || nullptr == data || 0 == size)
return -1;
FILE* fp = fopen(filename, "r");
if (NULL == fp) {
fprintf(stderr, "Failed to load kernel.");
return -1;
}
fseek(fp , 0 , SEEK_END);
long fsize = ftell(fp);
rewind(fp);
*data = (uint8_t*)malloc(fsize);
*size = fread(*data, 1, fsize, fp);
fclose(fp);
return 0;
}
static bool compare_equal(float a, float b) {
union fi_t { float f; int32_t i; };
fi_t fa, fb;
fa.f = a;
fb.f = b;
auto d = std::abs(fa.i - fb.i);
return d <= FLOAT_ULP;
}
static void convolution_cpu(float *O, float *I, float *W, int32_t width, int32_t height) {
int paddedWidth = width + 2;
for (int32_t y = 0; y < height; ++y) {
for (int32_t x = 0; x < width; ++x) {
int paddedY = y + 1;
int paddedX = x + 1;
float sum = 0.0f;
for (int32_t ky = -1; ky <= 1; ++ky) {
for (int32_t kx = -1; kx <= 1; ++kx) {
int32_t iy = paddedY + ky;
int32_t ix = paddedX + kx;
float value = I[iy * paddedWidth + ix];
float weight = W[(ky + 1) * 3 + (kx + 1)];
sum += value * weight;
}
}
O[y * width + x] = sum;
}
}
}
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue commandQueue = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
cl_mem i_memobj = NULL;
cl_mem w_memobj = NULL;
cl_mem o_memobj = NULL;
uint8_t* kernel_bin = NULL;
static void cleanup() {
if (commandQueue) clReleaseCommandQueue(commandQueue);
if (kernel) clReleaseKernel(kernel);
if (program) clReleaseProgram(program);
if (i_memobj) clReleaseMemObject(i_memobj);
if (w_memobj) clReleaseMemObject(w_memobj);
if (o_memobj) clReleaseMemObject(o_memobj);
if (context) clReleaseContext(context);
if (device_id) clReleaseDevice(device_id);
if (kernel_bin) free(kernel_bin);
}
int size = 32;
static void show_usage() {
printf("Usage: [-n size] [-h: help]\n");
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:h?")) != -1) {
switch (c) {
case 'n':
size = atoi(optarg);
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
int main (int argc, char **argv) {
// parse command arguments
parse_args(argc, argv);
printf("Matrix size=%d\n", size);
uint32_t o_points = size * size;
uint32_t i_points = (size+2) * (size+2);
uint32_t w_points = 3 * 3;
cl_platform_id platform_id;
size_t kernel_size;
// Getting platform and device information
CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
printf("Create context\n");
context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err));
char device_string[1024];
clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
printf("Using device: %s\n", device_string);
printf("Allocate device buffers\n");
size_t i_nbytes = i_points * sizeof(float);
size_t w_nbytes = w_points * sizeof(float);
size_t o_nbytes = o_points * sizeof(float);
i_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, i_nbytes, NULL, &_err));
w_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, w_nbytes, NULL, &_err));
o_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, o_nbytes, NULL, &_err));
printf("Create program from kernel source\n");
#ifdef HOSTGPU
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK2(clCreateProgramWithSource(
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
#else
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK2(clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
#endif
if (program == NULL) {
cleanup();
return -1;
}
// Build program
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
// Create kernel
kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
size_t global_size[2] = {size, size};
// Set kernel arguments
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&o_memobj));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&i_memobj));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&w_memobj));
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(uint32_t), &size));
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(uint32_t), &size));
// Allocate memories for input arrays and output arrays.
std::vector<float> h_i(i_points);
std::vector<float> h_w(w_points);
std::vector<float> h_o(o_points, 0.0f);
// Generate input values
for (int32_t y = -1; y < size+1; ++y) {
for (int32_t x = -1; x < size+1; ++x) {
if (x >= 0 && x < size && y >= 0 && y < size) {
h_i[(y+1) * (size+2) + (x+1)] = static_cast<float>(rand()) / RAND_MAX;
} else {
h_i[(y+1) * (size+2) + (x+1)] = 0;
}
}
}
for (uint32_t i = 0; i < w_points; ++i) {
h_w[i] = static_cast<float>(rand()) / RAND_MAX;
}
// Creating command queue
commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err));
printf("Upload source buffers\n");
CL_CHECK(clEnqueueWriteBuffer(commandQueue, i_memobj, CL_TRUE, 0, i_nbytes, h_i.data(), 0, NULL, NULL));
CL_CHECK(clEnqueueWriteBuffer(commandQueue, w_memobj, CL_TRUE, 0, w_nbytes, h_w.data(), 0, NULL, NULL));
printf("Execute the kernel\n");
auto time_start = std::chrono::high_resolution_clock::now();
CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
CL_CHECK(clFinish(commandQueue));
auto time_end = std::chrono::high_resolution_clock::now();
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
printf("Elapsed time: %lg ms\n", elapsed);
printf("Download destination buffer\n");
CL_CHECK(clEnqueueReadBuffer(commandQueue, o_memobj, CL_TRUE, 0, o_nbytes, h_o.data(), 0, NULL, NULL));
printf("Verify result\n");
std::vector<float> ref_vec(o_points);
convolution_cpu(ref_vec.data(), h_i.data(), h_w.data(), size, size);
int errors = 0;
for (uint32_t i = 0; i < o_points; ++i) {
if (!compare_equal(h_o[i], ref_vec[i])) {
if (errors < 100)
printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_o[i]);
++errors;
}
}
if (errors != 0) {
printf("FAILED! - %d errors\n", errors);
} else {
printf("PASSED!\n");
}
// Clean up
cleanup();
return errors;
}

View file

@ -173,14 +173,10 @@ void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) {
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
printf("OK+\n");
// Setup DEVICE datastructures
OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid);
OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid);
printf("OK-\n");
// Initialize DEVICE datastructures
OpenCL_LBM_initializeGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
OpenCL_LBM_initializeGrid(prm, OpenCL_dstGrid, TEMP_dstGrid);

View file

@ -2,6 +2,6 @@ PROJECT = matmul
SRCS = main.cc
OPTS ?= -n16
OPTS ?= -n32
include ../common.mk

View file

@ -5,10 +5,10 @@ __kernel void matmul(__global float *A,
__local float *localA,
__local float *localB)
{
int row = get_global_id(1);
int col = get_global_id(0);
int localRow = get_local_id(1);
int localCol = get_local_id(0);
int globalRow = get_global_id(1);
int globalCol = get_global_id(0);
int localRow = get_local_id(1);
int localCol = get_local_id(0);
int localSize = get_local_size(0); // assuming square local size
float sum = 0.0f;
@ -16,10 +16,10 @@ __kernel void matmul(__global float *A,
// Loop over all blocks of both matrices
for (int k = 0; k < N; k += localSize) {
// Load block of matrix A to local memory
localA[localRow * localSize + localCol] = A[row * N + k + localCol];
localA[localRow * localSize + localCol] = A[globalRow * N + k + localCol];
// Load block of matrix B to local memory, adjusting for column-major access
localB[localRow * localSize + localCol] = B[(k + localRow) * N + col];
localB[localRow * localSize + localCol] = B[(k + localRow) * N + globalCol];
// Synchronize to make sure the tiles are loaded
barrier(CLK_LOCAL_MEM_FENCE);
@ -28,20 +28,20 @@ __kernel void matmul(__global float *A,
for (int j = 0; j < localSize; j++) {
sum += localA[localRow * localSize + j] * localB[j * localSize + localCol];
}
// Synchronize before loading the next block
barrier(CLK_LOCAL_MEM_FENCE);
}
C[row * N + col] = sum;
C[globalRow * N + globalCol] = sum;
}
/*__kernel void matmul(__global float *A, __global float *B, __global float *C, const unsigned int N)
/*__kernel void matmul(__global float *A,
__global float *B,
__global float *C,
const unsigned int N)
{
int globalRow = get_global_id(1);
int globalCol = get_global_id(0);
int localRow = get_local_id(1);
int localCol = get_local_id(0);
int localRow = get_local_id(1);
int localCol = get_local_id(0);
// Static local memory declaration
__local float localA[16][16];
@ -64,9 +64,6 @@ __kernel void matmul(__global float *A,
for (int j = 0; j < 16; j++) {
sum += localA[localRow][j] * localB[j][localCol];
}
// Wait until all threads have computed before loading the next block
barrier(CLK_LOCAL_MEM_FENCE);
}
C[globalRow * N + globalCol] = sum;

View file

@ -10,6 +10,8 @@
#define LOCAL_SIZE 16
#define FLOAT_ULP 6
#define KERNEL_NAME "matmul"
#define CL_CHECK(_expr) \
@ -56,15 +58,16 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
return 0;
}
static bool compare_equal(float a, float b, int ulp = 21) {
union fi_t { int i; float f; };
static bool compare_equal(float a, float b) {
union fi_t { float f; int32_t i; };
fi_t fa, fb;
fa.f = a;
fb.f = b;
return std::abs(fa.i - fb.i) <= ulp;
auto d = std::abs(fa.i - fb.i);
return d <= FLOAT_ULP;
}
static void matrix_multiply_cpu(float *A, float *B, float *C, int N) {
static void matmul_cpu(float *C, float *A, float *B, int N) {
for (int i = 0; i < N; i++) {
for (int j = 0; j < N; j++) {
float sum = 0.0f;
@ -98,7 +101,7 @@ static void cleanup() {
if (kernel_bin) free(kernel_bin);
}
int size = 64;
int size = 32;
static void show_usage() {
printf("Usage: [-n size] [-h: help]\n");
@ -106,7 +109,7 @@ static void show_usage() {
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "fn:h?")) != -1) {
while ((c = getopt(argc, argv, "n:h?")) != -1) {
switch (c) {
case 'n':
size = atoi(optarg);
@ -127,6 +130,8 @@ int main (int argc, char **argv) {
// parse command arguments
parse_args(argc, argv);
uint32_t num_points = size * size;
printf("Matrix size=%d\n", size);
if ((size / LOCAL_SIZE) * LOCAL_SIZE != size) {
printf("Error: matrix size must be a multiple of %d\n", LOCAL_SIZE);
@ -148,7 +153,7 @@ int main (int argc, char **argv) {
printf("Using device: %s\n", device_string);
printf("Allocate device buffers\n");
size_t nbytes = size * size * sizeof(float);
size_t nbytes = num_points * sizeof(float);
a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
@ -176,8 +181,8 @@ int main (int argc, char **argv) {
// Create kernel
kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE};
size_t global_size[2] = {size, size};
size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE};
// Set kernel arguments
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));
@ -188,20 +193,14 @@ int main (int argc, char **argv) {
CL_CHECK(clSetKernelArg(kernel, 5, local_size[0]*local_size[1]*sizeof(float), NULL));
// Allocate memories for input arrays and output arrays.
std::vector<float> h_a(size * size);
std::vector<float> h_b(size * size);
std::vector<float> h_c(size * size);
std::vector<float> h_a(num_points);
std::vector<float> h_b(num_points);
std::vector<float> h_c(num_points);
// Initialize values for array members.
for (int i = 0; i < (size * size); ++i) {
#ifdef USE_FLOAT
h_a[i] = (float)rand() / (float)RAND_MAX;
h_b[i] = (float)rand() / (float)RAND_MAX;
#else
h_a[i] = rand();
h_b[i] = rand();
#endif
h_c[i] = 0xdeadbeef;
// Generate input values
for (uint32_t i = 0; i < num_points; ++i) {
h_a[i] = static_cast<float>(rand()) / RAND_MAX;
h_b[i] = static_cast<float>(rand()) / RAND_MAX;
}
// Creating command queue
@ -223,10 +222,10 @@ int main (int argc, char **argv) {
CL_CHECK(clEnqueueReadBuffer(commandQueue, c_memobj, CL_TRUE, 0, nbytes, h_c.data(), 0, NULL, NULL));
printf("Verify result\n");
std::vector<float> ref_vec(size * size);
matrix_multiply_cpu(h_a.data(), h_b.data(), ref_vec.data(), size);
std::vector<float> ref_vec(num_points);
matmul_cpu(ref_vec.data(), h_a.data(), h_b.data(), size);
int errors = 0;
for (int i = 0; i < (size * size); i++) {
for (uint32_t i = 0; i < num_points; ++i) {
if (!compare_equal(h_c[i], ref_vec[i])) {
if (errors < 100)
printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_c[i]);

View file

@ -106,11 +106,6 @@ int main (int argc, char **argv) {
cl_platform_id platform_id;
size_t kernel_size;
cl_int binary_status;
// read kernel binary from file
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
return -1;
// Getting platform and device information
CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
@ -124,12 +119,17 @@ int main (int argc, char **argv) {
a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
printf("Create program from kernel source\n");
program = CL_CHECK2(clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
if (program == NULL) {
cleanup();
#ifdef HOSTGPU
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
return -1;
}
program = CL_CHECK2(clCreateProgramWithSource(
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
#else
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK2(clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
#endif
// Build program
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
@ -143,7 +143,7 @@ int main (int argc, char **argv) {
// Allocate memories for input arrays and output arrays.
h_a = (int*)malloc(nbytes);
// Initialize values for array members.
// Generate input values
for (int i = 0; i < size; ++i) {
h_a[i] = -1 + i;
}

View file

@ -115,11 +115,6 @@ int main (int argc, char **argv) {
cl_platform_id platform_id;
size_t kernel_size;
cl_int binary_status;
// read kernel binary from file
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
return -1;
// Getting platform and device information
CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
@ -134,12 +129,17 @@ int main (int argc, char **argv) {
c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
printf("Create program from kernel source\n");
program = CL_CHECK2(clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
if (program == NULL) {
cleanup();
#ifdef HOSTGPU
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
return -1;
}
program = CL_CHECK2(clCreateProgramWithSource(
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
#else
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK2(clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
#endif
// Build program
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
@ -155,17 +155,16 @@ int main (int argc, char **argv) {
h_a = (int*)malloc(nbytes);
h_c = (int*)malloc(nbytes);
// Initialize values for array members.
// Generate input values
for (int i = 0; i < size; ++i) {
h_c[i] = 0xdeadbeef;
if (float_enable) {
float value = sinf(i)*sinf(i);
h_a[i] = *(int*)&value;
printf("*** [%d]: h_a=%f\n", i, value);
((float*)h_a)[i] = value;
printf("*** [%d]: %f\n", i, value);
} else {
int value = size*sinf(i);
h_a[i] = value;
printf("*** [%d]: h_a=%d\n", i, value);
printf("*** [%d]: %d\n", i, value);
}
}
@ -189,38 +188,44 @@ int main (int argc, char **argv) {
CL_CHECK(clEnqueueReadBuffer(commandQueue, c_memobj, CL_TRUE, 0, nbytes, h_c, 0, NULL, NULL));
printf("Verify result\n");
for (int i = 0; i < size; ++i) {
int value = h_c[i];
for (int i = 0; i < size; ++i) {
if (float_enable) {
printf("*** [%d]: h_a=%f\n", i, *(float*)&value);
float value = ((float*)h_c)[i];
printf("*** [%d]: %f\n", i, value);
} else {
printf("*** [%d]: h_a=%d\n", i, value);
int value = h_c[i];
printf("*** [%d]: %d\n", i, value);
}
}
int errors = 0;
for (int i = 0; i < size; ++i) {
int ref = h_a[i];
float ref_f = *(float*)&ref;
for (int i = 0; i < size; ++i) {
int pos = 0;
for (int j = 0; j < size; ++j) {
int cur = h_a[j];
if (float_enable) {
float cur_f = *(float*)&cur;
pos += (cur_f < ref_f) || (cur_f == ref_f && j < i);
} else {
if (float_enable) {
float ref = ((float*)h_a)[i];
for (int j = 0; j < size; ++j) {
float cur = ((float*)h_a)[j];
pos += (cur < ref) || (cur == ref && j < i);
}
float value = ((float*)h_c)[pos];
if (value != ref) {
if (errors < 100) {
printf("*** error: [%d] expected=%f, actual=%f\n", pos, ref, value);
}
++errors;
}
} else {
int ref = h_a[i];
for (int j = 0; j < size; ++j) {
int cur = h_a[j];
pos += (cur < ref) || (cur == ref && j < i);
}
}
int value = h_c[pos];
if (value != ref) {
if (errors < 100) {
if (float_enable) {
printf("*** error: [%d] expected=%f, actual=%f\n", pos, ref_f, *(float*)&value);
} else {
int value = h_c[pos];
if (value != ref) {
if (errors < 100) {
printf("*** error: [%d] expected=%d, actual=%d\n", pos, ref, value);
}
++errors;
}
++errors;
}
}
if (0 == errors) {

View file

@ -151,16 +151,12 @@ int main(int argc, char **argv) {
cl_platform_id platform_id;
cl_device_id device_id;
cl_program program;
cl_mem input_buffer;
cl_mem output_buffer;
size_t kernel_size;
cl_context context;
cl_command_queue queue;
cl_int binary_status = 0;
// read kernel binary from file
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
return -1;
// Getting platform and device information
CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
@ -172,19 +168,18 @@ int main(int argc, char **argv) {
cl_kernel kernel = 0;
cl_mem memObjects[2] = {0, 0};
// Create OpenCL program - first attempt to load cached binary.
// If that is not available, then create the program from source
// and store the binary for future use.
printf("create program from binary...\n");
cl_program program = CL_CHECK_ERR(clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
if (program == NULL) {
std::cerr << "Failed to write program binary" << std::endl;
Cleanup(device_id, context, queue, program, kernel, memObjects);
return 1;
} else {
printf("Read program from binary.\n");
}
printf("Create program from kernel source\n");
#ifdef HOSTGPU
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK_ERR(clCreateProgramWithSource(
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
#else
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK_ERR(clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
#endif
// Build program
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));

View file

@ -149,14 +149,10 @@ int main(int argc, char **argv) {
cl_platform_id platform_id;
cl_device_id device_id;
cl_program program;
size_t kernel_size;
cl_int binary_status = 0;
uint8_t *kernel_bin = NULL;
// read kernel binary from file
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
return -1;
// Getting platform and device information
CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
@ -170,19 +166,18 @@ int main(int argc, char **argv) {
cl_kernel kernel = 0;
cl_mem memObjects[2] = {0, 0};
// Create OpenCL program - first attempt to load cached binary.
// If that is not available, then create the program from source
// and store the binary for future use.
printf("create program from binary...\n");
cl_program program = CL_CHECK_ERR(clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
if (program == NULL) {
std::cerr << "Failed to write program binary" << std::endl;
Cleanup(kernel_bin, device_id, context, queue, program, kernel, memObjects);
return 1;
} else {
printf("Read program from binary.");
}
printf("Create program from kernel source\n");
#ifdef HOSTGPU
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK_ERR(clCreateProgramWithSource(
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
#else
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK_ERR(clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
#endif
// Build program
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));

View file

@ -1,12 +1,8 @@
#ifndef COMMON_H
#define COMMON_H
#define USE_FLOAT
#ifdef USE_FLOAT
#ifndef TYPE
#define TYPE float
#else
#define TYPE int
#endif
#endif // COMMON_H

View file

@ -11,6 +11,8 @@
#define KERNEL_NAME "sgemm"
#define FLOAT_ULP 6
#define CL_CHECK(_expr) \
do { \
cl_int _err = _expr; \
@ -33,6 +35,66 @@
_ret; \
})
template <typename Type>
class Comparator {};
template <>
class Comparator<int> {
public:
static const char* type_str() {
return "integer";
}
static int generate() {
return rand();
}
static bool compare(int a, int b, int index, int errors) {
if (a != b) {
if (errors < 100) {
printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
}
return false;
}
return true;
}
};
template <>
class Comparator<float> {
public:
static const char* type_str() {
return "float";
}
static int generate() {
return static_cast<float>(rand()) / RAND_MAX;
}
static bool compare(float a, float b, int index, int errors) {
union fi_t { float f; int32_t i; };
fi_t fa, fb;
fa.f = a;
fb.f = b;
auto d = std::abs(fa.i - fb.i);
if (d > FLOAT_ULP) {
if (errors < 100) {
printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
}
return false;
}
return true;
}
};
/*static void sgemm_cpu(TYPE *C, const TYPE* A, const TYPE *B, int M, int N, int K) {
for (int m = 0; m < M; ++m) {
for (int n = 0; n < N; ++n) {
TYPE acc = 0;
for (int k = 0; k < K; ++k) {
acc += A[k * M + m] * B[n * K + k];
}
C[n * M + m] = acc;
}
}
}*/
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
if (nullptr == filename || nullptr == data || 0 == size)
return -1;
@ -54,32 +116,6 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
return 0;
}
/*static void matmul(TYPE *C, const TYPE* A, const TYPE *B, int M, int N, int K) {
for (int m = 0; m < M; ++m) {
for (int n = 0; n < N; ++n) {
TYPE acc = 0;
for (int k = 0; k < K; ++k) {
acc += A[k * M + m] * B[n * K + k];
}
C[n * M + m] = acc;
}
}
}*/
#ifdef USE_FLOAT
static bool compare_equal(float a, float b, int ulp = 21) {
union fi_t { int i; float f; };
fi_t fa, fb;
fa.f = a;
fb.f = b;
return std::abs(fa.i - fb.i) <= ulp;
}
#else
static bool compare_equal(int a, int b, int ulp = 21) {
return (a == b);
}
#endif
cl_device_id device_id = NULL;
cl_context context = NULL;
cl_command_queue commandQueue = NULL;
@ -145,15 +181,12 @@ int main (int argc, char **argv) {
// parse command arguments
parse_args(argc, argv);
uint32_t num_points = size * size;
cl_platform_id platform_id;
size_t kernel_size;
cl_int binary_status;
srand(50);
// read kernel binary from file
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
return -1;
// Getting platform and device information
CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
@ -163,18 +196,23 @@ int main (int argc, char **argv) {
context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err));
// Allocate device buffers
size_t nbytes = size * size * sizeof(TYPE);
size_t nbytes = num_points * sizeof(TYPE);
a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
printf("Create program from kernel source\n");
program = CL_CHECK2(clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
if (program == NULL) {
cleanup();
#ifdef HOSTGPU
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
return -1;
}
program = CL_CHECK2(clCreateProgramWithSource(
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
#else
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK2(clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
#endif
// Build program
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
@ -194,23 +232,17 @@ int main (int argc, char **argv) {
h_b = (TYPE*)malloc(nbytes);
h_c = (TYPE*)malloc(nbytes);
// Initialize values for array members.
for (int i = 0; i < (size * size); ++i) {
#ifdef USE_FLOAT
h_a[i] = (float)rand() / (float)RAND_MAX;
h_b[i] = (float)rand() / (float)RAND_MAX;
#else
h_a[i] = rand();
h_b[i] = rand();
#endif
h_c[i] = 0xdeadbeef;
// Generate input values
for (uint32_t i = 0; i < num_points; ++i) {
h_a[i] = Comparator<TYPE>::generate();
h_b[i] = Comparator<TYPE>::generate();
}
size_t global_offset[2] = {0, 0};
size_t global_work_size[2] = {size, size};
size_t local_work_size[2] = {1, 1};
std::vector<float> ref_vec(size * size);
std::vector<float> ref_vec(num_points);
// reference generation
size_t num_groups_y = global_work_size[1] / local_work_size[1];
@ -228,12 +260,7 @@ int main (int argc, char **argv) {
TYPE acc = 0;
for (int k = 0; k < width; k++) {
acc += h_a[k * width + r] * h_b[c * width + k];
}
/*#ifdef USE_FLOAT
printf("*** r=%d, c=%d, v=%f\n", r, c, acc);
#else
printf("*** r=%d, c=%d, v=%d\n", r, c, acc);
#endif*/
}
ref_vec[c * width + r] = acc;
}
}
@ -260,14 +287,8 @@ int main (int argc, char **argv) {
printf("Verify result\n");
int errors = 0;
for (int i = 0; i < (size * size); i++) {
if (!compare_equal(h_c[i], ref_vec[i])) {
if (errors < 100)
#ifdef USE_FLOAT
printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_c[i]);
#else
printf("*** error: [%d] expected=%d, actual=%d\n", i, ref_vec[i], h_c[i]);
#endif
for (uint32_t i = 0; i < num_points; ++i) {
if (!Comparator<TYPE>::compare(h_c[i], ref_vec[i], i, errors)) {
++errors;
}
}

View file

@ -91,15 +91,11 @@ int coo_to_jds(char *mtx_filename, int pad_rows, int warp_size, int pack_size,
if ((f = fopen(mtx_filename, "r")) == NULL)
exit(1);
printf("OK**\n");
if (mm_read_banner(f, &matcode) != 0) {
printf("Could not process Matrix Market banner.\n");
exit(1);
}
printf("OK**\n");
/* This is how one can screen matrix types if their application */
/* only supports a subset of the Matrix Market data types. */

View file

@ -148,7 +148,6 @@ int main(int argc, char **argv) {
// &h_data, &h_indices, &h_ptr,
// &h_perm, &h_nzcnt);
int col_count;
printf("OK--\n");
coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
1, // row padding
pad, // warp size
@ -159,8 +158,6 @@ int main(int argc, char **argv) {
&h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim,
&len, &nzcnt_len, &depth);
printf("OK++\n");
// pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
h_Ax_vector = (float *)malloc(sizeof(float) * dim);
h_x_vector = (float *)malloc(sizeof(float) * dim);

View file

@ -157,9 +157,7 @@ int main(int argc, char** argv) {
CHECK_ERROR("clBuildProgram")
cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus);
CHECK_ERROR("clCreateKernel")
printf("OK+\n");
CHECK_ERROR("clCreateKernel")
//host data
float *h_A0;
@ -177,15 +175,11 @@ int main(int argc, char** argv) {
h_Anext=(float*)malloc(sizeof(float)*size);
pb_SwitchToTimer(&timers, pb_TimerID_IO);
//FILE *fp = fopen(parameters->inpFiles[0], "rb");
printf("OK+\n");
read_data(h_A0, nx,ny,nz,NULL);
printf("OK+\n");
//fclose(fp);
memcpy (h_Anext,h_A0,sizeof(float)*size);
//fclose(fp);
memcpy (h_Anext,h_A0,sizeof(float)*size);
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
printf("OK+\n");
//memory allocation
d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
@ -201,18 +195,16 @@ int main(int argc, char** argv) {
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
printf("OK+\n");
//only use 1D thread block
int tx = 128;
int tx = 128;
size_t block[3] = {tx,1,1};
size_t grid[3] = {(nx-2+tx-1)/tx*tx,ny-2,nz-2};
//size_t grid[3] = {nx-2,ny-2,nz-2};
size_t offset[3] = {1,1,1};
printf("grid size in x/y/z = %d %d %d\n",grid[0],grid[1],grid[2]);
//size_t grid[3] = {nx-2,ny-2,nz-2};
size_t offset[3] = {1,1,1};
printf("grid size in x/y/z = %d %d %d\n",grid[0],grid[1],grid[2]);
printf("block size in x/y/z = %d %d %d\n",block[0],block[1],block[2]);
printf ("blocks = %d\n", (grid[0]/block[0])*(grid[1]/block[1])*(grid[2]*block[2]));
printf ("blocks = %d\n", (grid[0]/block[0])*(grid[1]/block[1])*(grid[2]*block[2]));
clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0);
clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1);
@ -226,14 +218,10 @@ int main(int argc, char** argv) {
//main execution
pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
printf("OK+0\n");
int t;
for(t=0;t<iteration;t++)
{
clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL);
printf("OK+0\n");
//printf("iteration %d\n",t)
CHECK_ERROR("clEnqueueNDRangeKernel")
@ -244,11 +232,9 @@ int main(int argc, char** argv) {
clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
}
printf("OK+1\n");
cl_mem d_temp = d_A0;
d_A0 = d_Anext;
d_Anext = d_temp;
cl_mem d_temp = d_A0;
d_A0 = d_Anext;
d_Anext = d_temp;
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
@ -260,8 +246,6 @@ int main(int argc, char** argv) {
clStatus = clReleaseCommandQueue(clCommandQueue);
clStatus = clReleaseContext(clContext);
CHECK_ERROR("clReleaseContext")
printf("OK+2\n");
if (parameters->outFile) {
pb_SwitchToTimer(&timers, pb_TimerID_IO);

View file

@ -122,11 +122,6 @@ int main (int argc, char **argv) {
cl_platform_id platform_id;
size_t kernel_size;
cl_int binary_status;
// read kernel binary from file
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
return -1;
// Getting platform and device information
CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
@ -142,13 +137,17 @@ int main (int argc, char **argv) {
c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
printf("Create program from kernel source\n");
cl_int _err;
program = clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err);
if (program == NULL) {
cleanup();
#ifdef HOSTGPU
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
return -1;
}
program = CL_CHECK2(clCreateProgramWithSource(
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
#else
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
return -1;
program = CL_CHECK2(clCreateProgramWithBinary(
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
#endif
// Build program
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
@ -166,12 +165,10 @@ int main (int argc, char **argv) {
h_b = (float*)malloc(nbytes);
h_c = (float*)malloc(nbytes);
// Initialize values for array members.
// Generate input values
for (int i = 0; i < size; ++i) {
h_a[i] = sinf(i)*sinf(i);
h_b[i] = cosf(i)*cosf(i);
h_c[i] = 0xdeadbeef;
//printf("*** [%d]: h_a=%f, h_b=%f\n", i, h_a[i], h_b[i]);
}
// Creating command queue

View file

@ -10,6 +10,7 @@ all:
$(MAKE) -C fence
$(MAKE) -C no_mf_ext
$(MAKE) -C no_smem
$(MAKE) -C tensor
run-simx:
$(MAKE) -C basic run-simx
@ -23,6 +24,7 @@ run-simx:
$(MAKE) -C fence run-simx
$(MAKE) -C no_mf_ext run-simx
$(MAKE) -C no_smem run-simx
$(MAKE) -C tensor run-simx
run-rtlsim:
$(MAKE) -C basic run-rtlsim
@ -36,6 +38,7 @@ run-rtlsim:
$(MAKE) -C fence run-rtlsim
$(MAKE) -C no_mf_ext run-rtlsim
$(MAKE) -C no_smem run-rtlsim
$(MAKE) -C tensor run-rtlsim
run-opae:
$(MAKE) -C basic run-opae
@ -49,6 +52,7 @@ run-opae:
$(MAKE) -C fence run-opae
$(MAKE) -C no_mf_ext run-opae
$(MAKE) -C no_smem run-opae
$(MAKE) -C tensor run-opae
clean:
$(MAKE) -C basic clean
@ -62,6 +66,7 @@ clean:
$(MAKE) -C fence clean
$(MAKE) -C no_mf_ext clean
$(MAKE) -C no_smem clean
$(MAKE) -C tensor clean
clean-all:
$(MAKE) -C basic clean-all
@ -75,3 +80,4 @@ clean-all:
$(MAKE) -C fence clean-all
$(MAKE) -C no_mf_ext clean-all
$(MAKE) -C no_smem clean-all
$(MAKE) -C tensor clean-all

View file

@ -262,11 +262,8 @@ int main(int argc, char *argv[]) {
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
{
auto buf_ptr = (void*)staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
std::cout << "run kernel test" << std::endl;
RT_CHECK(run_kernel_test(kernel_arg, buf_size, num_points));

View file

@ -1,16 +1,18 @@
XLEN ?= 32
TOOLDIR ?= /opt
TARGET ?= opaesim
XRT_SYN_DIR ?= ../../../hw/syn/xilinx/xrt
XRT_DEVICE_INDEX ?= 0
ifeq ($(XLEN),64)
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
VX_CFLAGS += -march=rv64imafd -mabi=lp64d
STARTUP_ADDR ?= 0x180000000
else
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
VX_CFLAGS += -march=rv32imaf -mabi=ilp32f
STARTUP_ADDR ?= 0x80000000
endif
@ -23,7 +25,7 @@ VORTEX_KN_PATH ?= $(realpath ../../../kernel)
FPGA_BIN_DIR ?= $(VORTEX_RT_PATH)/opae
LLVM_VORTEX ?= /opt/llvm-vortex
LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex
LLVM_CFLAGS += --sysroot=$(RISCV_SYSROOT)
LLVM_CFLAGS += --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH)

View file

@ -3,6 +3,10 @@
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
#ifndef TYPE
#define TYPE float
#endif
typedef struct {
uint32_t num_tasks;
uint32_t task_size;
@ -11,4 +15,4 @@ typedef struct {
uint64_t dst_addr;
} kernel_arg_t;
#endif
#endif

View file

@ -4,11 +4,11 @@
#include "common.h"
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
uint32_t count = arg->task_size;
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
auto src0_ptr = reinterpret_cast<TYPE*>(arg->src0_addr);
auto src1_ptr = reinterpret_cast<TYPE*>(arg->src1_addr);
auto dst_ptr = reinterpret_cast<TYPE*>(arg->dst_addr);
uint32_t count = arg->task_size;
uint32_t offset = task_id * count;
for (uint32_t i = 0; i < count; ++i) {

View file

@ -5,6 +5,8 @@
#include <vortex.h>
#include "common.h"
#define FLOAT_ULP 6
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
@ -17,10 +19,61 @@
///////////////////////////////////////////////////////////////////////////////
template <typename Type>
class Comparator {};
template <>
class Comparator<int> {
public:
static const char* type_str() {
return "integer";
}
static int generate() {
return rand();
}
static bool compare(int a, int b, int index, int errors) {
if (a != b) {
if (errors < 100) {
printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
}
return false;
}
return true;
}
};
template <>
class Comparator<float> {
private:
union Float_t { float f; int i; };
public:
static const char* type_str() {
return "float";
}
static int generate() {
return static_cast<float>(rand()) / RAND_MAX;
}
static bool compare(float a, float b, int index, int errors) {
union fi_t { float f; int32_t i; };
fi_t fa, fb;
fa.f = a;
fb.f = b;
auto d = std::abs(fa.i - fb.i);
if (d > FLOAT_ULP) {
if (errors < 100) {
printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
}
return false;
}
return true;
}
};
const char* kernel_file = "kernel.bin";
uint32_t count = 0;
uint32_t count = 16;
vx_device_h device = nullptr;
std::vector<TYPE> source_data;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
@ -79,13 +132,11 @@ int run_test(const kernel_arg_t& kernel_arg,
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (int32_t*)staging_buf.data();
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
int ref = i + i;
int cur = buf_ptr[i];
if (cur != ref) {
std::cout << "error at result #" << std::dec << i
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
auto ref = source_data[2 * i + 0] + source_data[2 * i + 1];
auto cur = buf_ptr[i];
if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
++errors;
}
}
@ -103,9 +154,7 @@ int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
if (count == 0) {
count = 1;
}
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
@ -118,8 +167,9 @@ int main(int argc, char *argv[]) {
uint32_t num_tasks = num_cores * num_warps * num_threads;
uint32_t num_points = count * num_tasks;
uint32_t buf_size = num_points * sizeof(int32_t);
uint32_t buf_size = num_points * sizeof(TYPE);
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
std::cout << "number of points: " << num_points << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
@ -147,18 +197,21 @@ int main(int argc, char *argv[]) {
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
{
auto buf_ptr = (int*)staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// generate source data
source_data.resize(2 * num_points);
for (uint32_t i = 0; i < source_data.size(); ++i) {
source_data[i] = Comparator<TYPE>::generate();
}
// upload source buffer0
{
std::cout << "upload source buffer0" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i-1;
buf_ptr[i] = source_data[2 * i + 0];
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size));
}
@ -166,23 +219,18 @@ int main(int argc, char *argv[]) {
// upload source buffer1
{
std::cout << "upload source buffer1" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = i+1;
buf_ptr[i] = source_data[2 * i + 1];
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size));
}
// clear destination buffer
{
std::cout << "clear destination buffer" << std::endl;
auto buf_ptr = (int32_t*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = 0xdeadbeef;
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
}
std::cout << "clear destination buffer" << std::endl;
memset(staging_buf.data(), 0, num_points * sizeof(TYPE));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, buf_size, num_points));

View file

@ -233,11 +233,8 @@ int main(int argc, char *argv[]) {
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
{
auto buf_ptr = (int*)staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// upload source buffer
{

View file

@ -147,11 +147,8 @@ int main(int argc, char *argv[]) {
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
{
auto buf_ptr = (int*)staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// upload source buffer0
{

View file

@ -190,13 +190,10 @@ int main(int argc, char *argv[]) {
staging_buf.resize(staging_buf_size);
// upload kernel argument
{
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = (int*)staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// upload test address data
{
std::cout << "upload test address data" << std::endl;

View file

@ -236,13 +236,10 @@ int main(int argc, char *argv[]) {
staging_buf.resize(staging_buf_size);
// upload kernel argument
{
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = (int*)staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// upload source buffer0
{
std::cout << "upload address buffer" << std::endl;

View file

@ -136,13 +136,10 @@ int main(int argc, char *argv[]) {
staging_buf.resize(alloc_size);
// upload kernel argument
{
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = (int*)staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// upload source buffer0
{
std::cout << "upload source buffer" << std::endl;

View file

@ -135,13 +135,10 @@ int main(int argc, char *argv[]) {
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
staging_buf.resize(alloc_size);
// upload kernel argument
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
{
auto buf_ptr = (int*)staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// upload source buffer0
{

View file

@ -110,13 +110,10 @@ int main(int argc, char *argv[]) {
staging_buf.resize(alloc_size);
// upload kernel argument
{
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = (void*)staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// upload source buffer0
{
std::cout << "upload source buffer" << std::endl;

View file

@ -3,11 +3,7 @@
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
#define FP_ENABLE
#ifdef FP_ENABLE
#define TYPE float
#else
#ifndef TYPE
#define TYPE int
#endif
@ -17,4 +13,4 @@ typedef struct {
uint64_t dst_addr;
} kernel_arg_t;
#endif
#endif

View file

@ -5,14 +5,14 @@
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
uint32_t num_points = arg->num_points;
TYPE* src_ptr = (TYPE*)arg->src_addr;
TYPE* dst_ptr = (TYPE*)arg->dst_addr;
auto src_ptr = (TYPE*)arg->src_addr;
auto dst_ptr = (TYPE*)arg->dst_addr;
TYPE ref_value = src_ptr[task_id];
auto ref_value = src_ptr[task_id];
uint32_t pos = 0;
for (uint32_t i = 0; i < num_points; ++i) {
TYPE cur_value = src_ptr[i];
auto cur_value = src_ptr[i];
pos += (cur_value < ref_value) || ((cur_value == ref_value) && (i < task_id));
}
dst_ptr[pos] = ref_value;

View file

@ -66,8 +66,8 @@ void gen_input_data(uint32_t num_points) {
src_data.resize(num_points);
for (uint32_t i = 0; i < num_points; ++i) {
float r = static_cast<float>(std::rand()) / RAND_MAX;
TYPE value = r * num_points;
auto r = static_cast<float>(std::rand()) / RAND_MAX;
auto value = static_cast<TYPE>(r * num_points);
src_data[i] = value;
std::cout << std::dec << i << ": value=" << value << std::endl;
}
@ -172,19 +172,16 @@ int main(int argc, char *argv[]) {
{
std::cout << "allocate staging buffer" << std::endl;
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
std::max<uint32_t>(dst_buf_size,
sizeof(kernel_arg_t)));
std::max<uint32_t>(dst_buf_size,
sizeof(kernel_arg_t)));
staging_buf.resize(staging_buf_size);
}
// upload kernel argument
{
std::cout << "upload kernel argument" << std::endl;
auto buf_ptr = staging_buf.data();
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
}
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// upload source buffer
{
std::cout << "upload source buffer" << std::endl;

View file

@ -0,0 +1,9 @@
PROJECT = tensor
SRCS = main.cpp
VX_SRCS = kernel.cpp
OPTS ?= -n32
include ../common.mk

View file

@ -0,0 +1,18 @@
#ifndef _COMMON_H_
#define _COMMON_H_
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
#ifndef TYPE
#define TYPE float
#endif
typedef struct {
uint32_t num_tasks;
uint32_t size;
uint64_t A_addr;
uint64_t B_addr;
uint64_t C_addr;
} kernel_arg_t;
#endif

View file

@ -0,0 +1,41 @@
#include <stdint.h>
#include <vx_intrinsics.h>
#include <vx_spawn.h>
#include "common.h"
inline char is_log2(uint32_t x) {
return ((x & (x-1)) == 0);
}
inline uint32_t log2_fast(uint32_t x) {
return 31 - __builtin_clz (x);
}
void kernel_body(uint32_t task_id, kernel_arg_t* __UNIFORM__ arg) {
auto A = reinterpret_cast<TYPE*>(arg->A_addr);
auto B = reinterpret_cast<TYPE*>(arg->B_addr);
auto C = reinterpret_cast<TYPE*>(arg->C_addr);
auto size = arg->size;
uint32_t row, col;
if (is_log2(size)) {
uint32_t log_size = log2_fast(size);
row = task_id >> log_size;
col = task_id & (size-1);
} else {
row = task_id / size;
col = task_id % size;
}
TYPE sum (0);
for (int e = 0; e < size; ++e) {
sum += A[row * size + e] * B[e * size + col];
}
C[row * size + col] = sum;
}
int main() {
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
return 0;
}

View file

@ -0,0 +1,254 @@
#include <iostream>
#include <unistd.h>
#include <string.h>
#include <vector>
#include <vortex.h>
#include "common.h"
#define FLOAT_ULP 6
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
cleanup(); \
exit(-1); \
} while (false)
///////////////////////////////////////////////////////////////////////////////
template <typename Type>
class Comparator {};
template <>
class Comparator<int> {
public:
static const char* type_str() {
return "integer";
}
static int generate() {
return rand();
}
static bool compare(int a, int b, int index, int errors) {
if (a != b) {
if (errors < 100) {
printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
}
return false;
}
return true;
}
};
template <>
class Comparator<float> {
public:
static const char* type_str() {
return "float";
}
static int generate() {
return static_cast<float>(rand()) / RAND_MAX;
}
static bool compare(float a, float b, int index, int errors) {
union fi_t { float f; int32_t i; };
fi_t fa, fb;
fa.f = a;
fb.f = b;
auto d = std::abs(fa.i - fb.i);
if (d > FLOAT_ULP) {
if (errors < 100) {
printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
}
return false;
}
return true;
}
};
static void matmul_cpu(TYPE* out, const TYPE* A, const TYPE* B, uint32_t width, uint32_t height) {
for (uint32_t row = 0; row < height; ++row) {
for (uint32_t col = 0; col < width; ++col) {
TYPE sum(0);
for (uint32_t e = 0; e < width; ++e) {
sum += A[row * width + e] * B[e * width + col];
}
out[row * width + col] = sum;
}
}
}
const char* kernel_file = "kernel.bin";
uint32_t size = 32;
vx_device_h device = nullptr;
std::vector<uint8_t> staging_buf;
kernel_arg_t kernel_arg = {};
static void show_usage() {
std::cout << "Vortex Test." << std::endl;
std::cout << "Usage: [-k: kernel] [-n size] [-h: help]" << std::endl;
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
switch (c) {
case 'n':
size = atoi(optarg);
break;
case 'k':
kernel_file = optarg;
break;
case 'h':
case '?': {
show_usage();
exit(0);
} break;
default:
show_usage();
exit(-1);
}
}
}
void cleanup() {
if (device) {
vx_mem_free(device, kernel_arg.A_addr);
vx_mem_free(device, kernel_arg.B_addr);
vx_mem_free(device, kernel_arg.C_addr);
vx_dev_close(device);
}
}
int run_test(const kernel_arg_t& kernel_arg,
uint32_t buf_size,
const std::vector<TYPE>& refs) {
// start device
std::cout << "start device" << std::endl;
RT_CHECK(vx_start(device));
// wait for completion
std::cout << "wait for completion" << std::endl;
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
// download destination buffer
std::cout << "download destination buffer" << std::endl;
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.C_addr, buf_size));
// verify result
std::cout << "verify result" << std::endl;
{
int errors = 0;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < refs.size(); ++i) {
auto ref = refs[i];
auto cur = buf_ptr[i];
if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
++errors;
}
}
if (errors != 0) {
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
std::cout << "FAILED!" << std::endl;
return 1;
}
}
return 0;
}
int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
std::srand(50);
// open device connection
std::cout << "open device connection" << std::endl;
RT_CHECK(vx_dev_open(&device));
uint32_t num_points = size * size;
uint32_t buf_size = num_points * sizeof(TYPE);
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
std::cout << "matrix size: " << size << "x" << size << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// upload program
std::cout << "upload program" << std::endl;
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
// allocate device memory
std::cout << "allocate device memory" << std::endl;
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.A_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.B_addr));
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.C_addr));
kernel_arg.num_tasks = num_points;
kernel_arg.size = size;
std::cout << "dev_src0=0x" << std::hex << kernel_arg.A_addr << std::endl;
std::cout << "dev_src1=0x" << std::hex << kernel_arg.B_addr << std::endl;
std::cout << "dev_dst=0x" << std::hex << kernel_arg.C_addr << std::endl;
// allocate staging buffer
std::cout << "allocate staging buffer" << std::endl;
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
staging_buf.resize(alloc_size);
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
// generate source data
std::vector<TYPE> src_A(num_points);
std::vector<TYPE> src_B(num_points);
std::vector<TYPE> refs(num_points);
for (uint32_t i = 0; i < num_points; ++i) {
auto a = static_cast<float>(std::rand()) / RAND_MAX;
auto b = static_cast<float>(std::rand()) / RAND_MAX;
src_A[i] = static_cast<TYPE>(a * size);
src_B[i] = static_cast<TYPE>(b * size);
}
matmul_cpu(refs.data(), src_A.data(), src_B.data(), size, size);
// upload source buffer0
{
std::cout << "upload source buffer0" << std::endl;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = src_A[i];
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.A_addr, staging_buf.data(), buf_size));
}
// upload source buffer1
{
std::cout << "upload source buffer1" << std::endl;
auto buf_ptr = (TYPE*)staging_buf.data();
for (uint32_t i = 0; i < num_points; ++i) {
buf_ptr[i] = src_B[i];
}
RT_CHECK(vx_copy_to_dev(device, kernel_arg.B_addr, staging_buf.data(), buf_size));
}
// clear destination buffer
std::cout << "clear destination buffer" << std::endl;
memset(staging_buf.data(), 0, num_points * sizeof(TYPE));
RT_CHECK(vx_copy_to_dev(device, kernel_arg.C_addr, staging_buf.data(), buf_size));
// run tests
std::cout << "run tests" << std::endl;
RT_CHECK(run_test(kernel_arg, buf_size, refs));
// cleanup
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "PASSED!" << std::endl;
return 0;
}

30
tests/unittest/common.mk Normal file
View file

@ -0,0 +1,30 @@
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_RT_PATH)/common
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
all: $(PROJECT)
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run:
./$(PROJECT)
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif

View file

@ -1,34 +1,5 @@
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
CXXFLAGS += -I$(VORTEX_RT_PATH)/common
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
else
CXXFLAGS += -O2 -DNDEBUG
endif
PROJECT = vx_malloc
SRCS = main.cpp
all: $(PROJECT)
$(PROJECT): $(SRCS)
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
run:
./$(PROJECT)
clean:
rm -rf $(PROJECT) *.o .depend
clean-all: clean
rm -rf *.elf *.bin *.dump
ifneq ($(MAKECMDGOALS),clean)
-include .depend
endif
include ../common.mk