mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
Merge branch 'develop' of https://github.com/vortexgpgpu/vortex into develop
This commit is contained in:
commit
11752b2562
67 changed files with 1234 additions and 956 deletions
|
@ -38,7 +38,7 @@ jobs:
|
|||
- rm -rf $HOME/build32 && cp -r $PWD $HOME/build32
|
||||
- rm -rf $HOME/build64 && cp -r $PWD $HOME/build64
|
||||
- make -C $HOME/build32
|
||||
- XLEN=64 RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv64-gnu-toolchain make -C $HOME/build64
|
||||
- XLEN=64 make -C $HOME/build64
|
||||
- stage: test
|
||||
name: unittest
|
||||
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --unittest
|
||||
|
@ -47,13 +47,13 @@ jobs:
|
|||
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --isa
|
||||
- stage: test
|
||||
name: isa64
|
||||
script: cp -r $HOME/build64 build && cd build && XLEN=64 RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv64-gnu-toolchain ./ci/travis_run.py ./ci/regression.sh --isa
|
||||
script: cp -r $HOME/build64 build && cd build && XLEN=64 ./ci/travis_run.py ./ci/regression.sh --isa
|
||||
- stage: test
|
||||
name: regression
|
||||
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --regression
|
||||
- stage: test
|
||||
name: regression64
|
||||
script: cp -r $HOME/build64 build && cd build && XLEN=64 RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv64-gnu-toolchain ./ci/travis_run.py ./ci/regression.sh --regression
|
||||
script: cp -r $HOME/build64 build && cd build && XLEN=64 ./ci/travis_run.py ./ci/regression.sh --regression
|
||||
- stage: test
|
||||
name: opencl
|
||||
script: cp -r $HOME/build32 build && cd build && ./ci/travis_run.py ./ci/regression.sh --opencl
|
||||
|
|
|
@ -16,14 +16,11 @@
|
|||
|
||||
TOOLDIR=${TOOLDIR:=/opt}
|
||||
|
||||
export RISCV_TOOLCHAIN_PATH=$TOOLDIR/riscv-gnu-toolchain
|
||||
export LLVM_POCL=$TOOLDIR/llvm-pocl
|
||||
export LLVM_VORTEX=$TOOLDIR/llvm-vortex
|
||||
export VERILATOR_ROOT=$TOOLDIR/verilator
|
||||
export PATH=$VERILATOR_ROOT/bin:$PATH
|
||||
|
||||
export SV2V_PATH=$TOOLDIR/sv2v
|
||||
export PATH=$SV2V_PATH/bin:$PATH
|
||||
|
||||
export YOSYS_PATH=$TOOLDIR/yosys
|
||||
export PATH=$YOSYS_PATH/bin:$PATH
|
||||
export POCL_CC_PATH=$TOOLDIR/pocl/compiler
|
||||
export POCL_RT_PATH=$TOOLDIR/pocl/runtime
|
||||
|
|
|
@ -9,9 +9,6 @@ OPAE Environment Setup
|
|||
$ export C_INCLUDE_PATH=$OPAE_HOME/include:$C_INCLUDE_PATH
|
||||
$ export LIBRARY_PATH=$OPAE_HOME/lib:$LIBRARY_PATH
|
||||
$ export LD_LIBRARY_PATH=$OPAE_HOME/lib:$LD_LIBRARY_PATH
|
||||
$ export RISCV_TOOLCHAIN_PATH=/opt/riscv-gnu-toolchain
|
||||
$ export PATH=:/opt/verilator/bin:$PATH
|
||||
$ export VERILATOR_ROOT=/opt/verilator
|
||||
|
||||
OPAE Build
|
||||
------------------
|
||||
|
|
|
@ -223,18 +223,18 @@
|
|||
|
||||
// Number of ALU units
|
||||
`ifndef NUM_ALU_LANES
|
||||
`define NUM_ALU_LANES `UP(`NUM_THREADS / 2)
|
||||
`define NUM_ALU_LANES `NUM_THREADS
|
||||
`endif
|
||||
`ifndef NUM_ALU_BLOCKS
|
||||
`define NUM_ALU_BLOCKS `UP(`ISSUE_WIDTH / 1)
|
||||
`define NUM_ALU_BLOCKS `ISSUE_WIDTH
|
||||
`endif
|
||||
|
||||
// Number of FPU units
|
||||
`ifndef NUM_FPU_LANES
|
||||
`define NUM_FPU_LANES `UP(`NUM_THREADS / 2)
|
||||
`define NUM_FPU_LANES `NUM_THREADS
|
||||
`endif
|
||||
`ifndef NUM_FPU_BLOCKS
|
||||
`define NUM_FPU_BLOCKS `UP(`ISSUE_WIDTH / 1)
|
||||
`define NUM_FPU_BLOCKS `ISSUE_WIDTH
|
||||
`endif
|
||||
|
||||
// Number of LSU units
|
||||
|
@ -407,7 +407,7 @@
|
|||
|
||||
// Number of Associative Ways
|
||||
`ifndef ICACHE_NUM_WAYS
|
||||
`define ICACHE_NUM_WAYS 2
|
||||
`define ICACHE_NUM_WAYS 1
|
||||
`endif
|
||||
|
||||
// Dcache Configurable Knobs //////////////////////////////////////////////////
|
||||
|
@ -461,7 +461,7 @@
|
|||
|
||||
// Number of Associative Ways
|
||||
`ifndef DCACHE_NUM_WAYS
|
||||
`define DCACHE_NUM_WAYS 2
|
||||
`define DCACHE_NUM_WAYS 1
|
||||
`endif
|
||||
|
||||
// SM Configurable Knobs //////////////////////////////////////////////////////
|
||||
|
@ -520,7 +520,7 @@
|
|||
|
||||
// Number of Associative Ways
|
||||
`ifndef L2_NUM_WAYS
|
||||
`define L2_NUM_WAYS 4
|
||||
`define L2_NUM_WAYS 2
|
||||
`endif
|
||||
|
||||
// L3cache Configurable Knobs /////////////////////////////////////////////////
|
||||
|
|
|
@ -262,7 +262,7 @@ module VX_afu_wrap #(
|
|||
.m_axi_awready (m_axi_mem_awready_a),
|
||||
.m_axi_awaddr (m_axi_mem_awaddr_w),
|
||||
.m_axi_awid (m_axi_mem_awid_a),
|
||||
`UNUSED_PIN (m_axi_awlen),
|
||||
.m_axi_awlen (m_axi_mem_awlen_a),
|
||||
`UNUSED_PIN (m_axi_awsize),
|
||||
`UNUSED_PIN (m_axi_awburst),
|
||||
`UNUSED_PIN (m_axi_awlock),
|
||||
|
|
190
hw/rtl/cache/VX_cache_cluster_top.sv
vendored
190
hw/rtl/cache/VX_cache_cluster_top.sv
vendored
|
@ -1,190 +0,0 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_cache_define.vh"
|
||||
|
||||
module VX_cache_cluster_top import VX_gpu_pkg::*; #(
|
||||
parameter `STRING INSTANCE_ID = "",
|
||||
|
||||
parameter NUM_UNITS = 2,
|
||||
parameter NUM_INPUTS = 4,
|
||||
parameter TAG_SEL_IDX = 0,
|
||||
|
||||
// Number of Word requests per cycle
|
||||
parameter NUM_REQS = 4,
|
||||
|
||||
// Size of cache in bytes
|
||||
parameter CACHE_SIZE = 16384,
|
||||
// Size of line inside a bank in bytes
|
||||
parameter LINE_SIZE = 16,
|
||||
// Number of banks
|
||||
parameter NUM_BANKS = 4,
|
||||
// Number of associative ways
|
||||
parameter NUM_WAYS = 4,
|
||||
// Size of a word in bytes
|
||||
parameter WORD_SIZE = 4,
|
||||
|
||||
// Core Response Queue Size
|
||||
parameter CRSQ_SIZE = 2,
|
||||
// Miss Reserv Queue Knob
|
||||
parameter MSHR_SIZE = 16,
|
||||
// Memory Response Queue Size
|
||||
parameter MRSQ_SIZE = 0,
|
||||
// Memory Request Queue Size
|
||||
parameter MREQ_SIZE = 4,
|
||||
|
||||
// Enable cache writeable
|
||||
parameter WRITE_ENABLE = 1,
|
||||
|
||||
// Request debug identifier
|
||||
parameter UUID_WIDTH = 0,
|
||||
|
||||
// core request tag size
|
||||
parameter TAG_WIDTH = UUID_WIDTH + 16,
|
||||
|
||||
// enable bypass for non-cacheable addresses
|
||||
parameter NC_ENABLE = 1,
|
||||
|
||||
// Core response output register
|
||||
parameter CORE_OUT_REG = 2,
|
||||
|
||||
// Memory request output register
|
||||
parameter MEM_OUT_REG = 2,
|
||||
|
||||
parameter NUM_CACHES = `UP(NUM_UNITS),
|
||||
parameter PASSTHRU = (NUM_UNITS == 0),
|
||||
parameter ARB_TAG_WIDTH = TAG_WIDTH + `ARB_SEL_BITS(NUM_INPUTS, NUM_CACHES),
|
||||
parameter MEM_TAG_WIDTH = PASSTHRU ? (NC_ENABLE ? `CACHE_NC_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
|
||||
`CACHE_BYPASS_TAG_WIDTH(NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH)) :
|
||||
(NC_ENABLE ? `CACHE_NC_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS, NUM_REQS, LINE_SIZE, WORD_SIZE, ARB_TAG_WIDTH) :
|
||||
`CACHE_MEM_TAG_WIDTH(MSHR_SIZE, NUM_BANKS)),
|
||||
parameter MEM_TAG_X_WIDTH = MEM_TAG_WIDTH + `ARB_SEL_BITS(NUM_CACHES, 1)
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// PERF
|
||||
`ifdef PERF_ENABLE
|
||||
output cache_perf_t cache_perf,
|
||||
`endif
|
||||
|
||||
// Core request
|
||||
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_req_valid,
|
||||
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_req_rw,
|
||||
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen,
|
||||
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_ADDR_WIDTH-1:0] core_req_addr,
|
||||
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data,
|
||||
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag,
|
||||
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_req_ready,
|
||||
|
||||
// Core response
|
||||
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_rsp_valid,
|
||||
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_rsp_data,
|
||||
output wire [NUM_INPUTS-1:0][NUM_REQS-1:0][TAG_WIDTH-1:0] core_rsp_tag,
|
||||
input wire [NUM_INPUTS-1:0][NUM_REQS-1:0] core_rsp_ready,
|
||||
|
||||
// Memory request
|
||||
output wire mem_req_valid,
|
||||
output wire mem_req_rw,
|
||||
output wire [LINE_SIZE-1:0] mem_req_byteen,
|
||||
output wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr,
|
||||
output wire [`CS_LINE_WIDTH-1:0] mem_req_data,
|
||||
output wire [MEM_TAG_X_WIDTH-1:0] mem_req_tag,
|
||||
input wire mem_req_ready,
|
||||
|
||||
// Memory response
|
||||
input wire mem_rsp_valid,
|
||||
input wire [`CS_LINE_WIDTH-1:0] mem_rsp_data,
|
||||
input wire [MEM_TAG_X_WIDTH-1:0] mem_rsp_tag,
|
||||
output wire mem_rsp_ready
|
||||
);
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (WORD_SIZE),
|
||||
.TAG_WIDTH (TAG_WIDTH)
|
||||
) core_bus_if[NUM_INPUTS * NUM_REQS]();
|
||||
|
||||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (LINE_SIZE),
|
||||
.TAG_WIDTH (MEM_TAG_X_WIDTH)
|
||||
) mem_bus_if();
|
||||
|
||||
// Core request
|
||||
for (genvar i = 0; i < NUM_INPUTS; ++i) begin
|
||||
for (genvar r = 0; r < NUM_REQS; ++r) begin
|
||||
assign core_bus_if[i * NUM_REQS + r].req_valid = core_req_valid[i][r];
|
||||
assign core_bus_if[i * NUM_REQS + r].req_data.rw = core_req_rw[i][r];
|
||||
assign core_bus_if[i * NUM_REQS + r].req_data.byteen = core_req_byteen[i][r];
|
||||
assign core_bus_if[i * NUM_REQS + r].req_data.addr = core_req_addr[i][r];
|
||||
assign core_bus_if[i * NUM_REQS + r].req_data.data = core_req_data[i][r];
|
||||
assign core_bus_if[i * NUM_REQS + r].req_data.tag = core_req_tag[i][r];
|
||||
assign core_req_ready[i][r] = core_bus_if[i * NUM_REQS + r].req_ready;
|
||||
end
|
||||
end
|
||||
|
||||
// Core response
|
||||
for (genvar i = 0; i < NUM_INPUTS; ++i) begin
|
||||
for (genvar r = 0; r < NUM_REQS; ++r) begin
|
||||
assign core_rsp_valid[i][r] = core_bus_if[i * NUM_REQS + r].rsp_valid;
|
||||
assign core_rsp_data[i][r] = core_bus_if[i * NUM_REQS + r].rsp_data.data;
|
||||
assign core_rsp_tag[i][r] = core_bus_if[i * NUM_REQS + r].rsp_data.tag;
|
||||
assign core_bus_if[i * NUM_REQS + r].rsp_ready = core_rsp_ready[i][r];
|
||||
end
|
||||
end
|
||||
|
||||
// Memory request
|
||||
assign mem_req_valid = mem_bus_if.req_valid;
|
||||
assign mem_req_rw = mem_bus_if.req_data.rw;
|
||||
assign mem_req_byteen = mem_bus_if.req_data.byteen;
|
||||
assign mem_req_addr = mem_bus_if.req_data.addr;
|
||||
assign mem_req_data = mem_bus_if.req_data.data;
|
||||
assign mem_req_tag = mem_bus_if.req_data.tag;
|
||||
assign mem_bus_if.req_ready = mem_req_ready;
|
||||
|
||||
// Memory response
|
||||
assign mem_bus_if.rsp_valid = mem_rsp_valid;
|
||||
assign mem_bus_if.rsp_data.data = mem_rsp_data;
|
||||
assign mem_bus_if.rsp_data.tag = mem_rsp_tag;
|
||||
assign mem_rsp_ready = mem_bus_if.rsp_ready;
|
||||
|
||||
VX_cache_cluster #(
|
||||
.INSTANCE_ID (INSTANCE_ID),
|
||||
.NUM_UNITS (NUM_UNITS),
|
||||
.NUM_INPUTS (NUM_INPUTS),
|
||||
.TAG_SEL_IDX (TAG_SEL_IDX),
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.CACHE_SIZE (CACHE_SIZE),
|
||||
.LINE_SIZE (LINE_SIZE),
|
||||
.NUM_BANKS (NUM_BANKS),
|
||||
.NUM_WAYS (NUM_WAYS),
|
||||
.WORD_SIZE (WORD_SIZE),
|
||||
.CRSQ_SIZE (CRSQ_SIZE),
|
||||
.MSHR_SIZE (MSHR_SIZE),
|
||||
.MRSQ_SIZE (MRSQ_SIZE),
|
||||
.MREQ_SIZE (MREQ_SIZE),
|
||||
.WRITE_ENABLE (WRITE_ENABLE),
|
||||
.UUID_WIDTH (UUID_WIDTH),
|
||||
.TAG_WIDTH (TAG_WIDTH),
|
||||
.NC_ENABLE (NC_ENABLE),
|
||||
.CORE_OUT_REG (CORE_OUT_REG),
|
||||
.MEM_OUT_REG (MEM_OUT_REG)
|
||||
) cache (
|
||||
`ifdef PERF_ENABLE
|
||||
.cache_perf (cache_perf),
|
||||
`endif
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.core_bus_if (core_bus_if),
|
||||
.mem_bus_if (mem_bus_if)
|
||||
);
|
||||
|
||||
endmodule
|
2
hw/rtl/cache/VX_cache_data.sv
vendored
2
hw/rtl/cache/VX_cache_data.sv
vendored
|
@ -93,7 +93,7 @@ module VX_cache_data #(
|
|||
assign wren = fill;
|
||||
end
|
||||
|
||||
wire [`CLOG2(NUM_WAYS)-1:0] way_idx;
|
||||
wire [`LOG2UP(NUM_WAYS)-1:0] way_idx;
|
||||
|
||||
VX_onehot_encoder #(
|
||||
.N (NUM_WAYS)
|
||||
|
|
2
hw/rtl/cache/VX_cache_top.sv
vendored
2
hw/rtl/cache/VX_cache_top.sv
vendored
|
@ -22,7 +22,7 @@ module VX_cache_top #(
|
|||
// Size of cache in bytes
|
||||
parameter CACHE_SIZE = 16384,
|
||||
// Size of line inside a bank in bytes
|
||||
parameter LINE_SIZE = 16,
|
||||
parameter LINE_SIZE = 64,
|
||||
// Number of banks
|
||||
parameter NUM_BANKS = 4,
|
||||
// Number of associative ways
|
||||
|
|
|
@ -267,7 +267,7 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
|
||||
wire [`CLOG2(DCACHE_NUM_REQS+1)-1:0] perf_dcache_rsp_per_cycle;
|
||||
|
||||
wire perf_icache_pending_read_cycle;
|
||||
wire [1:0] perf_icache_pending_read_cycle;
|
||||
wire [`CLOG2(DCACHE_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads;
|
||||
|
|
|
@ -220,8 +220,13 @@ module VX_muldiv_unit #(
|
|||
wire [NUM_LANES-1:0][`XLEN-1:0] div_in2;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
`ifdef XLEN_64
|
||||
assign div_in1[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs1_data[i][31]}}, execute_if.data.rs1_data[i][31:0]}: execute_if.data.rs1_data[i];
|
||||
assign div_in2[i] = is_alu_w ? {{(`XLEN-32){is_signed_op && execute_if.data.rs2_data[i][31]}}, execute_if.data.rs2_data[i][31:0]}: execute_if.data.rs2_data[i];
|
||||
`else
|
||||
assign div_in1[i] = execute_if.data.rs1_data[i];
|
||||
assign div_in2[i] = execute_if.data.rs2_data[i];
|
||||
`endif
|
||||
end
|
||||
|
||||
`ifdef IDIV_DPI
|
||||
|
|
|
@ -38,9 +38,12 @@ module VX_operands import VX_gpu_pkg::*; #(
|
|||
reg [`NR_BITS-1:0] gpr_rd_rid, gpr_rd_rid_n;
|
||||
reg [ISSUE_WIS_W-1:0] gpr_rd_wis, gpr_rd_wis_n;
|
||||
|
||||
reg [ISSUE_RATIO-1:0][`NUM_THREADS-1:0][`XLEN-1:0] cache_data, cache_data_n;
|
||||
reg [ISSUE_RATIO-1:0][`NR_BITS-1:0] cache_reg, cache_reg_n;
|
||||
reg [ISSUE_RATIO-1:0][`NUM_THREADS-1:0] cache_tmask, cache_tmask_n;
|
||||
reg [`NUM_THREADS-1:0][`XLEN-1:0] cache_data [ISSUE_RATIO-1:0];
|
||||
reg [`NUM_THREADS-1:0][`XLEN-1:0] cache_data_n [ISSUE_RATIO-1:0];
|
||||
reg [`NR_BITS-1:0] cache_reg [ISSUE_RATIO-1:0];
|
||||
reg [`NR_BITS-1:0] cache_reg_n [ISSUE_RATIO-1:0];
|
||||
reg [`NUM_THREADS-1:0] cache_tmask [ISSUE_RATIO-1:0];
|
||||
reg [`NUM_THREADS-1:0] cache_tmask_n [ISSUE_RATIO-1:0];
|
||||
reg [ISSUE_RATIO-1:0] cache_eop, cache_eop_n;
|
||||
|
||||
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data, rs1_data_n;
|
||||
|
@ -160,11 +163,8 @@ module VX_operands import VX_gpu_pkg::*; #(
|
|||
end
|
||||
cache_reg_n[writeback_if[i].data.wis] = writeback_if[i].data.rd;
|
||||
cache_eop_n[writeback_if[i].data.wis] = writeback_if[i].data.eop;
|
||||
if (writeback_if[i].data.sop) begin
|
||||
cache_tmask_n[writeback_if[i].data.wis] = writeback_if[i].data.tmask;
|
||||
end else begin
|
||||
cache_tmask_n[writeback_if[i].data.wis] |= writeback_if[i].data.tmask;
|
||||
end
|
||||
cache_tmask_n[writeback_if[i].data.wis] = writeback_if[i].data.sop ? writeback_if[i].data.tmask :
|
||||
(cache_tmask_n[writeback_if[i].data.wis] | writeback_if[i].data.tmask);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -175,7 +175,6 @@ module VX_operands import VX_gpu_pkg::*; #(
|
|||
gpr_rd_rid <= '0;
|
||||
gpr_rd_wis <= '0;
|
||||
cache_eop <= {ISSUE_RATIO{1'b1}};
|
||||
cache_reg <= '0;
|
||||
data_ready <= 0;
|
||||
end else begin
|
||||
state <= state_n;
|
||||
|
|
|
@ -107,6 +107,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
.ready_out (scoreboard_if[i].ready)
|
||||
);
|
||||
|
||||
`ifdef SIMULATION
|
||||
reg [31:0] timeout_ctr;
|
||||
|
||||
always @(posedge clk) begin
|
||||
|
@ -134,6 +135,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
`RUNTIME_ASSERT(~writeback_fire || inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] != 0,
|
||||
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
|
||||
$time, CORE_ID, wis_to_wid(writeback_if[i].data.wis, i), writeback_if[i].data.PC, writeback_if[i].data.tmask, writeback_if[i].data.rd, writeback_if[i].data.uuid));
|
||||
`endif
|
||||
|
||||
end
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -170,7 +170,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
|||
.NUM_INPUTS (RSP_ARB_SIZE),
|
||||
.DATAW (RSP_ARB_DATAW),
|
||||
.ARBITER ("R"),
|
||||
.OUT_REG (1)
|
||||
.OUT_REG (3)
|
||||
) rsp_arb (
|
||||
.clk (clk),
|
||||
.reset (commit_reset),
|
||||
|
@ -186,7 +186,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
|||
VX_gather_unit #(
|
||||
.BLOCK_SIZE (BLOCK_SIZE),
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.OUT_REG (3)
|
||||
.OUT_REG (1)
|
||||
) gather_unit (
|
||||
.clk (clk),
|
||||
.reset (commit_reset),
|
||||
|
|
|
@ -52,30 +52,24 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
|||
|
||||
localparam MAN_BITS = 23;
|
||||
localparam EXP_BITS = 8;
|
||||
localparam EXP_BIAS = 2**(EXP_BITS-1)-1;
|
||||
|
||||
localparam logic [EXP_BITS-1:0] QNAN_EXPONENT = 2**EXP_BITS-1;
|
||||
localparam logic [MAN_BITS-1:0] QNAN_MANTISSA = 2**(MAN_BITS-1);
|
||||
localparam EXP_BIAS = 2**(EXP_BITS-1)-1;
|
||||
|
||||
// Use 32-bit integer
|
||||
localparam MAX_INT_WIDTH = 32;
|
||||
localparam INT_WIDTH = 32;
|
||||
|
||||
// The internal mantissa includes normal bit or an entire integer
|
||||
localparam INT_MAN_WIDTH = `MAX(MAN_BITS + 1, MAX_INT_WIDTH);
|
||||
localparam INT_MAN_WIDTH = `MAX(MAN_BITS + 1, INT_WIDTH);
|
||||
|
||||
// The lower 2p+3 bits of the internal FMA result will be needed for leading-zero detection
|
||||
localparam LZC_RESULT_WIDTH = `CLOG2(INT_MAN_WIDTH);
|
||||
|
||||
// The internal exponent must be able to represent the smallest denormal input value as signed
|
||||
// or the number of bits in an integer
|
||||
localparam INT_EXP_WIDTH = `MAX(`CLOG2(MAX_INT_WIDTH), `MAX(EXP_BITS, `CLOG2(EXP_BIAS + MAN_BITS))) + 1;
|
||||
|
||||
// shift amount for denormalization
|
||||
localparam SHAMT_BITS = `CLOG2(INT_MAN_WIDTH+1);
|
||||
localparam INT_EXP_WIDTH = `MAX(`CLOG2(INT_WIDTH), `MAX(EXP_BITS, `CLOG2(EXP_BIAS + MAN_BITS))) + 1;
|
||||
|
||||
localparam FMT_SHIFT_COMPENSATION = INT_MAN_WIDTH - 1 - MAN_BITS;
|
||||
localparam NUM_FP_STICKY = 2 * INT_MAN_WIDTH - MAN_BITS - 1; // removed mantissa, 1. and R
|
||||
localparam NUM_INT_STICKY = 2 * INT_MAN_WIDTH - MAX_INT_WIDTH; // removed int and R
|
||||
localparam NUM_INT_STICKY = 2 * INT_MAN_WIDTH - INT_WIDTH; // removed int and R
|
||||
|
||||
// Input processing
|
||||
|
||||
|
@ -86,8 +80,8 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
|||
.EXP_BITS (EXP_BITS),
|
||||
.MAN_BITS (MAN_BITS)
|
||||
) fp_class (
|
||||
.exp_i (dataa[i][30:23]),
|
||||
.man_i (dataa[i][22:0]),
|
||||
.exp_i (dataa[i][INT_WIDTH-2:MAN_BITS]),
|
||||
.man_i (dataa[i][MAN_BITS-1:0]),
|
||||
.clss_o (fclass[i])
|
||||
);
|
||||
end
|
||||
|
@ -97,27 +91,25 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
|||
wire [NUM_LANES-1:0] input_sign;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
wire [INT_MAN_WIDTH-1:0] int_mantissa;
|
||||
wire [INT_MAN_WIDTH-1:0] fmt_mantissa;
|
||||
wire fmt_sign = dataa[i][31];
|
||||
wire int_sign = dataa[i][31] && is_signed;
|
||||
assign int_mantissa = int_sign ? (-dataa[i]) : dataa[i];
|
||||
assign fmt_mantissa = INT_MAN_WIDTH'({fclass[i].is_normal, dataa[i][MAN_BITS-1:0]});
|
||||
wire i2f_sign = dataa[i][INT_WIDTH-1];
|
||||
wire f2i_sign = dataa[i][INT_WIDTH-1] && is_signed;
|
||||
wire [INT_MAN_WIDTH-1:0] f2i_mantissa = f2i_sign ? (-dataa[i]) : dataa[i];
|
||||
wire [INT_MAN_WIDTH-1:0] i2f_mantissa = INT_MAN_WIDTH'({fclass[i].is_normal, dataa[i][MAN_BITS-1:0]});
|
||||
assign input_exp[i] = {1'b0, dataa[i][MAN_BITS +: EXP_BITS]} + INT_EXP_WIDTH'({1'b0, fclass[i].is_subnormal});
|
||||
assign input_mant[i] = is_itof ? int_mantissa : fmt_mantissa;
|
||||
assign input_sign[i] = is_itof ? int_sign : fmt_sign;
|
||||
assign input_mant[i] = is_itof ? f2i_mantissa : i2f_mantissa;
|
||||
assign input_sign[i] = is_itof ? f2i_sign : i2f_sign;
|
||||
end
|
||||
|
||||
// Pipeline stage0
|
||||
|
||||
wire valid_in_s0;
|
||||
wire [NUM_LANES-1:0] lane_mask_s0;
|
||||
wire [TAGW-1:0] tag_in_s0;
|
||||
wire is_itof_s0;
|
||||
wire unsigned_s0;
|
||||
wire [2:0] rnd_mode_s0;
|
||||
wire valid_in_s0;
|
||||
wire [NUM_LANES-1:0] lane_mask_s0;
|
||||
wire [TAGW-1:0] tag_in_s0;
|
||||
wire is_itof_s0;
|
||||
wire is_signed_s0;
|
||||
wire [2:0] rnd_mode_s0;
|
||||
fclass_t [NUM_LANES-1:0] fclass_s0;
|
||||
wire [NUM_LANES-1:0] input_sign_s0;
|
||||
wire [NUM_LANES-1:0] input_sign_s0;
|
||||
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] fmt_exponent_s0;
|
||||
wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] encoded_mant_s0;
|
||||
|
||||
|
@ -130,8 +122,8 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
|||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall),
|
||||
.data_in ({valid_in, lane_mask, tag_in, is_itof, !is_signed, frm, fclass, input_sign, input_exp, input_mant}),
|
||||
.data_out ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
|
||||
.data_in ({valid_in, lane_mask, tag_in, is_itof, is_signed, frm, fclass, input_sign, input_exp, input_mant}),
|
||||
.data_out ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, is_signed_s0, rnd_mode_s0, fclass_s0, input_sign_s0, fmt_exponent_s0, encoded_mant_s0})
|
||||
);
|
||||
|
||||
// Normalization
|
||||
|
@ -159,22 +151,22 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
|||
assign input_mant_n_s0[i] = encoded_mant_s0[i] << renorm_shamt_s0[i];
|
||||
|
||||
// Unbias exponent and compensate for shift
|
||||
wire [INT_EXP_WIDTH-1:0] fp_input_exp_s0 = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
|
||||
wire [INT_EXP_WIDTH-1:0] int_input_exp_s0 = INT_EXP_WIDTH'(INT_MAN_WIDTH-1) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
|
||||
assign input_exp_n_s0[i] = is_itof_s0 ? int_input_exp_s0 : fp_input_exp_s0;
|
||||
wire [INT_EXP_WIDTH-1:0] i2f_input_exp_s0 = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
|
||||
wire [INT_EXP_WIDTH-1:0] f2i_input_exp_s0 = INT_EXP_WIDTH'(INT_MAN_WIDTH-1) - INT_EXP_WIDTH'({1'b0, renorm_shamt_s0[i]});
|
||||
assign input_exp_n_s0[i] = is_itof_s0 ? f2i_input_exp_s0 : i2f_input_exp_s0;
|
||||
end
|
||||
|
||||
// Pipeline stage1
|
||||
|
||||
wire valid_in_s1;
|
||||
wire [NUM_LANES-1:0] lane_mask_s1;
|
||||
wire [TAGW-1:0] tag_in_s1;
|
||||
wire is_itof_s1;
|
||||
wire unsigned_s1;
|
||||
wire [2:0] rnd_mode_s1;
|
||||
wire valid_in_s1;
|
||||
wire [NUM_LANES-1:0] lane_mask_s1;
|
||||
wire [TAGW-1:0] tag_in_s1;
|
||||
wire is_itof_s1;
|
||||
wire is_signed_s1;
|
||||
wire [2:0] rnd_mode_s1;
|
||||
fclass_t [NUM_LANES-1:0] fclass_s1;
|
||||
wire [NUM_LANES-1:0] input_sign_s1;
|
||||
wire [NUM_LANES-1:0] mant_is_zero_s1;
|
||||
wire [NUM_LANES-1:0] input_sign_s1;
|
||||
wire [NUM_LANES-1:0] mant_is_zero_s1;
|
||||
wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] input_mant_s1;
|
||||
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] input_exp_s1;
|
||||
|
||||
|
@ -185,76 +177,49 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
|||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall),
|
||||
.data_in ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, unsigned_s0, rnd_mode_s0, fclass_s0, input_sign_s0, mant_is_zero_s0, input_mant_n_s0, input_exp_n_s0}),
|
||||
.data_out ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fclass_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1})
|
||||
.data_in ({valid_in_s0, lane_mask_s0, tag_in_s0, is_itof_s0, is_signed_s0, rnd_mode_s0, fclass_s0, input_sign_s0, mant_is_zero_s0, input_mant_n_s0, input_exp_n_s0}),
|
||||
.data_out ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, is_signed_s1, rnd_mode_s1, fclass_s1, input_sign_s1, mant_is_zero_s1, input_mant_s1, input_exp_s1})
|
||||
);
|
||||
|
||||
// Perform adjustments to mantissa and exponent
|
||||
|
||||
wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s1;
|
||||
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s1;
|
||||
wire [NUM_LANES-1:0] of_before_round_s1;
|
||||
wire [NUM_LANES-1:0] of_before_round_s1;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
reg [2*INT_MAN_WIDTH:0] preshift_mant_s1; // mantissa before final shift
|
||||
reg [SHAMT_BITS-1:0] denorm_shamt_s1; // shift amount for denormalization
|
||||
reg [INT_EXP_WIDTH-1:0] final_exp_tmp_s1; // after eventual adjustments
|
||||
reg of_before_round_tmp_s1;
|
||||
|
||||
wire [INT_EXP_WIDTH-1:0] denorm_shamt = INT_EXP_WIDTH'(INT_WIDTH-1) - input_exp_s1[i];
|
||||
wire overflow = ($signed(denorm_shamt) <= -$signed(INT_EXP_WIDTH'(!is_signed_s1)));
|
||||
wire underflow = ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-1)));
|
||||
reg [INT_EXP_WIDTH-1:0] denorm_shamt_q;
|
||||
always @(*) begin
|
||||
final_exp_tmp_s1 = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS); // take exponent as is, only look at lower bits
|
||||
preshift_mant_s1 = {input_mant_s1[i], 33'b0};
|
||||
denorm_shamt_s1 = '0;
|
||||
of_before_round_tmp_s1 = 1'b0;
|
||||
|
||||
if (is_itof_s1) begin
|
||||
if ($signed(input_exp_s1[i]) >= INT_EXP_WIDTH'($signed(2**EXP_BITS-1-EXP_BIAS))) begin
|
||||
// Overflow or infinities (for proper rounding)
|
||||
final_exp_tmp_s1 = (2**EXP_BITS-2); // largest normal value
|
||||
preshift_mant_s1 = ~0; // largest normal value and RS bits set
|
||||
of_before_round_tmp_s1 = 1'b1;
|
||||
end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-MAN_BITS-EXP_BIAS))) begin
|
||||
// Limit the shift to retain sticky bits
|
||||
final_exp_tmp_s1 = '0; // denormal result
|
||||
denorm_shamt_s1 = (2 + MAN_BITS); // to sticky
|
||||
end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(1-EXP_BIAS))) begin
|
||||
// Denormalize underflowing values
|
||||
final_exp_tmp_s1 = '0; // denormal result
|
||||
denorm_shamt_s1 = SHAMT_BITS'(1-EXP_BIAS) - SHAMT_BITS'(input_exp_s1[i]); // adjust right shifting
|
||||
end
|
||||
if (overflow) begin
|
||||
denorm_shamt_q = '0;
|
||||
end else if (underflow) begin
|
||||
denorm_shamt_q = INT_WIDTH+1;
|
||||
end else begin
|
||||
if ($signed(input_exp_s1[i]) >= $signed(INT_EXP_WIDTH'(MAX_INT_WIDTH-1) + INT_EXP_WIDTH'(unsigned_s1))) begin
|
||||
// overflow: when converting to unsigned the range is larger by one
|
||||
of_before_round_tmp_s1 = 1'b1;
|
||||
end else if ($signed(input_exp_s1[i]) < INT_EXP_WIDTH'($signed(-1))) begin
|
||||
// underflow
|
||||
denorm_shamt_s1 = MAX_INT_WIDTH+1; // all bits go to the sticky
|
||||
end else begin
|
||||
// By default right shift mantissa to be an integer
|
||||
denorm_shamt_s1 = SHAMT_BITS'(MAX_INT_WIDTH-1) - SHAMT_BITS'(input_exp_s1[i]);
|
||||
end
|
||||
denorm_shamt_q = denorm_shamt;
|
||||
end
|
||||
end
|
||||
|
||||
assign destination_mant_s1[i] = preshift_mant_s1 >> denorm_shamt_s1;
|
||||
assign final_exp_s1[i] = final_exp_tmp_s1;
|
||||
assign of_before_round_s1[i] = of_before_round_tmp_s1;
|
||||
assign destination_mant_s1[i] = is_itof_s1 ? {input_mant_s1[i], 33'b0} : ({input_mant_s1[i], 33'b0} >> denorm_shamt_q);
|
||||
assign final_exp_s1[i] = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS);
|
||||
assign of_before_round_s1[i] = overflow;
|
||||
end
|
||||
|
||||
// Pipeline stage2
|
||||
|
||||
wire valid_in_s2;
|
||||
wire [NUM_LANES-1:0] lane_mask_s2;
|
||||
wire [TAGW-1:0] tag_in_s2;
|
||||
wire is_itof_s2;
|
||||
wire unsigned_s2;
|
||||
wire [2:0] rnd_mode_s2;
|
||||
wire valid_in_s2;
|
||||
wire [NUM_LANES-1:0] lane_mask_s2;
|
||||
wire [TAGW-1:0] tag_in_s2;
|
||||
wire is_itof_s2;
|
||||
wire is_signed_s2;
|
||||
wire [2:0] rnd_mode_s2;
|
||||
fclass_t [NUM_LANES-1:0] fclass_s2;
|
||||
wire [NUM_LANES-1:0] mant_is_zero_s2;
|
||||
wire [NUM_LANES-1:0] input_sign_s2;
|
||||
wire [NUM_LANES-1:0] mant_is_zero_s2;
|
||||
wire [NUM_LANES-1:0] input_sign_s2;
|
||||
wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s2;
|
||||
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s2;
|
||||
wire [NUM_LANES-1:0] of_before_round_s2;
|
||||
wire [NUM_LANES-1:0] of_before_round_s2;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + NUM_LANES + TAGW + 1 + 1 + `INST_FRM_BITS + NUM_LANES * ($bits(fclass_t) + 1 + 1 + (2*INT_MAN_WIDTH+1) + INT_EXP_WIDTH + 1)),
|
||||
|
@ -263,37 +228,37 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
|||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall),
|
||||
.data_in ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, unsigned_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
|
||||
.data_out ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
|
||||
.data_in ({valid_in_s1, lane_mask_s1, tag_in_s1, is_itof_s1, is_signed_s1, rnd_mode_s1, fclass_s1, mant_is_zero_s1, input_sign_s1, destination_mant_s1, final_exp_s1, of_before_round_s1}),
|
||||
.data_out ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, is_signed_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
|
||||
);
|
||||
|
||||
wire [NUM_LANES-1:0] rounded_sign_s2;
|
||||
wire [NUM_LANES-1:0][31:0] rounded_abs_s2; // absolute value of result after rounding
|
||||
wire [NUM_LANES-1:0] int_round_has_sticky_s2;
|
||||
wire [NUM_LANES-1:0] fp_round_has_sticky_s2;
|
||||
wire [NUM_LANES-1:0] rounded_sign_s2;
|
||||
wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_abs_s2; // absolute value of result after rounding
|
||||
wire [NUM_LANES-1:0] f2i_round_has_sticky_s2;
|
||||
wire [NUM_LANES-1:0] i2f_round_has_sticky_s2;
|
||||
|
||||
// Rouding and classification
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
wire [MAN_BITS-1:0] final_mant_s2; // mantissa after adjustments
|
||||
wire [MAX_INT_WIDTH-1:0] final_int_s2; // integer shifted in position
|
||||
wire [1:0] round_sticky_bits_s2;
|
||||
wire [31:0] fmt_pre_round_abs_s2;
|
||||
wire [31:0] pre_round_abs_s2;
|
||||
wire [1:0] int_round_sticky_bits_s2, fp_round_sticky_bits_s2;
|
||||
wire [MAN_BITS-1:0] final_mant_s2; // mantissa after adjustments
|
||||
wire [INT_WIDTH-1:0] final_int_s2; // integer shifted in position
|
||||
wire [1:0] round_sticky_bits_s2;
|
||||
wire [INT_WIDTH-1:0] fmt_pre_round_abs_s2;
|
||||
wire [INT_WIDTH-1:0] pre_round_abs_s2;
|
||||
wire [1:0] f2i_round_sticky_bits_s2, i2f_round_sticky_bits_s2;
|
||||
|
||||
// Extract final mantissa and round bit, discard the normal bit (for FP)
|
||||
assign {final_mant_s2, fp_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1];
|
||||
assign {final_int_s2, int_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH : 2*INT_MAN_WIDTH - (MAX_INT_WIDTH+1) + 1];
|
||||
assign {final_mant_s2, i2f_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1];
|
||||
assign {final_int_s2, f2i_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH : 2*INT_MAN_WIDTH - (INT_WIDTH+1) + 1];
|
||||
|
||||
// Collapse sticky bits
|
||||
assign fp_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]);
|
||||
assign int_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]);
|
||||
assign fp_round_has_sticky_s2[i] = (| fp_round_sticky_bits_s2);
|
||||
assign int_round_has_sticky_s2[i] = (| int_round_sticky_bits_s2);
|
||||
assign i2f_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]);
|
||||
assign f2i_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]);
|
||||
assign i2f_round_has_sticky_s2[i] = (| i2f_round_sticky_bits_s2);
|
||||
assign f2i_round_has_sticky_s2[i] = (| f2i_round_sticky_bits_s2);
|
||||
|
||||
// select RS bits for destination operation
|
||||
assign round_sticky_bits_s2 = is_itof_s2 ? fp_round_sticky_bits_s2 : int_round_sticky_bits_s2;
|
||||
assign round_sticky_bits_s2 = is_itof_s2 ? i2f_round_sticky_bits_s2 : f2i_round_sticky_bits_s2;
|
||||
|
||||
// Pack exponent and mantissa into proper rounding form
|
||||
assign fmt_pre_round_abs_s2 = {1'b0, final_exp_s2[i][EXP_BITS-1:0], final_mant_s2[MAN_BITS-1:0]};
|
||||
|
@ -322,15 +287,15 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
|||
wire [NUM_LANES-1:0] lane_mask_s3;
|
||||
wire [TAGW-1:0] tag_in_s3;
|
||||
wire is_itof_s3;
|
||||
wire unsigned_s3;
|
||||
wire is_signed_s3;
|
||||
fclass_t [NUM_LANES-1:0] fclass_s3;
|
||||
wire [NUM_LANES-1:0] mant_is_zero_s3;
|
||||
wire [NUM_LANES-1:0] input_sign_s3;
|
||||
wire [NUM_LANES-1:0] rounded_sign_s3;
|
||||
wire [NUM_LANES-1:0][31:0] rounded_abs_s3;
|
||||
wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_abs_s3;
|
||||
wire [NUM_LANES-1:0] of_before_round_s3;
|
||||
wire [NUM_LANES-1:0] int_round_has_sticky_s3;
|
||||
wire [NUM_LANES-1:0] fp_round_has_sticky_s3;
|
||||
wire [NUM_LANES-1:0] f2i_round_has_sticky_s3;
|
||||
wire [NUM_LANES-1:0] i2f_round_has_sticky_s3;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + NUM_LANES + TAGW + 1 + 1 + NUM_LANES * ($bits(fclass_t) + 1 + 1 + 32 + 1 + 1 + 1 + 1)),
|
||||
|
@ -339,105 +304,68 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
|||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall),
|
||||
.data_in ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, unsigned_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, int_round_has_sticky_s2, fp_round_has_sticky_s2}),
|
||||
.data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, unsigned_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, int_round_has_sticky_s3, fp_round_has_sticky_s3})
|
||||
.data_in ({valid_in_s2, lane_mask_s2, tag_in_s2, is_itof_s2, is_signed_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, f2i_round_has_sticky_s2, i2f_round_has_sticky_s2}),
|
||||
.data_out ({valid_in_s3, lane_mask_s3, tag_in_s3, is_itof_s3, is_signed_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, f2i_round_has_sticky_s3, i2f_round_has_sticky_s3})
|
||||
);
|
||||
|
||||
wire [NUM_LANES-1:0] of_after_round_s3;
|
||||
wire [NUM_LANES-1:0] uf_after_round_s3;
|
||||
wire [NUM_LANES-1:0][31:0] fmt_result_s3;
|
||||
wire [NUM_LANES-1:0][31:0] rounded_int_res_s3; // after possible inversion
|
||||
wire [NUM_LANES-1:0][INT_WIDTH-1:0] fmt_result_s3;
|
||||
wire [NUM_LANES-1:0][INT_WIDTH-1:0] rounded_int_res_s3; // after possible inversion
|
||||
wire [NUM_LANES-1:0] rounded_int_res_zero_s3; // after rounding
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
// Assemble regular result, nan box short ones. Int zeroes need to be detected
|
||||
assign fmt_result_s3[i] = (is_itof_s3 & mant_is_zero_s3[i]) ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]};
|
||||
|
||||
// Classification after rounding select by destination format
|
||||
assign uf_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == 0); // denormal
|
||||
assign of_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == ~0); // inf exp.
|
||||
assign fmt_result_s3[i] = mant_is_zero_s3[i] ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]};
|
||||
|
||||
// Negative integer result needs to be brought into two's complement
|
||||
assign rounded_int_res_s3[i] = rounded_sign_s3[i] ? (-rounded_abs_s3[i]) : rounded_abs_s3[i];
|
||||
assign rounded_int_res_zero_s3[i] = (rounded_int_res_s3[i] == 0);
|
||||
end
|
||||
|
||||
// FP Special case handling
|
||||
// F2I Special case handling
|
||||
|
||||
wire [NUM_LANES-1:0][31:0] fp_special_result_s3;
|
||||
fflags_t [NUM_LANES-1:0] fp_special_status_s3;
|
||||
wire [NUM_LANES-1:0] fp_result_is_special_s3;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
// Detect special case from source format, I2F casts don't produce a special result
|
||||
assign fp_result_is_special_s3[i] = ~is_itof_s3 & (fclass_s3[i].is_zero | fclass_s3[i].is_nan);
|
||||
|
||||
// Signalling input NaNs raise invalid flag, otherwise no flags set
|
||||
assign fp_special_status_s3[i] = fclass_s3[i].is_signaling ? {1'b1, 4'h0} : 5'h0; // invalid operation
|
||||
|
||||
// Assemble result according to destination format
|
||||
assign fp_special_result_s3[i] = fclass_s3[i].is_zero ? (32'(input_sign_s3) << 31) // signed zero
|
||||
: {1'b0, QNAN_EXPONENT, QNAN_MANTISSA}; // qNaN
|
||||
end
|
||||
|
||||
// INT Special case handling
|
||||
|
||||
reg [NUM_LANES-1:0][31:0] int_special_result_s3;
|
||||
fflags_t [NUM_LANES-1:0] int_special_status_s3;
|
||||
wire [NUM_LANES-1:0] int_result_is_special_s3;
|
||||
reg [NUM_LANES-1:0][INT_WIDTH-1:0] f2i_special_result_s3;
|
||||
fflags_t [NUM_LANES-1:0] f2i_special_status_s3;
|
||||
wire [NUM_LANES-1:0] f2i_result_is_special_s3;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
// Assemble result according to destination format
|
||||
always @(*) begin
|
||||
if (input_sign_s3[i] && !fclass_s3[i].is_nan) begin
|
||||
int_special_result_s3[i][30:0] = '0; // alone yields 2**(31)-1
|
||||
int_special_result_s3[i][31] = ~unsigned_s3; // for unsigned casts yields 2**31
|
||||
f2i_special_result_s3[i][INT_WIDTH-2:0] = '0; // alone yields 2**(31)-1
|
||||
f2i_special_result_s3[i][INT_WIDTH-1] = is_signed_s3; // for unsigned casts yields 2**31
|
||||
end else begin
|
||||
int_special_result_s3[i][30:0] = 2**(31) - 1; // alone yields 2**(31)-1
|
||||
int_special_result_s3[i][31] = unsigned_s3; // for unsigned casts yields 2**31
|
||||
f2i_special_result_s3[i][INT_WIDTH-2:0] = 2**(INT_WIDTH-1) - 1; // alone yields 2**(31)-1
|
||||
f2i_special_result_s3[i][INT_WIDTH-1] = ~is_signed_s3; // for unsigned casts yields 2**31
|
||||
end
|
||||
end
|
||||
|
||||
// Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned)
|
||||
assign int_result_is_special_s3[i] = fclass_s3[i].is_nan
|
||||
assign f2i_result_is_special_s3[i] = fclass_s3[i].is_nan
|
||||
| fclass_s3[i].is_inf
|
||||
| of_before_round_s3[i]
|
||||
| (input_sign_s3[i] & unsigned_s3 & ~rounded_int_res_zero_s3[i]);
|
||||
| (input_sign_s3[i] & ~is_signed_s3 & ~rounded_int_res_zero_s3[i]);
|
||||
|
||||
// All integer special cases are invalid
|
||||
assign int_special_status_s3[i] = {1'b1, 4'h0};
|
||||
assign f2i_special_status_s3[i] = {1'b1, 4'h0};
|
||||
end
|
||||
|
||||
// Result selection and Output handshake
|
||||
|
||||
fflags_t [NUM_LANES-1:0] tmp_fflags_s3;
|
||||
wire [NUM_LANES-1:0][31:0] tmp_result_s3;
|
||||
wire [NUM_LANES-1:0][INT_WIDTH-1:0] tmp_result_s3;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
fflags_t fp_regular_status_s3, int_regular_status_s3;
|
||||
fflags_t fp_status_s3, int_status_s3;
|
||||
wire [31:0] fp_result_s3, int_result_s3;
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
fflags_t i2f_regular_status_s3 = i2f_round_has_sticky_s3[i] ? 5'h1 : 5'h0;
|
||||
fflags_t f2i_regular_status_s3 = f2i_round_has_sticky_s3[i] ? 5'h1 : 5'h0;
|
||||
|
||||
wire inexact_s3 = is_itof_s3 ? fp_round_has_sticky_s3[i] // overflow is invalid in i2f;
|
||||
: (fp_round_has_sticky_s3[i] || (~fclass_s3[i].is_inf && (of_before_round_s3[i] || of_after_round_s3[i])));
|
||||
|
||||
assign fp_regular_status_s3.NV = is_itof_s3 & (of_before_round_s3[i] | of_after_round_s3[i]); // overflow is invalid for I2F casts
|
||||
assign fp_regular_status_s3.DZ = 1'b0; // no divisions
|
||||
assign fp_regular_status_s3.OF = ~is_itof_s3 & (~fclass_s3[i].is_inf & (of_before_round_s3[i] | of_after_round_s3[i])); // inf casts no OF
|
||||
assign fp_regular_status_s3.UF = uf_after_round_s3[i] & inexact_s3;
|
||||
assign fp_regular_status_s3.NX = inexact_s3;
|
||||
fflags_t i2f_status_s3 = i2f_regular_status_s3;
|
||||
fflags_t f2i_status_s3 = f2i_result_is_special_s3[i] ? f2i_special_status_s3[i] : f2i_regular_status_s3;
|
||||
|
||||
assign int_regular_status_s3 = int_round_has_sticky_s3[i] ? {4'h0, 1'b1} : 5'h0;
|
||||
wire [INT_WIDTH-1:0] i2f_result_s3 = fmt_result_s3[i];
|
||||
wire [INT_WIDTH-1:0] f2i_result_s3 = f2i_result_is_special_s3[i] ? f2i_special_result_s3[i] : rounded_int_res_s3[i];
|
||||
|
||||
assign fp_result_s3 = fp_result_is_special_s3[i] ? fp_special_result_s3[i] : fmt_result_s3[i];
|
||||
assign int_result_s3 = int_result_is_special_s3[i] ? int_special_result_s3[i] : rounded_int_res_s3[i];
|
||||
|
||||
assign fp_status_s3 = fp_result_is_special_s3[i] ? fp_special_status_s3[i] : fp_regular_status_s3;
|
||||
assign int_status_s3 = int_result_is_special_s3[i] ? int_special_status_s3[i] : int_regular_status_s3;
|
||||
|
||||
// Select output depending on special case detection
|
||||
assign tmp_result_s3[i] = is_itof_s3 ? fp_result_s3 : int_result_s3;
|
||||
assign tmp_fflags_s3[i] = is_itof_s3 ? fp_status_s3 : int_status_s3;
|
||||
assign tmp_result_s3[i] = is_itof_s3 ? i2f_result_s3 : f2i_result_s3;
|
||||
assign tmp_fflags_s3[i] = is_itof_s3 ? i2f_status_s3 : f2i_status_s3;
|
||||
end
|
||||
|
||||
assign stall = ~ready_out && valid_out;
|
||||
|
@ -457,7 +385,6 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
|||
);
|
||||
|
||||
assign ready_in = ~stall;
|
||||
|
||||
assign has_fflags = 1'b1;
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -54,7 +54,6 @@ module VX_fpu_rounding #(
|
|||
2'b01: round_up = 1'b0; // < ulp/2 away, round down
|
||||
2'b10: round_up = abs_value_i[0]; // = ulp/2 away, round towards even result
|
||||
2'b11: round_up = 1'b1; // > ulp/2 away, round up
|
||||
default: round_up = 1'bx;
|
||||
endcase
|
||||
`INST_FRM_RTZ: round_up = 1'b0; // always round down
|
||||
`INST_FRM_RDN: round_up = (| round_sticky_bits_i) & sign_i; // to 0 if +, away if -
|
||||
|
|
4
hw/syn/altera/quartus/cache/Makefile
vendored
4
hw/syn/altera/quartus/cache/Makefile
vendored
|
@ -1,6 +1,6 @@
|
|||
PROJECT = VX_cache_cluster_top
|
||||
PROJECT = VX_cache_top
|
||||
TOP_LEVEL_ENTITY = $(PROJECT)
|
||||
SRC_FILE = VX_cache_cluster.sv
|
||||
SRC_FILE = $(PROJECT).sv
|
||||
|
||||
include ../../common.mk
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
PROJECT = VX_core_top
|
||||
TOP_LEVEL_ENTITY = $(PROJECT)
|
||||
SRC_FILE = VX_core.sv
|
||||
SRC_FILE = $(PROJECT).sv
|
||||
|
||||
include ../../common.mk
|
||||
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
XLEN ?= 32
|
||||
TOOLDIR ?= /opt
|
||||
|
||||
ifeq ($(XLEN),64)
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
|
||||
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
|
||||
CFLAGS += -march=rv64imafd -mabi=lp64d
|
||||
else
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
|
||||
CFLAGS += -march=rv32imaf -mabi=ilp32f
|
||||
endif
|
||||
|
||||
|
|
|
@ -56,7 +56,6 @@ PROJECT = top_modules
|
|||
all: build
|
||||
|
||||
build: $(SRCS)
|
||||
verilator --build $(VL_FLAGS) --cc VX_cache_cluster_top --top-module VX_cache_cluster_top $^ -CFLAGS '$(CXXFLAGS)'
|
||||
verilator --build $(VL_FLAGS) --cc VX_cache_top --top-module VX_cache_top $^ -CFLAGS '$(CXXFLAGS)'
|
||||
verilator --build $(VL_FLAGS) --cc VX_core_top --top-module VX_core_top $^ -CFLAGS '$(CXXFLAGS)'
|
||||
|
||||
|
|
|
@ -1,17 +1,18 @@
|
|||
XLEN ?= 32
|
||||
TOOLDIR ?= /opt
|
||||
|
||||
ifeq ($(XLEN),64)
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
|
||||
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
|
||||
CFLAGS += -march=rv64imafd -mabi=lp64d
|
||||
else
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
|
||||
CFLAGS += -march=rv32imaf -mabi=ilp32f
|
||||
endif
|
||||
|
||||
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
|
||||
RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
|
||||
|
||||
LLVM_VORTEX ?= /opt/llvm-vortex
|
||||
LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex
|
||||
|
||||
LLVM_CFLAGS += --sysroot=$(RISCV_SYSROOT)
|
||||
LLVM_CFLAGS += --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH)
|
||||
|
|
|
@ -51,9 +51,8 @@ inline char is_log2(int x) {
|
|||
return ((x & (x-1)) == 0);
|
||||
}
|
||||
|
||||
inline int fast_log2(int x) {
|
||||
float f = x;
|
||||
return (*(int*)(&f)>>23) - 127;
|
||||
inline int log2_fast(int x) {
|
||||
return 31 - __builtin_clz (x);
|
||||
}
|
||||
|
||||
static void __attribute__ ((noinline)) spawn_tasks_all_stub() {
|
||||
|
@ -286,8 +285,8 @@ void vx_spawn_kernel(context_t * ctx, vx_spawn_kernel_cb callback, void * arg) {
|
|||
|
||||
// fast path handling
|
||||
char isXYpow2 = is_log2(XY);
|
||||
char log2XY = fast_log2(XY);
|
||||
char log2X = fast_log2(X);
|
||||
char log2XY = log2_fast(XY);
|
||||
char log2X = log2_fast(X);
|
||||
|
||||
wspawn_kernel_args_t wspawn_args = {
|
||||
ctx, callback, arg, core_id * tasks_per_core, fW, rW, isXYpow2, log2XY, log2X
|
||||
|
|
49
tests/kernel/common.mk
Normal file
49
tests/kernel/common.mk
Normal file
|
@ -0,0 +1,49 @@
|
|||
XLEN ?= 32
|
||||
TOOLDIR ?= /opt
|
||||
|
||||
ifeq ($(XLEN),64)
|
||||
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
|
||||
CFLAGS += -march=rv64imafd -mabi=lp64d
|
||||
else
|
||||
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
|
||||
CFLAGS += -march=rv32imaf -mabi=ilp32f
|
||||
endif
|
||||
|
||||
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
|
||||
|
||||
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
|
||||
|
||||
CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
|
||||
AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar
|
||||
DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
|
||||
CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
|
||||
|
||||
SIM_DIR = ../../../sim
|
||||
|
||||
CFLAGS += -O3 -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
|
||||
CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw
|
||||
|
||||
LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a
|
||||
|
||||
all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump
|
||||
|
||||
$(PROJECT).dump: $(PROJECT).elf
|
||||
$(DP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||
|
||||
$(PROJECT).bin: $(PROJECT).elf
|
||||
$(CP) -O binary $(PROJECT).elf $(PROJECT).bin
|
||||
|
||||
$(PROJECT).elf: $(SRCS)
|
||||
$(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf
|
||||
|
||||
run-rtlsim: $(PROJECT).bin
|
||||
$(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin
|
||||
|
||||
run-simx: $(PROJECT).bin
|
||||
$(SIM_DIR)/simx/simx $(PROJECT).bin
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CC) $(CFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf *.elf *.bin *.dump .depend
|
|
@ -1,52 +1,5 @@
|
|||
XLEN ?= 32
|
||||
|
||||
ifeq ($(XLEN),64)
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
|
||||
CFLAGS += -march=rv64imafd -mabi=lp64d
|
||||
else
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
CFLAGS += -march=rv32imaf -mabi=ilp32f
|
||||
endif
|
||||
|
||||
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
|
||||
|
||||
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
|
||||
|
||||
CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
|
||||
AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar
|
||||
DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
|
||||
CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
|
||||
|
||||
SIM_DIR = ../../../sim
|
||||
|
||||
CFLAGS += -O3 -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
|
||||
CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw
|
||||
|
||||
LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a
|
||||
|
||||
PROJECT = conform
|
||||
|
||||
SRCS = main.cpp tests.cpp
|
||||
|
||||
all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump
|
||||
|
||||
$(PROJECT).dump: $(PROJECT).elf
|
||||
$(DP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||
|
||||
$(PROJECT).bin: $(PROJECT).elf
|
||||
$(CP) -O binary $(PROJECT).elf $(PROJECT).bin
|
||||
|
||||
$(PROJECT).elf: $(SRCS)
|
||||
$(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf
|
||||
|
||||
run-rtlsim: $(PROJECT).bin
|
||||
$(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin
|
||||
|
||||
run-simx: $(PROJECT).bin
|
||||
$(SIM_DIR)/simx/simx $(PROJECT).bin
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CC) $(CFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf *.elf *.bin *.dump .depend
|
||||
include ../common.mk
|
||||
|
|
|
@ -1,52 +1,5 @@
|
|||
XLEN ?= 32
|
||||
|
||||
ifeq ($(XLEN),64)
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
|
||||
CFLAGS += -march=rv64imafd -mabi=lp64d
|
||||
else
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
CFLAGS += -march=rv32imaf -mabi=ilp32f
|
||||
endif
|
||||
|
||||
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
|
||||
|
||||
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
|
||||
|
||||
CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
|
||||
AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar
|
||||
DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
|
||||
CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
|
||||
|
||||
SIM_DIR = ../../../sim
|
||||
|
||||
CFLAGS += -O3 -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
|
||||
CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw
|
||||
|
||||
LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a
|
||||
|
||||
PROJECT = fibonacci
|
||||
|
||||
SRCS = main.cpp
|
||||
|
||||
all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump
|
||||
|
||||
$(PROJECT).dump: $(PROJECT).elf
|
||||
$(DP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||
|
||||
$(PROJECT).bin: $(PROJECT).elf
|
||||
$(CP) -O binary $(PROJECT).elf $(PROJECT).bin
|
||||
|
||||
$(PROJECT).elf: $(SRCS)
|
||||
$(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf
|
||||
|
||||
run-rtlsim: $(PROJECT).bin
|
||||
$(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin
|
||||
|
||||
run-simx: $(PROJECT).bin
|
||||
$(SIM_DIR)/simx/simx $(PROJECT).bin
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CC) $(CFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf *.elf *.bin *.dump .depend
|
||||
include ../common.mk
|
||||
|
|
|
@ -1,52 +1,5 @@
|
|||
XLEN ?= 32
|
||||
|
||||
ifeq ($(XLEN),64)
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
|
||||
CFLAGS += -march=rv64imafd -mabi=lp64d
|
||||
else
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
CFLAGS += -march=rv32imaf -mabi=ilp32f
|
||||
endif
|
||||
|
||||
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
|
||||
|
||||
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
|
||||
|
||||
CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc
|
||||
AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-gcc-ar
|
||||
DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objdump
|
||||
CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)-objcopy
|
||||
|
||||
SIM_DIR = ../../../sim
|
||||
|
||||
CFLAGS += -O3 -v -mcmodel=medany -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
|
||||
CFLAGS += -I$(VORTEX_KN_PATH)/include -I$(VORTEX_KN_PATH)/../hw
|
||||
|
||||
LDFLAGS += -lm -Wl,-Bstatic,--gc-sections,-T,$(VORTEX_KN_PATH)/linker/vx_link$(XLEN).ld,--defsym=STARTUP_ADDR=0x80000000 $(VORTEX_KN_PATH)/libvortexrt.a
|
||||
|
||||
PROJECT = hello
|
||||
|
||||
SRCS = main.cpp
|
||||
|
||||
all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump
|
||||
|
||||
$(PROJECT).dump: $(PROJECT).elf
|
||||
$(DP) -D $(PROJECT).elf > $(PROJECT).dump
|
||||
|
||||
$(PROJECT).bin: $(PROJECT).elf
|
||||
$(CP) -O binary $(PROJECT).elf $(PROJECT).bin
|
||||
|
||||
$(PROJECT).elf: $(SRCS)
|
||||
$(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf
|
||||
|
||||
run-rtlsim: $(PROJECT).bin
|
||||
$(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin
|
||||
|
||||
run-simx: $(PROJECT).bin
|
||||
$(SIM_DIR)/simx/simx $(PROJECT).bin
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CC) $(CFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf *.elf *.bin *.dump .depend
|
||||
include ../common.mk
|
||||
|
|
|
@ -18,6 +18,7 @@ all:
|
|||
$(MAKE) -C oclprintf
|
||||
$(MAKE) -C blackscholes
|
||||
$(MAKE) -C matmul
|
||||
$(MAKE) -C convolution
|
||||
|
||||
run-simx:
|
||||
$(MAKE) -C vecadd run-simx
|
||||
|
@ -30,15 +31,16 @@ run-simx:
|
|||
$(MAKE) -C dotproduct run-simx
|
||||
$(MAKE) -C kmeans run-simx
|
||||
$(MAKE) -C spmv run-simx
|
||||
$(MAKE) -C cutcp run-simx
|
||||
$(MAKE) -C stencil run-simx
|
||||
$(MAKE) -C lbm run-simx
|
||||
$(MAKE) -C oclprintf run-simx
|
||||
$(MAKE) -C blackscholes run-simx
|
||||
$(MAKE) -C matmul run-simx
|
||||
$(MAKE) -C transpose run-simx
|
||||
# $(MAKE) -C vectorhypot run-simx
|
||||
# $(MAKE) -C mri-q run-simx
|
||||
$(MAKE) -C convolution run-simx
|
||||
$(MAKE) -C cutcp run-simx
|
||||
$(MAKE) -C matmul run-simx
|
||||
$(MAKE) -C vectorhypot run-simx
|
||||
$(MAKE) -C mri-q run-simx
|
||||
|
||||
run-rtlsim:
|
||||
$(MAKE) -C vecadd run-rtlsim
|
||||
|
@ -52,12 +54,13 @@ run-rtlsim:
|
|||
$(MAKE) -C kmeans run-rtlsim
|
||||
$(MAKE) -C spmv run-rtlsim
|
||||
$(MAKE) -C transpose run-rtlsim
|
||||
$(MAKE) -C cutcp run-rtlsim
|
||||
$(MAKE) -C stencil run-rtlsim
|
||||
$(MAKE) -C lbm run-rtlsim
|
||||
$(MAKE) -C oclprintf run-rtlsim
|
||||
$(MAKE) -C blackscholes run-rtlsim
|
||||
$(MAKE) -C matmul run-rtlsim
|
||||
$(MAKE) -C convolution run-rtlsim
|
||||
# $(MAKE) -C cutcp run-rtlsim
|
||||
# $(MAKE) -C matmul run-rtlsim
|
||||
# $(MAKE) -C vectorhypot run-rtlsim
|
||||
# $(MAKE) -C mri-q run-rtlsim
|
||||
|
||||
|
@ -73,12 +76,13 @@ run-opae:
|
|||
$(MAKE) -C kmeans run-opae
|
||||
$(MAKE) -C spmv run-opae
|
||||
$(MAKE) -C transpose run-opae
|
||||
$(MAKE) -C cutcp run-opae
|
||||
$(MAKE) -C stencil run-opae
|
||||
$(MAKE) -C lbm run-opae
|
||||
$(MAKE) -C oclprintf run-opae
|
||||
$(MAKE) -C blackscholes run-opae
|
||||
$(MAKE) -C matmul run-opae
|
||||
$(MAKE) -C convolution run-opae
|
||||
# $(MAKE) -C cutcp run-opae
|
||||
# $(MAKE) -C matmul run-opae
|
||||
# $(MAKE) -C vectorhypot run-opae
|
||||
# $(MAKE) -C mri-q run-opae
|
||||
|
||||
|
@ -102,6 +106,7 @@ clean:
|
|||
$(MAKE) -C oclprintf clean
|
||||
$(MAKE) -C blackscholes clean
|
||||
$(MAKE) -C matmul clean
|
||||
$(MAKE) -C convolution clean
|
||||
|
||||
clean-all:
|
||||
$(MAKE) -C vecadd clean-all
|
||||
|
@ -124,3 +129,4 @@ clean-all:
|
|||
$(MAKE) -C oclprintf clean-all
|
||||
$(MAKE) -C blackscholes clean-all
|
||||
$(MAKE) -C matmul clean-all
|
||||
$(MAKE) -C convolution clean-all
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
XLEN ?= 32
|
||||
TOOLDIR ?= /opt
|
||||
|
||||
TARGET ?= opaesim
|
||||
|
||||
|
@ -6,12 +7,12 @@ XRT_SYN_DIR ?= ../../../hw/syn/xilinx/xrt
|
|||
XRT_DEVICE_INDEX ?= 0
|
||||
|
||||
ifeq ($(XLEN),64)
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
|
||||
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
|
||||
VX_CFLAGS += -march=rv64imafd -mabi=lp64d
|
||||
K_CFLAGS += -march=rv64imafd -mabi=ilp64d
|
||||
STARTUP_ADDR ?= 0x180000000
|
||||
else
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
|
||||
VX_CFLAGS += -march=rv32imaf -mabi=ilp32f
|
||||
K_CFLAGS += -march=rv32imaf -mabi=ilp32f
|
||||
STARTUP_ADDR ?= 0x80000000
|
||||
|
@ -20,16 +21,16 @@ endif
|
|||
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf
|
||||
RISCV_SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/$(RISCV_PREFIX)
|
||||
|
||||
POCL_CC_PATH ?= /opt/pocl/compiler
|
||||
POCL_RT_PATH ?= /opt/pocl/runtime
|
||||
POCL_CC_PATH ?= $(TOOLDIR)/pocl/compiler
|
||||
POCL_RT_PATH ?= $(TOOLDIR)/pocl/runtime
|
||||
|
||||
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||
VORTEX_KN_PATH ?= $(realpath ../../../kernel)
|
||||
|
||||
FPGA_BIN_DIR ?= $(VORTEX_RT_PATH)/opae
|
||||
|
||||
LLVM_VORTEX ?= /opt/llvm-vortex
|
||||
LLVM_POCL ?= /opt/llvm-vortex
|
||||
LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex
|
||||
LLVM_POCL ?= $(TOOLDIR)/llvm-vortex
|
||||
|
||||
K_CFLAGS += -v -O3 --sysroot=$(RISCV_SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -Xclang -target-feature -Xclang +vortex
|
||||
K_CFLAGS += -fno-rtti -fno-exceptions -nostartfiles -fdata-sections -ffunction-sections
|
||||
|
@ -40,13 +41,12 @@ CXXFLAGS += -std=c++11 -Wall -Wextra -Wfatal-errors
|
|||
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter -Wno-narrowing
|
||||
CXXFLAGS += -pthread
|
||||
CXXFLAGS += -I$(POCL_RT_PATH)/include
|
||||
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_RT_PATH)/stub -lvortex
|
||||
|
||||
ifdef HOSTGPU
|
||||
CXXFLAGS += -DHOSTGPU
|
||||
LDFLAGS += -lOpenCL
|
||||
else
|
||||
LDFLAGS += $(POCL_RT_PATH)/lib/libOpenCL.so
|
||||
LDFLAGS += -L$(VORTEX_RT_PATH)/stub -lvortex $(POCL_RT_PATH)/lib/libOpenCL.so
|
||||
endif
|
||||
|
||||
# Debugigng
|
||||
|
|
7
tests/opencl/convolution/Makefile
Normal file
7
tests/opencl/convolution/Makefile
Normal file
|
@ -0,0 +1,7 @@
|
|||
PROJECT = convolution
|
||||
|
||||
SRCS = main.cc
|
||||
|
||||
OPTS ?= -n32
|
||||
|
||||
include ../common.mk
|
32
tests/opencl/convolution/kernel.cl
Normal file
32
tests/opencl/convolution/kernel.cl
Normal file
|
@ -0,0 +1,32 @@
|
|||
__kernel void conv3x3(__global float* output,
|
||||
__global float* input,
|
||||
__global float* weights,
|
||||
const int width,
|
||||
const int height)
|
||||
{
|
||||
int x = get_global_id(0);
|
||||
int y = get_global_id(1);
|
||||
|
||||
// Adjust for padded borders
|
||||
int paddedWidth = width + 2;
|
||||
int paddedX = x + 1;
|
||||
int paddedY = y + 1;
|
||||
|
||||
// Compute the convolution sum
|
||||
float sum = 0.0f;
|
||||
|
||||
sum += input[(paddedY - 1) * paddedWidth + (paddedX - 1)] * weights[0]; // Top-left
|
||||
sum += input[(paddedY - 1) * paddedWidth + paddedX] * weights[1]; // Top-center
|
||||
sum += input[(paddedY - 1) * paddedWidth + (paddedX + 1)] * weights[2]; // Top-right
|
||||
|
||||
sum += input[paddedY * paddedWidth + (paddedX - 1)] * weights[3]; // Middle-left
|
||||
sum += input[paddedY * paddedWidth + paddedX] * weights[4]; // Center
|
||||
sum += input[paddedY * paddedWidth + (paddedX + 1)] * weights[5]; // Middle-right
|
||||
|
||||
sum += input[(paddedY + 1) * paddedWidth + (paddedX - 1)] * weights[6]; // Bottom-left
|
||||
sum += input[(paddedY + 1) * paddedWidth + paddedX] * weights[7]; // Bottom-center
|
||||
sum += input[(paddedY + 1) * paddedWidth + (paddedX + 1)] * weights[8]; // Bottom-right
|
||||
|
||||
// Store the result in the output array
|
||||
output[y * width + x] = sum;
|
||||
}
|
258
tests/opencl/convolution/main.cc
Normal file
258
tests/opencl/convolution/main.cc
Normal file
|
@ -0,0 +1,258 @@
|
|||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <assert.h>
|
||||
#include <CL/opencl.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#include <chrono>
|
||||
#include <vector>
|
||||
|
||||
#define FLOAT_ULP 6
|
||||
|
||||
#define KERNEL_NAME "conv3x3"
|
||||
|
||||
#define CL_CHECK(_expr) \
|
||||
do { \
|
||||
cl_int _err = _expr; \
|
||||
if (_err == CL_SUCCESS) \
|
||||
break; \
|
||||
printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
|
||||
cleanup(); \
|
||||
exit(-1); \
|
||||
} while (0)
|
||||
|
||||
#define CL_CHECK2(_expr) \
|
||||
({ \
|
||||
cl_int _err = CL_INVALID_VALUE; \
|
||||
decltype(_expr) _ret = _expr; \
|
||||
if (_err != CL_SUCCESS) { \
|
||||
printf("OpenCL Error: '%s' returned %d!\n", #_expr, (int)_err); \
|
||||
cleanup(); \
|
||||
exit(-1); \
|
||||
} \
|
||||
_ret; \
|
||||
})
|
||||
|
||||
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
|
||||
if (nullptr == filename || nullptr == data || 0 == size)
|
||||
return -1;
|
||||
|
||||
FILE* fp = fopen(filename, "r");
|
||||
if (NULL == fp) {
|
||||
fprintf(stderr, "Failed to load kernel.");
|
||||
return -1;
|
||||
}
|
||||
|
||||
fseek(fp , 0 , SEEK_END);
|
||||
long fsize = ftell(fp);
|
||||
rewind(fp);
|
||||
|
||||
*data = (uint8_t*)malloc(fsize);
|
||||
*size = fread(*data, 1, fsize, fp);
|
||||
|
||||
fclose(fp);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool compare_equal(float a, float b) {
|
||||
union fi_t { float f; int32_t i; };
|
||||
fi_t fa, fb;
|
||||
fa.f = a;
|
||||
fb.f = b;
|
||||
auto d = std::abs(fa.i - fb.i);
|
||||
return d <= FLOAT_ULP;
|
||||
}
|
||||
|
||||
static void convolution_cpu(float *O, float *I, float *W, int32_t width, int32_t height) {
|
||||
int paddedWidth = width + 2;
|
||||
for (int32_t y = 0; y < height; ++y) {
|
||||
for (int32_t x = 0; x < width; ++x) {
|
||||
int paddedY = y + 1;
|
||||
int paddedX = x + 1;
|
||||
float sum = 0.0f;
|
||||
for (int32_t ky = -1; ky <= 1; ++ky) {
|
||||
for (int32_t kx = -1; kx <= 1; ++kx) {
|
||||
int32_t iy = paddedY + ky;
|
||||
int32_t ix = paddedX + kx;
|
||||
float value = I[iy * paddedWidth + ix];
|
||||
float weight = W[(ky + 1) * 3 + (kx + 1)];
|
||||
sum += value * weight;
|
||||
}
|
||||
}
|
||||
O[y * width + x] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cl_device_id device_id = NULL;
|
||||
cl_context context = NULL;
|
||||
cl_command_queue commandQueue = NULL;
|
||||
cl_program program = NULL;
|
||||
cl_kernel kernel = NULL;
|
||||
cl_mem i_memobj = NULL;
|
||||
cl_mem w_memobj = NULL;
|
||||
cl_mem o_memobj = NULL;
|
||||
uint8_t* kernel_bin = NULL;
|
||||
|
||||
static void cleanup() {
|
||||
if (commandQueue) clReleaseCommandQueue(commandQueue);
|
||||
if (kernel) clReleaseKernel(kernel);
|
||||
if (program) clReleaseProgram(program);
|
||||
if (i_memobj) clReleaseMemObject(i_memobj);
|
||||
if (w_memobj) clReleaseMemObject(w_memobj);
|
||||
if (o_memobj) clReleaseMemObject(o_memobj);
|
||||
if (context) clReleaseContext(context);
|
||||
if (device_id) clReleaseDevice(device_id);
|
||||
if (kernel_bin) free(kernel_bin);
|
||||
}
|
||||
|
||||
int size = 32;
|
||||
|
||||
static void show_usage() {
|
||||
printf("Usage: [-n size] [-h: help]\n");
|
||||
}
|
||||
|
||||
static void parse_args(int argc, char **argv) {
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "n:h?")) != -1) {
|
||||
switch (c) {
|
||||
case 'n':
|
||||
size = atoi(optarg);
|
||||
break;
|
||||
case 'h':
|
||||
case '?': {
|
||||
show_usage();
|
||||
exit(0);
|
||||
} break;
|
||||
default:
|
||||
show_usage();
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int main (int argc, char **argv) {
|
||||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
printf("Matrix size=%d\n", size);
|
||||
|
||||
uint32_t o_points = size * size;
|
||||
uint32_t i_points = (size+2) * (size+2);
|
||||
uint32_t w_points = 3 * 3;
|
||||
|
||||
cl_platform_id platform_id;
|
||||
size_t kernel_size;
|
||||
|
||||
// Getting platform and device information
|
||||
CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
|
||||
CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
|
||||
|
||||
printf("Create context\n");
|
||||
context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err));
|
||||
|
||||
char device_string[1024];
|
||||
clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_string), &device_string, NULL);
|
||||
printf("Using device: %s\n", device_string);
|
||||
|
||||
printf("Allocate device buffers\n");
|
||||
size_t i_nbytes = i_points * sizeof(float);
|
||||
size_t w_nbytes = w_points * sizeof(float);
|
||||
size_t o_nbytes = o_points * sizeof(float);
|
||||
i_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, i_nbytes, NULL, &_err));
|
||||
w_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, w_nbytes, NULL, &_err));
|
||||
o_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, o_nbytes, NULL, &_err));
|
||||
|
||||
printf("Create program from kernel source\n");
|
||||
#ifdef HOSTGPU
|
||||
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK2(clCreateProgramWithSource(
|
||||
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
|
||||
#else
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK2(clCreateProgramWithBinary(
|
||||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
|
||||
#endif
|
||||
if (program == NULL) {
|
||||
cleanup();
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Build program
|
||||
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
|
||||
|
||||
// Create kernel
|
||||
kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
|
||||
|
||||
size_t global_size[2] = {size, size};
|
||||
|
||||
// Set kernel arguments
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&o_memobj));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&i_memobj));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&w_memobj));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(uint32_t), &size));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(uint32_t), &size));
|
||||
|
||||
// Allocate memories for input arrays and output arrays.
|
||||
std::vector<float> h_i(i_points);
|
||||
std::vector<float> h_w(w_points);
|
||||
std::vector<float> h_o(o_points, 0.0f);
|
||||
|
||||
// Generate input values
|
||||
for (int32_t y = -1; y < size+1; ++y) {
|
||||
for (int32_t x = -1; x < size+1; ++x) {
|
||||
if (x >= 0 && x < size && y >= 0 && y < size) {
|
||||
h_i[(y+1) * (size+2) + (x+1)] = static_cast<float>(rand()) / RAND_MAX;
|
||||
} else {
|
||||
h_i[(y+1) * (size+2) + (x+1)] = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
for (uint32_t i = 0; i < w_points; ++i) {
|
||||
h_w[i] = static_cast<float>(rand()) / RAND_MAX;
|
||||
}
|
||||
|
||||
// Creating command queue
|
||||
commandQueue = CL_CHECK2(clCreateCommandQueue(context, device_id, 0, &_err));
|
||||
|
||||
printf("Upload source buffers\n");
|
||||
CL_CHECK(clEnqueueWriteBuffer(commandQueue, i_memobj, CL_TRUE, 0, i_nbytes, h_i.data(), 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueWriteBuffer(commandQueue, w_memobj, CL_TRUE, 0, w_nbytes, h_w.data(), 0, NULL, NULL));
|
||||
|
||||
printf("Execute the kernel\n");
|
||||
auto time_start = std::chrono::high_resolution_clock::now();
|
||||
CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, global_size, NULL, 0, NULL, NULL));
|
||||
CL_CHECK(clFinish(commandQueue));
|
||||
auto time_end = std::chrono::high_resolution_clock::now();
|
||||
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
|
||||
printf("Elapsed time: %lg ms\n", elapsed);
|
||||
|
||||
printf("Download destination buffer\n");
|
||||
CL_CHECK(clEnqueueReadBuffer(commandQueue, o_memobj, CL_TRUE, 0, o_nbytes, h_o.data(), 0, NULL, NULL));
|
||||
|
||||
printf("Verify result\n");
|
||||
std::vector<float> ref_vec(o_points);
|
||||
convolution_cpu(ref_vec.data(), h_i.data(), h_w.data(), size, size);
|
||||
int errors = 0;
|
||||
for (uint32_t i = 0; i < o_points; ++i) {
|
||||
if (!compare_equal(h_o[i], ref_vec[i])) {
|
||||
if (errors < 100)
|
||||
printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_o[i]);
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
if (errors != 0) {
|
||||
printf("FAILED! - %d errors\n", errors);
|
||||
} else {
|
||||
printf("PASSED!\n");
|
||||
}
|
||||
|
||||
// Clean up
|
||||
cleanup();
|
||||
|
||||
return errors;
|
||||
}
|
|
@ -173,14 +173,10 @@ void MAIN_initialize(const MAIN_Param *param, const OpenCL_Param *prm) {
|
|||
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
|
||||
|
||||
printf("OK+\n");
|
||||
|
||||
// Setup DEVICE datastructures
|
||||
OpenCL_LBM_allocateGrid(prm, &OpenCL_srcGrid);
|
||||
OpenCL_LBM_allocateGrid(prm, &OpenCL_dstGrid);
|
||||
|
||||
printf("OK-\n");
|
||||
|
||||
// Initialize DEVICE datastructures
|
||||
OpenCL_LBM_initializeGrid(prm, OpenCL_srcGrid, TEMP_srcGrid);
|
||||
OpenCL_LBM_initializeGrid(prm, OpenCL_dstGrid, TEMP_dstGrid);
|
||||
|
|
|
@ -2,6 +2,6 @@ PROJECT = matmul
|
|||
|
||||
SRCS = main.cc
|
||||
|
||||
OPTS ?= -n16
|
||||
OPTS ?= -n32
|
||||
|
||||
include ../common.mk
|
||||
|
|
|
@ -5,10 +5,10 @@ __kernel void matmul(__global float *A,
|
|||
__local float *localA,
|
||||
__local float *localB)
|
||||
{
|
||||
int row = get_global_id(1);
|
||||
int col = get_global_id(0);
|
||||
int localRow = get_local_id(1);
|
||||
int localCol = get_local_id(0);
|
||||
int globalRow = get_global_id(1);
|
||||
int globalCol = get_global_id(0);
|
||||
int localRow = get_local_id(1);
|
||||
int localCol = get_local_id(0);
|
||||
int localSize = get_local_size(0); // assuming square local size
|
||||
|
||||
float sum = 0.0f;
|
||||
|
@ -16,10 +16,10 @@ __kernel void matmul(__global float *A,
|
|||
// Loop over all blocks of both matrices
|
||||
for (int k = 0; k < N; k += localSize) {
|
||||
// Load block of matrix A to local memory
|
||||
localA[localRow * localSize + localCol] = A[row * N + k + localCol];
|
||||
localA[localRow * localSize + localCol] = A[globalRow * N + k + localCol];
|
||||
|
||||
// Load block of matrix B to local memory, adjusting for column-major access
|
||||
localB[localRow * localSize + localCol] = B[(k + localRow) * N + col];
|
||||
localB[localRow * localSize + localCol] = B[(k + localRow) * N + globalCol];
|
||||
|
||||
// Synchronize to make sure the tiles are loaded
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
@ -28,20 +28,20 @@ __kernel void matmul(__global float *A,
|
|||
for (int j = 0; j < localSize; j++) {
|
||||
sum += localA[localRow * localSize + j] * localB[j * localSize + localCol];
|
||||
}
|
||||
|
||||
// Synchronize before loading the next block
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
C[row * N + col] = sum;
|
||||
C[globalRow * N + globalCol] = sum;
|
||||
}
|
||||
|
||||
/*__kernel void matmul(__global float *A, __global float *B, __global float *C, const unsigned int N)
|
||||
/*__kernel void matmul(__global float *A,
|
||||
__global float *B,
|
||||
__global float *C,
|
||||
const unsigned int N)
|
||||
{
|
||||
int globalRow = get_global_id(1);
|
||||
int globalCol = get_global_id(0);
|
||||
int localRow = get_local_id(1);
|
||||
int localCol = get_local_id(0);
|
||||
int localRow = get_local_id(1);
|
||||
int localCol = get_local_id(0);
|
||||
|
||||
// Static local memory declaration
|
||||
__local float localA[16][16];
|
||||
|
@ -64,9 +64,6 @@ __kernel void matmul(__global float *A,
|
|||
for (int j = 0; j < 16; j++) {
|
||||
sum += localA[localRow][j] * localB[j][localCol];
|
||||
}
|
||||
|
||||
// Wait until all threads have computed before loading the next block
|
||||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
}
|
||||
|
||||
C[globalRow * N + globalCol] = sum;
|
||||
|
|
|
@ -10,6 +10,8 @@
|
|||
|
||||
#define LOCAL_SIZE 16
|
||||
|
||||
#define FLOAT_ULP 6
|
||||
|
||||
#define KERNEL_NAME "matmul"
|
||||
|
||||
#define CL_CHECK(_expr) \
|
||||
|
@ -56,15 +58,16 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static bool compare_equal(float a, float b, int ulp = 21) {
|
||||
union fi_t { int i; float f; };
|
||||
static bool compare_equal(float a, float b) {
|
||||
union fi_t { float f; int32_t i; };
|
||||
fi_t fa, fb;
|
||||
fa.f = a;
|
||||
fb.f = b;
|
||||
return std::abs(fa.i - fb.i) <= ulp;
|
||||
auto d = std::abs(fa.i - fb.i);
|
||||
return d <= FLOAT_ULP;
|
||||
}
|
||||
|
||||
static void matrix_multiply_cpu(float *A, float *B, float *C, int N) {
|
||||
static void matmul_cpu(float *C, float *A, float *B, int N) {
|
||||
for (int i = 0; i < N; i++) {
|
||||
for (int j = 0; j < N; j++) {
|
||||
float sum = 0.0f;
|
||||
|
@ -98,7 +101,7 @@ static void cleanup() {
|
|||
if (kernel_bin) free(kernel_bin);
|
||||
}
|
||||
|
||||
int size = 64;
|
||||
int size = 32;
|
||||
|
||||
static void show_usage() {
|
||||
printf("Usage: [-n size] [-h: help]\n");
|
||||
|
@ -106,7 +109,7 @@ static void show_usage() {
|
|||
|
||||
static void parse_args(int argc, char **argv) {
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "fn:h?")) != -1) {
|
||||
while ((c = getopt(argc, argv, "n:h?")) != -1) {
|
||||
switch (c) {
|
||||
case 'n':
|
||||
size = atoi(optarg);
|
||||
|
@ -127,6 +130,8 @@ int main (int argc, char **argv) {
|
|||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
uint32_t num_points = size * size;
|
||||
|
||||
printf("Matrix size=%d\n", size);
|
||||
if ((size / LOCAL_SIZE) * LOCAL_SIZE != size) {
|
||||
printf("Error: matrix size must be a multiple of %d\n", LOCAL_SIZE);
|
||||
|
@ -148,7 +153,7 @@ int main (int argc, char **argv) {
|
|||
printf("Using device: %s\n", device_string);
|
||||
|
||||
printf("Allocate device buffers\n");
|
||||
size_t nbytes = size * size * sizeof(float);
|
||||
size_t nbytes = num_points * sizeof(float);
|
||||
a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
|
||||
b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
|
||||
c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
|
||||
|
@ -176,8 +181,8 @@ int main (int argc, char **argv) {
|
|||
// Create kernel
|
||||
kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
|
||||
|
||||
size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE};
|
||||
size_t global_size[2] = {size, size};
|
||||
size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE};
|
||||
|
||||
// Set kernel arguments
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));
|
||||
|
@ -188,20 +193,14 @@ int main (int argc, char **argv) {
|
|||
CL_CHECK(clSetKernelArg(kernel, 5, local_size[0]*local_size[1]*sizeof(float), NULL));
|
||||
|
||||
// Allocate memories for input arrays and output arrays.
|
||||
std::vector<float> h_a(size * size);
|
||||
std::vector<float> h_b(size * size);
|
||||
std::vector<float> h_c(size * size);
|
||||
std::vector<float> h_a(num_points);
|
||||
std::vector<float> h_b(num_points);
|
||||
std::vector<float> h_c(num_points);
|
||||
|
||||
// Initialize values for array members.
|
||||
for (int i = 0; i < (size * size); ++i) {
|
||||
#ifdef USE_FLOAT
|
||||
h_a[i] = (float)rand() / (float)RAND_MAX;
|
||||
h_b[i] = (float)rand() / (float)RAND_MAX;
|
||||
#else
|
||||
h_a[i] = rand();
|
||||
h_b[i] = rand();
|
||||
#endif
|
||||
h_c[i] = 0xdeadbeef;
|
||||
// Generate input values
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
h_a[i] = static_cast<float>(rand()) / RAND_MAX;
|
||||
h_b[i] = static_cast<float>(rand()) / RAND_MAX;
|
||||
}
|
||||
|
||||
// Creating command queue
|
||||
|
@ -223,10 +222,10 @@ int main (int argc, char **argv) {
|
|||
CL_CHECK(clEnqueueReadBuffer(commandQueue, c_memobj, CL_TRUE, 0, nbytes, h_c.data(), 0, NULL, NULL));
|
||||
|
||||
printf("Verify result\n");
|
||||
std::vector<float> ref_vec(size * size);
|
||||
matrix_multiply_cpu(h_a.data(), h_b.data(), ref_vec.data(), size);
|
||||
std::vector<float> ref_vec(num_points);
|
||||
matmul_cpu(ref_vec.data(), h_a.data(), h_b.data(), size);
|
||||
int errors = 0;
|
||||
for (int i = 0; i < (size * size); i++) {
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
if (!compare_equal(h_c[i], ref_vec[i])) {
|
||||
if (errors < 100)
|
||||
printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_c[i]);
|
||||
|
|
|
@ -106,11 +106,6 @@ int main (int argc, char **argv) {
|
|||
|
||||
cl_platform_id platform_id;
|
||||
size_t kernel_size;
|
||||
cl_int binary_status;
|
||||
|
||||
// read kernel binary from file
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
|
||||
// Getting platform and device information
|
||||
CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
|
||||
|
@ -124,12 +119,17 @@ int main (int argc, char **argv) {
|
|||
a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
|
||||
|
||||
printf("Create program from kernel source\n");
|
||||
program = CL_CHECK2(clCreateProgramWithBinary(
|
||||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
|
||||
if (program == NULL) {
|
||||
cleanup();
|
||||
#ifdef HOSTGPU
|
||||
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
}
|
||||
program = CL_CHECK2(clCreateProgramWithSource(
|
||||
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
|
||||
#else
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK2(clCreateProgramWithBinary(
|
||||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
|
||||
#endif
|
||||
|
||||
// Build program
|
||||
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
|
||||
|
@ -143,7 +143,7 @@ int main (int argc, char **argv) {
|
|||
// Allocate memories for input arrays and output arrays.
|
||||
h_a = (int*)malloc(nbytes);
|
||||
|
||||
// Initialize values for array members.
|
||||
// Generate input values
|
||||
for (int i = 0; i < size; ++i) {
|
||||
h_a[i] = -1 + i;
|
||||
}
|
||||
|
|
|
@ -115,11 +115,6 @@ int main (int argc, char **argv) {
|
|||
|
||||
cl_platform_id platform_id;
|
||||
size_t kernel_size;
|
||||
cl_int binary_status;
|
||||
|
||||
// read kernel binary from file
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
|
||||
// Getting platform and device information
|
||||
CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
|
||||
|
@ -134,12 +129,17 @@ int main (int argc, char **argv) {
|
|||
c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
|
||||
|
||||
printf("Create program from kernel source\n");
|
||||
program = CL_CHECK2(clCreateProgramWithBinary(
|
||||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
|
||||
if (program == NULL) {
|
||||
cleanup();
|
||||
#ifdef HOSTGPU
|
||||
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
}
|
||||
program = CL_CHECK2(clCreateProgramWithSource(
|
||||
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
|
||||
#else
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK2(clCreateProgramWithBinary(
|
||||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
|
||||
#endif
|
||||
|
||||
// Build program
|
||||
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
|
||||
|
@ -155,17 +155,16 @@ int main (int argc, char **argv) {
|
|||
h_a = (int*)malloc(nbytes);
|
||||
h_c = (int*)malloc(nbytes);
|
||||
|
||||
// Initialize values for array members.
|
||||
// Generate input values
|
||||
for (int i = 0; i < size; ++i) {
|
||||
h_c[i] = 0xdeadbeef;
|
||||
if (float_enable) {
|
||||
float value = sinf(i)*sinf(i);
|
||||
h_a[i] = *(int*)&value;
|
||||
printf("*** [%d]: h_a=%f\n", i, value);
|
||||
((float*)h_a)[i] = value;
|
||||
printf("*** [%d]: %f\n", i, value);
|
||||
} else {
|
||||
int value = size*sinf(i);
|
||||
h_a[i] = value;
|
||||
printf("*** [%d]: h_a=%d\n", i, value);
|
||||
printf("*** [%d]: %d\n", i, value);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -189,38 +188,44 @@ int main (int argc, char **argv) {
|
|||
CL_CHECK(clEnqueueReadBuffer(commandQueue, c_memobj, CL_TRUE, 0, nbytes, h_c, 0, NULL, NULL));
|
||||
|
||||
printf("Verify result\n");
|
||||
for (int i = 0; i < size; ++i) {
|
||||
int value = h_c[i];
|
||||
for (int i = 0; i < size; ++i) {
|
||||
if (float_enable) {
|
||||
printf("*** [%d]: h_a=%f\n", i, *(float*)&value);
|
||||
float value = ((float*)h_c)[i];
|
||||
printf("*** [%d]: %f\n", i, value);
|
||||
} else {
|
||||
printf("*** [%d]: h_a=%d\n", i, value);
|
||||
int value = h_c[i];
|
||||
printf("*** [%d]: %d\n", i, value);
|
||||
}
|
||||
}
|
||||
int errors = 0;
|
||||
for (int i = 0; i < size; ++i) {
|
||||
int ref = h_a[i];
|
||||
float ref_f = *(float*)&ref;
|
||||
for (int i = 0; i < size; ++i) {
|
||||
int pos = 0;
|
||||
for (int j = 0; j < size; ++j) {
|
||||
int cur = h_a[j];
|
||||
if (float_enable) {
|
||||
float cur_f = *(float*)&cur;
|
||||
pos += (cur_f < ref_f) || (cur_f == ref_f && j < i);
|
||||
} else {
|
||||
if (float_enable) {
|
||||
float ref = ((float*)h_a)[i];
|
||||
for (int j = 0; j < size; ++j) {
|
||||
float cur = ((float*)h_a)[j];
|
||||
pos += (cur < ref) || (cur == ref && j < i);
|
||||
}
|
||||
float value = ((float*)h_c)[pos];
|
||||
if (value != ref) {
|
||||
if (errors < 100) {
|
||||
printf("*** error: [%d] expected=%f, actual=%f\n", pos, ref, value);
|
||||
}
|
||||
++errors;
|
||||
}
|
||||
} else {
|
||||
int ref = h_a[i];
|
||||
for (int j = 0; j < size; ++j) {
|
||||
int cur = h_a[j];
|
||||
pos += (cur < ref) || (cur == ref && j < i);
|
||||
}
|
||||
}
|
||||
int value = h_c[pos];
|
||||
if (value != ref) {
|
||||
if (errors < 100) {
|
||||
if (float_enable) {
|
||||
printf("*** error: [%d] expected=%f, actual=%f\n", pos, ref_f, *(float*)&value);
|
||||
} else {
|
||||
int value = h_c[pos];
|
||||
if (value != ref) {
|
||||
if (errors < 100) {
|
||||
printf("*** error: [%d] expected=%d, actual=%d\n", pos, ref, value);
|
||||
}
|
||||
++errors;
|
||||
}
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
if (0 == errors) {
|
||||
|
|
|
@ -151,16 +151,12 @@ int main(int argc, char **argv) {
|
|||
|
||||
cl_platform_id platform_id;
|
||||
cl_device_id device_id;
|
||||
cl_program program;
|
||||
cl_mem input_buffer;
|
||||
cl_mem output_buffer;
|
||||
size_t kernel_size;
|
||||
cl_context context;
|
||||
cl_command_queue queue;
|
||||
cl_int binary_status = 0;
|
||||
|
||||
// read kernel binary from file
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
|
||||
// Getting platform and device information
|
||||
CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
|
||||
|
@ -172,19 +168,18 @@ int main(int argc, char **argv) {
|
|||
cl_kernel kernel = 0;
|
||||
cl_mem memObjects[2] = {0, 0};
|
||||
|
||||
// Create OpenCL program - first attempt to load cached binary.
|
||||
// If that is not available, then create the program from source
|
||||
// and store the binary for future use.
|
||||
printf("create program from binary...\n");
|
||||
cl_program program = CL_CHECK_ERR(clCreateProgramWithBinary(
|
||||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
|
||||
if (program == NULL) {
|
||||
std::cerr << "Failed to write program binary" << std::endl;
|
||||
Cleanup(device_id, context, queue, program, kernel, memObjects);
|
||||
return 1;
|
||||
} else {
|
||||
printf("Read program from binary.\n");
|
||||
}
|
||||
printf("Create program from kernel source\n");
|
||||
#ifdef HOSTGPU
|
||||
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK_ERR(clCreateProgramWithSource(
|
||||
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
|
||||
#else
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK_ERR(clCreateProgramWithBinary(
|
||||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
|
||||
#endif
|
||||
|
||||
// Build program
|
||||
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
|
||||
|
|
|
@ -149,14 +149,10 @@ int main(int argc, char **argv) {
|
|||
|
||||
cl_platform_id platform_id;
|
||||
cl_device_id device_id;
|
||||
cl_program program;
|
||||
size_t kernel_size;
|
||||
cl_int binary_status = 0;
|
||||
uint8_t *kernel_bin = NULL;
|
||||
|
||||
// read kernel binary from file
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
|
||||
// Getting platform and device information
|
||||
CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
|
||||
CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, NULL));
|
||||
|
@ -170,19 +166,18 @@ int main(int argc, char **argv) {
|
|||
cl_kernel kernel = 0;
|
||||
cl_mem memObjects[2] = {0, 0};
|
||||
|
||||
// Create OpenCL program - first attempt to load cached binary.
|
||||
// If that is not available, then create the program from source
|
||||
// and store the binary for future use.
|
||||
printf("create program from binary...\n");
|
||||
cl_program program = CL_CHECK_ERR(clCreateProgramWithBinary(
|
||||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
|
||||
if (program == NULL) {
|
||||
std::cerr << "Failed to write program binary" << std::endl;
|
||||
Cleanup(kernel_bin, device_id, context, queue, program, kernel, memObjects);
|
||||
return 1;
|
||||
} else {
|
||||
printf("Read program from binary.");
|
||||
}
|
||||
printf("Create program from kernel source\n");
|
||||
#ifdef HOSTGPU
|
||||
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK_ERR(clCreateProgramWithSource(
|
||||
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
|
||||
#else
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK_ERR(clCreateProgramWithBinary(
|
||||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
|
||||
#endif
|
||||
|
||||
// Build program
|
||||
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
|
||||
|
|
|
@ -1,12 +1,8 @@
|
|||
#ifndef COMMON_H
|
||||
#define COMMON_H
|
||||
|
||||
#define USE_FLOAT
|
||||
|
||||
#ifdef USE_FLOAT
|
||||
#ifndef TYPE
|
||||
#define TYPE float
|
||||
#else
|
||||
#define TYPE int
|
||||
#endif
|
||||
|
||||
#endif // COMMON_H
|
|
@ -11,6 +11,8 @@
|
|||
|
||||
#define KERNEL_NAME "sgemm"
|
||||
|
||||
#define FLOAT_ULP 6
|
||||
|
||||
#define CL_CHECK(_expr) \
|
||||
do { \
|
||||
cl_int _err = _expr; \
|
||||
|
@ -33,6 +35,66 @@
|
|||
_ret; \
|
||||
})
|
||||
|
||||
template <typename Type>
|
||||
class Comparator {};
|
||||
|
||||
template <>
|
||||
class Comparator<int> {
|
||||
public:
|
||||
static const char* type_str() {
|
||||
return "integer";
|
||||
}
|
||||
static int generate() {
|
||||
return rand();
|
||||
}
|
||||
static bool compare(int a, int b, int index, int errors) {
|
||||
if (a != b) {
|
||||
if (errors < 100) {
|
||||
printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
class Comparator<float> {
|
||||
public:
|
||||
static const char* type_str() {
|
||||
return "float";
|
||||
}
|
||||
static int generate() {
|
||||
return static_cast<float>(rand()) / RAND_MAX;
|
||||
}
|
||||
static bool compare(float a, float b, int index, int errors) {
|
||||
union fi_t { float f; int32_t i; };
|
||||
fi_t fa, fb;
|
||||
fa.f = a;
|
||||
fb.f = b;
|
||||
auto d = std::abs(fa.i - fb.i);
|
||||
if (d > FLOAT_ULP) {
|
||||
if (errors < 100) {
|
||||
printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
/*static void sgemm_cpu(TYPE *C, const TYPE* A, const TYPE *B, int M, int N, int K) {
|
||||
for (int m = 0; m < M; ++m) {
|
||||
for (int n = 0; n < N; ++n) {
|
||||
TYPE acc = 0;
|
||||
for (int k = 0; k < K; ++k) {
|
||||
acc += A[k * M + m] * B[n * K + k];
|
||||
}
|
||||
C[n * M + m] = acc;
|
||||
}
|
||||
}
|
||||
}*/
|
||||
|
||||
static int read_kernel_file(const char* filename, uint8_t** data, size_t* size) {
|
||||
if (nullptr == filename || nullptr == data || 0 == size)
|
||||
return -1;
|
||||
|
@ -54,32 +116,6 @@ static int read_kernel_file(const char* filename, uint8_t** data, size_t* size)
|
|||
return 0;
|
||||
}
|
||||
|
||||
/*static void matmul(TYPE *C, const TYPE* A, const TYPE *B, int M, int N, int K) {
|
||||
for (int m = 0; m < M; ++m) {
|
||||
for (int n = 0; n < N; ++n) {
|
||||
TYPE acc = 0;
|
||||
for (int k = 0; k < K; ++k) {
|
||||
acc += A[k * M + m] * B[n * K + k];
|
||||
}
|
||||
C[n * M + m] = acc;
|
||||
}
|
||||
}
|
||||
}*/
|
||||
|
||||
#ifdef USE_FLOAT
|
||||
static bool compare_equal(float a, float b, int ulp = 21) {
|
||||
union fi_t { int i; float f; };
|
||||
fi_t fa, fb;
|
||||
fa.f = a;
|
||||
fb.f = b;
|
||||
return std::abs(fa.i - fb.i) <= ulp;
|
||||
}
|
||||
#else
|
||||
static bool compare_equal(int a, int b, int ulp = 21) {
|
||||
return (a == b);
|
||||
}
|
||||
#endif
|
||||
|
||||
cl_device_id device_id = NULL;
|
||||
cl_context context = NULL;
|
||||
cl_command_queue commandQueue = NULL;
|
||||
|
@ -145,15 +181,12 @@ int main (int argc, char **argv) {
|
|||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
uint32_t num_points = size * size;
|
||||
|
||||
cl_platform_id platform_id;
|
||||
size_t kernel_size;
|
||||
cl_int binary_status;
|
||||
|
||||
srand(50);
|
||||
|
||||
// read kernel binary from file
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
|
||||
// Getting platform and device information
|
||||
CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
|
||||
|
@ -163,18 +196,23 @@ int main (int argc, char **argv) {
|
|||
context = CL_CHECK2(clCreateContext(NULL, 1, &device_id, NULL, NULL, &_err));
|
||||
|
||||
// Allocate device buffers
|
||||
size_t nbytes = size * size * sizeof(TYPE);
|
||||
size_t nbytes = num_points * sizeof(TYPE);
|
||||
a_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
|
||||
b_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_READ_ONLY, nbytes, NULL, &_err));
|
||||
c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
|
||||
|
||||
printf("Create program from kernel source\n");
|
||||
program = CL_CHECK2(clCreateProgramWithBinary(
|
||||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err));
|
||||
if (program == NULL) {
|
||||
cleanup();
|
||||
#ifdef HOSTGPU
|
||||
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
}
|
||||
program = CL_CHECK2(clCreateProgramWithSource(
|
||||
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
|
||||
#else
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK2(clCreateProgramWithBinary(
|
||||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
|
||||
#endif
|
||||
|
||||
// Build program
|
||||
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
|
||||
|
@ -194,23 +232,17 @@ int main (int argc, char **argv) {
|
|||
h_b = (TYPE*)malloc(nbytes);
|
||||
h_c = (TYPE*)malloc(nbytes);
|
||||
|
||||
// Initialize values for array members.
|
||||
for (int i = 0; i < (size * size); ++i) {
|
||||
#ifdef USE_FLOAT
|
||||
h_a[i] = (float)rand() / (float)RAND_MAX;
|
||||
h_b[i] = (float)rand() / (float)RAND_MAX;
|
||||
#else
|
||||
h_a[i] = rand();
|
||||
h_b[i] = rand();
|
||||
#endif
|
||||
h_c[i] = 0xdeadbeef;
|
||||
// Generate input values
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
h_a[i] = Comparator<TYPE>::generate();
|
||||
h_b[i] = Comparator<TYPE>::generate();
|
||||
}
|
||||
|
||||
size_t global_offset[2] = {0, 0};
|
||||
size_t global_work_size[2] = {size, size};
|
||||
size_t local_work_size[2] = {1, 1};
|
||||
|
||||
std::vector<float> ref_vec(size * size);
|
||||
std::vector<float> ref_vec(num_points);
|
||||
|
||||
// reference generation
|
||||
size_t num_groups_y = global_work_size[1] / local_work_size[1];
|
||||
|
@ -228,12 +260,7 @@ int main (int argc, char **argv) {
|
|||
TYPE acc = 0;
|
||||
for (int k = 0; k < width; k++) {
|
||||
acc += h_a[k * width + r] * h_b[c * width + k];
|
||||
}
|
||||
/*#ifdef USE_FLOAT
|
||||
printf("*** r=%d, c=%d, v=%f\n", r, c, acc);
|
||||
#else
|
||||
printf("*** r=%d, c=%d, v=%d\n", r, c, acc);
|
||||
#endif*/
|
||||
}
|
||||
ref_vec[c * width + r] = acc;
|
||||
}
|
||||
}
|
||||
|
@ -260,14 +287,8 @@ int main (int argc, char **argv) {
|
|||
|
||||
printf("Verify result\n");
|
||||
int errors = 0;
|
||||
for (int i = 0; i < (size * size); i++) {
|
||||
if (!compare_equal(h_c[i], ref_vec[i])) {
|
||||
if (errors < 100)
|
||||
#ifdef USE_FLOAT
|
||||
printf("*** error: [%d] expected=%f, actual=%f\n", i, ref_vec[i], h_c[i]);
|
||||
#else
|
||||
printf("*** error: [%d] expected=%d, actual=%d\n", i, ref_vec[i], h_c[i]);
|
||||
#endif
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
if (!Comparator<TYPE>::compare(h_c[i], ref_vec[i], i, errors)) {
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -91,15 +91,11 @@ int coo_to_jds(char *mtx_filename, int pad_rows, int warp_size, int pack_size,
|
|||
if ((f = fopen(mtx_filename, "r")) == NULL)
|
||||
exit(1);
|
||||
|
||||
printf("OK**\n");
|
||||
|
||||
if (mm_read_banner(f, &matcode) != 0) {
|
||||
printf("Could not process Matrix Market banner.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
printf("OK**\n");
|
||||
|
||||
/* This is how one can screen matrix types if their application */
|
||||
/* only supports a subset of the Matrix Market data types. */
|
||||
|
||||
|
|
|
@ -148,7 +148,6 @@ int main(int argc, char **argv) {
|
|||
// &h_data, &h_indices, &h_ptr,
|
||||
// &h_perm, &h_nzcnt);
|
||||
int col_count;
|
||||
printf("OK--\n");
|
||||
coo_to_jds(parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx
|
||||
1, // row padding
|
||||
pad, // warp size
|
||||
|
@ -159,8 +158,6 @@ int main(int argc, char **argv) {
|
|||
&h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim,
|
||||
&len, &nzcnt_len, &depth);
|
||||
|
||||
printf("OK++\n");
|
||||
|
||||
// pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
|
||||
h_Ax_vector = (float *)malloc(sizeof(float) * dim);
|
||||
h_x_vector = (float *)malloc(sizeof(float) * dim);
|
||||
|
|
|
@ -157,9 +157,7 @@ int main(int argc, char** argv) {
|
|||
CHECK_ERROR("clBuildProgram")
|
||||
|
||||
cl_kernel clKernel = clCreateKernel(clProgram,"naive_kernel",&clStatus);
|
||||
CHECK_ERROR("clCreateKernel")
|
||||
|
||||
printf("OK+\n");
|
||||
CHECK_ERROR("clCreateKernel")
|
||||
|
||||
//host data
|
||||
float *h_A0;
|
||||
|
@ -177,15 +175,11 @@ int main(int argc, char** argv) {
|
|||
h_Anext=(float*)malloc(sizeof(float)*size);
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_IO);
|
||||
//FILE *fp = fopen(parameters->inpFiles[0], "rb");
|
||||
printf("OK+\n");
|
||||
read_data(h_A0, nx,ny,nz,NULL);
|
||||
printf("OK+\n");
|
||||
//fclose(fp);
|
||||
memcpy (h_Anext,h_A0,sizeof(float)*size);
|
||||
//fclose(fp);
|
||||
memcpy (h_Anext,h_A0,sizeof(float)*size);
|
||||
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
|
||||
|
||||
printf("OK+\n");
|
||||
|
||||
//memory allocation
|
||||
d_A0 = clCreateBuffer(clContext,CL_MEM_READ_WRITE,size*sizeof(float),NULL,&clStatus);
|
||||
|
@ -201,18 +195,16 @@ int main(int argc, char** argv) {
|
|||
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE);
|
||||
|
||||
printf("OK+\n");
|
||||
|
||||
//only use 1D thread block
|
||||
int tx = 128;
|
||||
int tx = 128;
|
||||
size_t block[3] = {tx,1,1};
|
||||
size_t grid[3] = {(nx-2+tx-1)/tx*tx,ny-2,nz-2};
|
||||
//size_t grid[3] = {nx-2,ny-2,nz-2};
|
||||
size_t offset[3] = {1,1,1};
|
||||
printf("grid size in x/y/z = %d %d %d\n",grid[0],grid[1],grid[2]);
|
||||
//size_t grid[3] = {nx-2,ny-2,nz-2};
|
||||
size_t offset[3] = {1,1,1};
|
||||
printf("grid size in x/y/z = %d %d %d\n",grid[0],grid[1],grid[2]);
|
||||
printf("block size in x/y/z = %d %d %d\n",block[0],block[1],block[2]);
|
||||
|
||||
printf ("blocks = %d\n", (grid[0]/block[0])*(grid[1]/block[1])*(grid[2]*block[2]));
|
||||
printf ("blocks = %d\n", (grid[0]/block[0])*(grid[1]/block[1])*(grid[2]*block[2]));
|
||||
|
||||
clStatus = clSetKernelArg(clKernel,0,sizeof(float),(void*)&c0);
|
||||
clStatus = clSetKernelArg(clKernel,1,sizeof(float),(void*)&c1);
|
||||
|
@ -226,14 +218,10 @@ int main(int argc, char** argv) {
|
|||
//main execution
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_KERNEL);
|
||||
|
||||
printf("OK+0\n");
|
||||
|
||||
int t;
|
||||
for(t=0;t<iteration;t++)
|
||||
{
|
||||
clStatus = clEnqueueNDRangeKernel(clCommandQueue,clKernel,3,NULL,grid,block,0,NULL,NULL);
|
||||
printf("OK+0\n");
|
||||
|
||||
//printf("iteration %d\n",t)
|
||||
CHECK_ERROR("clEnqueueNDRangeKernel")
|
||||
|
||||
|
@ -244,11 +232,9 @@ int main(int argc, char** argv) {
|
|||
clStatus = clSetKernelArg(clKernel,3,sizeof(cl_mem),(void*)&d_Anext);
|
||||
}
|
||||
|
||||
printf("OK+1\n");
|
||||
|
||||
cl_mem d_temp = d_A0;
|
||||
d_A0 = d_Anext;
|
||||
d_Anext = d_temp;
|
||||
cl_mem d_temp = d_A0;
|
||||
d_A0 = d_Anext;
|
||||
d_Anext = d_temp;
|
||||
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_COPY);
|
||||
clStatus = clEnqueueReadBuffer(clCommandQueue,d_Anext,CL_TRUE,0,size*sizeof(float),h_Anext,0,NULL,NULL);
|
||||
|
@ -260,8 +246,6 @@ int main(int argc, char** argv) {
|
|||
clStatus = clReleaseCommandQueue(clCommandQueue);
|
||||
clStatus = clReleaseContext(clContext);
|
||||
CHECK_ERROR("clReleaseContext")
|
||||
|
||||
printf("OK+2\n");
|
||||
|
||||
if (parameters->outFile) {
|
||||
pb_SwitchToTimer(&timers, pb_TimerID_IO);
|
||||
|
|
|
@ -122,11 +122,6 @@ int main (int argc, char **argv) {
|
|||
|
||||
cl_platform_id platform_id;
|
||||
size_t kernel_size;
|
||||
cl_int binary_status;
|
||||
|
||||
// read kernel binary from file
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
|
||||
// Getting platform and device information
|
||||
CL_CHECK(clGetPlatformIDs(1, &platform_id, NULL));
|
||||
|
@ -142,13 +137,17 @@ int main (int argc, char **argv) {
|
|||
c_memobj = CL_CHECK2(clCreateBuffer(context, CL_MEM_WRITE_ONLY, nbytes, NULL, &_err));
|
||||
|
||||
printf("Create program from kernel source\n");
|
||||
cl_int _err;
|
||||
program = clCreateProgramWithBinary(
|
||||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, &binary_status, &_err);
|
||||
if (program == NULL) {
|
||||
cleanup();
|
||||
#ifdef HOSTGPU
|
||||
if (0 != read_kernel_file("kernel.cl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
}
|
||||
program = CL_CHECK2(clCreateProgramWithSource(
|
||||
context, 1, (const char**)&kernel_bin, &kernel_size, &_err));
|
||||
#else
|
||||
if (0 != read_kernel_file("kernel.pocl", &kernel_bin, &kernel_size))
|
||||
return -1;
|
||||
program = CL_CHECK2(clCreateProgramWithBinary(
|
||||
context, 1, &device_id, &kernel_size, (const uint8_t**)&kernel_bin, NULL, &_err));
|
||||
#endif
|
||||
|
||||
// Build program
|
||||
CL_CHECK(clBuildProgram(program, 1, &device_id, NULL, NULL, NULL));
|
||||
|
@ -166,12 +165,10 @@ int main (int argc, char **argv) {
|
|||
h_b = (float*)malloc(nbytes);
|
||||
h_c = (float*)malloc(nbytes);
|
||||
|
||||
// Initialize values for array members.
|
||||
// Generate input values
|
||||
for (int i = 0; i < size; ++i) {
|
||||
h_a[i] = sinf(i)*sinf(i);
|
||||
h_b[i] = cosf(i)*cosf(i);
|
||||
h_c[i] = 0xdeadbeef;
|
||||
//printf("*** [%d]: h_a=%f, h_b=%f\n", i, h_a[i], h_b[i]);
|
||||
}
|
||||
|
||||
// Creating command queue
|
||||
|
|
|
@ -10,6 +10,7 @@ all:
|
|||
$(MAKE) -C fence
|
||||
$(MAKE) -C no_mf_ext
|
||||
$(MAKE) -C no_smem
|
||||
$(MAKE) -C tensor
|
||||
|
||||
run-simx:
|
||||
$(MAKE) -C basic run-simx
|
||||
|
@ -23,6 +24,7 @@ run-simx:
|
|||
$(MAKE) -C fence run-simx
|
||||
$(MAKE) -C no_mf_ext run-simx
|
||||
$(MAKE) -C no_smem run-simx
|
||||
$(MAKE) -C tensor run-simx
|
||||
|
||||
run-rtlsim:
|
||||
$(MAKE) -C basic run-rtlsim
|
||||
|
@ -36,6 +38,7 @@ run-rtlsim:
|
|||
$(MAKE) -C fence run-rtlsim
|
||||
$(MAKE) -C no_mf_ext run-rtlsim
|
||||
$(MAKE) -C no_smem run-rtlsim
|
||||
$(MAKE) -C tensor run-rtlsim
|
||||
|
||||
run-opae:
|
||||
$(MAKE) -C basic run-opae
|
||||
|
@ -49,6 +52,7 @@ run-opae:
|
|||
$(MAKE) -C fence run-opae
|
||||
$(MAKE) -C no_mf_ext run-opae
|
||||
$(MAKE) -C no_smem run-opae
|
||||
$(MAKE) -C tensor run-opae
|
||||
|
||||
clean:
|
||||
$(MAKE) -C basic clean
|
||||
|
@ -62,6 +66,7 @@ clean:
|
|||
$(MAKE) -C fence clean
|
||||
$(MAKE) -C no_mf_ext clean
|
||||
$(MAKE) -C no_smem clean
|
||||
$(MAKE) -C tensor clean
|
||||
|
||||
clean-all:
|
||||
$(MAKE) -C basic clean-all
|
||||
|
@ -75,3 +80,4 @@ clean-all:
|
|||
$(MAKE) -C fence clean-all
|
||||
$(MAKE) -C no_mf_ext clean-all
|
||||
$(MAKE) -C no_smem clean-all
|
||||
$(MAKE) -C tensor clean-all
|
||||
|
|
|
@ -262,11 +262,8 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
{
|
||||
auto buf_ptr = (void*)staging_buf.data();
|
||||
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
|
||||
}
|
||||
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
|
||||
|
||||
std::cout << "run kernel test" << std::endl;
|
||||
RT_CHECK(run_kernel_test(kernel_arg, buf_size, num_points));
|
||||
|
|
|
@ -1,16 +1,18 @@
|
|||
XLEN ?= 32
|
||||
|
||||
TOOLDIR ?= /opt
|
||||
|
||||
TARGET ?= opaesim
|
||||
|
||||
XRT_SYN_DIR ?= ../../../hw/syn/xilinx/xrt
|
||||
XRT_DEVICE_INDEX ?= 0
|
||||
|
||||
ifeq ($(XLEN),64)
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv64-gnu-toolchain
|
||||
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv64-gnu-toolchain
|
||||
VX_CFLAGS += -march=rv64imafd -mabi=lp64d
|
||||
STARTUP_ADDR ?= 0x180000000
|
||||
else
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
RISCV_TOOLCHAIN_PATH ?= $(TOOLDIR)/riscv-gnu-toolchain
|
||||
VX_CFLAGS += -march=rv32imaf -mabi=ilp32f
|
||||
STARTUP_ADDR ?= 0x80000000
|
||||
endif
|
||||
|
@ -23,7 +25,7 @@ VORTEX_KN_PATH ?= $(realpath ../../../kernel)
|
|||
|
||||
FPGA_BIN_DIR ?= $(VORTEX_RT_PATH)/opae
|
||||
|
||||
LLVM_VORTEX ?= /opt/llvm-vortex
|
||||
LLVM_VORTEX ?= $(TOOLDIR)/llvm-vortex
|
||||
|
||||
LLVM_CFLAGS += --sysroot=$(RISCV_SYSROOT)
|
||||
LLVM_CFLAGS += --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH)
|
||||
|
|
|
@ -3,6 +3,10 @@
|
|||
|
||||
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
|
||||
|
||||
#ifndef TYPE
|
||||
#define TYPE float
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
uint32_t num_tasks;
|
||||
uint32_t task_size;
|
||||
|
@ -11,4 +15,4 @@ typedef struct {
|
|||
uint64_t dst_addr;
|
||||
} kernel_arg_t;
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
@ -4,11 +4,11 @@
|
|||
#include "common.h"
|
||||
|
||||
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
uint32_t count = arg->task_size;
|
||||
int32_t* src0_ptr = (int32_t*)arg->src0_addr;
|
||||
int32_t* src1_ptr = (int32_t*)arg->src1_addr;
|
||||
int32_t* dst_ptr = (int32_t*)arg->dst_addr;
|
||||
auto src0_ptr = reinterpret_cast<TYPE*>(arg->src0_addr);
|
||||
auto src1_ptr = reinterpret_cast<TYPE*>(arg->src1_addr);
|
||||
auto dst_ptr = reinterpret_cast<TYPE*>(arg->dst_addr);
|
||||
|
||||
uint32_t count = arg->task_size;
|
||||
uint32_t offset = task_id * count;
|
||||
|
||||
for (uint32_t i = 0; i < count; ++i) {
|
||||
|
|
|
@ -5,6 +5,8 @@
|
|||
#include <vortex.h>
|
||||
#include "common.h"
|
||||
|
||||
#define FLOAT_ULP 6
|
||||
|
||||
#define RT_CHECK(_expr) \
|
||||
do { \
|
||||
int _ret = _expr; \
|
||||
|
@ -17,10 +19,61 @@
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename Type>
|
||||
class Comparator {};
|
||||
|
||||
template <>
|
||||
class Comparator<int> {
|
||||
public:
|
||||
static const char* type_str() {
|
||||
return "integer";
|
||||
}
|
||||
static int generate() {
|
||||
return rand();
|
||||
}
|
||||
static bool compare(int a, int b, int index, int errors) {
|
||||
if (a != b) {
|
||||
if (errors < 100) {
|
||||
printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
class Comparator<float> {
|
||||
private:
|
||||
union Float_t { float f; int i; };
|
||||
public:
|
||||
static const char* type_str() {
|
||||
return "float";
|
||||
}
|
||||
static int generate() {
|
||||
return static_cast<float>(rand()) / RAND_MAX;
|
||||
}
|
||||
static bool compare(float a, float b, int index, int errors) {
|
||||
union fi_t { float f; int32_t i; };
|
||||
fi_t fa, fb;
|
||||
fa.f = a;
|
||||
fb.f = b;
|
||||
auto d = std::abs(fa.i - fb.i);
|
||||
if (d > FLOAT_ULP) {
|
||||
if (errors < 100) {
|
||||
printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
const char* kernel_file = "kernel.bin";
|
||||
uint32_t count = 0;
|
||||
uint32_t count = 16;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
std::vector<TYPE> source_data;
|
||||
std::vector<uint8_t> staging_buf;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
||||
|
@ -79,13 +132,11 @@ int run_test(const kernel_arg_t& kernel_arg,
|
|||
std::cout << "verify result" << std::endl;
|
||||
{
|
||||
int errors = 0;
|
||||
auto buf_ptr = (int32_t*)staging_buf.data();
|
||||
auto buf_ptr = (TYPE*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
int ref = i + i;
|
||||
int cur = buf_ptr[i];
|
||||
if (cur != ref) {
|
||||
std::cout << "error at result #" << std::dec << i
|
||||
<< std::hex << ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
|
||||
auto ref = source_data[2 * i + 0] + source_data[2 * i + 1];
|
||||
auto cur = buf_ptr[i];
|
||||
if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
|
@ -103,9 +154,7 @@ int main(int argc, char *argv[]) {
|
|||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
if (count == 0) {
|
||||
count = 1;
|
||||
}
|
||||
std::srand(50);
|
||||
|
||||
// open device connection
|
||||
std::cout << "open device connection" << std::endl;
|
||||
|
@ -118,8 +167,9 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
uint32_t num_tasks = num_cores * num_warps * num_threads;
|
||||
uint32_t num_points = count * num_tasks;
|
||||
uint32_t buf_size = num_points * sizeof(int32_t);
|
||||
uint32_t buf_size = num_points * sizeof(TYPE);
|
||||
|
||||
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
|
||||
std::cout << "number of points: " << num_points << std::endl;
|
||||
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
|
||||
|
||||
|
@ -147,18 +197,21 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
{
|
||||
auto buf_ptr = (int*)staging_buf.data();
|
||||
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
|
||||
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
|
||||
|
||||
// generate source data
|
||||
source_data.resize(2 * num_points);
|
||||
for (uint32_t i = 0; i < source_data.size(); ++i) {
|
||||
source_data[i] = Comparator<TYPE>::generate();
|
||||
}
|
||||
|
||||
// upload source buffer0
|
||||
{
|
||||
std::cout << "upload source buffer0" << std::endl;
|
||||
auto buf_ptr = (int32_t*)staging_buf.data();
|
||||
auto buf_ptr = (TYPE*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = i-1;
|
||||
buf_ptr[i] = source_data[2 * i + 0];
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src0_addr, staging_buf.data(), buf_size));
|
||||
}
|
||||
|
@ -166,23 +219,18 @@ int main(int argc, char *argv[]) {
|
|||
// upload source buffer1
|
||||
{
|
||||
std::cout << "upload source buffer1" << std::endl;
|
||||
auto buf_ptr = (int32_t*)staging_buf.data();
|
||||
auto buf_ptr = (TYPE*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = i+1;
|
||||
buf_ptr[i] = source_data[2 * i + 1];
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.src1_addr, staging_buf.data(), buf_size));
|
||||
}
|
||||
|
||||
// clear destination buffer
|
||||
{
|
||||
std::cout << "clear destination buffer" << std::endl;
|
||||
auto buf_ptr = (int32_t*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = 0xdeadbeef;
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
|
||||
}
|
||||
|
||||
std::cout << "clear destination buffer" << std::endl;
|
||||
memset(staging_buf.data(), 0, num_points * sizeof(TYPE));
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.dst_addr, staging_buf.data(), buf_size));
|
||||
|
||||
// run tests
|
||||
std::cout << "run tests" << std::endl;
|
||||
RT_CHECK(run_test(kernel_arg, buf_size, num_points));
|
||||
|
|
|
@ -233,11 +233,8 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
{
|
||||
auto buf_ptr = (int*)staging_buf.data();
|
||||
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
|
||||
}
|
||||
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
|
||||
|
||||
// upload source buffer
|
||||
{
|
||||
|
|
|
@ -147,11 +147,8 @@ int main(int argc, char *argv[]) {
|
|||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
{
|
||||
auto buf_ptr = (int*)staging_buf.data();
|
||||
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
|
||||
}
|
||||
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
|
||||
|
||||
// upload source buffer0
|
||||
{
|
||||
|
|
|
@ -190,13 +190,10 @@ int main(int argc, char *argv[]) {
|
|||
staging_buf.resize(staging_buf_size);
|
||||
|
||||
// upload kernel argument
|
||||
{
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
auto buf_ptr = (int*)staging_buf.data();
|
||||
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
|
||||
}
|
||||
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
|
||||
|
||||
// upload test address data
|
||||
{
|
||||
std::cout << "upload test address data" << std::endl;
|
||||
|
|
|
@ -236,13 +236,10 @@ int main(int argc, char *argv[]) {
|
|||
staging_buf.resize(staging_buf_size);
|
||||
|
||||
// upload kernel argument
|
||||
{
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
auto buf_ptr = (int*)staging_buf.data();
|
||||
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
|
||||
}
|
||||
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
|
||||
|
||||
// upload source buffer0
|
||||
{
|
||||
std::cout << "upload address buffer" << std::endl;
|
||||
|
|
|
@ -136,13 +136,10 @@ int main(int argc, char *argv[]) {
|
|||
staging_buf.resize(alloc_size);
|
||||
|
||||
// upload kernel argument
|
||||
{
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
auto buf_ptr = (int*)staging_buf.data();
|
||||
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
|
||||
}
|
||||
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
|
||||
|
||||
// upload source buffer0
|
||||
{
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
|
|
|
@ -135,13 +135,10 @@ int main(int argc, char *argv[]) {
|
|||
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
|
||||
staging_buf.resize(alloc_size);
|
||||
|
||||
// upload kernel argument
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
{
|
||||
auto buf_ptr = (int*)staging_buf.data();
|
||||
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
|
||||
}
|
||||
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
|
||||
|
||||
// upload source buffer0
|
||||
{
|
||||
|
|
|
@ -110,13 +110,10 @@ int main(int argc, char *argv[]) {
|
|||
staging_buf.resize(alloc_size);
|
||||
|
||||
// upload kernel argument
|
||||
{
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
auto buf_ptr = (void*)staging_buf.data();
|
||||
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
|
||||
}
|
||||
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
|
||||
|
||||
// upload source buffer0
|
||||
{
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
|
|
|
@ -3,11 +3,7 @@
|
|||
|
||||
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
|
||||
|
||||
#define FP_ENABLE
|
||||
|
||||
#ifdef FP_ENABLE
|
||||
#define TYPE float
|
||||
#else
|
||||
#ifndef TYPE
|
||||
#define TYPE int
|
||||
#endif
|
||||
|
||||
|
@ -17,4 +13,4 @@ typedef struct {
|
|||
uint64_t dst_addr;
|
||||
} kernel_arg_t;
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
@ -5,14 +5,14 @@
|
|||
|
||||
void kernel_body(int task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
uint32_t num_points = arg->num_points;
|
||||
TYPE* src_ptr = (TYPE*)arg->src_addr;
|
||||
TYPE* dst_ptr = (TYPE*)arg->dst_addr;
|
||||
auto src_ptr = (TYPE*)arg->src_addr;
|
||||
auto dst_ptr = (TYPE*)arg->dst_addr;
|
||||
|
||||
TYPE ref_value = src_ptr[task_id];
|
||||
auto ref_value = src_ptr[task_id];
|
||||
|
||||
uint32_t pos = 0;
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
TYPE cur_value = src_ptr[i];
|
||||
auto cur_value = src_ptr[i];
|
||||
pos += (cur_value < ref_value) || ((cur_value == ref_value) && (i < task_id));
|
||||
}
|
||||
dst_ptr[pos] = ref_value;
|
||||
|
|
|
@ -66,8 +66,8 @@ void gen_input_data(uint32_t num_points) {
|
|||
src_data.resize(num_points);
|
||||
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
float r = static_cast<float>(std::rand()) / RAND_MAX;
|
||||
TYPE value = r * num_points;
|
||||
auto r = static_cast<float>(std::rand()) / RAND_MAX;
|
||||
auto value = static_cast<TYPE>(r * num_points);
|
||||
src_data[i] = value;
|
||||
std::cout << std::dec << i << ": value=" << value << std::endl;
|
||||
}
|
||||
|
@ -172,19 +172,16 @@ int main(int argc, char *argv[]) {
|
|||
{
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
uint32_t staging_buf_size = std::max<uint32_t>(src_buf_size,
|
||||
std::max<uint32_t>(dst_buf_size,
|
||||
sizeof(kernel_arg_t)));
|
||||
std::max<uint32_t>(dst_buf_size,
|
||||
sizeof(kernel_arg_t)));
|
||||
staging_buf.resize(staging_buf_size);
|
||||
}
|
||||
|
||||
// upload kernel argument
|
||||
{
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
auto buf_ptr = staging_buf.data();
|
||||
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
|
||||
}
|
||||
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
|
||||
|
||||
// upload source buffer
|
||||
{
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
|
|
9
tests/regression/tensor/Makefile
Normal file
9
tests/regression/tensor/Makefile
Normal file
|
@ -0,0 +1,9 @@
|
|||
PROJECT = tensor
|
||||
|
||||
SRCS = main.cpp
|
||||
|
||||
VX_SRCS = kernel.cpp
|
||||
|
||||
OPTS ?= -n32
|
||||
|
||||
include ../common.mk
|
18
tests/regression/tensor/common.h
Normal file
18
tests/regression/tensor/common.h
Normal file
|
@ -0,0 +1,18 @@
|
|||
#ifndef _COMMON_H_
|
||||
#define _COMMON_H_
|
||||
|
||||
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
|
||||
|
||||
#ifndef TYPE
|
||||
#define TYPE float
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
uint32_t num_tasks;
|
||||
uint32_t size;
|
||||
uint64_t A_addr;
|
||||
uint64_t B_addr;
|
||||
uint64_t C_addr;
|
||||
} kernel_arg_t;
|
||||
|
||||
#endif
|
41
tests/regression/tensor/kernel.cpp
Normal file
41
tests/regression/tensor/kernel.cpp
Normal file
|
@ -0,0 +1,41 @@
|
|||
#include <stdint.h>
|
||||
#include <vx_intrinsics.h>
|
||||
#include <vx_spawn.h>
|
||||
#include "common.h"
|
||||
|
||||
inline char is_log2(uint32_t x) {
|
||||
return ((x & (x-1)) == 0);
|
||||
}
|
||||
|
||||
inline uint32_t log2_fast(uint32_t x) {
|
||||
return 31 - __builtin_clz (x);
|
||||
}
|
||||
|
||||
void kernel_body(uint32_t task_id, kernel_arg_t* __UNIFORM__ arg) {
|
||||
auto A = reinterpret_cast<TYPE*>(arg->A_addr);
|
||||
auto B = reinterpret_cast<TYPE*>(arg->B_addr);
|
||||
auto C = reinterpret_cast<TYPE*>(arg->C_addr);
|
||||
auto size = arg->size;
|
||||
|
||||
uint32_t row, col;
|
||||
if (is_log2(size)) {
|
||||
uint32_t log_size = log2_fast(size);
|
||||
row = task_id >> log_size;
|
||||
col = task_id & (size-1);
|
||||
} else {
|
||||
row = task_id / size;
|
||||
col = task_id % size;
|
||||
}
|
||||
|
||||
TYPE sum (0);
|
||||
for (int e = 0; e < size; ++e) {
|
||||
sum += A[row * size + e] * B[e * size + col];
|
||||
}
|
||||
C[row * size + col] = sum;
|
||||
}
|
||||
|
||||
int main() {
|
||||
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
|
||||
vx_spawn_tasks(arg->num_tasks, (vx_spawn_tasks_cb)kernel_body, arg);
|
||||
return 0;
|
||||
}
|
254
tests/regression/tensor/main.cpp
Normal file
254
tests/regression/tensor/main.cpp
Normal file
|
@ -0,0 +1,254 @@
|
|||
#include <iostream>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <vector>
|
||||
#include <vortex.h>
|
||||
#include "common.h"
|
||||
|
||||
#define FLOAT_ULP 6
|
||||
|
||||
#define RT_CHECK(_expr) \
|
||||
do { \
|
||||
int _ret = _expr; \
|
||||
if (0 == _ret) \
|
||||
break; \
|
||||
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
|
||||
cleanup(); \
|
||||
exit(-1); \
|
||||
} while (false)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename Type>
|
||||
class Comparator {};
|
||||
|
||||
template <>
|
||||
class Comparator<int> {
|
||||
public:
|
||||
static const char* type_str() {
|
||||
return "integer";
|
||||
}
|
||||
static int generate() {
|
||||
return rand();
|
||||
}
|
||||
static bool compare(int a, int b, int index, int errors) {
|
||||
if (a != b) {
|
||||
if (errors < 100) {
|
||||
printf("*** error: [%d] expected=%d, actual=%d\n", index, a, b);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
class Comparator<float> {
|
||||
public:
|
||||
static const char* type_str() {
|
||||
return "float";
|
||||
}
|
||||
static int generate() {
|
||||
return static_cast<float>(rand()) / RAND_MAX;
|
||||
}
|
||||
static bool compare(float a, float b, int index, int errors) {
|
||||
union fi_t { float f; int32_t i; };
|
||||
fi_t fa, fb;
|
||||
fa.f = a;
|
||||
fb.f = b;
|
||||
auto d = std::abs(fa.i - fb.i);
|
||||
if (d > FLOAT_ULP) {
|
||||
if (errors < 100) {
|
||||
printf("*** error: [%d] expected=%f, actual=%f\n", index, a, b);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
static void matmul_cpu(TYPE* out, const TYPE* A, const TYPE* B, uint32_t width, uint32_t height) {
|
||||
for (uint32_t row = 0; row < height; ++row) {
|
||||
for (uint32_t col = 0; col < width; ++col) {
|
||||
TYPE sum(0);
|
||||
for (uint32_t e = 0; e < width; ++e) {
|
||||
sum += A[row * width + e] * B[e * width + col];
|
||||
}
|
||||
out[row * width + col] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const char* kernel_file = "kernel.bin";
|
||||
uint32_t size = 32;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
std::vector<uint8_t> staging_buf;
|
||||
kernel_arg_t kernel_arg = {};
|
||||
|
||||
static void show_usage() {
|
||||
std::cout << "Vortex Test." << std::endl;
|
||||
std::cout << "Usage: [-k: kernel] [-n size] [-h: help]" << std::endl;
|
||||
}
|
||||
|
||||
static void parse_args(int argc, char **argv) {
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "n:k:h?")) != -1) {
|
||||
switch (c) {
|
||||
case 'n':
|
||||
size = atoi(optarg);
|
||||
break;
|
||||
case 'k':
|
||||
kernel_file = optarg;
|
||||
break;
|
||||
case 'h':
|
||||
case '?': {
|
||||
show_usage();
|
||||
exit(0);
|
||||
} break;
|
||||
default:
|
||||
show_usage();
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void cleanup() {
|
||||
if (device) {
|
||||
vx_mem_free(device, kernel_arg.A_addr);
|
||||
vx_mem_free(device, kernel_arg.B_addr);
|
||||
vx_mem_free(device, kernel_arg.C_addr);
|
||||
vx_dev_close(device);
|
||||
}
|
||||
}
|
||||
|
||||
int run_test(const kernel_arg_t& kernel_arg,
|
||||
uint32_t buf_size,
|
||||
const std::vector<TYPE>& refs) {
|
||||
// start device
|
||||
std::cout << "start device" << std::endl;
|
||||
RT_CHECK(vx_start(device));
|
||||
|
||||
// wait for completion
|
||||
std::cout << "wait for completion" << std::endl;
|
||||
RT_CHECK(vx_ready_wait(device, VX_MAX_TIMEOUT));
|
||||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(device, staging_buf.data(), kernel_arg.C_addr, buf_size));
|
||||
|
||||
// verify result
|
||||
std::cout << "verify result" << std::endl;
|
||||
{
|
||||
int errors = 0;
|
||||
auto buf_ptr = (TYPE*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < refs.size(); ++i) {
|
||||
auto ref = refs[i];
|
||||
auto cur = buf_ptr[i];
|
||||
if (!Comparator<TYPE>::compare(cur, ref, i, errors)) {
|
||||
++errors;
|
||||
}
|
||||
}
|
||||
if (errors != 0) {
|
||||
std::cout << "Found " << std::dec << errors << " errors!" << std::endl;
|
||||
std::cout << "FAILED!" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
std::srand(50);
|
||||
|
||||
// open device connection
|
||||
std::cout << "open device connection" << std::endl;
|
||||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
uint32_t num_points = size * size;
|
||||
uint32_t buf_size = num_points * sizeof(TYPE);
|
||||
|
||||
std::cout << "data type: " << Comparator<TYPE>::type_str() << std::endl;
|
||||
std::cout << "matrix size: " << size << "x" << size << std::endl;
|
||||
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.A_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.B_addr));
|
||||
RT_CHECK(vx_mem_alloc(device, buf_size, VX_MEM_TYPE_GLOBAL, &kernel_arg.C_addr));
|
||||
|
||||
kernel_arg.num_tasks = num_points;
|
||||
kernel_arg.size = size;
|
||||
|
||||
std::cout << "dev_src0=0x" << std::hex << kernel_arg.A_addr << std::endl;
|
||||
std::cout << "dev_src1=0x" << std::hex << kernel_arg.B_addr << std::endl;
|
||||
std::cout << "dev_dst=0x" << std::hex << kernel_arg.C_addr << std::endl;
|
||||
|
||||
// allocate staging buffer
|
||||
std::cout << "allocate staging buffer" << std::endl;
|
||||
uint32_t alloc_size = std::max<uint32_t>(buf_size, sizeof(kernel_arg_t));
|
||||
staging_buf.resize(alloc_size);
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
memcpy(staging_buf.data(), &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(device, KERNEL_ARG_DEV_MEM_ADDR, staging_buf.data(), sizeof(kernel_arg_t)));
|
||||
|
||||
// generate source data
|
||||
std::vector<TYPE> src_A(num_points);
|
||||
std::vector<TYPE> src_B(num_points);
|
||||
std::vector<TYPE> refs(num_points);
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
auto a = static_cast<float>(std::rand()) / RAND_MAX;
|
||||
auto b = static_cast<float>(std::rand()) / RAND_MAX;
|
||||
src_A[i] = static_cast<TYPE>(a * size);
|
||||
src_B[i] = static_cast<TYPE>(b * size);
|
||||
}
|
||||
matmul_cpu(refs.data(), src_A.data(), src_B.data(), size, size);
|
||||
|
||||
// upload source buffer0
|
||||
{
|
||||
std::cout << "upload source buffer0" << std::endl;
|
||||
auto buf_ptr = (TYPE*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = src_A[i];
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.A_addr, staging_buf.data(), buf_size));
|
||||
}
|
||||
|
||||
// upload source buffer1
|
||||
{
|
||||
std::cout << "upload source buffer1" << std::endl;
|
||||
auto buf_ptr = (TYPE*)staging_buf.data();
|
||||
for (uint32_t i = 0; i < num_points; ++i) {
|
||||
buf_ptr[i] = src_B[i];
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.B_addr, staging_buf.data(), buf_size));
|
||||
}
|
||||
|
||||
// clear destination buffer
|
||||
std::cout << "clear destination buffer" << std::endl;
|
||||
memset(staging_buf.data(), 0, num_points * sizeof(TYPE));
|
||||
RT_CHECK(vx_copy_to_dev(device, kernel_arg.C_addr, staging_buf.data(), buf_size));
|
||||
|
||||
// run tests
|
||||
std::cout << "run tests" << std::endl;
|
||||
RT_CHECK(run_test(kernel_arg, buf_size, refs));
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
30
tests/unittest/common.mk
Normal file
30
tests/unittest/common.mk
Normal file
|
@ -0,0 +1,30 @@
|
|||
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -I$(VORTEX_RT_PATH)/common
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
CXXFLAGS += -g -O0
|
||||
else
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
||||
all: $(PROJECT)
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
run:
|
||||
./$(PROJECT)
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) *.o .depend
|
||||
|
||||
clean-all: clean
|
||||
rm -rf *.elf *.bin *.dump
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
|
@ -1,34 +1,5 @@
|
|||
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||
|
||||
CXXFLAGS += -std=c++11 -Wall -Wextra -pedantic -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -I$(VORTEX_RT_PATH)/common
|
||||
|
||||
# Debugigng
|
||||
ifdef DEBUG
|
||||
CXXFLAGS += -g -O0
|
||||
else
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
||||
PROJECT = vx_malloc
|
||||
|
||||
SRCS = main.cpp
|
||||
|
||||
all: $(PROJECT)
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
run:
|
||||
./$(PROJECT)
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) *.o .depend
|
||||
|
||||
clean-all: clean
|
||||
rm -rf *.elf *.bin *.dump
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
||||
include ../common.mk
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue