mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
fixed lmp_mult parameters, ram init filepath
This commit is contained in:
parent
dccea80b68
commit
42e3b6c45d
36 changed files with 738 additions and 495 deletions
|
@ -5,12 +5,12 @@
|
|||
#include <limits>
|
||||
|
||||
union Float_t {
|
||||
float f;
|
||||
int32_t i;
|
||||
float f;
|
||||
int i;
|
||||
struct {
|
||||
uint32_t mantissa : 23;
|
||||
uint32_t exponent : 8;
|
||||
uint32_t sign : 1;
|
||||
uint32_t man : 23;
|
||||
uint32_t exp : 8;
|
||||
uint32_t sign : 1;
|
||||
} parts;
|
||||
};
|
||||
|
||||
|
|
|
@ -4,19 +4,22 @@ FPGA_BUILD_DIR=build_fpga
|
|||
|
||||
all: ase-1c
|
||||
|
||||
sources.txt:
|
||||
sources.txt:
|
||||
./gen_sources.sh > sources.txt
|
||||
|
||||
gen_sources: sources.txt
|
||||
|
||||
ase-1c: setup-ase-1c gen_sources
|
||||
make -C $(ASE_BUILD_DIR)_1c
|
||||
cp ../rtl/fp_cores/altera/*.hex $(ASE_BUILD_DIR)_1c/work
|
||||
|
||||
ase-2c: setup-ase-2c gen_sources
|
||||
make -C $(ASE_BUILD_DIR)_2c
|
||||
cp ../rtl/fp_cores/altera/*.hex $(ASE_BUILD_DIR)_2c/work
|
||||
|
||||
ase-4c: setup-ase-4c gen_sources
|
||||
make -C $(ASE_BUILD_DIR)_4c
|
||||
cp ../rtl/fp_cores/altera/*.hex $(ASE_BUILD_DIR)_3c/work
|
||||
|
||||
setup-ase-1c: $(ASE_BUILD_DIR)_1c/Makefile
|
||||
|
||||
|
@ -35,12 +38,15 @@ $(ASE_BUILD_DIR)_4c/Makefile: sources.txt
|
|||
|
||||
fpga-1c: setup-fpga-1c gen_sources
|
||||
cd $(FPGA_BUILD_DIR)_1c && qsub-synth
|
||||
cp ../rtl/fp_cores/altera/*.hex $(FPGA_BUILD_DIR)_1c
|
||||
|
||||
fpga-2c: setup-fpga-2c gen_sources
|
||||
cd $(FPGA_BUILD_DIR)_2c && qsub-synth
|
||||
cp ../rtl/fp_cores/altera/*.hex $(FPGA_BUILD_DIR)_2c
|
||||
|
||||
fpga-4c: setup-fpga-4c gen_sources
|
||||
cd $(FPGA_BUILD_DIR)_4c && qsub-synth
|
||||
cp ../rtl/fp_cores/altera/*.hex $(FPGA_BUILD_DIR)_4c
|
||||
|
||||
setup-fpga-1c: $(FPGA_BUILD_DIR)_1c/build/dcp.qpf
|
||||
|
||||
|
|
|
@ -62,7 +62,7 @@ make ase
|
|||
# tests
|
||||
./run_ase.sh build_ase_1c ../../driver/tests/basic/basic -n 256
|
||||
./run_ase.sh build_ase_1c ../../driver/tests/demo/demo -n 16
|
||||
./run_ase.sh build_ase_1c ../../driver/tests/dogfood/dogfood -n 1 -s4 -e4
|
||||
./run_ase.sh build_ase_1c ../../driver/tests/dogfood/dogfood -n1 -s4 -e4
|
||||
./run_ase.sh build_ase_1c ../../benchmarks/opencl/vecadd/vecadd
|
||||
|
||||
# modify "vsim_run.tcl" to dump VCD trace
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
#+define+DBG_PRINT_DRAM
|
||||
#+define+DBG_PRINT_PIPELINE
|
||||
#+define+DBG_PRINT_OPAE
|
||||
#+define+DBG_CORE_REQ_INFO
|
||||
#+define+DBG_PRINT_SCOPE
|
||||
|
||||
vortex_afu.json
|
||||
|
|
|
@ -62,7 +62,7 @@ module VX_commit #(
|
|||
fflags_r <= fflags;
|
||||
has_fflags_r <= fpu_commit_if.valid && fpu_commit_if.has_fflags;
|
||||
wid_r <= fpu_commit_if.wid;
|
||||
num_commits_r <= num_commits;
|
||||
num_commits_r <= (num_commits << $clog2(`NUM_THREADS));
|
||||
end
|
||||
|
||||
assign cmt_to_csr_if.valid = csr_update_r;
|
||||
|
|
|
@ -59,6 +59,8 @@
|
|||
`define EXT_F_ENABLE
|
||||
`endif
|
||||
|
||||
`define FPU_FAST
|
||||
|
||||
// Device identification
|
||||
`define VENDOR_ID 0
|
||||
`define ARCHITECTURE_ID 0
|
||||
|
@ -74,12 +76,12 @@
|
|||
`define LATENCY_FNONCOMP 1
|
||||
`endif
|
||||
|
||||
`ifndef LATENCY_FMADD
|
||||
`define LATENCY_FMADD 1
|
||||
`ifndef LATENCY_FADDMUL
|
||||
`define LATENCY_FADDMUL 3
|
||||
`endif
|
||||
|
||||
`ifndef LATENCY_FNMADD
|
||||
`define LATENCY_FNMADD 2
|
||||
`ifndef LATENCY_FMADD
|
||||
`define LATENCY_FMADD 4
|
||||
`endif
|
||||
|
||||
`ifndef LATENCY_FDIV
|
||||
|
@ -98,16 +100,12 @@
|
|||
`define LATENCY_FTOI 3
|
||||
`endif
|
||||
|
||||
`ifndef LATENCY_FADDMUL
|
||||
`define LATENCY_FADDMUL 2
|
||||
`endif
|
||||
|
||||
`ifndef LATENCY_FDIVSQRT
|
||||
`define LATENCY_FDIVSQRT 2
|
||||
`define LATENCY_FDIVSQRT 10
|
||||
`endif
|
||||
|
||||
`ifndef LATENCY_FCONV
|
||||
`define LATENCY_FCONV 2
|
||||
`define LATENCY_FCONV 3
|
||||
`endif
|
||||
|
||||
// CSR Addresses //////////////////////////////////////////////////////////////
|
||||
|
|
|
@ -385,7 +385,7 @@
|
|||
`define VX_CORE_TAG_WIDTH `L3CORE_TAG_WIDTH
|
||||
`define VX_CSR_ID_WIDTH `LOG2UP(`NUM_CLUSTERS * `NUM_CORES)
|
||||
|
||||
`define DRAM_TO_BYTE_ADDR(x) {x, (32-$bits(x))'(0)}
|
||||
`define DRAM_TO_BYTE_ADDR(x) {x, (32-$bits(x))'(0)}
|
||||
|
||||
`include "VX_types.vh"
|
||||
|
||||
|
|
|
@ -51,32 +51,33 @@ module VX_mul_unit #(
|
|||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] mul_result;
|
||||
wire is_mulw = (alu_op == `MUL_MUL);
|
||||
wire is_mulw_out;
|
||||
wire is_mul_in = (alu_op == `MUL_MUL);
|
||||
wire is_mul_out;
|
||||
wire stall_mul;
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
|
||||
wire [32:0] mul_in1 = {(alu_op != `MUL_MULHU) & alu_in1[i][31], alu_in1[i]};
|
||||
wire [32:0] mul_in2 = {(alu_op != `MUL_MULHU && alu_op != `MUL_MULHSU) & alu_in2[i][31], alu_in2[i]};
|
||||
wire [63:0] mul_result_tmp;
|
||||
`IGNORE_WARNINGS_BEGIN
|
||||
wire [65:0] mul_result_tmp;
|
||||
`IGNORE_WARNINGS_END
|
||||
|
||||
VX_multiplier #(
|
||||
.WIDTHA(33),
|
||||
.WIDTHB(33),
|
||||
.WIDTHP(64),
|
||||
.WIDTHP(66),
|
||||
.SIGNED(1),
|
||||
.PIPELINE(`LATENCY_IMUL)
|
||||
.LATENCY(`LATENCY_IMUL)
|
||||
) multiplier (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.clk_en(~stall_mul),
|
||||
.enable(~stall_mul),
|
||||
.dataa(mul_in1),
|
||||
.datab(mul_in2),
|
||||
.result(mul_result_tmp)
|
||||
);
|
||||
|
||||
assign mul_result[i] = is_mulw_out ? mul_result_tmp[31:0] : mul_result_tmp[63:32];
|
||||
assign mul_result[i] = is_mul_out ? mul_result_tmp[31:0] : mul_result_tmp[63:32];
|
||||
end
|
||||
|
||||
wire [MULQ_BITS-1:0] mul_tag;
|
||||
|
@ -91,17 +92,17 @@ module VX_mul_unit #(
|
|||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(~stall_mul),
|
||||
.in({mul_fire, tag_in, is_mulw}),
|
||||
.out({mul_valid_out, mul_tag, is_mulw_out})
|
||||
.in({mul_fire, tag_in, is_mul_in}),
|
||||
.out({mul_valid_out, mul_tag, is_mul_out})
|
||||
);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] div_result_tmp, rem_result_tmp;
|
||||
|
||||
wire is_div_only = (alu_op == `MUL_DIV) || (alu_op == `MUL_DIVU);
|
||||
wire is_div_only = (alu_op == `MUL_DIV) || (alu_op == `MUL_DIVU);
|
||||
wire is_signed_div = (alu_op == `MUL_DIV) || (alu_op == `MUL_REM);
|
||||
wire div_valid_in = mul_req_if.valid && is_div_op;
|
||||
wire div_valid_in = mul_req_if.valid && is_div_op;
|
||||
wire div_ready_in;
|
||||
wire div_ready_out;
|
||||
wire div_valid_out;
|
||||
|
|
6
hw/rtl/cache/VX_cache_config.vh
vendored
6
hw/rtl/cache/VX_cache_config.vh
vendored
|
@ -4,6 +4,10 @@
|
|||
`include "VX_platform.vh"
|
||||
`include "VX_scope.vh"
|
||||
|
||||
`ifdef DBG_CORE_REQ_INFO
|
||||
`include "VX_define.vh"
|
||||
`endif
|
||||
|
||||
`define REQ_TAG_WIDTH `MAX(CORE_TAG_WIDTH, SNP_REQ_TAG_WIDTH)
|
||||
|
||||
`define REQS_BITS `LOG2UP(NUM_REQUESTS)
|
||||
|
@ -77,4 +81,6 @@
|
|||
|
||||
`define LINE_TO_BYTE_ADDR(x, i) {x, (32-$bits(x))'(i << (32-$bits(x)-`BANK_SELECT_BITS))}
|
||||
|
||||
`define DRAM_TO_BYTE_ADDR(x) {x, (32-$bits(x))'(0)}
|
||||
|
||||
`endif
|
||||
|
|
187
hw/rtl/fp_cores/VX_fp_addmul.v
Normal file
187
hw/rtl/fp_cores/VX_fp_addmul.v
Normal file
|
@ -0,0 +1,187 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
`ifndef SYNTHESIS
|
||||
`include "float_dpi.vh"
|
||||
`endif
|
||||
|
||||
module VX_fp_addmul #(
|
||||
parameter TAGW = 1,
|
||||
parameter LANES = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
output wire ready_in,
|
||||
input wire valid_in,
|
||||
|
||||
input wire [TAGW-1:0] tag_in,
|
||||
|
||||
input wire do_sub,
|
||||
input wire do_mul,
|
||||
|
||||
input wire [LANES-1:0][31:0] dataa,
|
||||
input wire [LANES-1:0][31:0] datab,
|
||||
output wire [LANES-1:0][31:0] result,
|
||||
|
||||
output wire [TAGW-1:0] tag_out,
|
||||
|
||||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
|
||||
wire stall = ~ready_out && valid_out;
|
||||
wire enable = ~stall;
|
||||
|
||||
reg do_sub_r, do_mul_r;
|
||||
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
|
||||
wire [31:0] result_add;
|
||||
wire [31:0] result_sub;
|
||||
wire [31:0] result_mul;
|
||||
|
||||
`ifdef QUARTUS
|
||||
twentynm_fp_mac mac_fp_add (
|
||||
// inputs
|
||||
.accumulate(),
|
||||
.chainin_overflow(),
|
||||
.chainin_invalid(),
|
||||
.chainin_underflow(),
|
||||
.chainin_inexact(),
|
||||
.ax(dataa[i]),
|
||||
.ay(datab[i]),
|
||||
.az(),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,enable}),
|
||||
.aclr(2'b00),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
.invalid(),
|
||||
.underflow(),
|
||||
.inexact(),
|
||||
.chainout_overflow(),
|
||||
.chainout_invalid(),
|
||||
.chainout_underflow(),
|
||||
.chainout_inexact(),
|
||||
.resulta(result_add),
|
||||
.chainout()
|
||||
);
|
||||
defparam mac_fp_add.operation_mode = "sp_add";
|
||||
defparam mac_fp_add.use_chainin = "false";
|
||||
defparam mac_fp_add.adder_subtract = "false";
|
||||
defparam mac_fp_add.ax_clock = "0";
|
||||
defparam mac_fp_add.ay_clock = "0";
|
||||
defparam mac_fp_add.az_clock = "none";
|
||||
defparam mac_fp_add.output_clock = "0";
|
||||
defparam mac_fp_add.accumulate_clock = "none";
|
||||
defparam mac_fp_add.ax_chainin_pl_clock = "none";
|
||||
defparam mac_fp_add.accum_pipeline_clock = "none";
|
||||
defparam mac_fp_add.mult_pipeline_clock = "none";
|
||||
defparam mac_fp_add.adder_input_clock = "0";
|
||||
defparam mac_fp_add.accum_adder_clock = "none";
|
||||
|
||||
twentynm_fp_mac mac_fp_sub (
|
||||
// inputs
|
||||
.accumulate(),
|
||||
.chainin_overflow(),
|
||||
.chainin_invalid(),
|
||||
.chainin_underflow(),
|
||||
.chainin_inexact(),
|
||||
.ax(dataa[i]),
|
||||
.ay(datab[i]),
|
||||
.az(),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,enable}),
|
||||
.aclr(2'b00),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
.invalid(),
|
||||
.underflow(),
|
||||
.inexact(),
|
||||
.chainout_overflow(),
|
||||
.chainout_invalid(),
|
||||
.chainout_underflow(),
|
||||
.chainout_inexact(),
|
||||
.resulta(result_sub),
|
||||
.chainout()
|
||||
);
|
||||
defparam mac_fp_sub.operation_mode = "sp_add";
|
||||
defparam mac_fp_sub.use_chainin = "false";
|
||||
defparam mac_fp_sub.adder_subtract = "true";
|
||||
defparam mac_fp_sub.ax_clock = "0";
|
||||
defparam mac_fp_sub.ay_clock = "0";
|
||||
defparam mac_fp_sub.az_clock = "none";
|
||||
defparam mac_fp_sub.output_clock = "0";
|
||||
defparam mac_fp_sub.accumulate_clock = "none";
|
||||
defparam mac_fp_sub.ax_chainin_pl_clock = "none";
|
||||
defparam mac_fp_sub.accum_pipeline_clock = "none";
|
||||
defparam mac_fp_sub.mult_pipeline_clock = "none";
|
||||
defparam mac_fp_sub.adder_input_clock = "0";
|
||||
defparam mac_fp_sub.accum_adder_clock = "none";
|
||||
|
||||
twentynm_fp_mac mac_fp_mul (
|
||||
// inputs
|
||||
.accumulate(),
|
||||
.chainin_overflow(),
|
||||
.chainin_invalid(),
|
||||
.chainin_underflow(),
|
||||
.chainin_inexact(),
|
||||
.ax(),
|
||||
.ay(datab[i]),
|
||||
.az(dataa[i]),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,enable}),
|
||||
.aclr(2'b00),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
.invalid(),
|
||||
.underflow(),
|
||||
.inexact(),
|
||||
.chainout_overflow(),
|
||||
.chainout_invalid(),
|
||||
.chainout_underflow(),
|
||||
.chainout_inexact(),
|
||||
.resulta(result_mul),
|
||||
.chainout()
|
||||
);
|
||||
defparam mac_fp_mul.operation_mode = "sp_mult";
|
||||
defparam mac_fp_mul.use_chainin = "false";
|
||||
defparam mac_fp_mul.adder_subtract = "false";
|
||||
defparam mac_fp_mul.ax_clock = "none";
|
||||
defparam mac_fp_mul.ay_clock = "0";
|
||||
defparam mac_fp_mul.az_clock = "0";
|
||||
defparam mac_fp_mul.output_clock = "0";
|
||||
defparam mac_fp_mul.accumulate_clock = "none";
|
||||
defparam mac_fp_mul.ax_chainin_pl_clock = "none";
|
||||
defparam mac_fp_mul.accum_pipeline_clock = "none";
|
||||
defparam mac_fp_mul.mult_pipeline_clock = "0";
|
||||
defparam mac_fp_mul.adder_input_clock = "none";
|
||||
defparam mac_fp_mul.accum_adder_clock = "none";
|
||||
`else
|
||||
always @(posedge clk) begin
|
||||
dpi_fadd(0*LANES+i, enable, dataa[i], datab[i], result_add);
|
||||
dpi_fsub(1*LANES+i, enable, dataa[i], datab[i], result_sub);
|
||||
dpi_fmul(2*LANES+i, enable, dataa[i], datab[i], result_mul);
|
||||
end
|
||||
`endif
|
||||
|
||||
assign result[i] = do_mul_r ? result_mul : (do_sub_r ? result_sub : result_add);
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW(TAGW + 1 + 1 + 1),
|
||||
.DEPTH(`LATENCY_FADDMUL)
|
||||
) shift_reg (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(enable),
|
||||
.in({tag_in, valid_in, do_sub, do_mul}),
|
||||
.out({tag_out, valid_out, do_sub_r, do_mul_r})
|
||||
);
|
||||
|
||||
assign ready_in = enable;
|
||||
|
||||
endmodule
|
|
@ -26,20 +26,21 @@ module VX_fp_div #(
|
|||
output wire valid_out
|
||||
);
|
||||
wire stall = ~ready_out && valid_out;
|
||||
wire enable = ~stall;
|
||||
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
`ifdef QUARTUS
|
||||
acl_fp_div fdiv (
|
||||
acl_fdiv fdiv (
|
||||
.clk (clk),
|
||||
.areset (1'b0),
|
||||
.en (~stall),
|
||||
.en (enable),
|
||||
.a (dataa[i]),
|
||||
.b (datab[i]),
|
||||
.q (result[i])
|
||||
);
|
||||
`else
|
||||
always @(posedge clk) begin
|
||||
dpi_fdiv(8*LANES+i, ~stall, valid_in, dataa[i], datab[i], result[i]);
|
||||
dpi_fdiv(8*LANES+i, enable, dataa[i], datab[i], result[i]);
|
||||
end
|
||||
`endif
|
||||
end
|
||||
|
@ -50,11 +51,11 @@ module VX_fp_div #(
|
|||
) shift_reg (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(~stall),
|
||||
.enable(enable),
|
||||
.in ({tag_in, valid_in}),
|
||||
.out({tag_out, valid_out})
|
||||
);
|
||||
|
||||
assign ready_in = ~stall;
|
||||
assign ready_in = enable;
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -27,7 +27,7 @@ module VX_fp_fpga #(
|
|||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
localparam NUM_FPC = 7;
|
||||
localparam NUM_FPC = 8;
|
||||
localparam FPC_BITS = `LOG2UP(NUM_FPC);
|
||||
|
||||
wire [NUM_FPC-1:0] per_core_ready_in;
|
||||
|
@ -40,29 +40,28 @@ module VX_fp_fpga #(
|
|||
fflags_t [`NUM_THREADS-1:0] fpnew_fflags;
|
||||
|
||||
reg [FPC_BITS-1:0] core_select;
|
||||
reg do_add, do_sub, do_mul;
|
||||
reg do_sub, do_mul;
|
||||
reg is_signed;
|
||||
|
||||
always @(*) begin
|
||||
core_select = 'x;
|
||||
do_add = 'x;
|
||||
do_sub = 'x;
|
||||
do_mul = 'x;
|
||||
is_signed = 'x;
|
||||
case (op_type)
|
||||
`FPU_ADD: begin core_select = 1; do_mul = 0; do_add = 1; do_sub = 0; end
|
||||
`FPU_SUB: begin core_select = 1; do_mul = 0; do_add = 0; do_sub = 1; end
|
||||
`FPU_MUL: begin core_select = 1; do_mul = 1; do_add = 0; do_sub = 0; end
|
||||
`FPU_MADD: begin core_select = 1; do_mul = 1; do_add = 1; do_sub = 0; end
|
||||
`FPU_MSUB: begin core_select = 1; do_mul = 1; do_add = 0; do_sub = 1; end
|
||||
`FPU_NMSUB: begin core_select = 2; do_sub = 1; end
|
||||
`FPU_NMADD: begin core_select = 2; do_sub = 0; end
|
||||
`FPU_DIV: begin core_select = 3; end
|
||||
`FPU_SQRT: begin core_select = 4; end
|
||||
`FPU_CVTWS: begin core_select = 5; is_signed = 1; end
|
||||
`FPU_CVTWUS: begin core_select = 5; is_signed = 0; end
|
||||
`FPU_CVTSW: begin core_select = 6; is_signed = 1; end
|
||||
`FPU_CVTSWU: begin core_select = 6; is_signed = 0; end
|
||||
`FPU_ADD: begin core_select = 1; do_mul = 0; do_sub = 0; end
|
||||
`FPU_SUB: begin core_select = 1; do_mul = 0; do_sub = 1; end
|
||||
`FPU_MUL: begin core_select = 1; do_mul = 1; do_sub = 0; end
|
||||
`FPU_MADD: begin core_select = 2; do_sub = 0; end
|
||||
`FPU_MSUB: begin core_select = 2; do_sub = 1; end
|
||||
`FPU_NMADD: begin core_select = 3; do_sub = 0; end
|
||||
`FPU_NMSUB: begin core_select = 3; do_sub = 1; end
|
||||
`FPU_DIV: begin core_select = 4; end
|
||||
`FPU_SQRT: begin core_select = 5; end
|
||||
`FPU_CVTWS: begin core_select = 6; is_signed = 1; end
|
||||
`FPU_CVTWUS: begin core_select = 6; is_signed = 0; end
|
||||
`FPU_CVTSW: begin core_select = 7; is_signed = 1; end
|
||||
`FPU_CVTSWU: begin core_select = 7; is_signed = 0; end
|
||||
default: begin core_select = 0; end
|
||||
endcase
|
||||
end
|
||||
|
@ -88,25 +87,42 @@ module VX_fp_fpga #(
|
|||
.valid_out (per_core_valid_out[0])
|
||||
);
|
||||
|
||||
VX_fp_addmul #(
|
||||
.TAGW (TAGW),
|
||||
.LANES(`NUM_THREADS)
|
||||
) fp_addmul (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 1)),
|
||||
.ready_in (per_core_ready_in[1]),
|
||||
.tag_in (tag_in),
|
||||
.do_sub (do_sub),
|
||||
.do_mul (do_mul),
|
||||
.dataa (dataa),
|
||||
.datab (datab),
|
||||
.result (per_core_result[1]),
|
||||
.tag_out (per_core_tag_out[1]),
|
||||
.ready_out (per_core_ready_out[1]),
|
||||
.valid_out (per_core_valid_out[1])
|
||||
);
|
||||
|
||||
VX_fp_madd #(
|
||||
.TAGW (TAGW),
|
||||
.LANES(`NUM_THREADS)
|
||||
) fp_madd (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 1)),
|
||||
.ready_in (per_core_ready_in[1]),
|
||||
.tag_in (tag_in),
|
||||
.do_add (do_add),
|
||||
.valid_in (valid_in && (core_select == 2)),
|
||||
.ready_in (per_core_ready_in[2]),
|
||||
.tag_in (tag_in),
|
||||
.do_sub (do_sub),
|
||||
.do_mul (do_mul),
|
||||
.dataa (dataa),
|
||||
.datab (datab),
|
||||
.datac (datac),
|
||||
.result (per_core_result[1]),
|
||||
.tag_out (per_core_tag_out[1]),
|
||||
.ready_out (per_core_ready_out[1]),
|
||||
.valid_out (per_core_valid_out[1])
|
||||
.result (per_core_result[2]),
|
||||
.tag_out (per_core_tag_out[2]),
|
||||
.ready_out (per_core_ready_out[2]),
|
||||
.valid_out (per_core_valid_out[2])
|
||||
);
|
||||
|
||||
VX_fp_nmadd #(
|
||||
|
@ -115,17 +131,17 @@ module VX_fp_fpga #(
|
|||
) fp_nmadd (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 2)),
|
||||
.ready_in (per_core_ready_in[2]),
|
||||
.valid_in (valid_in && (core_select == 3)),
|
||||
.ready_in (per_core_ready_in[3]),
|
||||
.tag_in (tag_in),
|
||||
.do_sub (do_sub),
|
||||
.dataa (dataa),
|
||||
.datab (datab),
|
||||
.datac (datac),
|
||||
.result (per_core_result[2]),
|
||||
.tag_out (per_core_tag_out[2]),
|
||||
.ready_out (per_core_ready_out[2]),
|
||||
.valid_out (per_core_valid_out[2])
|
||||
.result (per_core_result[3]),
|
||||
.tag_out (per_core_tag_out[3]),
|
||||
.ready_out (per_core_ready_out[3]),
|
||||
.valid_out (per_core_valid_out[3])
|
||||
);
|
||||
|
||||
VX_fp_div #(
|
||||
|
@ -134,15 +150,15 @@ module VX_fp_fpga #(
|
|||
) fp_div (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 3)),
|
||||
.ready_in (per_core_ready_in[3]),
|
||||
.valid_in (valid_in && (core_select == 4)),
|
||||
.ready_in (per_core_ready_in[4]),
|
||||
.tag_in (tag_in),
|
||||
.dataa (dataa),
|
||||
.datab (datab),
|
||||
.result (per_core_result[3]),
|
||||
.tag_out (per_core_tag_out[3]),
|
||||
.ready_out (per_core_ready_out[3]),
|
||||
.valid_out (per_core_valid_out[3])
|
||||
.result (per_core_result[4]),
|
||||
.tag_out (per_core_tag_out[4]),
|
||||
.ready_out (per_core_ready_out[4]),
|
||||
.valid_out (per_core_valid_out[4])
|
||||
);
|
||||
|
||||
VX_fp_sqrt #(
|
||||
|
@ -151,14 +167,14 @@ module VX_fp_fpga #(
|
|||
) fp_sqrt (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 4)),
|
||||
.ready_in (per_core_ready_in[4]),
|
||||
.valid_in (valid_in && (core_select == 5)),
|
||||
.ready_in (per_core_ready_in[5]),
|
||||
.tag_in (tag_in),
|
||||
.dataa (dataa),
|
||||
.result (per_core_result[4]),
|
||||
.tag_out (per_core_tag_out[4]),
|
||||
.ready_out (per_core_ready_out[4]),
|
||||
.valid_out (per_core_valid_out[4])
|
||||
.result (per_core_result[5]),
|
||||
.tag_out (per_core_tag_out[5]),
|
||||
.ready_out (per_core_ready_out[5]),
|
||||
.valid_out (per_core_valid_out[5])
|
||||
);
|
||||
|
||||
VX_fp_ftoi #(
|
||||
|
@ -167,15 +183,15 @@ module VX_fp_fpga #(
|
|||
) fp_ftoi (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 5)),
|
||||
.ready_in (per_core_ready_in[5]),
|
||||
.valid_in (valid_in && (core_select == 6)),
|
||||
.ready_in (per_core_ready_in[6]),
|
||||
.tag_in (tag_in),
|
||||
.is_signed (is_signed),
|
||||
.dataa (dataa),
|
||||
.result (per_core_result[5]),
|
||||
.tag_out (per_core_tag_out[5]),
|
||||
.ready_out (per_core_ready_out[5]),
|
||||
.valid_out (per_core_valid_out[5])
|
||||
.result (per_core_result[6]),
|
||||
.tag_out (per_core_tag_out[6]),
|
||||
.ready_out (per_core_ready_out[6]),
|
||||
.valid_out (per_core_valid_out[6])
|
||||
);
|
||||
|
||||
VX_fp_itof #(
|
||||
|
@ -184,15 +200,15 @@ module VX_fp_fpga #(
|
|||
) fp_itof (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 6)),
|
||||
.ready_in (per_core_ready_in[6]),
|
||||
.valid_in (valid_in && (core_select == 7)),
|
||||
.ready_in (per_core_ready_in[7]),
|
||||
.tag_in (tag_in),
|
||||
.is_signed (is_signed),
|
||||
.dataa (dataa),
|
||||
.result (per_core_result[6]),
|
||||
.tag_out (per_core_tag_out[6]),
|
||||
.ready_out (per_core_ready_out[6]),
|
||||
.valid_out (per_core_valid_out[6])
|
||||
.result (per_core_result[7]),
|
||||
.tag_out (per_core_tag_out[7]),
|
||||
.ready_out (per_core_ready_out[7]),
|
||||
.valid_out (per_core_valid_out[7])
|
||||
);
|
||||
|
||||
reg valid_out_n;
|
||||
|
|
|
@ -27,6 +27,7 @@ module VX_fp_ftoi #(
|
|||
output wire valid_out
|
||||
);
|
||||
wire stall = ~ready_out && valid_out;
|
||||
wire enable = ~stall;
|
||||
|
||||
reg is_signed_r;
|
||||
|
||||
|
@ -36,25 +37,25 @@ module VX_fp_ftoi #(
|
|||
wire [31:0] result_u;
|
||||
|
||||
`ifdef QUARTUS
|
||||
acl_fp_ftoi ftoi (
|
||||
acl_ftoi ftoi (
|
||||
.clk (clk),
|
||||
.areset (1'b0),
|
||||
.en (~stall),
|
||||
.en (enable),
|
||||
.a (dataa[i]),
|
||||
.q (result_s)
|
||||
);
|
||||
|
||||
acl_fp_ftou ftou (
|
||||
acl_ftou ftou (
|
||||
.clk (clk),
|
||||
.areset (1'b0),
|
||||
.en (~stall),
|
||||
.en (enable),
|
||||
.a (dataa[i]),
|
||||
.q (result_u)
|
||||
);
|
||||
`else
|
||||
always @(posedge clk) begin
|
||||
dpi_ftoi(10*LANES+i, ~stall, valid_in, dataa[i], result_s);
|
||||
dpi_ftou(11*LANES+i, ~stall, valid_in, dataa[i], result_u);
|
||||
dpi_ftoi(10*LANES+i, enable, dataa[i], result_s);
|
||||
dpi_ftou(11*LANES+i, enable, dataa[i], result_u);
|
||||
end
|
||||
`endif
|
||||
|
||||
|
@ -67,11 +68,11 @@ module VX_fp_ftoi #(
|
|||
) shift_reg (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(~stall),
|
||||
.enable(enable),
|
||||
.in ({tag_in, valid_in, is_signed}),
|
||||
.out({tag_out, valid_out, is_signed_r})
|
||||
);
|
||||
|
||||
assign ready_in = ~stall;
|
||||
assign ready_in = enable;
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -27,6 +27,7 @@ module VX_fp_itof #(
|
|||
output wire valid_out
|
||||
);
|
||||
wire stall = ~ready_out && valid_out;
|
||||
wire enable = ~stall;
|
||||
|
||||
reg is_signed_r;
|
||||
|
||||
|
@ -36,25 +37,25 @@ module VX_fp_itof #(
|
|||
wire [31:0] result_u;
|
||||
|
||||
`ifdef QUARTUS
|
||||
acl_fp_itof itof (
|
||||
acl_itof itof (
|
||||
.clk (clk),
|
||||
.areset (1'b0),
|
||||
.en (~stall),
|
||||
.en (enable),
|
||||
.a (dataa[i]),
|
||||
.q (result_s)
|
||||
);
|
||||
|
||||
acl_fp_utof utof (
|
||||
acl_utof utof (
|
||||
.clk (clk),
|
||||
.areset (1'b0),
|
||||
.en (~stall),
|
||||
.en (enable),
|
||||
.a (dataa[i]),
|
||||
.q (result_u)
|
||||
);
|
||||
`else
|
||||
always @(posedge clk) begin
|
||||
dpi_itof(12*LANES+i, ~stall, valid_in, dataa[i], result_s);
|
||||
dpi_utof(13*LANES+i, ~stall, valid_in, dataa[i], result_u);
|
||||
dpi_itof(12*LANES+i, enable, dataa[i], result_s);
|
||||
dpi_utof(13*LANES+i, enable, dataa[i], result_u);
|
||||
end
|
||||
`endif
|
||||
|
||||
|
@ -67,11 +68,11 @@ module VX_fp_itof #(
|
|||
) shift_reg (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(~stall),
|
||||
.enable(enable),
|
||||
.in ({tag_in, valid_in, is_signed}),
|
||||
.out({tag_out, valid_out, is_signed_r})
|
||||
);
|
||||
|
||||
assign ready_in = ~stall;
|
||||
assign ready_in = enable;
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -16,9 +16,7 @@ module VX_fp_madd #(
|
|||
|
||||
input wire [TAGW-1:0] tag_in,
|
||||
|
||||
input wire do_add,
|
||||
input wire do_sub,
|
||||
input wire do_mul,
|
||||
input wire do_sub,
|
||||
|
||||
input wire [LANES-1:0][31:0] dataa,
|
||||
input wire [LANES-1:0][31:0] datab,
|
||||
|
@ -32,138 +30,16 @@ module VX_fp_madd #(
|
|||
);
|
||||
|
||||
wire stall = ~ready_out && valid_out;
|
||||
wire enable = ~stall;
|
||||
|
||||
reg do_add_r, do_sub_r, do_mul_r;
|
||||
reg do_sub_r;
|
||||
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
|
||||
wire [31:0] result_add;
|
||||
wire [31:0] result_sub;
|
||||
wire [31:0] result_mul;
|
||||
wire [31:0] result_madd;
|
||||
wire [31:0] result_msub;
|
||||
|
||||
`ifdef QUARTUS
|
||||
twentynm_fp_mac mac_fp_add (
|
||||
// inputs
|
||||
.accumulate(),
|
||||
.chainin_overflow(),
|
||||
.chainin_invalid(),
|
||||
.chainin_underflow(),
|
||||
.chainin_inexact(),
|
||||
.ax(dataa[i]),
|
||||
.ay(datab[i]),
|
||||
.az(),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,~stall}),
|
||||
.aclr(2'b00),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
.invalid(),
|
||||
.underflow(),
|
||||
.inexact(),
|
||||
.chainout_overflow(),
|
||||
.chainout_invalid(),
|
||||
.chainout_underflow(),
|
||||
.chainout_inexact(),
|
||||
.resulta(result_add),
|
||||
.chainout()
|
||||
);
|
||||
defparam mac_fp_add.operation_mode = "sp_add";
|
||||
defparam mac_fp_add.use_chainin = "false";
|
||||
defparam mac_fp_add.adder_subtract = "false";
|
||||
defparam mac_fp_add.ax_clock = "0";
|
||||
defparam mac_fp_add.ay_clock = "0";
|
||||
defparam mac_fp_add.az_clock = "none";
|
||||
defparam mac_fp_add.output_clock = "0";
|
||||
defparam mac_fp_add.accumulate_clock = "none";
|
||||
defparam mac_fp_add.ax_chainin_pl_clock = "none";
|
||||
defparam mac_fp_add.accum_pipeline_clock = "none";
|
||||
defparam mac_fp_add.mult_pipeline_clock = "none";
|
||||
defparam mac_fp_add.adder_input_clock = "0";
|
||||
defparam mac_fp_add.accum_adder_clock = "none";
|
||||
|
||||
twentynm_fp_mac mac_fp_sub (
|
||||
// inputs
|
||||
.accumulate(),
|
||||
.chainin_overflow(),
|
||||
.chainin_invalid(),
|
||||
.chainin_underflow(),
|
||||
.chainin_inexact(),
|
||||
.ax(dataa[i]),
|
||||
.ay(datab[i]),
|
||||
.az(),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,~stall}),
|
||||
.aclr(2'b00),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
.invalid(),
|
||||
.underflow(),
|
||||
.inexact(),
|
||||
.chainout_overflow(),
|
||||
.chainout_invalid(),
|
||||
.chainout_underflow(),
|
||||
.chainout_inexact(),
|
||||
.resulta(result_sub),
|
||||
.chainout()
|
||||
);
|
||||
defparam mac_fp_sub.operation_mode = "sp_add";
|
||||
defparam mac_fp_sub.use_chainin = "false";
|
||||
defparam mac_fp_sub.adder_subtract = "true";
|
||||
defparam mac_fp_sub.ax_clock = "0";
|
||||
defparam mac_fp_sub.ay_clock = "0";
|
||||
defparam mac_fp_sub.az_clock = "none";
|
||||
defparam mac_fp_sub.output_clock = "0";
|
||||
defparam mac_fp_sub.accumulate_clock = "none";
|
||||
defparam mac_fp_sub.ax_chainin_pl_clock = "none";
|
||||
defparam mac_fp_sub.accum_pipeline_clock = "none";
|
||||
defparam mac_fp_sub.mult_pipeline_clock = "none";
|
||||
defparam mac_fp_sub.adder_input_clock = "0";
|
||||
defparam mac_fp_sub.accum_adder_clock = "none";
|
||||
|
||||
twentynm_fp_mac mac_fp_mul (
|
||||
// inputs
|
||||
.accumulate(),
|
||||
.chainin_overflow(),
|
||||
.chainin_invalid(),
|
||||
.chainin_underflow(),
|
||||
.chainin_inexact(),
|
||||
.ax(),
|
||||
.ay(datab[i]),
|
||||
.az(dataa[i]),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,~stall}),
|
||||
.aclr(2'b00),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
.invalid(),
|
||||
.underflow(),
|
||||
.inexact(),
|
||||
.chainout_overflow(),
|
||||
.chainout_invalid(),
|
||||
.chainout_underflow(),
|
||||
.chainout_inexact(),
|
||||
.resulta(result_mul),
|
||||
.chainout()
|
||||
);
|
||||
defparam mac_fp_mul.operation_mode = "sp_mult";
|
||||
defparam mac_fp_mul.use_chainin = "false";
|
||||
defparam mac_fp_mul.adder_subtract = "false";
|
||||
defparam mac_fp_mul.ax_clock = "none";
|
||||
defparam mac_fp_mul.ay_clock = "0";
|
||||
defparam mac_fp_mul.az_clock = "0";
|
||||
defparam mac_fp_mul.output_clock = "0";
|
||||
defparam mac_fp_mul.accumulate_clock = "none";
|
||||
defparam mac_fp_mul.ax_chainin_pl_clock = "none";
|
||||
defparam mac_fp_mul.accum_pipeline_clock = "none";
|
||||
defparam mac_fp_mul.mult_pipeline_clock = "0";
|
||||
defparam mac_fp_mul.adder_input_clock = "none";
|
||||
defparam mac_fp_mul.accum_adder_clock = "none";
|
||||
|
||||
twentynm_fp_mac mac_fp_madd (
|
||||
// inputs
|
||||
.accumulate(),
|
||||
|
@ -175,7 +51,7 @@ module VX_fp_madd #(
|
|||
.ay(datab[i]),
|
||||
.az(dataa[i]),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,~stall}),
|
||||
.ena({2'b11,enable}),
|
||||
.aclr(2'b00),
|
||||
.chainin(),
|
||||
// outputs
|
||||
|
@ -215,7 +91,7 @@ module VX_fp_madd #(
|
|||
.ay(datab[i]),
|
||||
.az(dataa[i]),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,~stall}),
|
||||
.ena({2'b11,enable}),
|
||||
.aclr(2'b00),
|
||||
.chainin(),
|
||||
// outputs
|
||||
|
@ -245,47 +121,25 @@ module VX_fp_madd #(
|
|||
defparam mac_fp_msub.accum_adder_clock = "none";
|
||||
`else
|
||||
always @(posedge clk) begin
|
||||
dpi_fadd(0*LANES+i, ~stall, valid_in, dataa[i], datab[i], result_add);
|
||||
dpi_fsub(1*LANES+i, ~stall, valid_in, dataa[i], datab[i], result_sub);
|
||||
dpi_fmul(2*LANES+i, ~stall, valid_in, dataa[i], datab[i], result_mul);
|
||||
dpi_fmadd(3*LANES+i, ~stall, valid_in, dataa[i], datab[i], datac[i], result_madd);
|
||||
dpi_fmsub(4*LANES+i, ~stall, valid_in, dataa[i], datab[i], datac[i], result_msub);
|
||||
dpi_fmadd(3*LANES+i, enable, dataa[i], datab[i], datac[i], result_madd);
|
||||
dpi_fmsub(4*LANES+i, enable, dataa[i], datab[i], datac[i], result_msub);
|
||||
end
|
||||
`endif
|
||||
|
||||
reg [31:0] result_r;
|
||||
|
||||
always @(*) begin
|
||||
result_r = 'x;
|
||||
if (do_mul_r) begin
|
||||
if (do_add_r)
|
||||
result_r = result_madd;
|
||||
else if (do_sub_r)
|
||||
result_r = result_msub;
|
||||
else
|
||||
result_r = result_mul;
|
||||
end else begin
|
||||
if (do_add_r)
|
||||
result_r = result_add;
|
||||
else if (do_sub_r)
|
||||
result_r = result_sub;
|
||||
end
|
||||
end
|
||||
|
||||
assign result[i] = result_r;
|
||||
assign result[i] = do_sub_r ? result_msub : result_madd;
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW(TAGW + 1 + 1 + 1 + 1),
|
||||
.DATAW(TAGW + 1 + 1),
|
||||
.DEPTH(`LATENCY_FMADD)
|
||||
) shift_reg1 (
|
||||
) shift_reg (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(~stall),
|
||||
.in({tag_in, valid_in, do_add, do_sub, do_mul}),
|
||||
.out({tag_out, valid_out, do_add_r, do_sub_r, do_mul_r})
|
||||
.enable(enable),
|
||||
.in({tag_in, valid_in, do_sub}),
|
||||
.out({tag_out, valid_out, do_sub_r})
|
||||
);
|
||||
|
||||
assign ready_in = ~stall;
|
||||
assign ready_in = enable;
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -30,13 +30,14 @@ module VX_fp_nmadd #(
|
|||
);
|
||||
|
||||
wire stall = ~ready_out && valid_out;
|
||||
wire enable = ~stall;
|
||||
|
||||
reg do_sub_r;
|
||||
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
|
||||
wire [31:0] result_madd;
|
||||
wire [31:0] result_msub;
|
||||
wire [31:0] result_msub;
|
||||
|
||||
wire [31:0] result_st0 = do_sub_r ? result_msub : result_madd;
|
||||
|
||||
|
@ -52,7 +53,7 @@ module VX_fp_nmadd #(
|
|||
.ay(datab[i]),
|
||||
.az(dataa[i]),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,~stall}),
|
||||
.ena({2'b11,enable}),
|
||||
.aclr(2'b00),
|
||||
.chainin(),
|
||||
// outputs
|
||||
|
@ -161,33 +162,36 @@ module VX_fp_nmadd #(
|
|||
defparam mac_fp_neg.adder_input_clock = "0";
|
||||
defparam mac_fp_neg.accum_adder_clock = "none";
|
||||
`else
|
||||
reg valid_in_st0;
|
||||
always @(posedge clk) begin
|
||||
valid_in_st0 <= reset ? 0 : valid_in;
|
||||
dpi_fmadd(5*LANES+i, ~stall, valid_in, dataa[i], datab[i], datac[i], result_madd);
|
||||
dpi_fmsub(6*LANES+i, ~stall, valid_in, dataa[i], datab[i], datac[i], result_msub);
|
||||
dpi_fsub(7*LANES+i, ~stall, valid_in_st0, 32'b0, result_st0, result[i]);
|
||||
dpi_fmadd(5*LANES+i, enable, dataa[i], datab[i], datac[i], result_madd);
|
||||
dpi_fmsub(6*LANES+i, enable, dataa[i], datab[i], datac[i], result_msub);
|
||||
dpi_fsub(7*LANES+i, enable, 32'b0, result_st0, result[i]);
|
||||
end
|
||||
`endif
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (~stall) begin
|
||||
do_sub_r <= do_sub;
|
||||
end
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW(1),
|
||||
.DEPTH(`LATENCY_FMADD)
|
||||
) shift_reg0 (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(enable),
|
||||
.in({do_sub}),
|
||||
.out({do_sub_r})
|
||||
);
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW(TAGW + 1),
|
||||
.DEPTH(`LATENCY_FNMADD)
|
||||
.DEPTH(`LATENCY_FMADD + `LATENCY_FADDMUL)
|
||||
) shift_reg1 (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(~stall),
|
||||
.enable(enable),
|
||||
.in({tag_in, valid_in}),
|
||||
.out({tag_out, valid_out})
|
||||
);
|
||||
|
||||
assign ready_in = ~stall;
|
||||
assign ready_in = enable;
|
||||
|
||||
endmodule
|
|
@ -25,19 +25,20 @@ module VX_fp_sqrt #(
|
|||
output wire valid_out
|
||||
);
|
||||
wire stall = ~ready_out && valid_out;
|
||||
wire enable = ~stall;
|
||||
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
`ifdef QUARTUS
|
||||
acl_fp_sqrt fsqrt (
|
||||
acl_fsqrt fsqrt (
|
||||
.clk (clk),
|
||||
.areset (1'b0),
|
||||
.en (~stall),
|
||||
.en (enable),
|
||||
.a (dataa[i]),
|
||||
.q (result[i])
|
||||
);
|
||||
`else
|
||||
always @(posedge clk) begin
|
||||
dpi_fsqrt(9*LANES+i, ~stall, valid_in, dataa[i], result[i]);
|
||||
dpi_fsqrt(9*LANES+i, enable, dataa[i], result[i]);
|
||||
end
|
||||
`endif
|
||||
end
|
||||
|
@ -48,11 +49,11 @@ module VX_fp_sqrt #(
|
|||
) shift_reg (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(~stall),
|
||||
.enable(enable),
|
||||
.in ({tag_in, valid_in}),
|
||||
.out({tag_out, valid_out})
|
||||
);
|
||||
|
||||
assign ready_in = ~stall;
|
||||
assign ready_in = enable;
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -15,12 +15,12 @@
|
|||
// applicable agreement for further details.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// SystemVerilog created from acl_fp_div
|
||||
// SystemVerilog created on Mon Aug 31 06:15:17 2020
|
||||
// SystemVerilog created from acl_fdiv
|
||||
// SystemVerilog created on Wed Sep 2 07:11:09 2020
|
||||
|
||||
|
||||
(* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 10037; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 15400; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 12020; -name MESSAGE_DISABLE 12030; -name MESSAGE_DISABLE 12010; -name MESSAGE_DISABLE 12110; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 13410; -name MESSAGE_DISABLE 113007; -name MESSAGE_DISABLE 10958" *)
|
||||
module acl_fp_div (
|
||||
module acl_fdiv (
|
||||
input wire [31:0] a,
|
||||
input wire [31:0] b,
|
||||
input wire [0:0] en,
|
||||
|
@ -623,7 +623,7 @@ module acl_fp_div (
|
|||
.outdata_aclr_a("CLEAR0"),
|
||||
.clock_enable_input_a("NORMAL"),
|
||||
.power_up_uninitialized("FALSE"),
|
||||
.init_file("acl_fp_div_memoryC2_uid118_invTables_lutmem.hex"),
|
||||
.init_file("acl_fdiv_memoryC2_uid118_invTables_lutmem.hex"),
|
||||
.init_file_layout("PORT_A"),
|
||||
.intended_device_family("Arria 10")
|
||||
) memoryC2_uid118_invTables_lutmem_dmem (
|
||||
|
@ -755,7 +755,7 @@ module acl_fp_div (
|
|||
.outdata_aclr_a("CLEAR0"),
|
||||
.clock_enable_input_a("NORMAL"),
|
||||
.power_up_uninitialized("FALSE"),
|
||||
.init_file("acl_fp_div_memoryC1_uid115_invTables_lutmem.hex"),
|
||||
.init_file("acl_fdiv_memoryC1_uid115_invTables_lutmem.hex"),
|
||||
.init_file_layout("PORT_A"),
|
||||
.intended_device_family("Arria 10")
|
||||
) memoryC1_uid115_invTables_lutmem_dmem (
|
||||
|
@ -1060,7 +1060,7 @@ module acl_fp_div (
|
|||
.outdata_aclr_a("CLEAR0"),
|
||||
.clock_enable_input_a("NORMAL"),
|
||||
.power_up_uninitialized("FALSE"),
|
||||
.init_file("acl_fp_div_memoryC0_uid112_invTables_lutmem.hex"),
|
||||
.init_file("acl_fdiv_memoryC0_uid112_invTables_lutmem.hex"),
|
||||
.init_file_layout("PORT_A"),
|
||||
.intended_device_family("Arria 10")
|
||||
) memoryC0_uid112_invTables_lutmem_dmem (
|
|
@ -15,12 +15,12 @@
|
|||
// applicable agreement for further details.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// SystemVerilog created from acl_fp_sqrt
|
||||
// SystemVerilog created on Mon Aug 31 06:15:18 2020
|
||||
// SystemVerilog created from acl_fsqrt
|
||||
// SystemVerilog created on Wed Sep 2 07:11:09 2020
|
||||
|
||||
|
||||
(* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 10037; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 15400; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 12020; -name MESSAGE_DISABLE 12030; -name MESSAGE_DISABLE 12010; -name MESSAGE_DISABLE 12110; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 13410; -name MESSAGE_DISABLE 113007; -name MESSAGE_DISABLE 10958" *)
|
||||
module acl_fp_sqrt (
|
||||
module acl_fsqrt (
|
||||
input wire [31:0] a,
|
||||
input wire [0:0] en,
|
||||
output wire [31:0] q,
|
||||
|
@ -279,7 +279,7 @@ module acl_fp_sqrt (
|
|||
.outdata_aclr_a("CLEAR0"),
|
||||
.clock_enable_input_a("NORMAL"),
|
||||
.power_up_uninitialized("FALSE"),
|
||||
.init_file("acl_fp_sqrt_memoryC2_uid68_sqrtTables_lutmem.hex"),
|
||||
.init_file("acl_fsqrt_memoryC2_uid68_sqrtTables_lutmem.hex"),
|
||||
.init_file_layout("PORT_A"),
|
||||
.intended_device_family("Arria 10")
|
||||
) memoryC2_uid68_sqrtTables_lutmem_dmem (
|
||||
|
@ -412,7 +412,7 @@ module acl_fp_sqrt (
|
|||
.outdata_aclr_a("CLEAR0"),
|
||||
.clock_enable_input_a("NORMAL"),
|
||||
.power_up_uninitialized("FALSE"),
|
||||
.init_file("acl_fp_sqrt_memoryC1_uid65_sqrtTables_lutmem.hex"),
|
||||
.init_file("acl_fsqrt_memoryC1_uid65_sqrtTables_lutmem.hex"),
|
||||
.init_file_layout("PORT_A"),
|
||||
.intended_device_family("Arria 10")
|
||||
) memoryC1_uid65_sqrtTables_lutmem_dmem (
|
||||
|
@ -723,7 +723,7 @@ module acl_fp_sqrt (
|
|||
.outdata_aclr_a("CLEAR0"),
|
||||
.clock_enable_input_a("NORMAL"),
|
||||
.power_up_uninitialized("FALSE"),
|
||||
.init_file("acl_fp_sqrt_memoryC0_uid62_sqrtTables_lutmem.hex"),
|
||||
.init_file("acl_fsqrt_memoryC0_uid62_sqrtTables_lutmem.hex"),
|
||||
.init_file_layout("PORT_A"),
|
||||
.intended_device_family("Arria 10")
|
||||
) memoryC0_uid62_sqrtTables_lutmem_dmem (
|
|
@ -15,12 +15,12 @@
|
|||
// applicable agreement for further details.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// SystemVerilog created from acl_fp_ftoi
|
||||
// SystemVerilog created on Mon Aug 31 06:15:18 2020
|
||||
// SystemVerilog created from acl_ftoi
|
||||
// SystemVerilog created on Wed Sep 2 07:11:09 2020
|
||||
|
||||
|
||||
(* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 10037; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 15400; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 12020; -name MESSAGE_DISABLE 12030; -name MESSAGE_DISABLE 12010; -name MESSAGE_DISABLE 12110; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 13410; -name MESSAGE_DISABLE 113007; -name MESSAGE_DISABLE 10958" *)
|
||||
module acl_fp_ftoi (
|
||||
module acl_ftoi (
|
||||
input wire [31:0] a,
|
||||
input wire [0:0] en,
|
||||
output wire [31:0] q,
|
|
@ -15,12 +15,12 @@
|
|||
// applicable agreement for further details.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// SystemVerilog created from acl_fp_ftou
|
||||
// SystemVerilog created on Mon Aug 31 06:15:18 2020
|
||||
// SystemVerilog created from acl_ftou
|
||||
// SystemVerilog created on Wed Sep 2 07:11:09 2020
|
||||
|
||||
|
||||
(* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 10037; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 15400; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 12020; -name MESSAGE_DISABLE 12030; -name MESSAGE_DISABLE 12010; -name MESSAGE_DISABLE 12110; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 13410; -name MESSAGE_DISABLE 113007; -name MESSAGE_DISABLE 10958" *)
|
||||
module acl_fp_ftou (
|
||||
module acl_ftou (
|
||||
input wire [31:0] a,
|
||||
input wire [0:0] en,
|
||||
output wire [31:0] q,
|
169
hw/rtl/fp_cores/altera/acl_gen.log
Normal file
169
hw/rtl/fp_cores/altera/acl_gen.log
Normal file
|
@ -0,0 +1,169 @@
|
|||
starting execution ...
|
||||
build model options ...
|
||||
argc=21
|
||||
Generation context:
|
||||
HardFP is enabled enabling set to true
|
||||
Faithful rounding constraint detected
|
||||
Will not generate valid and channel signals
|
||||
The new component name is acl_fdiv
|
||||
Frequency 250MHz
|
||||
Deployment FPGA Arria10
|
||||
Estimated resources LUTs 539, DSPs 5, RAMBits 32768, RAMBlocks 3
|
||||
The pipeline depth of the block is 15 cycle(s)
|
||||
@@start
|
||||
@name FPDiv@
|
||||
@latency 15@
|
||||
@LUT 539@
|
||||
@DSP 5@
|
||||
@RAMBits 32768@
|
||||
@RAMBlockUsage 3@
|
||||
@enable 1@
|
||||
@subnormals 0@
|
||||
@error 1.00@
|
||||
@rounding NA@
|
||||
@method polynomial approximation@
|
||||
@inPort 0 fpieee 8 23@
|
||||
@inPort 1 fpieee 8 23@
|
||||
@outPort 0 fpieee 8 23@
|
||||
@nochanvalid 1@
|
||||
@@end
|
||||
starting execution ...
|
||||
build model options ...
|
||||
argc=20
|
||||
Generation context:
|
||||
HardFP is enabled enabling set to true
|
||||
Faithful rounding constraint detected
|
||||
Will not generate valid and channel signals
|
||||
The new component name is acl_fsqrt
|
||||
Frequency 250MHz
|
||||
Deployment FPGA Arria10
|
||||
Estimated resources LUTs 271, DSPs 3, RAMBits 15872, RAMBlocks 3
|
||||
The pipeline depth of the block is 10 cycle(s)
|
||||
@@start
|
||||
@name FPSqrt@
|
||||
@latency 10@
|
||||
@LUT 271@
|
||||
@DSP 3@
|
||||
@RAMBits 15872@
|
||||
@RAMBlockUsage 3@
|
||||
@enable 1@
|
||||
@subnormals 0@
|
||||
@error 1.00@
|
||||
@rounding NA@
|
||||
@method polynomial approximation@
|
||||
@inPort 0 fpieee 8 23@
|
||||
@outPort 0 fpieee 8 23@
|
||||
@nochanvalid 1@
|
||||
@@end
|
||||
starting execution ...
|
||||
build model options ...
|
||||
argc=23
|
||||
Generation context:
|
||||
HardFP is enabled enabling set to true
|
||||
Faithful rounding constraint detected
|
||||
Will not generate valid and channel signals
|
||||
The new component name is acl_ftoi
|
||||
Frequency 250MHz
|
||||
Deployment FPGA Arria10
|
||||
Estimated resources LUTs 327, DSPs 0, RAMBits 0, RAMBlocks 0
|
||||
The pipeline depth of the block is 3 cycle(s)
|
||||
@@start
|
||||
@name FPToFXP@
|
||||
@latency 3@
|
||||
@LUT 327@
|
||||
@DSP 0@
|
||||
@RAMBits 0@
|
||||
@RAMBlockUsage 0@
|
||||
@enable 1@
|
||||
@subnormals 0@
|
||||
@error 1.00@
|
||||
@rounding NA@
|
||||
@method default@
|
||||
@inPort 0 fpieee 8 23@
|
||||
@outPort 0 fxp 32 0 1@
|
||||
@nochanvalid 1@
|
||||
@@end
|
||||
starting execution ...
|
||||
build model options ...
|
||||
argc=23
|
||||
Generation context:
|
||||
HardFP is enabled enabling set to true
|
||||
Faithful rounding constraint detected
|
||||
Will not generate valid and channel signals
|
||||
The new component name is acl_ftou
|
||||
Frequency 250MHz
|
||||
Deployment FPGA Arria10
|
||||
Estimated resources LUTs 287, DSPs 0, RAMBits 0, RAMBlocks 0
|
||||
The pipeline depth of the block is 3 cycle(s)
|
||||
@@start
|
||||
@name FPToFXP@
|
||||
@latency 3@
|
||||
@LUT 287@
|
||||
@DSP 0@
|
||||
@RAMBits 0@
|
||||
@RAMBlockUsage 0@
|
||||
@enable 1@
|
||||
@subnormals 0@
|
||||
@error 1.00@
|
||||
@rounding NA@
|
||||
@method default@
|
||||
@inPort 0 fpieee 8 23@
|
||||
@outPort 0 fxp 32 0 0@
|
||||
@nochanvalid 1@
|
||||
@@end
|
||||
starting execution ...
|
||||
build model options ...
|
||||
argc=23
|
||||
Generation context:
|
||||
HardFP is enabled enabling set to true
|
||||
Faithful rounding constraint detected
|
||||
Will not generate valid and channel signals
|
||||
The new component name is acl_itof
|
||||
Frequency 250MHz
|
||||
Deployment FPGA Arria10
|
||||
Estimated resources LUTs 397, DSPs 0, RAMBits 0, RAMBlocks 0
|
||||
The pipeline depth of the block is 7 cycle(s)
|
||||
@@start
|
||||
@name FXPToFP@
|
||||
@latency 7@
|
||||
@LUT 397@
|
||||
@DSP 0@
|
||||
@RAMBits 0@
|
||||
@RAMBlockUsage 0@
|
||||
@enable 1@
|
||||
@subnormals 0@
|
||||
@error 1.00@
|
||||
@rounding NA@
|
||||
@method default@
|
||||
@inPort 0 fxp 32 0 1@
|
||||
@outPort 0 fpieee 8 23@
|
||||
@nochanvalid 1@
|
||||
@@end
|
||||
starting execution ...
|
||||
build model options ...
|
||||
argc=23
|
||||
Generation context:
|
||||
HardFP is enabled enabling set to true
|
||||
Faithful rounding constraint detected
|
||||
Will not generate valid and channel signals
|
||||
The new component name is acl_utof
|
||||
Frequency 300MHz
|
||||
Deployment FPGA Arria10
|
||||
Estimated resources LUTs 363, DSPs 0, RAMBits 0, RAMBlocks 0
|
||||
The pipeline depth of the block is 7 cycle(s)
|
||||
@@start
|
||||
@name FXPToFP@
|
||||
@latency 7@
|
||||
@LUT 363@
|
||||
@DSP 0@
|
||||
@RAMBits 0@
|
||||
@RAMBlockUsage 0@
|
||||
@enable 1@
|
||||
@subnormals 0@
|
||||
@error 1.00@
|
||||
@rounding NA@
|
||||
@method default@
|
||||
@inPort 0 fxp 32 0 0@
|
||||
@outPort 0 fpieee 8 23@
|
||||
@nochanvalid 1@
|
||||
@@end
|
25
hw/rtl/fp_cores/altera/acl_gen.sh
Executable file
25
hw/rtl/fp_cores/altera/acl_gen.sh
Executable file
|
@ -0,0 +1,25 @@
|
|||
#!/bin/bash
|
||||
|
||||
CMD_POLY_EVAL_PATH=$QUARTUS_HOME/dspba/backend/linux64
|
||||
|
||||
OPTIONS="-target Arria10 -lang verilog -enableHardFP 1 -printMachineReadable -faithfulRounding -noChanValid -enable -speedgrade 2"
|
||||
|
||||
export LD_LIBRARY_PATH=$CMD_POLY_EVAL_PATH:$LD_LIBRARY_PATH
|
||||
|
||||
CMD="$CMD_POLY_EVAL_PATH/cmdPolyEval $OPTIONS"
|
||||
|
||||
EXP_BITS=8
|
||||
MAN_BITS=23
|
||||
FBITS="f$(($EXP_BITS + $MAN_BITS + 1))"
|
||||
|
||||
echo Generating IP cores for $FBITS
|
||||
{
|
||||
$CMD -name acl_fdiv -frequency 250 FPDiv $EXP_BITS $MAN_BITS 0
|
||||
$CMD -name acl_fsqrt -frequency 250 FPSqrt $EXP_BITS $MAN_BITS
|
||||
$CMD -name acl_ftoi -frequency 250 FPToFXP $EXP_BITS $MAN_BITS 32 0 1
|
||||
$CMD -name acl_ftou -frequency 250 FPToFXP $EXP_BITS $MAN_BITS 32 0 0
|
||||
$CMD -name acl_itof -frequency 250 FXPToFP 32 0 1 $EXP_BITS $MAN_BITS
|
||||
$CMD -name acl_utof -frequency 300 FXPToFP 32 0 0 $EXP_BITS $MAN_BITS
|
||||
} > acl_gen.log 2>&1
|
||||
|
||||
#cp $QUARTUS_HOME/dspba/backend/Libraries/sv/base/dspba_library_ver.sv .
|
|
@ -15,12 +15,12 @@
|
|||
// applicable agreement for further details.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// SystemVerilog created from acl_fp_itof
|
||||
// SystemVerilog created on Mon Aug 31 06:15:18 2020
|
||||
// SystemVerilog created from acl_itof
|
||||
// SystemVerilog created on Wed Sep 2 07:11:09 2020
|
||||
|
||||
|
||||
(* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 10037; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 15400; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 12020; -name MESSAGE_DISABLE 12030; -name MESSAGE_DISABLE 12010; -name MESSAGE_DISABLE 12110; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 13410; -name MESSAGE_DISABLE 113007; -name MESSAGE_DISABLE 10958" *)
|
||||
module acl_fp_itof (
|
||||
module acl_itof (
|
||||
input wire [31:0] a,
|
||||
input wire [0:0] en,
|
||||
output wire [31:0] q,
|
|
@ -15,12 +15,12 @@
|
|||
// applicable agreement for further details.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
// SystemVerilog created from acl_fp_utof
|
||||
// SystemVerilog created on Mon Aug 31 06:15:18 2020
|
||||
// SystemVerilog created from acl_utof
|
||||
// SystemVerilog created on Wed Sep 2 07:11:09 2020
|
||||
|
||||
|
||||
(* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 10037; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 15400; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 12020; -name MESSAGE_DISABLE 12030; -name MESSAGE_DISABLE 12010; -name MESSAGE_DISABLE 12110; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 13410; -name MESSAGE_DISABLE 113007; -name MESSAGE_DISABLE 10958" *)
|
||||
module acl_fp_utof (
|
||||
module acl_utof (
|
||||
input wire [31:0] a,
|
||||
input wire [0:0] en,
|
||||
output wire [31:0] q,
|
|
@ -1,25 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
CMD_POLY_EVAL_PATH=$QUARTUS_HOME/dspba/backend/linux64
|
||||
|
||||
OPTIONS="-target Arria10 -lang verilog -enableHardFP 1 -printMachineReadable -faithfulRounding -noChanValid -enable -speedgrade 2"
|
||||
|
||||
export LD_LIBRARY_PATH=$CMD_POLY_EVAL_PATH:$LD_LIBRARY_PATH
|
||||
|
||||
CMD="$CMD_POLY_EVAL_PATH/cmdPolyEval $OPTIONS"
|
||||
|
||||
EXP_BITS=8
|
||||
MAN_BITS=23
|
||||
FBITS="f$(($EXP_BITS + $MAN_BITS + 1))"
|
||||
|
||||
echo Generating IP cores for $FBITS
|
||||
{
|
||||
$CMD -name acl_fp_div -frequency 250 FPDiv $EXP_BITS $MAN_BITS 0
|
||||
$CMD -name acl_fp_sqrt -frequency 250 FPSqrt $EXP_BITS $MAN_BITS
|
||||
$CMD -name acl_fp_ftoi -frequency 250 FPToFXP $EXP_BITS $MAN_BITS 32 0 1
|
||||
$CMD -name acl_fp_ftou -frequency 250 FPToFXP $EXP_BITS $MAN_BITS 32 0 0
|
||||
$CMD -name acl_fp_itof -frequency 250 FXPToFP 32 0 1 $EXP_BITS $MAN_BITS
|
||||
$CMD -name acl_fp_utof -frequency 300 FXPToFP 32 0 0 $EXP_BITS $MAN_BITS
|
||||
} > log.txt 2>&1
|
||||
|
||||
cp $QUARTUS_HOME/dspba/backend/Libraries/sv/base/dspba_library_ver.sv .
|
|
@ -8,21 +8,19 @@
|
|||
#include "VX_config.h"
|
||||
|
||||
extern "C" {
|
||||
void dpi_fadd(int inst, bool enable, bool valid, int a, int b, int* result);
|
||||
void dpi_fsub(int inst, bool enable, bool valid, int a, int b, int* result);
|
||||
void dpi_fmul(int inst, bool enable, bool valid, int a, int b, int* result);
|
||||
void dpi_fmadd(int inst, bool enable, bool valid, int a, int b, int c, int* result);
|
||||
void dpi_fmsub(int inst, bool enable, bool valid, int a, int b, int c, int* result);
|
||||
void dpi_fdiv(int inst, bool enable, bool valid, int a, int b, int* result);
|
||||
void dpi_fsqrt(int inst, bool enable, bool valid, int a, int* result);
|
||||
void dpi_ftoi(int inst, bool enable, bool valid, int a, int* result);
|
||||
void dpi_ftou(int inst, bool enable, bool valid, int a, int* result);
|
||||
void dpi_itof(int inst, bool enable, bool valid, int a, int* result);
|
||||
void dpi_utof(int inst, bool enable, bool valid, int a, int* result);
|
||||
void dpi_fadd(int inst, bool enable, int a, int b, int* result);
|
||||
void dpi_fsub(int inst, bool enable, int a, int b, int* result);
|
||||
void dpi_fmul(int inst, bool enable, int a, int b, int* result);
|
||||
void dpi_fmadd(int inst, bool enable, int a, int b, int c, int* result);
|
||||
void dpi_fmsub(int inst, bool enable, int a, int b, int c, int* result);
|
||||
void dpi_fdiv(int inst, bool enable, int a, int b, int* result);
|
||||
void dpi_fsqrt(int inst, bool enable, int a, int* result);
|
||||
void dpi_ftoi(int inst, bool enable, int a, int* result);
|
||||
void dpi_ftou(int inst, bool enable, int a, int* result);
|
||||
void dpi_itof(int inst, bool enable, int a, int* result);
|
||||
void dpi_utof(int inst, bool enable, int a, int* result);
|
||||
}
|
||||
|
||||
extern double sc_time_stamp();
|
||||
|
||||
class ShiftRegister {
|
||||
public:
|
||||
ShiftRegister() : init_(false), depth_(0) {}
|
||||
|
@ -35,37 +33,36 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
void push(int value, bool enable, bool valid) {
|
||||
void push(int value, bool enable) {
|
||||
if (!enable)
|
||||
return;
|
||||
for (unsigned i = 0; i < depth_-1; ++i) {
|
||||
buffer_[i] = buffer_[i+1];
|
||||
}
|
||||
buffer_[depth_-1].value = value;
|
||||
buffer_[depth_-1].valid = valid;
|
||||
buffer_[depth_-1] = value;
|
||||
}
|
||||
|
||||
int top() const {
|
||||
return buffer_[0].value;
|
||||
}
|
||||
|
||||
bool valid() const {
|
||||
return buffer_[0].valid;
|
||||
return buffer_[0];
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
struct entry_t {
|
||||
int value;
|
||||
bool valid;
|
||||
};
|
||||
|
||||
std::vector<entry_t> buffer_;
|
||||
int top_;
|
||||
unsigned depth_;
|
||||
std::vector<int> buffer_;
|
||||
bool init_;
|
||||
unsigned depth_;
|
||||
};
|
||||
|
||||
union Float_t {
|
||||
float f;
|
||||
int i;
|
||||
struct {
|
||||
uint32_t man : 23;
|
||||
uint32_t exp : 8;
|
||||
uint32_t sign : 1;
|
||||
} parts;
|
||||
};
|
||||
|
||||
class Instances {
|
||||
public:
|
||||
ShiftRegister& get(int inst) {
|
||||
|
@ -82,130 +79,152 @@ private:
|
|||
|
||||
Instances instances;
|
||||
|
||||
void dpi_fadd(int inst, bool enable, bool valid, int a, int b, int* result) {
|
||||
void dpi_fadd(int inst, bool enable, int a, int b, int* result) {
|
||||
ShiftRegister& sr = instances.get(inst);
|
||||
|
||||
float fa = *(float*)&a;
|
||||
float fb = *(float*)&b;
|
||||
float fr = fa + fb;
|
||||
Float_t fa, fb, fr;
|
||||
|
||||
sr.ensure_init(LATENCY_FMADD);
|
||||
sr.push(*(int*)&fr, enable, valid);
|
||||
fa.i = a;
|
||||
fb.i = b;
|
||||
fr.f = fa.f + fb.f;
|
||||
|
||||
sr.ensure_init(LATENCY_FADDMUL);
|
||||
sr.push(fr.i, enable);
|
||||
*result = sr.top();
|
||||
}
|
||||
|
||||
void dpi_fsub(int inst, bool enable, bool valid, int a, int b, int* result) {
|
||||
void dpi_fsub(int inst, bool enable, int a, int b, int* result) {
|
||||
ShiftRegister& sr = instances.get(inst);
|
||||
|
||||
float fa = *(float*)&a;
|
||||
float fb = *(float*)&b;
|
||||
float fr = fa - fb;
|
||||
Float_t fa, fb, fr;
|
||||
|
||||
sr.ensure_init(LATENCY_FMADD);
|
||||
sr.push(*(int*)&fr, enable, valid);
|
||||
fa.i = a;
|
||||
fb.i = b;
|
||||
fr.f = fa.f - fb.f;
|
||||
|
||||
sr.ensure_init(LATENCY_FADDMUL);
|
||||
sr.push(fr.i, enable);
|
||||
*result = sr.top();
|
||||
}
|
||||
|
||||
void dpi_fmul(int inst, bool enable, bool valid, int a, int b, int* result) {
|
||||
void dpi_fmul(int inst, bool enable, int a, int b, int* result) {
|
||||
ShiftRegister& sr = instances.get(inst);
|
||||
|
||||
float fa = *(float*)&a;
|
||||
float fb = *(float*)&b;
|
||||
float fr = fa * fb;
|
||||
Float_t fa, fb, fr;
|
||||
|
||||
sr.ensure_init(LATENCY_FMADD);
|
||||
sr.push(*(int*)&fr, enable, valid);
|
||||
fa.i = a;
|
||||
fb.i = b;
|
||||
fr.f = fa.f * fb.f;
|
||||
|
||||
sr.ensure_init(LATENCY_FADDMUL);
|
||||
sr.push(fr.i, enable);
|
||||
*result = sr.top();
|
||||
}
|
||||
|
||||
void dpi_fmadd(int inst, bool enable, bool valid, int a, int b, int c, int* result) {
|
||||
void dpi_fmadd(int inst, bool enable, int a, int b, int c, int* result) {
|
||||
ShiftRegister& sr = instances.get(inst);
|
||||
|
||||
float fa = *(float*)&a;
|
||||
float fb = *(float*)&b;
|
||||
float fc = *(float*)&c;
|
||||
float fr = fa * fb + fc;
|
||||
Float_t fa, fb, fc, fr;
|
||||
|
||||
fa.i = a;
|
||||
fb.i = b;
|
||||
fc.i = c;
|
||||
fr.f = fa.f * fb.f + fc.f;
|
||||
|
||||
sr.ensure_init(LATENCY_FMADD);
|
||||
sr.push(*(int*)&fr, enable, valid);
|
||||
sr.push(fr.i, enable);
|
||||
*result = sr.top();
|
||||
}
|
||||
|
||||
void dpi_fmsub(int inst, bool enable, bool valid, int a, int b, int c, int* result) {
|
||||
void dpi_fmsub(int inst, bool enable, int a, int b, int c, int* result) {
|
||||
ShiftRegister& sr = instances.get(inst);
|
||||
|
||||
float fa = *(float*)&a;
|
||||
float fb = *(float*)&b;
|
||||
float fc = *(float*)&c;
|
||||
float fr = fa * fb - fc;
|
||||
Float_t fa, fb, fc, fr;
|
||||
|
||||
fa.i = a;
|
||||
fb.i = b;
|
||||
fc.i = c;
|
||||
fr.f = fa.f * fb.f - fc.f;
|
||||
|
||||
sr.ensure_init(LATENCY_FMADD);
|
||||
sr.push(*(int*)&fr, enable, valid);
|
||||
sr.push(fr.i, enable);
|
||||
*result = sr.top();
|
||||
}
|
||||
|
||||
void dpi_fdiv(int inst, bool enable, bool valid, int a, int b, int* result) {
|
||||
void dpi_fdiv(int inst, bool enable, int a, int b, int* result) {
|
||||
ShiftRegister& sr = instances.get(inst);
|
||||
|
||||
float fa = *(float*)&a;
|
||||
float fb = *(float*)&b;
|
||||
float fr = fa / fb;
|
||||
Float_t fa, fb, fr;
|
||||
|
||||
fa.i = a;
|
||||
fb.i = b;
|
||||
fr.f = fa.f / fb.f;
|
||||
|
||||
sr.ensure_init(LATENCY_FDIV);
|
||||
sr.push(*(int*)&fr, enable, valid);
|
||||
sr.push(fr.i, enable);
|
||||
*result = sr.top();
|
||||
}
|
||||
|
||||
void dpi_fsqrt(int inst, bool enable, bool valid, int a, int* result) {
|
||||
void dpi_fsqrt(int inst, bool enable, int a, int* result) {
|
||||
ShiftRegister& sr = instances.get(inst);
|
||||
|
||||
float fa = *(float*)&a;
|
||||
float fr = sqrtf(fa);
|
||||
Float_t fa, fr;
|
||||
|
||||
fa.i = a;
|
||||
fr.f = sqrtf(fa.f);
|
||||
|
||||
sr.ensure_init(LATENCY_FSQRT);
|
||||
sr.push(*(int*)&fr, enable, valid);
|
||||
sr.push(fr.i, enable);
|
||||
*result = sr.top();
|
||||
}
|
||||
|
||||
void dpi_ftoi(int inst, bool enable, bool valid, int a, int* result) {
|
||||
void dpi_ftoi(int inst, bool enable, int a, int* result) {
|
||||
ShiftRegister& sr = instances.get(inst);
|
||||
|
||||
float fa = *(float*)&a;
|
||||
int ir = int(fa);
|
||||
Float_t fa, fr;
|
||||
|
||||
fa.i = a;
|
||||
fr.i = int(fa.f);
|
||||
|
||||
sr.ensure_init(LATENCY_FTOI);
|
||||
sr.push(ir, enable, valid);
|
||||
sr.push(fr.i, enable);
|
||||
*result = sr.top();
|
||||
}
|
||||
|
||||
void dpi_ftou(int inst, bool enable, bool valid, int a, int* result) {
|
||||
void dpi_ftou(int inst, bool enable, int a, int* result) {
|
||||
ShiftRegister& sr = instances.get(inst);
|
||||
|
||||
float fa = *(float*)&a;
|
||||
unsigned ir = unsigned(fa);
|
||||
Float_t fa, fr;
|
||||
|
||||
fa.i = a;
|
||||
fr.i = unsigned(fa.f);
|
||||
|
||||
sr.ensure_init(LATENCY_FTOI);
|
||||
sr.push(ir, enable, valid);
|
||||
sr.push(fr.i, enable);
|
||||
*result = sr.top();
|
||||
}
|
||||
|
||||
void dpi_itof(int inst, bool enable, bool valid, int a, int* result) {
|
||||
void dpi_itof(int inst, bool enable, int a, int* result) {
|
||||
ShiftRegister& sr = instances.get(inst);
|
||||
|
||||
float fr = (float)a;
|
||||
Float_t fa, fr;
|
||||
|
||||
fr.f = (float)a;
|
||||
|
||||
sr.ensure_init(LATENCY_ITOF);
|
||||
sr.push(*(int*)&fr, enable, valid);
|
||||
sr.push(fr.i, enable);
|
||||
*result = sr.top();
|
||||
}
|
||||
|
||||
void dpi_utof(int inst, bool enable, bool valid, int a, int* result) {
|
||||
void dpi_utof(int inst, bool enable, int a, int* result) {
|
||||
ShiftRegister& sr = instances.get(inst);
|
||||
|
||||
unsigned ua = *(unsigned*)&a;
|
||||
float fr = (float)ua;
|
||||
Float_t fa, fr;
|
||||
|
||||
unsigned ua = a;
|
||||
fr.f = (float)ua;
|
||||
|
||||
sr.ensure_init(LATENCY_ITOF);
|
||||
sr.push(*(int*)&fr, enable, valid);
|
||||
sr.push(fr.i, enable);
|
||||
*result = sr.top();
|
||||
}
|
|
@ -1,16 +1,16 @@
|
|||
`ifndef FLOAT_DPI
|
||||
`define FLOAT_DPI
|
||||
|
||||
import "DPI-C" context function void dpi_fadd(int inst, input logic enable, input logic valid, input int a, input int b, output int result);
|
||||
import "DPI-C" context function void dpi_fsub(int inst, input logic enable, input logic valid, input int a, input int b, output int result);
|
||||
import "DPI-C" context function void dpi_fmul(int inst, input logic enable, input logic valid, input int a, input int b, output int result);
|
||||
import "DPI-C" context function void dpi_fmadd(int inst, input logic enable, input logic valid, input int a, input int b, input int c, output int result);
|
||||
import "DPI-C" context function void dpi_fmsub(int inst, input logic enable, input logic valid, input int a, input int b, input int c, output int result);
|
||||
import "DPI-C" context function void dpi_fdiv(int inst, input logic enable, input logic valid, input int a, input int b, output int result);
|
||||
import "DPI-C" context function void dpi_fsqrt(int inst, input logic enable, input logic valid, input int a, output int result);
|
||||
import "DPI-C" context function void dpi_ftoi(int inst, input logic enable, input logic valid, input int a, output int result);
|
||||
import "DPI-C" context function void dpi_ftou(int inst, input logic enable, input logic valid, input int a, output int result);
|
||||
import "DPI-C" context function void dpi_itof(int inst, input logic enable, input logic valid, input int a, output int result);
|
||||
import "DPI-C" context function void dpi_utof(int inst, input logic enable, input logic valid, input int a, output int result);
|
||||
import "DPI-C" context function void dpi_fadd(int inst, input logic enable, input int a, input int b, output int result);
|
||||
import "DPI-C" context function void dpi_fsub(int inst, input logic enable, input int a, input int b, output int result);
|
||||
import "DPI-C" context function void dpi_fmul(int inst, input logic enable, input int a, input int b, output int result);
|
||||
import "DPI-C" context function void dpi_fmadd(int inst, input logic enable, input int a, input int b, input int c, output int result);
|
||||
import "DPI-C" context function void dpi_fmsub(int inst, input logic enable, input int a, input int b, input int c, output int result);
|
||||
import "DPI-C" context function void dpi_fdiv(int inst, input logic enable, input int a, input int b, output int result);
|
||||
import "DPI-C" context function void dpi_fsqrt(int inst, input logic enable, input int a, output int result);
|
||||
import "DPI-C" context function void dpi_ftoi(int inst, input logic enable, input int a, output int result);
|
||||
import "DPI-C" context function void dpi_ftou(int inst, input logic enable, input int a, output int result);
|
||||
import "DPI-C" context function void dpi_itof(int inst, input logic enable, input int a, output int result);
|
||||
import "DPI-C" context function void dpi_utof(int inst, input logic enable, input int a, output int result);
|
||||
|
||||
`endif
|
|
@ -1,21 +1,18 @@
|
|||
`include "VX_platform.vh"
|
||||
|
||||
module VX_divide #(
|
||||
parameter WIDTHN = 1,
|
||||
parameter WIDTHD = 1,
|
||||
parameter WIDTHQ = 1,
|
||||
parameter WIDTHR = 1,
|
||||
parameter WIDTHN = 1,
|
||||
parameter WIDTHD = 1,
|
||||
parameter WIDTHQ = 1,
|
||||
parameter WIDTHR = 1,
|
||||
parameter NSIGNED = 0,
|
||||
parameter DSIGNED = 0,
|
||||
parameter PIPELINE = 0
|
||||
parameter LATENCY = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
input wire clk_en,
|
||||
input wire enable,
|
||||
input wire [WIDTHN-1:0] numer,
|
||||
input wire [WIDTHD-1:0] denom,
|
||||
|
||||
output wire [WIDTHQ-1:0] quotient,
|
||||
output wire [WIDTHR-1:0] remainder
|
||||
);
|
||||
|
@ -27,11 +24,11 @@ module VX_divide #(
|
|||
|
||||
lpm_divide divide (
|
||||
.clock (clk),
|
||||
.clken (enable),
|
||||
.numer (numer),
|
||||
.denom (denom),
|
||||
.quotient (quotient_unqual),
|
||||
.remain (remainder_unqual),
|
||||
.clken (clk_en)
|
||||
.remain (remainder_unqual)
|
||||
);
|
||||
|
||||
defparam
|
||||
|
@ -41,7 +38,7 @@ module VX_divide #(
|
|||
divide.lpm_nrepresentation = NSIGNED ? "SIGNED" : "UNSIGNED",
|
||||
divide.lpm_drepresentation = DSIGNED ? "SIGNED" : "UNSIGNED",
|
||||
divide.lpm_hint = "MAXIMIZE_SPEED=6,LPM_REMAINDERPOSITIVE=FALSE",
|
||||
divide.lpm_pipeline = PIPELINE;
|
||||
divide.lpm_pipeline = LATENCY;
|
||||
|
||||
assign quotient = quotient_unqual [WIDTHQ-1:0];
|
||||
assign remainder = remainder_unqual [WIDTHR-1:0];
|
||||
|
@ -72,34 +69,24 @@ module VX_divide #(
|
|||
end
|
||||
end
|
||||
|
||||
if (PIPELINE == 0) begin
|
||||
if (LATENCY == 0) begin
|
||||
assign quotient = quotient_unqual [WIDTHQ-1:0];
|
||||
assign remainder = remainder_unqual [WIDTHR-1:0];
|
||||
end else begin
|
||||
reg [WIDTHN-1:0] quotient_pipe [0:PIPELINE-1];
|
||||
reg [WIDTHD-1:0] remainder_pipe [0:PIPELINE-1];
|
||||
reg [WIDTHN-1:0] quotient_pipe [0:LATENCY-1];
|
||||
reg [WIDTHD-1:0] remainder_pipe [0:LATENCY-1];
|
||||
|
||||
for (genvar i = 0; i < PIPELINE; i++) begin
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
quotient_pipe[i] <= 0;
|
||||
remainder_pipe[i] <= 0;
|
||||
end else begin
|
||||
if (clk_en) begin
|
||||
if (i == 0) begin
|
||||
quotient_pipe[i] <= quotient_unqual;
|
||||
remainder_pipe[i] <= remainder_unqual;
|
||||
end else begin
|
||||
quotient_pipe[i] <= quotient_pipe[i-1];
|
||||
remainder_pipe[i] <= remainder_pipe[i-1];
|
||||
end
|
||||
end
|
||||
for (genvar i = 0; i < LATENCY; i++) begin
|
||||
always @(posedge clk) begin
|
||||
if (enable) begin
|
||||
quotient_pipe[i] <= (0 == i) ? quotient_unqual : quotient_pipe[i-1];
|
||||
remainder_pipe[i] <= (0 == i) ? remainder_unqual : remainder_pipe[i-1];
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
assign quotient = quotient_pipe[PIPELINE-1][WIDTHQ-1:0];
|
||||
assign remainder = remainder_pipe[PIPELINE-1][WIDTHR-1:0];
|
||||
assign quotient = quotient_pipe[LATENCY-1][WIDTHQ-1:0];
|
||||
assign remainder = remainder_pipe[LATENCY-1][WIDTHR-1:0];
|
||||
end
|
||||
|
||||
`endif
|
||||
|
|
|
@ -1,16 +1,14 @@
|
|||
`include "VX_platform.vh"
|
||||
|
||||
module VX_multiplier #(
|
||||
parameter WIDTHA = 1,
|
||||
parameter WIDTHB = 1,
|
||||
parameter WIDTHP = 1,
|
||||
parameter SIGNED = 0,
|
||||
parameter PIPELINE = 0
|
||||
parameter WIDTHA = 1,
|
||||
parameter WIDTHB = 1,
|
||||
parameter WIDTHP = 1,
|
||||
parameter SIGNED = 0,
|
||||
parameter LATENCY = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
input wire clk_en,
|
||||
input wire clk,
|
||||
input wire enable,
|
||||
input wire [WIDTHA-1:0] dataa,
|
||||
input wire [WIDTHB-1:0] datab,
|
||||
output wire [WIDTHP-1:0] result
|
||||
|
@ -20,20 +18,22 @@ module VX_multiplier #(
|
|||
|
||||
lpm_mult mult (
|
||||
.clock (clk),
|
||||
.clken (enable),
|
||||
.dataa (dataa),
|
||||
.datab (datab),
|
||||
.result (result),
|
||||
.clken (clk_en),
|
||||
.result (result),
|
||||
.aclr (1'b0),
|
||||
.sclr (1'b0),
|
||||
.sum (1'b0)
|
||||
);
|
||||
|
||||
defparam mult.lpm_type = "LPM_MULT",
|
||||
defparam mult.lpm_type = "LPM_MULT",
|
||||
mult.lpm_widtha = WIDTHA,
|
||||
mult.lpm_widthb = WIDTHB,
|
||||
mult.lpm_widthp = WIDTHP,
|
||||
mult.lpm_representation = SIGNED ? "SIGNED" : "UNSIGNED",
|
||||
mult.lpm_pipeline = PIPELINE,
|
||||
mult.lpm_hint = "MAXIMIZE_SPEED=9,DEDICATED_MULTIPLIER_CIRCUITRY=YES";
|
||||
mult.lpm_pipeline = LATENCY,
|
||||
mult.lpm_hint = "DEDICATED_MULTIPLIER_CIRCUITRY=YES,MAXIMIZE_SPEED=9";
|
||||
`else
|
||||
|
||||
wire [WIDTHP-1:0] result_unqual;
|
||||
|
@ -44,29 +44,20 @@ module VX_multiplier #(
|
|||
assign result_unqual = dataa * datab;
|
||||
end
|
||||
|
||||
if (PIPELINE == 0) begin
|
||||
if (LATENCY == 0) begin
|
||||
assign result = result_unqual;
|
||||
end else begin
|
||||
|
||||
reg [WIDTHP-1:0] result_pipe [0:PIPELINE-1];
|
||||
end else begin
|
||||
reg [WIDTHP-1:0] result_pipe [0:LATENCY-1];
|
||||
|
||||
for (genvar i = 0; i < PIPELINE; i++) begin
|
||||
for (genvar i = 0; i < LATENCY; i++) begin
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
result_pipe[i] <= 0;
|
||||
end else begin
|
||||
if (clk_en) begin
|
||||
if (i == 0) begin
|
||||
result_pipe[i] <= result_unqual;
|
||||
end else begin
|
||||
result_pipe[i] <= result_pipe[i-1];
|
||||
end
|
||||
end
|
||||
if (enable) begin
|
||||
result_pipe[i] <= (0 == i) ? result_unqual : result_pipe[i-1];
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
assign result = result_pipe[PIPELINE-1];
|
||||
end
|
||||
|
||||
assign result = result_pipe[LATENCY-1];
|
||||
end
|
||||
|
||||
`endif
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue