fixed lmp_mult parameters, ram init filepath

This commit is contained in:
Blaise Tine 2020-09-04 07:51:46 -07:00
parent dccea80b68
commit 42e3b6c45d
36 changed files with 738 additions and 495 deletions

View file

@ -5,12 +5,12 @@
#include <limits>
union Float_t {
float f;
int32_t i;
float f;
int i;
struct {
uint32_t mantissa : 23;
uint32_t exponent : 8;
uint32_t sign : 1;
uint32_t man : 23;
uint32_t exp : 8;
uint32_t sign : 1;
} parts;
};

View file

@ -4,19 +4,22 @@ FPGA_BUILD_DIR=build_fpga
all: ase-1c
sources.txt:
sources.txt:
./gen_sources.sh > sources.txt
gen_sources: sources.txt
ase-1c: setup-ase-1c gen_sources
make -C $(ASE_BUILD_DIR)_1c
cp ../rtl/fp_cores/altera/*.hex $(ASE_BUILD_DIR)_1c/work
ase-2c: setup-ase-2c gen_sources
make -C $(ASE_BUILD_DIR)_2c
cp ../rtl/fp_cores/altera/*.hex $(ASE_BUILD_DIR)_2c/work
ase-4c: setup-ase-4c gen_sources
make -C $(ASE_BUILD_DIR)_4c
cp ../rtl/fp_cores/altera/*.hex $(ASE_BUILD_DIR)_3c/work
setup-ase-1c: $(ASE_BUILD_DIR)_1c/Makefile
@ -35,12 +38,15 @@ $(ASE_BUILD_DIR)_4c/Makefile: sources.txt
fpga-1c: setup-fpga-1c gen_sources
cd $(FPGA_BUILD_DIR)_1c && qsub-synth
cp ../rtl/fp_cores/altera/*.hex $(FPGA_BUILD_DIR)_1c
fpga-2c: setup-fpga-2c gen_sources
cd $(FPGA_BUILD_DIR)_2c && qsub-synth
cp ../rtl/fp_cores/altera/*.hex $(FPGA_BUILD_DIR)_2c
fpga-4c: setup-fpga-4c gen_sources
cd $(FPGA_BUILD_DIR)_4c && qsub-synth
cp ../rtl/fp_cores/altera/*.hex $(FPGA_BUILD_DIR)_4c
setup-fpga-1c: $(FPGA_BUILD_DIR)_1c/build/dcp.qpf

View file

@ -62,7 +62,7 @@ make ase
# tests
./run_ase.sh build_ase_1c ../../driver/tests/basic/basic -n 256
./run_ase.sh build_ase_1c ../../driver/tests/demo/demo -n 16
./run_ase.sh build_ase_1c ../../driver/tests/dogfood/dogfood -n 1 -s4 -e4
./run_ase.sh build_ase_1c ../../driver/tests/dogfood/dogfood -n1 -s4 -e4
./run_ase.sh build_ase_1c ../../benchmarks/opencl/vecadd/vecadd
# modify "vsim_run.tcl" to dump VCD trace

View file

@ -13,6 +13,7 @@
#+define+DBG_PRINT_DRAM
#+define+DBG_PRINT_PIPELINE
#+define+DBG_PRINT_OPAE
#+define+DBG_CORE_REQ_INFO
#+define+DBG_PRINT_SCOPE
vortex_afu.json

View file

@ -62,7 +62,7 @@ module VX_commit #(
fflags_r <= fflags;
has_fflags_r <= fpu_commit_if.valid && fpu_commit_if.has_fflags;
wid_r <= fpu_commit_if.wid;
num_commits_r <= num_commits;
num_commits_r <= (num_commits << $clog2(`NUM_THREADS));
end
assign cmt_to_csr_if.valid = csr_update_r;

View file

@ -59,6 +59,8 @@
`define EXT_F_ENABLE
`endif
`define FPU_FAST
// Device identification
`define VENDOR_ID 0
`define ARCHITECTURE_ID 0
@ -74,12 +76,12 @@
`define LATENCY_FNONCOMP 1
`endif
`ifndef LATENCY_FMADD
`define LATENCY_FMADD 1
`ifndef LATENCY_FADDMUL
`define LATENCY_FADDMUL 3
`endif
`ifndef LATENCY_FNMADD
`define LATENCY_FNMADD 2
`ifndef LATENCY_FMADD
`define LATENCY_FMADD 4
`endif
`ifndef LATENCY_FDIV
@ -98,16 +100,12 @@
`define LATENCY_FTOI 3
`endif
`ifndef LATENCY_FADDMUL
`define LATENCY_FADDMUL 2
`endif
`ifndef LATENCY_FDIVSQRT
`define LATENCY_FDIVSQRT 2
`define LATENCY_FDIVSQRT 10
`endif
`ifndef LATENCY_FCONV
`define LATENCY_FCONV 2
`define LATENCY_FCONV 3
`endif
// CSR Addresses //////////////////////////////////////////////////////////////

View file

@ -385,7 +385,7 @@
`define VX_CORE_TAG_WIDTH `L3CORE_TAG_WIDTH
`define VX_CSR_ID_WIDTH `LOG2UP(`NUM_CLUSTERS * `NUM_CORES)
`define DRAM_TO_BYTE_ADDR(x) {x, (32-$bits(x))'(0)}
`define DRAM_TO_BYTE_ADDR(x) {x, (32-$bits(x))'(0)}
`include "VX_types.vh"

View file

@ -51,32 +51,33 @@ module VX_mul_unit #(
///////////////////////////////////////////////////////////////////////////
wire [`NUM_THREADS-1:0][31:0] mul_result;
wire is_mulw = (alu_op == `MUL_MUL);
wire is_mulw_out;
wire is_mul_in = (alu_op == `MUL_MUL);
wire is_mul_out;
wire stall_mul;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
wire [32:0] mul_in1 = {(alu_op != `MUL_MULHU) & alu_in1[i][31], alu_in1[i]};
wire [32:0] mul_in2 = {(alu_op != `MUL_MULHU && alu_op != `MUL_MULHSU) & alu_in2[i][31], alu_in2[i]};
wire [63:0] mul_result_tmp;
`IGNORE_WARNINGS_BEGIN
wire [65:0] mul_result_tmp;
`IGNORE_WARNINGS_END
VX_multiplier #(
.WIDTHA(33),
.WIDTHB(33),
.WIDTHP(64),
.WIDTHP(66),
.SIGNED(1),
.PIPELINE(`LATENCY_IMUL)
.LATENCY(`LATENCY_IMUL)
) multiplier (
.clk(clk),
.reset(reset),
.clk_en(~stall_mul),
.enable(~stall_mul),
.dataa(mul_in1),
.datab(mul_in2),
.result(mul_result_tmp)
);
assign mul_result[i] = is_mulw_out ? mul_result_tmp[31:0] : mul_result_tmp[63:32];
assign mul_result[i] = is_mul_out ? mul_result_tmp[31:0] : mul_result_tmp[63:32];
end
wire [MULQ_BITS-1:0] mul_tag;
@ -91,17 +92,17 @@ module VX_mul_unit #(
.clk(clk),
.reset(reset),
.enable(~stall_mul),
.in({mul_fire, tag_in, is_mulw}),
.out({mul_valid_out, mul_tag, is_mulw_out})
.in({mul_fire, tag_in, is_mul_in}),
.out({mul_valid_out, mul_tag, is_mul_out})
);
///////////////////////////////////////////////////////////////////////////
wire [`NUM_THREADS-1:0][31:0] div_result_tmp, rem_result_tmp;
wire is_div_only = (alu_op == `MUL_DIV) || (alu_op == `MUL_DIVU);
wire is_div_only = (alu_op == `MUL_DIV) || (alu_op == `MUL_DIVU);
wire is_signed_div = (alu_op == `MUL_DIV) || (alu_op == `MUL_REM);
wire div_valid_in = mul_req_if.valid && is_div_op;
wire div_valid_in = mul_req_if.valid && is_div_op;
wire div_ready_in;
wire div_ready_out;
wire div_valid_out;

View file

@ -4,6 +4,10 @@
`include "VX_platform.vh"
`include "VX_scope.vh"
`ifdef DBG_CORE_REQ_INFO
`include "VX_define.vh"
`endif
`define REQ_TAG_WIDTH `MAX(CORE_TAG_WIDTH, SNP_REQ_TAG_WIDTH)
`define REQS_BITS `LOG2UP(NUM_REQUESTS)
@ -77,4 +81,6 @@
`define LINE_TO_BYTE_ADDR(x, i) {x, (32-$bits(x))'(i << (32-$bits(x)-`BANK_SELECT_BITS))}
`define DRAM_TO_BYTE_ADDR(x) {x, (32-$bits(x))'(0)}
`endif

View file

@ -0,0 +1,187 @@
`include "VX_define.vh"
`ifndef SYNTHESIS
`include "float_dpi.vh"
`endif
module VX_fp_addmul #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [TAGW-1:0] tag_in,
input wire do_sub,
input wire do_mul,
input wire [LANES-1:0][31:0] dataa,
input wire [LANES-1:0][31:0] datab,
output wire [LANES-1:0][31:0] result,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
);
wire stall = ~ready_out && valid_out;
wire enable = ~stall;
reg do_sub_r, do_mul_r;
for (genvar i = 0; i < LANES; i++) begin
wire [31:0] result_add;
wire [31:0] result_sub;
wire [31:0] result_mul;
`ifdef QUARTUS
twentynm_fp_mac mac_fp_add (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(dataa[i]),
.ay(datab[i]),
.az(),
.clk({2'b00,clk}),
.ena({2'b11,enable}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result_add),
.chainout()
);
defparam mac_fp_add.operation_mode = "sp_add";
defparam mac_fp_add.use_chainin = "false";
defparam mac_fp_add.adder_subtract = "false";
defparam mac_fp_add.ax_clock = "0";
defparam mac_fp_add.ay_clock = "0";
defparam mac_fp_add.az_clock = "none";
defparam mac_fp_add.output_clock = "0";
defparam mac_fp_add.accumulate_clock = "none";
defparam mac_fp_add.ax_chainin_pl_clock = "none";
defparam mac_fp_add.accum_pipeline_clock = "none";
defparam mac_fp_add.mult_pipeline_clock = "none";
defparam mac_fp_add.adder_input_clock = "0";
defparam mac_fp_add.accum_adder_clock = "none";
twentynm_fp_mac mac_fp_sub (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(dataa[i]),
.ay(datab[i]),
.az(),
.clk({2'b00,clk}),
.ena({2'b11,enable}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result_sub),
.chainout()
);
defparam mac_fp_sub.operation_mode = "sp_add";
defparam mac_fp_sub.use_chainin = "false";
defparam mac_fp_sub.adder_subtract = "true";
defparam mac_fp_sub.ax_clock = "0";
defparam mac_fp_sub.ay_clock = "0";
defparam mac_fp_sub.az_clock = "none";
defparam mac_fp_sub.output_clock = "0";
defparam mac_fp_sub.accumulate_clock = "none";
defparam mac_fp_sub.ax_chainin_pl_clock = "none";
defparam mac_fp_sub.accum_pipeline_clock = "none";
defparam mac_fp_sub.mult_pipeline_clock = "none";
defparam mac_fp_sub.adder_input_clock = "0";
defparam mac_fp_sub.accum_adder_clock = "none";
twentynm_fp_mac mac_fp_mul (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(),
.ay(datab[i]),
.az(dataa[i]),
.clk({2'b00,clk}),
.ena({2'b11,enable}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result_mul),
.chainout()
);
defparam mac_fp_mul.operation_mode = "sp_mult";
defparam mac_fp_mul.use_chainin = "false";
defparam mac_fp_mul.adder_subtract = "false";
defparam mac_fp_mul.ax_clock = "none";
defparam mac_fp_mul.ay_clock = "0";
defparam mac_fp_mul.az_clock = "0";
defparam mac_fp_mul.output_clock = "0";
defparam mac_fp_mul.accumulate_clock = "none";
defparam mac_fp_mul.ax_chainin_pl_clock = "none";
defparam mac_fp_mul.accum_pipeline_clock = "none";
defparam mac_fp_mul.mult_pipeline_clock = "0";
defparam mac_fp_mul.adder_input_clock = "none";
defparam mac_fp_mul.accum_adder_clock = "none";
`else
always @(posedge clk) begin
dpi_fadd(0*LANES+i, enable, dataa[i], datab[i], result_add);
dpi_fsub(1*LANES+i, enable, dataa[i], datab[i], result_sub);
dpi_fmul(2*LANES+i, enable, dataa[i], datab[i], result_mul);
end
`endif
assign result[i] = do_mul_r ? result_mul : (do_sub_r ? result_sub : result_add);
end
VX_shift_register #(
.DATAW(TAGW + 1 + 1 + 1),
.DEPTH(`LATENCY_FADDMUL)
) shift_reg (
.clk(clk),
.reset(reset),
.enable(enable),
.in({tag_in, valid_in, do_sub, do_mul}),
.out({tag_out, valid_out, do_sub_r, do_mul_r})
);
assign ready_in = enable;
endmodule

View file

@ -26,20 +26,21 @@ module VX_fp_div #(
output wire valid_out
);
wire stall = ~ready_out && valid_out;
wire enable = ~stall;
for (genvar i = 0; i < LANES; i++) begin
`ifdef QUARTUS
acl_fp_div fdiv (
acl_fdiv fdiv (
.clk (clk),
.areset (1'b0),
.en (~stall),
.en (enable),
.a (dataa[i]),
.b (datab[i]),
.q (result[i])
);
`else
always @(posedge clk) begin
dpi_fdiv(8*LANES+i, ~stall, valid_in, dataa[i], datab[i], result[i]);
dpi_fdiv(8*LANES+i, enable, dataa[i], datab[i], result[i]);
end
`endif
end
@ -50,11 +51,11 @@ module VX_fp_div #(
) shift_reg (
.clk(clk),
.reset(reset),
.enable(~stall),
.enable(enable),
.in ({tag_in, valid_in}),
.out({tag_out, valid_out})
);
assign ready_in = ~stall;
assign ready_in = enable;
endmodule

View file

@ -27,7 +27,7 @@ module VX_fp_fpga #(
input wire ready_out,
output wire valid_out
);
localparam NUM_FPC = 7;
localparam NUM_FPC = 8;
localparam FPC_BITS = `LOG2UP(NUM_FPC);
wire [NUM_FPC-1:0] per_core_ready_in;
@ -40,29 +40,28 @@ module VX_fp_fpga #(
fflags_t [`NUM_THREADS-1:0] fpnew_fflags;
reg [FPC_BITS-1:0] core_select;
reg do_add, do_sub, do_mul;
reg do_sub, do_mul;
reg is_signed;
always @(*) begin
core_select = 'x;
do_add = 'x;
do_sub = 'x;
do_mul = 'x;
is_signed = 'x;
case (op_type)
`FPU_ADD: begin core_select = 1; do_mul = 0; do_add = 1; do_sub = 0; end
`FPU_SUB: begin core_select = 1; do_mul = 0; do_add = 0; do_sub = 1; end
`FPU_MUL: begin core_select = 1; do_mul = 1; do_add = 0; do_sub = 0; end
`FPU_MADD: begin core_select = 1; do_mul = 1; do_add = 1; do_sub = 0; end
`FPU_MSUB: begin core_select = 1; do_mul = 1; do_add = 0; do_sub = 1; end
`FPU_NMSUB: begin core_select = 2; do_sub = 1; end
`FPU_NMADD: begin core_select = 2; do_sub = 0; end
`FPU_DIV: begin core_select = 3; end
`FPU_SQRT: begin core_select = 4; end
`FPU_CVTWS: begin core_select = 5; is_signed = 1; end
`FPU_CVTWUS: begin core_select = 5; is_signed = 0; end
`FPU_CVTSW: begin core_select = 6; is_signed = 1; end
`FPU_CVTSWU: begin core_select = 6; is_signed = 0; end
`FPU_ADD: begin core_select = 1; do_mul = 0; do_sub = 0; end
`FPU_SUB: begin core_select = 1; do_mul = 0; do_sub = 1; end
`FPU_MUL: begin core_select = 1; do_mul = 1; do_sub = 0; end
`FPU_MADD: begin core_select = 2; do_sub = 0; end
`FPU_MSUB: begin core_select = 2; do_sub = 1; end
`FPU_NMADD: begin core_select = 3; do_sub = 0; end
`FPU_NMSUB: begin core_select = 3; do_sub = 1; end
`FPU_DIV: begin core_select = 4; end
`FPU_SQRT: begin core_select = 5; end
`FPU_CVTWS: begin core_select = 6; is_signed = 1; end
`FPU_CVTWUS: begin core_select = 6; is_signed = 0; end
`FPU_CVTSW: begin core_select = 7; is_signed = 1; end
`FPU_CVTSWU: begin core_select = 7; is_signed = 0; end
default: begin core_select = 0; end
endcase
end
@ -88,25 +87,42 @@ module VX_fp_fpga #(
.valid_out (per_core_valid_out[0])
);
VX_fp_addmul #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_addmul (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 1)),
.ready_in (per_core_ready_in[1]),
.tag_in (tag_in),
.do_sub (do_sub),
.do_mul (do_mul),
.dataa (dataa),
.datab (datab),
.result (per_core_result[1]),
.tag_out (per_core_tag_out[1]),
.ready_out (per_core_ready_out[1]),
.valid_out (per_core_valid_out[1])
);
VX_fp_madd #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_madd (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 1)),
.ready_in (per_core_ready_in[1]),
.tag_in (tag_in),
.do_add (do_add),
.valid_in (valid_in && (core_select == 2)),
.ready_in (per_core_ready_in[2]),
.tag_in (tag_in),
.do_sub (do_sub),
.do_mul (do_mul),
.dataa (dataa),
.datab (datab),
.datac (datac),
.result (per_core_result[1]),
.tag_out (per_core_tag_out[1]),
.ready_out (per_core_ready_out[1]),
.valid_out (per_core_valid_out[1])
.result (per_core_result[2]),
.tag_out (per_core_tag_out[2]),
.ready_out (per_core_ready_out[2]),
.valid_out (per_core_valid_out[2])
);
VX_fp_nmadd #(
@ -115,17 +131,17 @@ module VX_fp_fpga #(
) fp_nmadd (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 2)),
.ready_in (per_core_ready_in[2]),
.valid_in (valid_in && (core_select == 3)),
.ready_in (per_core_ready_in[3]),
.tag_in (tag_in),
.do_sub (do_sub),
.dataa (dataa),
.datab (datab),
.datac (datac),
.result (per_core_result[2]),
.tag_out (per_core_tag_out[2]),
.ready_out (per_core_ready_out[2]),
.valid_out (per_core_valid_out[2])
.result (per_core_result[3]),
.tag_out (per_core_tag_out[3]),
.ready_out (per_core_ready_out[3]),
.valid_out (per_core_valid_out[3])
);
VX_fp_div #(
@ -134,15 +150,15 @@ module VX_fp_fpga #(
) fp_div (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 3)),
.ready_in (per_core_ready_in[3]),
.valid_in (valid_in && (core_select == 4)),
.ready_in (per_core_ready_in[4]),
.tag_in (tag_in),
.dataa (dataa),
.datab (datab),
.result (per_core_result[3]),
.tag_out (per_core_tag_out[3]),
.ready_out (per_core_ready_out[3]),
.valid_out (per_core_valid_out[3])
.result (per_core_result[4]),
.tag_out (per_core_tag_out[4]),
.ready_out (per_core_ready_out[4]),
.valid_out (per_core_valid_out[4])
);
VX_fp_sqrt #(
@ -151,14 +167,14 @@ module VX_fp_fpga #(
) fp_sqrt (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 4)),
.ready_in (per_core_ready_in[4]),
.valid_in (valid_in && (core_select == 5)),
.ready_in (per_core_ready_in[5]),
.tag_in (tag_in),
.dataa (dataa),
.result (per_core_result[4]),
.tag_out (per_core_tag_out[4]),
.ready_out (per_core_ready_out[4]),
.valid_out (per_core_valid_out[4])
.result (per_core_result[5]),
.tag_out (per_core_tag_out[5]),
.ready_out (per_core_ready_out[5]),
.valid_out (per_core_valid_out[5])
);
VX_fp_ftoi #(
@ -167,15 +183,15 @@ module VX_fp_fpga #(
) fp_ftoi (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 5)),
.ready_in (per_core_ready_in[5]),
.valid_in (valid_in && (core_select == 6)),
.ready_in (per_core_ready_in[6]),
.tag_in (tag_in),
.is_signed (is_signed),
.dataa (dataa),
.result (per_core_result[5]),
.tag_out (per_core_tag_out[5]),
.ready_out (per_core_ready_out[5]),
.valid_out (per_core_valid_out[5])
.result (per_core_result[6]),
.tag_out (per_core_tag_out[6]),
.ready_out (per_core_ready_out[6]),
.valid_out (per_core_valid_out[6])
);
VX_fp_itof #(
@ -184,15 +200,15 @@ module VX_fp_fpga #(
) fp_itof (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 6)),
.ready_in (per_core_ready_in[6]),
.valid_in (valid_in && (core_select == 7)),
.ready_in (per_core_ready_in[7]),
.tag_in (tag_in),
.is_signed (is_signed),
.dataa (dataa),
.result (per_core_result[6]),
.tag_out (per_core_tag_out[6]),
.ready_out (per_core_ready_out[6]),
.valid_out (per_core_valid_out[6])
.result (per_core_result[7]),
.tag_out (per_core_tag_out[7]),
.ready_out (per_core_ready_out[7]),
.valid_out (per_core_valid_out[7])
);
reg valid_out_n;

View file

@ -27,6 +27,7 @@ module VX_fp_ftoi #(
output wire valid_out
);
wire stall = ~ready_out && valid_out;
wire enable = ~stall;
reg is_signed_r;
@ -36,25 +37,25 @@ module VX_fp_ftoi #(
wire [31:0] result_u;
`ifdef QUARTUS
acl_fp_ftoi ftoi (
acl_ftoi ftoi (
.clk (clk),
.areset (1'b0),
.en (~stall),
.en (enable),
.a (dataa[i]),
.q (result_s)
);
acl_fp_ftou ftou (
acl_ftou ftou (
.clk (clk),
.areset (1'b0),
.en (~stall),
.en (enable),
.a (dataa[i]),
.q (result_u)
);
`else
always @(posedge clk) begin
dpi_ftoi(10*LANES+i, ~stall, valid_in, dataa[i], result_s);
dpi_ftou(11*LANES+i, ~stall, valid_in, dataa[i], result_u);
dpi_ftoi(10*LANES+i, enable, dataa[i], result_s);
dpi_ftou(11*LANES+i, enable, dataa[i], result_u);
end
`endif
@ -67,11 +68,11 @@ module VX_fp_ftoi #(
) shift_reg (
.clk(clk),
.reset(reset),
.enable(~stall),
.enable(enable),
.in ({tag_in, valid_in, is_signed}),
.out({tag_out, valid_out, is_signed_r})
);
assign ready_in = ~stall;
assign ready_in = enable;
endmodule

View file

@ -27,6 +27,7 @@ module VX_fp_itof #(
output wire valid_out
);
wire stall = ~ready_out && valid_out;
wire enable = ~stall;
reg is_signed_r;
@ -36,25 +37,25 @@ module VX_fp_itof #(
wire [31:0] result_u;
`ifdef QUARTUS
acl_fp_itof itof (
acl_itof itof (
.clk (clk),
.areset (1'b0),
.en (~stall),
.en (enable),
.a (dataa[i]),
.q (result_s)
);
acl_fp_utof utof (
acl_utof utof (
.clk (clk),
.areset (1'b0),
.en (~stall),
.en (enable),
.a (dataa[i]),
.q (result_u)
);
`else
always @(posedge clk) begin
dpi_itof(12*LANES+i, ~stall, valid_in, dataa[i], result_s);
dpi_utof(13*LANES+i, ~stall, valid_in, dataa[i], result_u);
dpi_itof(12*LANES+i, enable, dataa[i], result_s);
dpi_utof(13*LANES+i, enable, dataa[i], result_u);
end
`endif
@ -67,11 +68,11 @@ module VX_fp_itof #(
) shift_reg (
.clk(clk),
.reset(reset),
.enable(~stall),
.enable(enable),
.in ({tag_in, valid_in, is_signed}),
.out({tag_out, valid_out, is_signed_r})
);
assign ready_in = ~stall;
assign ready_in = enable;
endmodule

View file

@ -16,9 +16,7 @@ module VX_fp_madd #(
input wire [TAGW-1:0] tag_in,
input wire do_add,
input wire do_sub,
input wire do_mul,
input wire do_sub,
input wire [LANES-1:0][31:0] dataa,
input wire [LANES-1:0][31:0] datab,
@ -32,138 +30,16 @@ module VX_fp_madd #(
);
wire stall = ~ready_out && valid_out;
wire enable = ~stall;
reg do_add_r, do_sub_r, do_mul_r;
reg do_sub_r;
for (genvar i = 0; i < LANES; i++) begin
wire [31:0] result_add;
wire [31:0] result_sub;
wire [31:0] result_mul;
wire [31:0] result_madd;
wire [31:0] result_msub;
`ifdef QUARTUS
twentynm_fp_mac mac_fp_add (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(dataa[i]),
.ay(datab[i]),
.az(),
.clk({2'b00,clk}),
.ena({2'b11,~stall}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result_add),
.chainout()
);
defparam mac_fp_add.operation_mode = "sp_add";
defparam mac_fp_add.use_chainin = "false";
defparam mac_fp_add.adder_subtract = "false";
defparam mac_fp_add.ax_clock = "0";
defparam mac_fp_add.ay_clock = "0";
defparam mac_fp_add.az_clock = "none";
defparam mac_fp_add.output_clock = "0";
defparam mac_fp_add.accumulate_clock = "none";
defparam mac_fp_add.ax_chainin_pl_clock = "none";
defparam mac_fp_add.accum_pipeline_clock = "none";
defparam mac_fp_add.mult_pipeline_clock = "none";
defparam mac_fp_add.adder_input_clock = "0";
defparam mac_fp_add.accum_adder_clock = "none";
twentynm_fp_mac mac_fp_sub (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(dataa[i]),
.ay(datab[i]),
.az(),
.clk({2'b00,clk}),
.ena({2'b11,~stall}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result_sub),
.chainout()
);
defparam mac_fp_sub.operation_mode = "sp_add";
defparam mac_fp_sub.use_chainin = "false";
defparam mac_fp_sub.adder_subtract = "true";
defparam mac_fp_sub.ax_clock = "0";
defparam mac_fp_sub.ay_clock = "0";
defparam mac_fp_sub.az_clock = "none";
defparam mac_fp_sub.output_clock = "0";
defparam mac_fp_sub.accumulate_clock = "none";
defparam mac_fp_sub.ax_chainin_pl_clock = "none";
defparam mac_fp_sub.accum_pipeline_clock = "none";
defparam mac_fp_sub.mult_pipeline_clock = "none";
defparam mac_fp_sub.adder_input_clock = "0";
defparam mac_fp_sub.accum_adder_clock = "none";
twentynm_fp_mac mac_fp_mul (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(),
.ay(datab[i]),
.az(dataa[i]),
.clk({2'b00,clk}),
.ena({2'b11,~stall}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result_mul),
.chainout()
);
defparam mac_fp_mul.operation_mode = "sp_mult";
defparam mac_fp_mul.use_chainin = "false";
defparam mac_fp_mul.adder_subtract = "false";
defparam mac_fp_mul.ax_clock = "none";
defparam mac_fp_mul.ay_clock = "0";
defparam mac_fp_mul.az_clock = "0";
defparam mac_fp_mul.output_clock = "0";
defparam mac_fp_mul.accumulate_clock = "none";
defparam mac_fp_mul.ax_chainin_pl_clock = "none";
defparam mac_fp_mul.accum_pipeline_clock = "none";
defparam mac_fp_mul.mult_pipeline_clock = "0";
defparam mac_fp_mul.adder_input_clock = "none";
defparam mac_fp_mul.accum_adder_clock = "none";
twentynm_fp_mac mac_fp_madd (
// inputs
.accumulate(),
@ -175,7 +51,7 @@ module VX_fp_madd #(
.ay(datab[i]),
.az(dataa[i]),
.clk({2'b00,clk}),
.ena({2'b11,~stall}),
.ena({2'b11,enable}),
.aclr(2'b00),
.chainin(),
// outputs
@ -215,7 +91,7 @@ module VX_fp_madd #(
.ay(datab[i]),
.az(dataa[i]),
.clk({2'b00,clk}),
.ena({2'b11,~stall}),
.ena({2'b11,enable}),
.aclr(2'b00),
.chainin(),
// outputs
@ -245,47 +121,25 @@ module VX_fp_madd #(
defparam mac_fp_msub.accum_adder_clock = "none";
`else
always @(posedge clk) begin
dpi_fadd(0*LANES+i, ~stall, valid_in, dataa[i], datab[i], result_add);
dpi_fsub(1*LANES+i, ~stall, valid_in, dataa[i], datab[i], result_sub);
dpi_fmul(2*LANES+i, ~stall, valid_in, dataa[i], datab[i], result_mul);
dpi_fmadd(3*LANES+i, ~stall, valid_in, dataa[i], datab[i], datac[i], result_madd);
dpi_fmsub(4*LANES+i, ~stall, valid_in, dataa[i], datab[i], datac[i], result_msub);
dpi_fmadd(3*LANES+i, enable, dataa[i], datab[i], datac[i], result_madd);
dpi_fmsub(4*LANES+i, enable, dataa[i], datab[i], datac[i], result_msub);
end
`endif
reg [31:0] result_r;
always @(*) begin
result_r = 'x;
if (do_mul_r) begin
if (do_add_r)
result_r = result_madd;
else if (do_sub_r)
result_r = result_msub;
else
result_r = result_mul;
end else begin
if (do_add_r)
result_r = result_add;
else if (do_sub_r)
result_r = result_sub;
end
end
assign result[i] = result_r;
assign result[i] = do_sub_r ? result_msub : result_madd;
end
VX_shift_register #(
.DATAW(TAGW + 1 + 1 + 1 + 1),
.DATAW(TAGW + 1 + 1),
.DEPTH(`LATENCY_FMADD)
) shift_reg1 (
) shift_reg (
.clk(clk),
.reset(reset),
.enable(~stall),
.in({tag_in, valid_in, do_add, do_sub, do_mul}),
.out({tag_out, valid_out, do_add_r, do_sub_r, do_mul_r})
.enable(enable),
.in({tag_in, valid_in, do_sub}),
.out({tag_out, valid_out, do_sub_r})
);
assign ready_in = ~stall;
assign ready_in = enable;
endmodule

View file

@ -30,13 +30,14 @@ module VX_fp_nmadd #(
);
wire stall = ~ready_out && valid_out;
wire enable = ~stall;
reg do_sub_r;
for (genvar i = 0; i < LANES; i++) begin
wire [31:0] result_madd;
wire [31:0] result_msub;
wire [31:0] result_msub;
wire [31:0] result_st0 = do_sub_r ? result_msub : result_madd;
@ -52,7 +53,7 @@ module VX_fp_nmadd #(
.ay(datab[i]),
.az(dataa[i]),
.clk({2'b00,clk}),
.ena({2'b11,~stall}),
.ena({2'b11,enable}),
.aclr(2'b00),
.chainin(),
// outputs
@ -161,33 +162,36 @@ module VX_fp_nmadd #(
defparam mac_fp_neg.adder_input_clock = "0";
defparam mac_fp_neg.accum_adder_clock = "none";
`else
reg valid_in_st0;
always @(posedge clk) begin
valid_in_st0 <= reset ? 0 : valid_in;
dpi_fmadd(5*LANES+i, ~stall, valid_in, dataa[i], datab[i], datac[i], result_madd);
dpi_fmsub(6*LANES+i, ~stall, valid_in, dataa[i], datab[i], datac[i], result_msub);
dpi_fsub(7*LANES+i, ~stall, valid_in_st0, 32'b0, result_st0, result[i]);
dpi_fmadd(5*LANES+i, enable, dataa[i], datab[i], datac[i], result_madd);
dpi_fmsub(6*LANES+i, enable, dataa[i], datab[i], datac[i], result_msub);
dpi_fsub(7*LANES+i, enable, 32'b0, result_st0, result[i]);
end
`endif
end
always @(posedge clk) begin
if (~stall) begin
do_sub_r <= do_sub;
end
end
VX_shift_register #(
.DATAW(1),
.DEPTH(`LATENCY_FMADD)
) shift_reg0 (
.clk(clk),
.reset(reset),
.enable(enable),
.in({do_sub}),
.out({do_sub_r})
);
VX_shift_register #(
.DATAW(TAGW + 1),
.DEPTH(`LATENCY_FNMADD)
.DEPTH(`LATENCY_FMADD + `LATENCY_FADDMUL)
) shift_reg1 (
.clk(clk),
.reset(reset),
.enable(~stall),
.enable(enable),
.in({tag_in, valid_in}),
.out({tag_out, valid_out})
);
assign ready_in = ~stall;
assign ready_in = enable;
endmodule

View file

@ -25,19 +25,20 @@ module VX_fp_sqrt #(
output wire valid_out
);
wire stall = ~ready_out && valid_out;
wire enable = ~stall;
for (genvar i = 0; i < LANES; i++) begin
`ifdef QUARTUS
acl_fp_sqrt fsqrt (
acl_fsqrt fsqrt (
.clk (clk),
.areset (1'b0),
.en (~stall),
.en (enable),
.a (dataa[i]),
.q (result[i])
);
`else
always @(posedge clk) begin
dpi_fsqrt(9*LANES+i, ~stall, valid_in, dataa[i], result[i]);
dpi_fsqrt(9*LANES+i, enable, dataa[i], result[i]);
end
`endif
end
@ -48,11 +49,11 @@ module VX_fp_sqrt #(
) shift_reg (
.clk(clk),
.reset(reset),
.enable(~stall),
.enable(enable),
.in ({tag_in, valid_in}),
.out({tag_out, valid_out})
);
assign ready_in = ~stall;
assign ready_in = enable;
endmodule

View file

@ -15,12 +15,12 @@
// applicable agreement for further details.
// ---------------------------------------------------------------------------
// SystemVerilog created from acl_fp_div
// SystemVerilog created on Mon Aug 31 06:15:17 2020
// SystemVerilog created from acl_fdiv
// SystemVerilog created on Wed Sep 2 07:11:09 2020
(* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 10037; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 15400; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 12020; -name MESSAGE_DISABLE 12030; -name MESSAGE_DISABLE 12010; -name MESSAGE_DISABLE 12110; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 13410; -name MESSAGE_DISABLE 113007; -name MESSAGE_DISABLE 10958" *)
module acl_fp_div (
module acl_fdiv (
input wire [31:0] a,
input wire [31:0] b,
input wire [0:0] en,
@ -623,7 +623,7 @@ module acl_fp_div (
.outdata_aclr_a("CLEAR0"),
.clock_enable_input_a("NORMAL"),
.power_up_uninitialized("FALSE"),
.init_file("acl_fp_div_memoryC2_uid118_invTables_lutmem.hex"),
.init_file("acl_fdiv_memoryC2_uid118_invTables_lutmem.hex"),
.init_file_layout("PORT_A"),
.intended_device_family("Arria 10")
) memoryC2_uid118_invTables_lutmem_dmem (
@ -755,7 +755,7 @@ module acl_fp_div (
.outdata_aclr_a("CLEAR0"),
.clock_enable_input_a("NORMAL"),
.power_up_uninitialized("FALSE"),
.init_file("acl_fp_div_memoryC1_uid115_invTables_lutmem.hex"),
.init_file("acl_fdiv_memoryC1_uid115_invTables_lutmem.hex"),
.init_file_layout("PORT_A"),
.intended_device_family("Arria 10")
) memoryC1_uid115_invTables_lutmem_dmem (
@ -1060,7 +1060,7 @@ module acl_fp_div (
.outdata_aclr_a("CLEAR0"),
.clock_enable_input_a("NORMAL"),
.power_up_uninitialized("FALSE"),
.init_file("acl_fp_div_memoryC0_uid112_invTables_lutmem.hex"),
.init_file("acl_fdiv_memoryC0_uid112_invTables_lutmem.hex"),
.init_file_layout("PORT_A"),
.intended_device_family("Arria 10")
) memoryC0_uid112_invTables_lutmem_dmem (

View file

@ -15,12 +15,12 @@
// applicable agreement for further details.
// ---------------------------------------------------------------------------
// SystemVerilog created from acl_fp_sqrt
// SystemVerilog created on Mon Aug 31 06:15:18 2020
// SystemVerilog created from acl_fsqrt
// SystemVerilog created on Wed Sep 2 07:11:09 2020
(* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 10037; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 15400; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 12020; -name MESSAGE_DISABLE 12030; -name MESSAGE_DISABLE 12010; -name MESSAGE_DISABLE 12110; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 13410; -name MESSAGE_DISABLE 113007; -name MESSAGE_DISABLE 10958" *)
module acl_fp_sqrt (
module acl_fsqrt (
input wire [31:0] a,
input wire [0:0] en,
output wire [31:0] q,
@ -279,7 +279,7 @@ module acl_fp_sqrt (
.outdata_aclr_a("CLEAR0"),
.clock_enable_input_a("NORMAL"),
.power_up_uninitialized("FALSE"),
.init_file("acl_fp_sqrt_memoryC2_uid68_sqrtTables_lutmem.hex"),
.init_file("acl_fsqrt_memoryC2_uid68_sqrtTables_lutmem.hex"),
.init_file_layout("PORT_A"),
.intended_device_family("Arria 10")
) memoryC2_uid68_sqrtTables_lutmem_dmem (
@ -412,7 +412,7 @@ module acl_fp_sqrt (
.outdata_aclr_a("CLEAR0"),
.clock_enable_input_a("NORMAL"),
.power_up_uninitialized("FALSE"),
.init_file("acl_fp_sqrt_memoryC1_uid65_sqrtTables_lutmem.hex"),
.init_file("acl_fsqrt_memoryC1_uid65_sqrtTables_lutmem.hex"),
.init_file_layout("PORT_A"),
.intended_device_family("Arria 10")
) memoryC1_uid65_sqrtTables_lutmem_dmem (
@ -723,7 +723,7 @@ module acl_fp_sqrt (
.outdata_aclr_a("CLEAR0"),
.clock_enable_input_a("NORMAL"),
.power_up_uninitialized("FALSE"),
.init_file("acl_fp_sqrt_memoryC0_uid62_sqrtTables_lutmem.hex"),
.init_file("acl_fsqrt_memoryC0_uid62_sqrtTables_lutmem.hex"),
.init_file_layout("PORT_A"),
.intended_device_family("Arria 10")
) memoryC0_uid62_sqrtTables_lutmem_dmem (

View file

@ -15,12 +15,12 @@
// applicable agreement for further details.
// ---------------------------------------------------------------------------
// SystemVerilog created from acl_fp_ftoi
// SystemVerilog created on Mon Aug 31 06:15:18 2020
// SystemVerilog created from acl_ftoi
// SystemVerilog created on Wed Sep 2 07:11:09 2020
(* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 10037; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 15400; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 12020; -name MESSAGE_DISABLE 12030; -name MESSAGE_DISABLE 12010; -name MESSAGE_DISABLE 12110; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 13410; -name MESSAGE_DISABLE 113007; -name MESSAGE_DISABLE 10958" *)
module acl_fp_ftoi (
module acl_ftoi (
input wire [31:0] a,
input wire [0:0] en,
output wire [31:0] q,

View file

@ -15,12 +15,12 @@
// applicable agreement for further details.
// ---------------------------------------------------------------------------
// SystemVerilog created from acl_fp_ftou
// SystemVerilog created on Mon Aug 31 06:15:18 2020
// SystemVerilog created from acl_ftou
// SystemVerilog created on Wed Sep 2 07:11:09 2020
(* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 10037; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 15400; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 12020; -name MESSAGE_DISABLE 12030; -name MESSAGE_DISABLE 12010; -name MESSAGE_DISABLE 12110; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 13410; -name MESSAGE_DISABLE 113007; -name MESSAGE_DISABLE 10958" *)
module acl_fp_ftou (
module acl_ftou (
input wire [31:0] a,
input wire [0:0] en,
output wire [31:0] q,

View file

@ -0,0 +1,169 @@
starting execution ...
build model options ...
argc=21
Generation context:
HardFP is enabled enabling set to true
Faithful rounding constraint detected
Will not generate valid and channel signals
The new component name is acl_fdiv
Frequency 250MHz
Deployment FPGA Arria10
Estimated resources LUTs 539, DSPs 5, RAMBits 32768, RAMBlocks 3
The pipeline depth of the block is 15 cycle(s)
@@start
@name FPDiv@
@latency 15@
@LUT 539@
@DSP 5@
@RAMBits 32768@
@RAMBlockUsage 3@
@enable 1@
@subnormals 0@
@error 1.00@
@rounding NA@
@method polynomial approximation@
@inPort 0 fpieee 8 23@
@inPort 1 fpieee 8 23@
@outPort 0 fpieee 8 23@
@nochanvalid 1@
@@end
starting execution ...
build model options ...
argc=20
Generation context:
HardFP is enabled enabling set to true
Faithful rounding constraint detected
Will not generate valid and channel signals
The new component name is acl_fsqrt
Frequency 250MHz
Deployment FPGA Arria10
Estimated resources LUTs 271, DSPs 3, RAMBits 15872, RAMBlocks 3
The pipeline depth of the block is 10 cycle(s)
@@start
@name FPSqrt@
@latency 10@
@LUT 271@
@DSP 3@
@RAMBits 15872@
@RAMBlockUsage 3@
@enable 1@
@subnormals 0@
@error 1.00@
@rounding NA@
@method polynomial approximation@
@inPort 0 fpieee 8 23@
@outPort 0 fpieee 8 23@
@nochanvalid 1@
@@end
starting execution ...
build model options ...
argc=23
Generation context:
HardFP is enabled enabling set to true
Faithful rounding constraint detected
Will not generate valid and channel signals
The new component name is acl_ftoi
Frequency 250MHz
Deployment FPGA Arria10
Estimated resources LUTs 327, DSPs 0, RAMBits 0, RAMBlocks 0
The pipeline depth of the block is 3 cycle(s)
@@start
@name FPToFXP@
@latency 3@
@LUT 327@
@DSP 0@
@RAMBits 0@
@RAMBlockUsage 0@
@enable 1@
@subnormals 0@
@error 1.00@
@rounding NA@
@method default@
@inPort 0 fpieee 8 23@
@outPort 0 fxp 32 0 1@
@nochanvalid 1@
@@end
starting execution ...
build model options ...
argc=23
Generation context:
HardFP is enabled enabling set to true
Faithful rounding constraint detected
Will not generate valid and channel signals
The new component name is acl_ftou
Frequency 250MHz
Deployment FPGA Arria10
Estimated resources LUTs 287, DSPs 0, RAMBits 0, RAMBlocks 0
The pipeline depth of the block is 3 cycle(s)
@@start
@name FPToFXP@
@latency 3@
@LUT 287@
@DSP 0@
@RAMBits 0@
@RAMBlockUsage 0@
@enable 1@
@subnormals 0@
@error 1.00@
@rounding NA@
@method default@
@inPort 0 fpieee 8 23@
@outPort 0 fxp 32 0 0@
@nochanvalid 1@
@@end
starting execution ...
build model options ...
argc=23
Generation context:
HardFP is enabled enabling set to true
Faithful rounding constraint detected
Will not generate valid and channel signals
The new component name is acl_itof
Frequency 250MHz
Deployment FPGA Arria10
Estimated resources LUTs 397, DSPs 0, RAMBits 0, RAMBlocks 0
The pipeline depth of the block is 7 cycle(s)
@@start
@name FXPToFP@
@latency 7@
@LUT 397@
@DSP 0@
@RAMBits 0@
@RAMBlockUsage 0@
@enable 1@
@subnormals 0@
@error 1.00@
@rounding NA@
@method default@
@inPort 0 fxp 32 0 1@
@outPort 0 fpieee 8 23@
@nochanvalid 1@
@@end
starting execution ...
build model options ...
argc=23
Generation context:
HardFP is enabled enabling set to true
Faithful rounding constraint detected
Will not generate valid and channel signals
The new component name is acl_utof
Frequency 300MHz
Deployment FPGA Arria10
Estimated resources LUTs 363, DSPs 0, RAMBits 0, RAMBlocks 0
The pipeline depth of the block is 7 cycle(s)
@@start
@name FXPToFP@
@latency 7@
@LUT 363@
@DSP 0@
@RAMBits 0@
@RAMBlockUsage 0@
@enable 1@
@subnormals 0@
@error 1.00@
@rounding NA@
@method default@
@inPort 0 fxp 32 0 0@
@outPort 0 fpieee 8 23@
@nochanvalid 1@
@@end

View file

@ -0,0 +1,25 @@
#!/bin/bash
CMD_POLY_EVAL_PATH=$QUARTUS_HOME/dspba/backend/linux64
OPTIONS="-target Arria10 -lang verilog -enableHardFP 1 -printMachineReadable -faithfulRounding -noChanValid -enable -speedgrade 2"
export LD_LIBRARY_PATH=$CMD_POLY_EVAL_PATH:$LD_LIBRARY_PATH
CMD="$CMD_POLY_EVAL_PATH/cmdPolyEval $OPTIONS"
EXP_BITS=8
MAN_BITS=23
FBITS="f$(($EXP_BITS + $MAN_BITS + 1))"
echo Generating IP cores for $FBITS
{
$CMD -name acl_fdiv -frequency 250 FPDiv $EXP_BITS $MAN_BITS 0
$CMD -name acl_fsqrt -frequency 250 FPSqrt $EXP_BITS $MAN_BITS
$CMD -name acl_ftoi -frequency 250 FPToFXP $EXP_BITS $MAN_BITS 32 0 1
$CMD -name acl_ftou -frequency 250 FPToFXP $EXP_BITS $MAN_BITS 32 0 0
$CMD -name acl_itof -frequency 250 FXPToFP 32 0 1 $EXP_BITS $MAN_BITS
$CMD -name acl_utof -frequency 300 FXPToFP 32 0 0 $EXP_BITS $MAN_BITS
} > acl_gen.log 2>&1
#cp $QUARTUS_HOME/dspba/backend/Libraries/sv/base/dspba_library_ver.sv .

View file

@ -15,12 +15,12 @@
// applicable agreement for further details.
// ---------------------------------------------------------------------------
// SystemVerilog created from acl_fp_itof
// SystemVerilog created on Mon Aug 31 06:15:18 2020
// SystemVerilog created from acl_itof
// SystemVerilog created on Wed Sep 2 07:11:09 2020
(* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 10037; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 15400; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 12020; -name MESSAGE_DISABLE 12030; -name MESSAGE_DISABLE 12010; -name MESSAGE_DISABLE 12110; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 13410; -name MESSAGE_DISABLE 113007; -name MESSAGE_DISABLE 10958" *)
module acl_fp_itof (
module acl_itof (
input wire [31:0] a,
input wire [0:0] en,
output wire [31:0] q,

View file

@ -15,12 +15,12 @@
// applicable agreement for further details.
// ---------------------------------------------------------------------------
// SystemVerilog created from acl_fp_utof
// SystemVerilog created on Mon Aug 31 06:15:18 2020
// SystemVerilog created from acl_utof
// SystemVerilog created on Wed Sep 2 07:11:09 2020
(* altera_attribute = "-name AUTO_SHIFT_REGISTER_RECOGNITION OFF; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 10037; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 15400; -name MESSAGE_DISABLE 14130; -name MESSAGE_DISABLE 10036; -name MESSAGE_DISABLE 12020; -name MESSAGE_DISABLE 12030; -name MESSAGE_DISABLE 12010; -name MESSAGE_DISABLE 12110; -name MESSAGE_DISABLE 14320; -name MESSAGE_DISABLE 13410; -name MESSAGE_DISABLE 113007; -name MESSAGE_DISABLE 10958" *)
module acl_fp_utof (
module acl_utof (
input wire [31:0] a,
input wire [0:0] en,
output wire [31:0] q,

View file

@ -1,25 +0,0 @@
#!/bin/bash
CMD_POLY_EVAL_PATH=$QUARTUS_HOME/dspba/backend/linux64
OPTIONS="-target Arria10 -lang verilog -enableHardFP 1 -printMachineReadable -faithfulRounding -noChanValid -enable -speedgrade 2"
export LD_LIBRARY_PATH=$CMD_POLY_EVAL_PATH:$LD_LIBRARY_PATH
CMD="$CMD_POLY_EVAL_PATH/cmdPolyEval $OPTIONS"
EXP_BITS=8
MAN_BITS=23
FBITS="f$(($EXP_BITS + $MAN_BITS + 1))"
echo Generating IP cores for $FBITS
{
$CMD -name acl_fp_div -frequency 250 FPDiv $EXP_BITS $MAN_BITS 0
$CMD -name acl_fp_sqrt -frequency 250 FPSqrt $EXP_BITS $MAN_BITS
$CMD -name acl_fp_ftoi -frequency 250 FPToFXP $EXP_BITS $MAN_BITS 32 0 1
$CMD -name acl_fp_ftou -frequency 250 FPToFXP $EXP_BITS $MAN_BITS 32 0 0
$CMD -name acl_fp_itof -frequency 250 FXPToFP 32 0 1 $EXP_BITS $MAN_BITS
$CMD -name acl_fp_utof -frequency 300 FXPToFP 32 0 0 $EXP_BITS $MAN_BITS
} > log.txt 2>&1
cp $QUARTUS_HOME/dspba/backend/Libraries/sv/base/dspba_library_ver.sv .

View file

@ -8,21 +8,19 @@
#include "VX_config.h"
extern "C" {
void dpi_fadd(int inst, bool enable, bool valid, int a, int b, int* result);
void dpi_fsub(int inst, bool enable, bool valid, int a, int b, int* result);
void dpi_fmul(int inst, bool enable, bool valid, int a, int b, int* result);
void dpi_fmadd(int inst, bool enable, bool valid, int a, int b, int c, int* result);
void dpi_fmsub(int inst, bool enable, bool valid, int a, int b, int c, int* result);
void dpi_fdiv(int inst, bool enable, bool valid, int a, int b, int* result);
void dpi_fsqrt(int inst, bool enable, bool valid, int a, int* result);
void dpi_ftoi(int inst, bool enable, bool valid, int a, int* result);
void dpi_ftou(int inst, bool enable, bool valid, int a, int* result);
void dpi_itof(int inst, bool enable, bool valid, int a, int* result);
void dpi_utof(int inst, bool enable, bool valid, int a, int* result);
void dpi_fadd(int inst, bool enable, int a, int b, int* result);
void dpi_fsub(int inst, bool enable, int a, int b, int* result);
void dpi_fmul(int inst, bool enable, int a, int b, int* result);
void dpi_fmadd(int inst, bool enable, int a, int b, int c, int* result);
void dpi_fmsub(int inst, bool enable, int a, int b, int c, int* result);
void dpi_fdiv(int inst, bool enable, int a, int b, int* result);
void dpi_fsqrt(int inst, bool enable, int a, int* result);
void dpi_ftoi(int inst, bool enable, int a, int* result);
void dpi_ftou(int inst, bool enable, int a, int* result);
void dpi_itof(int inst, bool enable, int a, int* result);
void dpi_utof(int inst, bool enable, int a, int* result);
}
extern double sc_time_stamp();
class ShiftRegister {
public:
ShiftRegister() : init_(false), depth_(0) {}
@ -35,37 +33,36 @@ public:
}
}
void push(int value, bool enable, bool valid) {
void push(int value, bool enable) {
if (!enable)
return;
for (unsigned i = 0; i < depth_-1; ++i) {
buffer_[i] = buffer_[i+1];
}
buffer_[depth_-1].value = value;
buffer_[depth_-1].valid = valid;
buffer_[depth_-1] = value;
}
int top() const {
return buffer_[0].value;
}
bool valid() const {
return buffer_[0].valid;
return buffer_[0];
}
private:
struct entry_t {
int value;
bool valid;
};
std::vector<entry_t> buffer_;
int top_;
unsigned depth_;
std::vector<int> buffer_;
bool init_;
unsigned depth_;
};
union Float_t {
float f;
int i;
struct {
uint32_t man : 23;
uint32_t exp : 8;
uint32_t sign : 1;
} parts;
};
class Instances {
public:
ShiftRegister& get(int inst) {
@ -82,130 +79,152 @@ private:
Instances instances;
void dpi_fadd(int inst, bool enable, bool valid, int a, int b, int* result) {
void dpi_fadd(int inst, bool enable, int a, int b, int* result) {
ShiftRegister& sr = instances.get(inst);
float fa = *(float*)&a;
float fb = *(float*)&b;
float fr = fa + fb;
Float_t fa, fb, fr;
sr.ensure_init(LATENCY_FMADD);
sr.push(*(int*)&fr, enable, valid);
fa.i = a;
fb.i = b;
fr.f = fa.f + fb.f;
sr.ensure_init(LATENCY_FADDMUL);
sr.push(fr.i, enable);
*result = sr.top();
}
void dpi_fsub(int inst, bool enable, bool valid, int a, int b, int* result) {
void dpi_fsub(int inst, bool enable, int a, int b, int* result) {
ShiftRegister& sr = instances.get(inst);
float fa = *(float*)&a;
float fb = *(float*)&b;
float fr = fa - fb;
Float_t fa, fb, fr;
sr.ensure_init(LATENCY_FMADD);
sr.push(*(int*)&fr, enable, valid);
fa.i = a;
fb.i = b;
fr.f = fa.f - fb.f;
sr.ensure_init(LATENCY_FADDMUL);
sr.push(fr.i, enable);
*result = sr.top();
}
void dpi_fmul(int inst, bool enable, bool valid, int a, int b, int* result) {
void dpi_fmul(int inst, bool enable, int a, int b, int* result) {
ShiftRegister& sr = instances.get(inst);
float fa = *(float*)&a;
float fb = *(float*)&b;
float fr = fa * fb;
Float_t fa, fb, fr;
sr.ensure_init(LATENCY_FMADD);
sr.push(*(int*)&fr, enable, valid);
fa.i = a;
fb.i = b;
fr.f = fa.f * fb.f;
sr.ensure_init(LATENCY_FADDMUL);
sr.push(fr.i, enable);
*result = sr.top();
}
void dpi_fmadd(int inst, bool enable, bool valid, int a, int b, int c, int* result) {
void dpi_fmadd(int inst, bool enable, int a, int b, int c, int* result) {
ShiftRegister& sr = instances.get(inst);
float fa = *(float*)&a;
float fb = *(float*)&b;
float fc = *(float*)&c;
float fr = fa * fb + fc;
Float_t fa, fb, fc, fr;
fa.i = a;
fb.i = b;
fc.i = c;
fr.f = fa.f * fb.f + fc.f;
sr.ensure_init(LATENCY_FMADD);
sr.push(*(int*)&fr, enable, valid);
sr.push(fr.i, enable);
*result = sr.top();
}
void dpi_fmsub(int inst, bool enable, bool valid, int a, int b, int c, int* result) {
void dpi_fmsub(int inst, bool enable, int a, int b, int c, int* result) {
ShiftRegister& sr = instances.get(inst);
float fa = *(float*)&a;
float fb = *(float*)&b;
float fc = *(float*)&c;
float fr = fa * fb - fc;
Float_t fa, fb, fc, fr;
fa.i = a;
fb.i = b;
fc.i = c;
fr.f = fa.f * fb.f - fc.f;
sr.ensure_init(LATENCY_FMADD);
sr.push(*(int*)&fr, enable, valid);
sr.push(fr.i, enable);
*result = sr.top();
}
void dpi_fdiv(int inst, bool enable, bool valid, int a, int b, int* result) {
void dpi_fdiv(int inst, bool enable, int a, int b, int* result) {
ShiftRegister& sr = instances.get(inst);
float fa = *(float*)&a;
float fb = *(float*)&b;
float fr = fa / fb;
Float_t fa, fb, fr;
fa.i = a;
fb.i = b;
fr.f = fa.f / fb.f;
sr.ensure_init(LATENCY_FDIV);
sr.push(*(int*)&fr, enable, valid);
sr.push(fr.i, enable);
*result = sr.top();
}
void dpi_fsqrt(int inst, bool enable, bool valid, int a, int* result) {
void dpi_fsqrt(int inst, bool enable, int a, int* result) {
ShiftRegister& sr = instances.get(inst);
float fa = *(float*)&a;
float fr = sqrtf(fa);
Float_t fa, fr;
fa.i = a;
fr.f = sqrtf(fa.f);
sr.ensure_init(LATENCY_FSQRT);
sr.push(*(int*)&fr, enable, valid);
sr.push(fr.i, enable);
*result = sr.top();
}
void dpi_ftoi(int inst, bool enable, bool valid, int a, int* result) {
void dpi_ftoi(int inst, bool enable, int a, int* result) {
ShiftRegister& sr = instances.get(inst);
float fa = *(float*)&a;
int ir = int(fa);
Float_t fa, fr;
fa.i = a;
fr.i = int(fa.f);
sr.ensure_init(LATENCY_FTOI);
sr.push(ir, enable, valid);
sr.push(fr.i, enable);
*result = sr.top();
}
void dpi_ftou(int inst, bool enable, bool valid, int a, int* result) {
void dpi_ftou(int inst, bool enable, int a, int* result) {
ShiftRegister& sr = instances.get(inst);
float fa = *(float*)&a;
unsigned ir = unsigned(fa);
Float_t fa, fr;
fa.i = a;
fr.i = unsigned(fa.f);
sr.ensure_init(LATENCY_FTOI);
sr.push(ir, enable, valid);
sr.push(fr.i, enable);
*result = sr.top();
}
void dpi_itof(int inst, bool enable, bool valid, int a, int* result) {
void dpi_itof(int inst, bool enable, int a, int* result) {
ShiftRegister& sr = instances.get(inst);
float fr = (float)a;
Float_t fa, fr;
fr.f = (float)a;
sr.ensure_init(LATENCY_ITOF);
sr.push(*(int*)&fr, enable, valid);
sr.push(fr.i, enable);
*result = sr.top();
}
void dpi_utof(int inst, bool enable, bool valid, int a, int* result) {
void dpi_utof(int inst, bool enable, int a, int* result) {
ShiftRegister& sr = instances.get(inst);
unsigned ua = *(unsigned*)&a;
float fr = (float)ua;
Float_t fa, fr;
unsigned ua = a;
fr.f = (float)ua;
sr.ensure_init(LATENCY_ITOF);
sr.push(*(int*)&fr, enable, valid);
sr.push(fr.i, enable);
*result = sr.top();
}

View file

@ -1,16 +1,16 @@
`ifndef FLOAT_DPI
`define FLOAT_DPI
import "DPI-C" context function void dpi_fadd(int inst, input logic enable, input logic valid, input int a, input int b, output int result);
import "DPI-C" context function void dpi_fsub(int inst, input logic enable, input logic valid, input int a, input int b, output int result);
import "DPI-C" context function void dpi_fmul(int inst, input logic enable, input logic valid, input int a, input int b, output int result);
import "DPI-C" context function void dpi_fmadd(int inst, input logic enable, input logic valid, input int a, input int b, input int c, output int result);
import "DPI-C" context function void dpi_fmsub(int inst, input logic enable, input logic valid, input int a, input int b, input int c, output int result);
import "DPI-C" context function void dpi_fdiv(int inst, input logic enable, input logic valid, input int a, input int b, output int result);
import "DPI-C" context function void dpi_fsqrt(int inst, input logic enable, input logic valid, input int a, output int result);
import "DPI-C" context function void dpi_ftoi(int inst, input logic enable, input logic valid, input int a, output int result);
import "DPI-C" context function void dpi_ftou(int inst, input logic enable, input logic valid, input int a, output int result);
import "DPI-C" context function void dpi_itof(int inst, input logic enable, input logic valid, input int a, output int result);
import "DPI-C" context function void dpi_utof(int inst, input logic enable, input logic valid, input int a, output int result);
import "DPI-C" context function void dpi_fadd(int inst, input logic enable, input int a, input int b, output int result);
import "DPI-C" context function void dpi_fsub(int inst, input logic enable, input int a, input int b, output int result);
import "DPI-C" context function void dpi_fmul(int inst, input logic enable, input int a, input int b, output int result);
import "DPI-C" context function void dpi_fmadd(int inst, input logic enable, input int a, input int b, input int c, output int result);
import "DPI-C" context function void dpi_fmsub(int inst, input logic enable, input int a, input int b, input int c, output int result);
import "DPI-C" context function void dpi_fdiv(int inst, input logic enable, input int a, input int b, output int result);
import "DPI-C" context function void dpi_fsqrt(int inst, input logic enable, input int a, output int result);
import "DPI-C" context function void dpi_ftoi(int inst, input logic enable, input int a, output int result);
import "DPI-C" context function void dpi_ftou(int inst, input logic enable, input int a, output int result);
import "DPI-C" context function void dpi_itof(int inst, input logic enable, input int a, output int result);
import "DPI-C" context function void dpi_utof(int inst, input logic enable, input int a, output int result);
`endif

View file

@ -1,21 +1,18 @@
`include "VX_platform.vh"
module VX_divide #(
parameter WIDTHN = 1,
parameter WIDTHD = 1,
parameter WIDTHQ = 1,
parameter WIDTHR = 1,
parameter WIDTHN = 1,
parameter WIDTHD = 1,
parameter WIDTHQ = 1,
parameter WIDTHR = 1,
parameter NSIGNED = 0,
parameter DSIGNED = 0,
parameter PIPELINE = 0
parameter LATENCY = 0
) (
input wire clk,
input wire reset,
input wire clk_en,
input wire enable,
input wire [WIDTHN-1:0] numer,
input wire [WIDTHD-1:0] denom,
output wire [WIDTHQ-1:0] quotient,
output wire [WIDTHR-1:0] remainder
);
@ -27,11 +24,11 @@ module VX_divide #(
lpm_divide divide (
.clock (clk),
.clken (enable),
.numer (numer),
.denom (denom),
.quotient (quotient_unqual),
.remain (remainder_unqual),
.clken (clk_en)
.remain (remainder_unqual)
);
defparam
@ -41,7 +38,7 @@ module VX_divide #(
divide.lpm_nrepresentation = NSIGNED ? "SIGNED" : "UNSIGNED",
divide.lpm_drepresentation = DSIGNED ? "SIGNED" : "UNSIGNED",
divide.lpm_hint = "MAXIMIZE_SPEED=6,LPM_REMAINDERPOSITIVE=FALSE",
divide.lpm_pipeline = PIPELINE;
divide.lpm_pipeline = LATENCY;
assign quotient = quotient_unqual [WIDTHQ-1:0];
assign remainder = remainder_unqual [WIDTHR-1:0];
@ -72,34 +69,24 @@ module VX_divide #(
end
end
if (PIPELINE == 0) begin
if (LATENCY == 0) begin
assign quotient = quotient_unqual [WIDTHQ-1:0];
assign remainder = remainder_unqual [WIDTHR-1:0];
end else begin
reg [WIDTHN-1:0] quotient_pipe [0:PIPELINE-1];
reg [WIDTHD-1:0] remainder_pipe [0:PIPELINE-1];
reg [WIDTHN-1:0] quotient_pipe [0:LATENCY-1];
reg [WIDTHD-1:0] remainder_pipe [0:LATENCY-1];
for (genvar i = 0; i < PIPELINE; i++) begin
always @(posedge clk) begin
if (reset) begin
quotient_pipe[i] <= 0;
remainder_pipe[i] <= 0;
end else begin
if (clk_en) begin
if (i == 0) begin
quotient_pipe[i] <= quotient_unqual;
remainder_pipe[i] <= remainder_unqual;
end else begin
quotient_pipe[i] <= quotient_pipe[i-1];
remainder_pipe[i] <= remainder_pipe[i-1];
end
end
for (genvar i = 0; i < LATENCY; i++) begin
always @(posedge clk) begin
if (enable) begin
quotient_pipe[i] <= (0 == i) ? quotient_unqual : quotient_pipe[i-1];
remainder_pipe[i] <= (0 == i) ? remainder_unqual : remainder_pipe[i-1];
end
end
end
assign quotient = quotient_pipe[PIPELINE-1][WIDTHQ-1:0];
assign remainder = remainder_pipe[PIPELINE-1][WIDTHR-1:0];
assign quotient = quotient_pipe[LATENCY-1][WIDTHQ-1:0];
assign remainder = remainder_pipe[LATENCY-1][WIDTHR-1:0];
end
`endif

View file

@ -1,16 +1,14 @@
`include "VX_platform.vh"
module VX_multiplier #(
parameter WIDTHA = 1,
parameter WIDTHB = 1,
parameter WIDTHP = 1,
parameter SIGNED = 0,
parameter PIPELINE = 0
parameter WIDTHA = 1,
parameter WIDTHB = 1,
parameter WIDTHP = 1,
parameter SIGNED = 0,
parameter LATENCY = 0
) (
input wire clk,
input wire reset,
input wire clk_en,
input wire clk,
input wire enable,
input wire [WIDTHA-1:0] dataa,
input wire [WIDTHB-1:0] datab,
output wire [WIDTHP-1:0] result
@ -20,20 +18,22 @@ module VX_multiplier #(
lpm_mult mult (
.clock (clk),
.clken (enable),
.dataa (dataa),
.datab (datab),
.result (result),
.clken (clk_en),
.result (result),
.aclr (1'b0),
.sclr (1'b0),
.sum (1'b0)
);
defparam mult.lpm_type = "LPM_MULT",
defparam mult.lpm_type = "LPM_MULT",
mult.lpm_widtha = WIDTHA,
mult.lpm_widthb = WIDTHB,
mult.lpm_widthp = WIDTHP,
mult.lpm_representation = SIGNED ? "SIGNED" : "UNSIGNED",
mult.lpm_pipeline = PIPELINE,
mult.lpm_hint = "MAXIMIZE_SPEED=9,DEDICATED_MULTIPLIER_CIRCUITRY=YES";
mult.lpm_pipeline = LATENCY,
mult.lpm_hint = "DEDICATED_MULTIPLIER_CIRCUITRY=YES,MAXIMIZE_SPEED=9";
`else
wire [WIDTHP-1:0] result_unqual;
@ -44,29 +44,20 @@ module VX_multiplier #(
assign result_unqual = dataa * datab;
end
if (PIPELINE == 0) begin
if (LATENCY == 0) begin
assign result = result_unqual;
end else begin
reg [WIDTHP-1:0] result_pipe [0:PIPELINE-1];
end else begin
reg [WIDTHP-1:0] result_pipe [0:LATENCY-1];
for (genvar i = 0; i < PIPELINE; i++) begin
for (genvar i = 0; i < LATENCY; i++) begin
always @(posedge clk) begin
if (reset) begin
result_pipe[i] <= 0;
end else begin
if (clk_en) begin
if (i == 0) begin
result_pipe[i] <= result_unqual;
end else begin
result_pipe[i] <= result_pipe[i-1];
end
end
if (enable) begin
result_pipe[i] <= (0 == i) ? result_unqual : result_pipe[i-1];
end
end
end
assign result = result_pipe[PIPELINE-1];
end
assign result = result_pipe[LATENCY-1];
end
`endif