FPU DPI fallback

This commit is contained in:
Blaise Tine 2020-08-31 09:19:55 -04:00
parent 0a0b28aac0
commit df711986bc
29 changed files with 1147 additions and 1329 deletions

View file

@ -34,9 +34,10 @@ LDFLAGS += -shared -pthread
TOP = Vortex
SRCS = vortex.cpp ../common/vx_utils.cpp ../../hw/simulate/simulator.cpp
SRCS += ../../hw/rtl/fp_cores/svdpi/float_dpi.cpp
FPU_INCLUDE = -I../../hw/rtl/fp_cores/fpnew/src/common_cells/include -I../../hw/rtl/fp_cores/fpnew/src/common_cells/src -I../../hw/rtl/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I../../hw/rtl/fp_cores/fpnew/src
RTL_INCLUDE = -I../../hw/rtl -I../../hw/rtl/libs -I../../hw/rtl/interfaces -I../../hw/rtl/cache -I../../hw/rtl/fp_cores $(FPU_INCLUDE)
FPU_INCLUDE = -I../../hw/rtl/fp_cores -I../../hw/rtl/fp_cores/svdpi -I../../hw/rtl/fp_cores/fpnew/src/common_cells/include -I../../hw/rtl/fp_cores/fpnew/src/common_cells/src -I../../hw/rtl/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I../../hw/rtl/fp_cores/fpnew/src
RTL_INCLUDE = -I../../hw/rtl -I../../hw/rtl/libs -I../../hw/rtl/interfaces -I../../hw/rtl/cache $(FPU_INCLUDE)
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic $(CONFIGS)
VL_FLAGS += -Wno-DECLFILENAME

View file

@ -51,15 +51,55 @@
`define L3_ENABLE (`NUM_CLUSTERS > 1)
`endif
`ifndef EXT_M_DISABLE
`define EXT_M_ENABLE
`endif
`ifndef EXT_F_DISABLE
`define EXT_F_ENABLE
`endif
`define FPNEW_ENABLE
// Device identification
`define VENDOR_ID 0
`define ARCHITECTURE_ID 0
`define IMPLEMENTATION_ID 0
///////////////////////////////////////////////////////////////////////////////
`ifndef LATENCY_IMUL
`define LATENCY_IMUL 3
`endif
`ifndef LATENCY_FNONCOMP
`define LATENCY_FNONCOMP 1
`endif
`ifndef LATENCY_FMADD
`define LATENCY_FMADD 1
`endif
`ifndef LATENCY_FNMADD
`define LATENCY_FNMADD 2
`endif
`ifndef LATENCY_FDIV
`define LATENCY_FDIV 15
`endif
`ifndef LATENCY_FSQRT
`define LATENCY_FSQRT 9
`endif
`ifndef LATENCY_ITOF
`define LATENCY_ITOF 7
`endif
`ifndef LATENCY_FTOI
`define LATENCY_FTOI 3
`endif
// CSR Addresses //////////////////////////////////////////////////////////////
`define CSR_FFLAGS 12'h001

View file

@ -35,22 +35,6 @@
///////////////////////////////////////////////////////////////////////////////
`define LATENCY_IMUL 3
`define LATENCY_FDIV 16
`define LATENCY_FSQRT 10
`define LATENCY_FTOI 5
`define LATENCY_FTOU 4
`define LATENCY_ITOF 8
`define LATENCY_UTOF 7
`define LATENCY_FMULADD 2
`define LATENCY_FDIVSQRT 2
`define LATENCY_FCONV 2
`define LATENCY_FNONCOMP 1
///////////////////////////////////////////////////////////////////////////////
`define INST_LUI 7'b0110111
`define INST_AUIPC 7'b0010111
`define INST_JAL 7'b1101111

View file

@ -56,7 +56,7 @@ module VX_fpu_unit #(
// can accept new request?
assign fpu_req_if.ready = ready_in && ~fpuq_full;
`ifdef SYNTHESIS
`ifndef FPNEW_ENABLE
VX_fp_fpga #(
.TAGW (FPUQ_BITS)

View file

@ -1,5 +1,9 @@
`include "VX_define.vh"
`ifndef SYNTHESIS
`include "float_dpi.vh"
`endif
module VX_fp_div #(
parameter TAGW = 1,
parameter LANES = 1
@ -21,19 +25,23 @@ module VX_fp_div #(
input wire ready_out,
output wire valid_out
);
wire stall = ~ready_out && valid_out;
wire enable = ~stall;
assign ready_in = enable;
wire stall = ~ready_out && valid_out;
for (genvar i = 0; i < LANES; i++) begin
`ifdef QUARTUS
acl_fp_div fdiv (
.clk (clk),
.areset (1'b0),
.en (enable),
.en (~stall),
.a (dataa[i]),
.b (datab[i]),
.q (result[i])
);
`else
always @(posedge clk) begin
dpi_fdiv(clk, ~stall, dataa[i], datab[i], result[i]);
end
`endif
end
VX_shift_register #(
@ -42,9 +50,11 @@ module VX_fp_div #(
) shift_reg (
.clk(clk),
.reset(reset),
.enable(enable),
.enable(~stall),
.in ({tag_in, valid_in}),
.out({tag_out, valid_out})
);
assign ready_in = ~stall;
endmodule

View file

@ -1,5 +1,4 @@
`include "VX_define.vh"
`include "dspba_library_ver.sv"
module VX_fp_fpga #(
parameter TAGW = 1
@ -28,7 +27,7 @@ module VX_fp_fpga #(
input wire ready_out,
output wire valid_out
);
localparam NUM_FPC = 12;
localparam NUM_FPC = 7;
localparam FPC_BITS = `LOG2UP(NUM_FPC);
wire [NUM_FPC-1:0] per_core_ready_in;
@ -41,26 +40,30 @@ module VX_fp_fpga #(
fflags_t [`NUM_THREADS-1:0] fpnew_fflags;
reg [FPC_BITS-1:0] core_select;
reg fmadd_negate;
reg do_add, do_sub, do_mul;
reg is_signed;
always @(*) begin
core_select = 0;
fmadd_negate = 0;
core_select = 'x;
do_add = 'x;
do_sub = 'x;
do_mul = 'x;
is_signed = 'x;
case (op_type)
`FPU_ADD: core_select = 1;
`FPU_SUB: core_select = 2;
`FPU_MUL: core_select = 3;
`FPU_MADD: core_select = 4;
`FPU_MSUB: core_select = 5;
`FPU_NMSUB: begin core_select = 4; fmadd_negate = 1; end
`FPU_NMADD: begin core_select = 5; fmadd_negate = 1; end
`FPU_DIV: core_select = 6;
`FPU_SQRT: core_select = 7;
`FPU_CVTWS: core_select = 8;
`FPU_CVTWUS: core_select = 9;
`FPU_CVTSW: core_select = 10;
`FPU_CVTSWU: core_select = 11;
default:;
`FPU_ADD: begin core_select = 1; do_mul = 0; do_add = 1; do_sub = 0; end
`FPU_SUB: begin core_select = 1; do_mul = 0; do_add = 0; do_sub = 1; end
`FPU_MUL: begin core_select = 1; do_mul = 1; do_add = 0; do_sub = 0; end
`FPU_MADD: begin core_select = 1; do_mul = 1; do_add = 1; do_sub = 0; end
`FPU_MSUB: begin core_select = 1; do_mul = 1; do_add = 0; do_sub = 1; end
`FPU_NMSUB: begin core_select = 2; do_sub = 1; end
`FPU_NMADD: begin core_select = 2; do_sub = 0; end
`FPU_DIV: begin core_select = 3; end
`FPU_SQRT: begin core_select = 4; end
`FPU_CVTWS: begin core_select = 5; is_signed = 1; end
`FPU_CVTWUS: begin core_select = 5; is_signed = 0; end
`FPU_CVTSW: begin core_select = 6; is_signed = 1; end
`FPU_CVTSWU: begin core_select = 6; is_signed = 0; end
default: begin core_select = 0; end
endcase
end
@ -76,7 +79,7 @@ module VX_fp_fpga #(
.op_type (op_type),
.frm (frm),
.dataa (dataa),
.datab (datab),
.datab (datab),
.result (per_core_result[0]),
.has_fflags (fpnew_has_fflags),
.fflags (fpnew_fflags),
@ -85,44 +88,50 @@ module VX_fp_fpga #(
.valid_out (per_core_valid_out[0])
);
VX_fp_add #(
VX_fp_madd #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_add (
) fp_madd (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 1)),
.ready_in (per_core_ready_in[1]),
.tag_in (tag_in),
.do_add (do_add),
.do_sub (do_sub),
.do_mul (do_mul),
.dataa (dataa),
.datab (datab),
.datab (datab),
.datac (datac),
.result (per_core_result[1]),
.tag_out (per_core_tag_out[1]),
.ready_out (per_core_ready_out[1]),
.valid_out (per_core_valid_out[1])
);
VX_fp_sub #(
VX_fp_nmadd #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_sub (
) fp_nmadd (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 2)),
.ready_in (per_core_ready_in[2]),
.tag_in (tag_in),
.tag_in (tag_in),
.do_sub (do_sub),
.dataa (dataa),
.datab (datab),
.datab (datab),
.datac (datac),
.result (per_core_result[2]),
.tag_out (per_core_tag_out[2]),
.ready_out (per_core_ready_out[2]),
.valid_out (per_core_valid_out[2])
);
VX_fp_mul #(
VX_fp_div #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_mul (
) fp_div (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 3)),
@ -136,75 +145,20 @@ module VX_fp_fpga #(
.valid_out (per_core_valid_out[3])
);
VX_fp_madd #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_madd (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 4)),
.ready_in (per_core_ready_in[4]),
.tag_in (tag_in),
.negate (fmadd_negate),
.dataa (dataa),
.datab (datab),
.datac (datac),
.result (per_core_result[4]),
.tag_out (per_core_tag_out[4]),
.ready_out (per_core_ready_out[4]),
.valid_out (per_core_valid_out[4])
);
VX_fp_msub #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_msub (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 5)),
.ready_in (per_core_ready_in[5]),
.tag_in (tag_in),
.negate (fmadd_negate),
.dataa (dataa),
.datab (datab),
.datac (datac),
.result (per_core_result[5]),
.tag_out (per_core_tag_out[5]),
.ready_out (per_core_ready_out[5]),
.valid_out (per_core_valid_out[5])
);
VX_fp_div #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_div (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 6)),
.ready_in (per_core_ready_in[6]),
.tag_in (tag_in),
.dataa (dataa),
.datab (datab),
.result (per_core_result[6]),
.tag_out (per_core_tag_out[6]),
.ready_out (per_core_ready_out[6]),
.valid_out (per_core_valid_out[6])
);
VX_fp_sqrt #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_sqrt (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 7)),
.ready_in (per_core_ready_in[7]),
.valid_in (valid_in && (core_select == 4)),
.ready_in (per_core_ready_in[4]),
.tag_in (tag_in),
.dataa (dataa),
.result (per_core_result[7]),
.tag_out (per_core_tag_out[7]),
.ready_out (per_core_ready_out[7]),
.valid_out (per_core_valid_out[7])
.result (per_core_result[4]),
.tag_out (per_core_tag_out[4]),
.ready_out (per_core_ready_out[4]),
.valid_out (per_core_valid_out[4])
);
VX_fp_ftoi #(
@ -213,30 +167,15 @@ module VX_fp_fpga #(
) fp_ftoi (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 8)),
.ready_in (per_core_ready_in[8]),
.tag_in (tag_in),
.valid_in (valid_in && (core_select == 5)),
.ready_in (per_core_ready_in[5]),
.tag_in (tag_in),
.is_signed (is_signed),
.dataa (dataa),
.result (per_core_result[8]),
.tag_out (per_core_tag_out[8]),
.ready_out (per_core_ready_out[8]),
.valid_out (per_core_valid_out[8])
);
VX_fp_ftou #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_ftou (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 9)),
.ready_in (per_core_ready_in[9]),
.tag_in (tag_in),
.dataa (dataa),
.result (per_core_result[9]),
.tag_out (per_core_tag_out[9]),
.ready_out (per_core_ready_out[9]),
.valid_out (per_core_valid_out[9])
.result (per_core_result[5]),
.tag_out (per_core_tag_out[5]),
.ready_out (per_core_ready_out[5]),
.valid_out (per_core_valid_out[5])
);
VX_fp_itof #(
@ -245,60 +184,45 @@ module VX_fp_fpga #(
) fp_itof (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 10)),
.ready_in (per_core_ready_in[10]),
.tag_in (tag_in),
.valid_in (valid_in && (core_select == 6)),
.ready_in (per_core_ready_in[6]),
.tag_in (tag_in),
.is_signed (is_signed),
.dataa (dataa),
.result (per_core_result[10]),
.tag_out (per_core_tag_out[10]),
.ready_out (per_core_ready_out[10]),
.valid_out (per_core_valid_out[10])
.result (per_core_result[6]),
.tag_out (per_core_tag_out[6]),
.ready_out (per_core_ready_out[6]),
.valid_out (per_core_valid_out[6])
);
VX_fp_utof #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_utof (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 11)),
.ready_in (per_core_ready_in[11]),
.tag_in (tag_in),
.dataa (dataa),
.result (per_core_result[11]),
.tag_out (per_core_tag_out[11]),
.ready_out (per_core_ready_out[11]),
.valid_out (per_core_valid_out[11])
);
reg valid_out_r;
reg has_fflags_r;
reg [`NUM_THREADS-1:0][31:0] result_r;
reg [TAGW-1:0] tag_out_r;
reg valid_out_n;
reg has_fflags_n;
reg [`NUM_THREADS-1:0][31:0] result_n;
reg [TAGW-1:0] tag_out_n;
always @(*) begin
per_core_ready_out = 0;
valid_out_r = 0;
has_fflags_r = 'x;
result_r = 'x;
tag_out_r = 'x;
valid_out_n = 0;
has_fflags_n = 'x;
result_n = 'x;
tag_out_n = 'x;
for (integer i = 0; i < NUM_FPC; i++) begin
if (per_core_valid_out[i]) begin
per_core_ready_out[i] = ready_out;
valid_out_r = 1;
has_fflags_r = fpnew_has_fflags && (i == 0);
result_r = per_core_result[i];
tag_out_r = per_core_tag_out[i];
valid_out_n = 1;
has_fflags_n = fpnew_has_fflags && (i == 0);
result_n = per_core_result[i];
tag_out_n = per_core_tag_out[i];
break;
end
end
end
assign ready_in = (& per_core_ready_in);
assign valid_out = valid_out_r;
assign has_fflags = has_fflags_r;
assign tag_out = tag_out_r;
assign result = result_r;
assign valid_out = valid_out_n;
assign has_fflags = has_fflags_n;
assign tag_out = tag_out_n;
assign result = result_n;
assign fflags = fpnew_fflags;
endmodule

View file

@ -0,0 +1,77 @@
`include "VX_define.vh"
`ifndef SYNTHESIS
`include "float_dpi.vh"
`endif
module VX_fp_ftoi #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [TAGW-1:0] tag_in,
input wire is_signed,
input wire [LANES-1:0][31:0] dataa,
output wire [LANES-1:0][31:0] result,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
);
wire stall = ~ready_out && valid_out;
reg is_signed_r;
for (genvar i = 0; i < LANES; i++) begin
wire [31:0] result_s;
wire [31:0] result_u;
`ifdef QUARTUS
acl_fp_ftoi ftoi (
.clk (clk),
.areset (1'b0),
.en (~stall),
.a (dataa[i]),
.q (result_s)
);
acl_fp_ftou ftou (
.clk (clk),
.areset (1'b0),
.en (~stall),
.a (dataa[i]),
.q (result_u)
);
`else
always @(posedge clk) begin
dpi_ftoi(clk, ~stall, dataa[i], result_s);
dpi_ftou(clk, ~stall, dataa[i], result_u);
end
`endif
assign result[i] = is_signed_r ? result_s : result_u;
end
VX_shift_register #(
.DATAW(TAGW + 1 + 1),
.DEPTH(`LATENCY_FTOI)
) shift_reg (
.clk(clk),
.reset(reset),
.enable(~stall),
.in ({tag_in, valid_in, is_signed}),
.out({tag_out, valid_out, is_signed_r})
);
assign ready_in = ~stall;
endmodule

View file

@ -0,0 +1,77 @@
`include "VX_define.vh"
`ifndef SYNTHESIS
`include "float_dpi.vh"
`endif
module VX_fp_itof #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [TAGW-1:0] tag_in,
input wire is_signed,
input wire [LANES-1:0][31:0] dataa,
output wire [LANES-1:0][31:0] result,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
);
wire stall = ~ready_out && valid_out;
reg is_signed_r;
for (genvar i = 0; i < LANES; i++) begin
wire [31:0] result_s;
wire [31:0] result_u;
`ifdef QUARTUS
acl_fp_itof itof (
.clk (clk),
.areset (1'b0),
.en (~stall),
.a (dataa[i]),
.q (result_s)
);
acl_fp_utof utof (
.clk (clk),
.areset (1'b0),
.en (~stall),
.a (dataa[i]),
.q (result_u)
);
`else
always @(posedge clk) begin
dpi_itof(clk, ~stall, dataa[i], result_s);
dpi_utof(clk, ~stall, dataa[i], result_u);
end
`endif
assign result[i] = is_signed_r ? result_s : result_u;
end
VX_shift_register #(
.DATAW(TAGW + 1 + 1),
.DEPTH(`LATENCY_FTOI)
) shift_reg (
.clk(clk),
.reset(reset),
.enable(~stall),
.in ({tag_in, valid_in, is_signed}),
.out({tag_out, valid_out, is_signed_r})
);
assign ready_in = ~stall;
endmodule

View file

@ -0,0 +1,291 @@
`include "VX_define.vh"
`ifndef SYNTHESIS
`include "float_dpi.vh"
`endif
module VX_fp_madd #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [TAGW-1:0] tag_in,
input wire do_add,
input wire do_sub,
input wire do_mul,
input wire [LANES-1:0][31:0] dataa,
input wire [LANES-1:0][31:0] datab,
input wire [LANES-1:0][31:0] datac,
output wire [LANES-1:0][31:0] result,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
);
wire stall = ~ready_out && valid_out;
reg do_add_r, do_sub_r, do_mul_r;
for (genvar i = 0; i < LANES; i++) begin
wire [31:0] result_add;
wire [31:0] result_sub;
wire [31:0] result_mul;
wire [31:0] result_madd;
wire [31:0] result_msub;
`ifdef QUARTUS
twentynm_fp_mac mac_fp_add (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(),
.ay(datab[i]),
.az(dataa[i]),
.clk({2'b00,clk}),
.ena({2'b11,~stall}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result_add),
.chainout()
);
defparam mac_fp_add.operation_mode = "sp_add";
defparam mac_fp_add.use_chainin = "false";
defparam mac_fp_add.adder_subtract = "false";
defparam mac_fp_add.ax_clock = "0";
defparam mac_fp_add.ay_clock = "0";
defparam mac_fp_add.az_clock = "0";
defparam mac_fp_add.output_clock = "0";
defparam mac_fp_add.accumulate_clock = "none";
defparam mac_fp_add.ax_chainin_pl_clock = "0";
defparam mac_fp_add.accum_pipeline_clock = "none";
defparam mac_fp_add.mult_pipeline_clock = "0";
defparam mac_fp_add.adder_input_clock = "0";
defparam mac_fp_add.accum_adder_clock = "none";
twentynm_fp_mac mac_fp_sub (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(),
.ay(datab[i]),
.az(dataa[i]),
.clk({2'b00,clk}),
.ena({2'b11,~stall}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result_sub),
.chainout()
);
defparam mac_fp_sub.operation_mode = "sp_add";
defparam mac_fp_sub.use_chainin = "false";
defparam mac_fp_sub.adder_subtract = "true";
defparam mac_fp_sub.ax_clock = "0";
defparam mac_fp_sub.ay_clock = "0";
defparam mac_fp_sub.az_clock = "none";
defparam mac_fp_sub.output_clock = "0";
defparam mac_fp_sub.accumulate_clock = "none";
defparam mac_fp_sub.ax_chainin_pl_clock = "none";
defparam mac_fp_sub.accum_pipeline_clock = "none";
defparam mac_fp_sub.mult_pipeline_clock = "none";
defparam mac_fp_sub.adder_input_clock = "0";
defparam mac_fp_sub.accum_adder_clock = "none";
twentynm_fp_mac mac_fp_mul (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(),
.ay(datab[i]),
.az(dataa[i]),
.clk({2'b00,clk}),
.ena({2'b11,~stall}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result_mul),
.chainout()
);
defparam mac_fp_mul.operation_mode = "sp_mult";
defparam mac_fp_mul.use_chainin = "false";
defparam mac_fp_mul.adder_subtract = "false";
defparam mac_fp_mul.ax_clock = "none";
defparam mac_fp_mul.ay_clock = "0";
defparam mac_fp_mul.az_clock = "0";
defparam mac_fp_mul.output_clock = "0";
defparam mac_fp_mul.accumulate_clock = "none";
defparam mac_fp_mul.ax_chainin_pl_clock = "none";
defparam mac_fp_mul.accum_pipeline_clock = "none";
defparam mac_fp_mul.mult_pipeline_clock = "0";
defparam mac_fp_mul.adder_input_clock = "none";
defparam mac_fp_mul.accum_adder_clock = "none";
twentynm_fp_mac mac_fp_madd (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(datac[i]),
.ay(datab[i]),
.az(dataa[i]),
.clk({2'b00,clk}),
.ena({2'b11,~stall}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result_madd),
.chainout()
);
defparam mac_fp_madd.operation_mode = "sp_mult_add";
defparam mac_fp_madd.use_chainin = "false";
defparam mac_fp_madd.adder_subtract = "false";
defparam mac_fp_madd.ax_clock = "0";
defparam mac_fp_madd.ay_clock = "0";
defparam mac_fp_madd.az_clock = "0";
defparam mac_fp_madd.output_clock = "0";
defparam mac_fp_madd.accumulate_clock = "none";
defparam mac_fp_madd.ax_chainin_pl_clock = "0";
defparam mac_fp_madd.accum_pipeline_clock = "none";
defparam mac_fp_madd.mult_pipeline_clock = "0";
defparam mac_fp_madd.adder_input_clock = "0";
defparam mac_fp_madd.accum_adder_clock = "none";
twentynm_fp_mac mac_fp_msub (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(datac[i]),
.ay(datab[i]),
.az(dataa[i]),
.clk({2'b00,clk}),
.ena({2'b11,~stall}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result_msub),
.chainout()
);
defparam mac_fp_msub.operation_mode = "sp_mult_add";
defparam mac_fp_msub.use_chainin = "false";
defparam mac_fp_msub.adder_subtract = "true";
defparam mac_fp_msub.ax_clock = "0";
defparam mac_fp_msub.ay_clock = "0";
defparam mac_fp_msub.az_clock = "0";
defparam mac_fp_msub.output_clock = "0";
defparam mac_fp_msub.accumulate_clock = "none";
defparam mac_fp_msub.ax_chainin_pl_clock = "0";
defparam mac_fp_msub.accum_pipeline_clock = "none";
defparam mac_fp_msub.mult_pipeline_clock = "0";
defparam mac_fp_msub.adder_input_clock = "0";
defparam mac_fp_msub.accum_adder_clock = "none";
`else
always @(posedge clk) begin
dpi_fadd(clk, ~stall, dataa[i], datab[i], result_add);
dpi_fsub(clk, ~stall, dataa[i], datab[i], result_sub);
dpi_fmul(clk, ~stall, dataa[i], datab[i], result_mul);
dpi_fmadd(clk, ~stall, dataa[i], datab[i], datac[i], result_madd);
dpi_fmsub(clk, ~stall, dataa[i], datab[i], datac[i], result_msub);
end
`endif
reg [31:0] result_r;
always @(*) begin
result_r = 'x;
if (do_mul_r) begin
if (do_add_r)
result_r = result_madd;
else if (do_sub_r)
result_r = result_msub;
else
result_r = result_mul;
end else begin
if (do_add_r)
result_r = result_add;
else if (do_sub_r)
result_r = result_sub;
end
end
assign result[i] = result_r;
end
VX_shift_register #(
.DATAW(TAGW + 1 + 1 + 1 + 1),
.DEPTH(`LATENCY_FMADD)
) shift_reg1 (
.clk(clk),
.reset(reset),
.enable(~stall),
.in({tag_in, valid_in, do_add, do_sub, do_mul}),
.out({tag_out, valid_out, do_add_r, do_sub_r, do_mul_r})
);
assign ready_in = ~stall;
endmodule

View file

@ -0,0 +1,191 @@
`include "VX_define.vh"
`ifndef SYNTHESIS
`include "float_dpi.vh"
`endif
module VX_fp_nmadd #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [TAGW-1:0] tag_in,
input wire do_sub,
input wire [LANES-1:0][31:0] dataa,
input wire [LANES-1:0][31:0] datab,
input wire [LANES-1:0][31:0] datac,
output wire [LANES-1:0][31:0] result,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
);
wire stall = ~ready_out && valid_out;
reg do_sub_r;
for (genvar i = 0; i < LANES; i++) begin
wire [31:0] result_madd;
wire [31:0] result_msub;
wire [31:0] result_st0 = do_sub_r ? result_msub : result_madd;
`ifdef QUARTUS
twentynm_fp_mac mac_fp_madd (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(datac[i]),
.ay(datab[i]),
.az(dataa[i]),
.clk({2'b00,clk}),
.ena({2'b11,~stall),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result_madd),
.chainout()
);
defparam mac_fp_madd.operation_mode = "sp_mult_add";
defparam mac_fp_madd.use_chainin = "false";
defparam mac_fp_madd.adder_subtract = "false";
defparam mac_fp_madd.ax_clock = "0";
defparam mac_fp_madd.ay_clock = "0";
defparam mac_fp_madd.az_clock = "0";
defparam mac_fp_madd.output_clock = "0";
defparam mac_fp_madd.accumulate_clock = "none";
defparam mac_fp_madd.ax_chainin_pl_clock = "0";
defparam mac_fp_madd.accum_pipeline_clock = "none";
defparam mac_fp_madd.mult_pipeline_clock = "0";
defparam mac_fp_madd.adder_input_clock = "0";
defparam mac_fp_madd.accum_adder_clock = "none";
twentynm_fp_mac mac_fp_msub (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(datac[i]),
.ay(datab[i]),
.az(dataa[i]),
.clk({2'b00,clk}),
.ena({2'b11,enable0}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result_msub),
.chainout()
);
defparam mac_fp_msub.operation_mode = "sp_mult_add";
defparam mac_fp_msub.use_chainin = "false";
defparam mac_fp_msub.adder_subtract = "true";
defparam mac_fp_msub.ax_clock = "0";
defparam mac_fp_msub.ay_clock = "0";
defparam mac_fp_msub.az_clock = "0";
defparam mac_fp_msub.output_clock = "0";
defparam mac_fp_msub.accumulate_clock = "none";
defparam mac_fp_msub.ax_chainin_pl_clock = "0";
defparam mac_fp_msub.accum_pipeline_clock = "none";
defparam mac_fp_msub.mult_pipeline_clock = "0";
defparam mac_fp_msub.adder_input_clock = "0";
defparam mac_fp_msub.accum_adder_clock = "none";
twentynm_fp_mac mac_fp_neg (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(32'h0),
.ay(result_st0),
.az(),
.clk({2'b00,clk}),
.ena({2'b11,enable1}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result[i]),
.chainout()
);
defparam mac_fp_neg.operation_mode = "sp_add";
defparam mac_fp_neg.use_chainin = "false";
defparam mac_fp_neg.adder_subtract = "true";
defparam mac_fp_neg.ax_clock = "0";
defparam mac_fp_neg.ay_clock = "0";
defparam mac_fp_neg.az_clock = "none";
defparam mac_fp_neg.output_clock = "0";
defparam mac_fp_neg.accumulate_clock = "none";
defparam mac_fp_neg.ax_chainin_pl_clock = "none";
defparam mac_fp_neg.accum_pipeline_clock = "none";
defparam mac_fp_neg.mult_pipeline_clock = "none";
defparam mac_fp_neg.adder_input_clock = "0";
defparam mac_fp_neg.accum_adder_clock = "none";
`else
always @(posedge clk) begin
dpi_fmadd(clk, ~stall, dataa[i], datab[i], datac[i], result_madd);
dpi_fmsub(clk, ~stall, dataa[i], datab[i], datac[i], result_msub);
dpi_fsub(clk, ~stall, 32'b0, result_st0, result[i]);
end
`endif
end
always @(posedge clk) begin
if (~stall) begin
do_sub_r <= do_sub;
end
end
VX_shift_register #(
.DATAW(TAGW + 1),
.DEPTH(`LATENCY_FNMADD)
) shift_reg1 (
.clk(clk),
.reset(reset),
.enable(~stall),
.in({tag_in, valid_in}),
.out({tag_out, valid_out})
);
assign ready_in = ~stall;
endmodule

View file

@ -45,8 +45,8 @@ module VX_fp_noncomp #(
reg [LANES-1:0][31:0] datab_r;
reg [LANES-1:0] a_sign, b_sign;
reg [LANES-1:0][7:0] a_exponent, b_exponent;
reg [LANES-1:0][22:0] a_mantissa, b_mantissa;
reg [LANES-1:0][7:0] a_exponent;
reg [LANES-1:0][22:0] a_mantissa;
fp_type_t [LANES-1:0] a_type, b_type;
reg [LANES-1:0] a_smaller, ab_equal;
@ -60,12 +60,12 @@ module VX_fp_noncomp #(
// Setup
for (genvar i = 0; i < LANES; i++) begin
wire tmp_a_sign = dataa[i][31];
wire [7:0] tmp_a_exponent = dataa[i][30:23];
wire tmp_a_sign = dataa[i][31];
wire [7:0] tmp_a_exponent = dataa[i][30:23];
wire [22:0] tmp_a_mantissa = dataa[i][22:0];
wire tmp_b_sign = datab[i][31];
wire [7:0] tmp_b_exponent = datab[i][30:23];
wire tmp_b_sign = datab[i][31];
wire [7:0] tmp_b_exponent = datab[i][30:23];
wire [22:0] tmp_b_mantissa = datab[i][22:0];
fp_type_t tmp_a_type, tmp_b_type;
@ -86,14 +86,14 @@ module VX_fp_noncomp #(
wire tmp_ab_equal = (dataa[i] == datab[i]) | (tmp_a_type[4] & tmp_b_type[4]);
VX_generic_register #(
.N(1 + 1 + 8 + 8 + 23 + 23 + $bits(fp_type_t) + $bits(fp_type_t) + 1 + 1)
.N(1 + 1 + 8 + 23 + $bits(fp_type_t) + $bits(fp_type_t) + 1 + 1)
) fnc1_reg (
.clk (clk),
.reset (reset),
.stall (stall),
.flush (1'b0),
.in ({tmp_a_sign, tmp_b_sign, tmp_a_exponent, tmp_b_exponent, tmp_a_mantissa, tmp_b_mantissa, tmp_a_type, tmp_b_type, tmp_a_smaller, tmp_ab_equal}),
.out ({a_sign[i], b_sign[i], a_exponent[i], b_exponent[i], a_mantissa[i], b_mantissa[i], a_type[i], b_type[i], a_smaller[i], ab_equal[i]})
.in ({tmp_a_sign, tmp_b_sign, tmp_a_exponent, tmp_a_mantissa, tmp_a_type, tmp_b_type, tmp_a_smaller, tmp_ab_equal}),
.out ({a_sign[i], b_sign[i], a_exponent[i], a_mantissa[i], a_type[i], b_type[i], a_smaller[i], ab_equal[i]})
);
end
@ -213,8 +213,6 @@ module VX_fp_noncomp #(
for (genvar i = 0; i < LANES; i++) begin
always @(*) begin
tmp_result[i] = 32'hdeadbeaf;
{tmp_fflags[i].NV, tmp_fflags[i].DZ, tmp_fflags[i].OF, tmp_fflags[i].UF, tmp_fflags[i].NX} = 5'h0;
case (op_type_r)
`FPU_CLASS: begin
tmp_result[i] = fclass_mask[i];
@ -224,7 +222,8 @@ module VX_fp_noncomp #(
tmp_result[i] = fcmp_res[i];
{tmp_fflags[i].NV, tmp_fflags[i].DZ, tmp_fflags[i].OF, tmp_fflags[i].UF, tmp_fflags[i].NX} = fcmp_excp[i];
end
`FPU_MISC: begin
//`FPU_MISC:
default: begin
case (frm)
0,1,2: begin
tmp_result[i] = fsgnj_res[i];
@ -234,7 +233,8 @@ module VX_fp_noncomp #(
tmp_result[i] = fminmax_res[i];
{tmp_fflags[i].NV, tmp_fflags[i].DZ, tmp_fflags[i].OF, tmp_fflags[i].UF, tmp_fflags[i].NX} = {a_type[i][0] | b_type[i][0], 4'h0};
end
5,6: begin
//5,6,7:
default: begin
tmp_result[i] = dataa[i];
{tmp_fflags[i].NV, tmp_fflags[i].DZ, tmp_fflags[i].OF, tmp_fflags[i].UF, tmp_fflags[i].NX} = 5'h0;
end

View file

@ -1,5 +1,9 @@
`include "VX_define.vh"
`ifndef SYNTHESIS
`include "float_dpi.vh"
`endif
module VX_fp_sqrt #(
parameter TAGW = 1,
parameter LANES = 1
@ -20,18 +24,22 @@ module VX_fp_sqrt #(
input wire ready_out,
output wire valid_out
);
wire stall = ~ready_out && valid_out;
wire enable = ~stall;
assign ready_in = enable;
wire stall = ~ready_out && valid_out;
for (genvar i = 0; i < LANES; i++) begin
`ifdef QUARTUS
acl_fp_sqrt fsqrt (
.clk (clk),
.areset (1'b0),
.en (enable),
.en (~stall),
.a (dataa[i]),
.q (result[i])
);
`else
always @(posedge clk) begin
dpi_fsqrt(clk, ~stall, dataa[i], result[i]);
end
`endif
end
VX_shift_register #(
@ -40,9 +48,11 @@ module VX_fp_sqrt #(
) shift_reg (
.clk(clk),
.reset(reset),
.enable(enable),
.enable(~stall),
.in ({tag_in, valid_in}),
.out({tag_out, valid_out})
);
assign ready_in = ~stall;
endmodule

View file

@ -8,12 +8,20 @@ module VX_fp_type (
// outputs
output fp_type_t o_type
);
assign o_type.is_normal = (exponent != 8'd0) && (exponent != 8'hff);
assign o_type.is_zero = (exponent == 8'd0) && (mantissa == 23'd0);
assign o_type.is_subnormal = (exponent == 8'd0) && !o_type.is_zero;
assign o_type.is_inf = ((exponent == 8'hff) && (mantissa == 23'd0));
assign o_type.is_nan = ((exponent == 8'hff) && (mantissa != 23'd0));
assign o_type.is_signaling = o_type.is_nan && (mantissa[22] == 1'b0);
assign o_type.is_quiet = o_type.is_nan && !o_type.is_signaling;
wire is_normal = (exponent != 8'd0) && (exponent != 8'hff);
wire is_zero = (exponent == 8'd0) && (mantissa == 23'd0);
wire is_subnormal = (exponent == 8'd0) && !is_zero;
wire is_inf = (exponent == 8'hff) && (mantissa == 23'd0);
wire is_nan = (exponent == 8'hff) && (mantissa != 23'd0);
wire is_signaling = is_nan && (mantissa[22] == 1'b0);
wire is_quiet = is_nan && !is_signaling;
assign o_type.is_normal = is_normal;
assign o_type.is_zero = is_zero;
assign o_type.is_subnormal = is_subnormal;
assign o_type.is_inf = is_inf;
assign o_type.is_nan = is_nan;
assign o_type.is_signaling = is_signaling;
assign o_type.is_quiet = is_quiet;
endmodule

View file

@ -53,10 +53,10 @@ module VX_fpnew #(
};
localparam fpnew_pkg::fpu_implementation_t FPU_IMPLEMENTATION = '{
PipeRegs:'{'{`LATENCY_FMULADD, 0, 0, 0, 0}, // ADDMUL
'{default: `LATENCY_FDIVSQRT}, // DIVSQRT
'{default: `LATENCY_FNONCOMP}, // NONCOMP
'{default: `LATENCY_FCONV}}, // CONV
PipeRegs:'{'{`LATENCY_FMADD, 0, 0, 0, 0}, // ADDMUL
'{default: `LATENCY_FDIV}, // DIVSQRT
'{default: `LATENCY_FNONCOMP}, // NONCOMP
'{default: `LATENCY_ITOF}}, // CONV
UnitTypes:'{'{default: UNIT_FMULADD}, // ADDMUL
'{default: UNIT_FDIVSQRT}, // DIVSQRT
'{default: UNIT_FNONCOMP}, // NONCOMP

View file

@ -1,81 +0,0 @@
`include "VX_define.vh"
module VX_fp_add #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [TAGW-1:0] tag_in,
input wire [LANES-1:0][31:0] dataa,
input wire [LANES-1:0][31:0] datab,
output wire [LANES-1:0][31:0] result,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
);
wire stall = ~ready_out && valid_out;
wire enable = ~stall;
assign ready_in = enable;
for (genvar i = 0; i < LANES; i++) begin
twentynm_fp_mac mac_fp_wys (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(dataa[i]),
.ay(datab[i]),
.az(),
.clk({2'b00,clk}),
.ena({2'b11,enable}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result[i]),
.chainout()
);
defparam mac_fp_wys.operation_mode = "sp_add";
defparam mac_fp_wys.use_chainin = "false";
defparam mac_fp_wys.adder_subtract = "false";
defparam mac_fp_wys.ax_clock = "0";
defparam mac_fp_wys.ay_clock = "0";
defparam mac_fp_wys.az_clock = "none";
defparam mac_fp_wys.output_clock = "0";
defparam mac_fp_wys.accumulate_clock = "none";
defparam mac_fp_wys.ax_chainin_pl_clock = "none";
defparam mac_fp_wys.accum_pipeline_clock = "none";
defparam mac_fp_wys.mult_pipeline_clock = "none";
defparam mac_fp_wys.adder_input_clock = "0";
defparam mac_fp_wys.accum_adder_clock = "none";
end
VX_shift_register #(
.DATAW(TAGW + 1),
.DEPTH(1)
) shift_reg (
.clk(clk),
.reset(reset),
.enable(enable),
.in ({tag_in, valid_in}),
.out({tag_out, valid_out})
);
endmodule

View file

@ -1,48 +0,0 @@
`include "VX_define.vh"
module VX_fp_ftoi #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [TAGW-1:0] tag_in,
input wire [LANES-1:0][31:0] dataa,
output wire [LANES-1:0][31:0] result,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
);
wire stall = ~ready_out && valid_out;
wire enable = ~stall;
assign ready_in = enable;
for (genvar i = 0; i < LANES; i++) begin
acl_fp_ftoi ftoi (
.clk (clk),
.areset (1'b0),
.en (enable),
.a (dataa[i]),
.q (result[i])
);
end
VX_shift_register #(
.DATAW(TAGW + 1),
.DEPTH(`LATENCY_FTOI)
) shift_reg (
.clk(clk),
.reset(reset),
.enable(enable),
.in ({tag_in, valid_in}),
.out({tag_out, valid_out})
);
endmodule

View file

@ -1,48 +0,0 @@
`include "VX_define.vh"
module VX_fp_ftou #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [TAGW-1:0] tag_in,
input wire [LANES-1:0][31:0] dataa,
output wire [LANES-1:0][31:0] result,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
);
wire stall = ~ready_out && valid_out;
wire enable = ~stall;
assign ready_in = enable;
for (genvar i = 0; i < LANES; i++) begin
acl_fp_ftou ftou (
.clk (clk),
.areset (1'b0),
.en (enable),
.a (dataa[i]),
.q (result[i])
);
end
VX_shift_register #(
.DATAW(TAGW + 1),
.DEPTH(`LATENCY_FTOU)
) shift_reg (
.clk(clk),
.reset(reset),
.enable(enable),
.in ({tag_in, valid_in}),
.out({tag_out, valid_out})
);
endmodule

View file

@ -1,48 +0,0 @@
`include "VX_define.vh"
module VX_fp_itof #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [TAGW-1:0] tag_in,
input wire [LANES-1:0][31:0] dataa,
output wire [LANES-1:0][31:0] result,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
);
wire stall = ~ready_out && valid_out;
wire enable = ~stall;
assign ready_in = enable;
for (genvar i = 0; i < LANES; i++) begin
acl_fp_itof itof (
.clk (clk),
.areset (1'b0),
.en (enable),
.a (dataa[i]),
.q (result[i])
);
end
VX_shift_register #(
.DATAW(TAGW + 1),
.DEPTH(`LATENCY_ITOF)
) shift_reg (
.clk(clk),
.reset(reset),
.enable(enable),
.in ({tag_in, valid_in}),
.out({tag_out, valid_out})
);
endmodule

View file

@ -1,146 +0,0 @@
`include "VX_define.vh"
module VX_fp_madd #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [TAGW-1:0] tag_in,
input wire [LANES-1:0][31:0] dataa,
input wire [LANES-1:0][31:0] datab,
input wire [LANES-1:0][31:0] datac,
output wire [LANES-1:0][31:0] result,
input wire negate,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
);
wire enable0, enable1;
assign ready_in = enable0 && enable1;
wire [LANES-1:0][31:0] result_st0, result_st1;
wire [TAGW-1:0] out_tag_st0, out_tag_st1;
wire in_valid_st0, out_valid_st0, out_valid_st1;
for (genvar i = 0; i < LANES; i++) begin
twentynm_fp_mac mac_fp_wys0 (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(datac[i]),
.ay(datab[i]),
.az(dataa[i]),
.clk({2'b00,clk}),
.ena({2'b11,enable0}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result_st0[i]),
.chainout()
);
defparam mac_fp_wys0.operation_mode = "sp_mult_add";
defparam mac_fp_wys0.use_chainin = "false";
defparam mac_fp_wys0.adder_subtract = "false";
defparam mac_fp_wys0.ax_clock = "0";
defparam mac_fp_wys0.ay_clock = "0";
defparam mac_fp_wys0.az_clock = "0";
defparam mac_fp_wys0.output_clock = "0";
defparam mac_fp_wys0.accumulate_clock = "none";
defparam mac_fp_wys0.ax_chainin_pl_clock = "0";
defparam mac_fp_wys0.accum_pipeline_clock = "none";
defparam mac_fp_wys0.mult_pipeline_clock = "0";
defparam mac_fp_wys0.adder_input_clock = "0";
defparam mac_fp_wys0.accum_adder_clock = "none";
twentynm_fp_mac mac_fp_wys1 (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(32'h0),
.ay(result_st0[i]),
.az(),
.clk({2'b00,clk}),
.ena({2'b11,enable1}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result_st1[i]),
.chainout()
);
defparam mac_fp_wys1.operation_mode = "sp_add";
defparam mac_fp_wys1.use_chainin = "false";
defparam mac_fp_wys1.adder_subtract = "true";
defparam mac_fp_wys1.ax_clock = "0";
defparam mac_fp_wys1.ay_clock = "0";
defparam mac_fp_wys1.az_clock = "none";
defparam mac_fp_wys1.output_clock = "0";
defparam mac_fp_wys1.accumulate_clock = "none";
defparam mac_fp_wys1.ax_chainin_pl_clock = "none";
defparam mac_fp_wys1.accum_pipeline_clock = "none";
defparam mac_fp_wys1.mult_pipeline_clock = "none";
defparam mac_fp_wys1.adder_input_clock = "0";
defparam mac_fp_wys1.accum_adder_clock = "none";
end
VX_shift_register #(
.DATAW(TAGW + 1 + 1),
.DEPTH(1)
) shift_reg0 (
.clk(clk),
.reset(reset),
.enable(enable0),
.in ({tag_in, (valid_in && ~negate), (valid_in && negate)}),
.out({out_tag_st0, out_valid_st0, in_valid_st0})
);
VX_shift_register #(
.DATAW(TAGW + 1),
.DEPTH(1)
) shift_reg1 (
.clk(clk),
.reset(reset),
.enable(enable1),
.in({out_tag_st0, in_valid_st0}),
.out({out_tag_st1, out_valid_st1})
);
wire out_stall = ~ready_out && valid_out;
assign enable0 = ~out_stall;
assign enable1 = ~out_stall && ~(out_valid_st0 && out_valid_st1); // stall the negate stage if dual outputs
assign result = out_valid_st0 ? result_st0 : result_st1;
assign tag_out = out_valid_st0 ? out_tag_st0 : out_tag_st1;
assign valid_out = out_valid_st0 || out_valid_st1;
endmodule

View file

@ -1,146 +0,0 @@
`include "VX_define.vh"
module VX_fp_msub #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [TAGW-1:0] tag_in,
input wire [LANES-1:0][31:0] dataa,
input wire [LANES-1:0][31:0] datab,
input wire [LANES-1:0][31:0] datac,
output wire [LANES-1:0][31:0] result,
input wire negate,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
);
wire enable0, enable1;
assign ready_in = enable0 && enable1;
wire [LANES-1:0][31:0] result_st0, result_st1;
wire [TAGW-1:0] out_tag_st0, out_tag_st1;
wire in_valid_st0, out_valid_st0, out_valid_st1;
for (genvar i = 0; i < LANES; i++) begin
twentynm_fp_mac mac_fp_wys0 (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(datac[i]),
.ay(datab[i]),
.az(dataa[i]),
.clk({2'b00,clk}),
.ena({2'b11,enable0}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result_st0[i]),
.chainout()
);
defparam mac_fp_wys0.operation_mode = "sp_mult_add";
defparam mac_fp_wys0.use_chainin = "false";
defparam mac_fp_wys0.adder_subtract = "true";
defparam mac_fp_wys0.ax_clock = "0";
defparam mac_fp_wys0.ay_clock = "0";
defparam mac_fp_wys0.az_clock = "0";
defparam mac_fp_wys0.output_clock = "0";
defparam mac_fp_wys0.accumulate_clock = "none";
defparam mac_fp_wys0.ax_chainin_pl_clock = "0";
defparam mac_fp_wys0.accum_pipeline_clock = "none";
defparam mac_fp_wys0.mult_pipeline_clock = "0";
defparam mac_fp_wys0.adder_input_clock = "0";
defparam mac_fp_wys0.accum_adder_clock = "none";
twentynm_fp_mac mac_fp_wys1 (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(32'h0),
.ay(result_st0[i]),
.az(),
.clk({2'b00,clk}),
.ena({2'b11,enable1}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result_st1[i]),
.chainout()
);
defparam mac_fp_wys1.operation_mode = "sp_add";
defparam mac_fp_wys1.use_chainin = "false";
defparam mac_fp_wys1.adder_subtract = "true";
defparam mac_fp_wys1.ax_clock = "0";
defparam mac_fp_wys1.ay_clock = "0";
defparam mac_fp_wys1.az_clock = "none";
defparam mac_fp_wys1.output_clock = "0";
defparam mac_fp_wys1.accumulate_clock = "none";
defparam mac_fp_wys1.ax_chainin_pl_clock = "none";
defparam mac_fp_wys1.accum_pipeline_clock = "none";
defparam mac_fp_wys1.mult_pipeline_clock = "none";
defparam mac_fp_wys1.adder_input_clock = "0";
defparam mac_fp_wys1.accum_adder_clock = "none";
end
VX_shift_register #(
.DATAW(TAGW + 1 + 1),
.DEPTH(1)
) shift_reg0 (
.clk(clk),
.reset(reset),
.enable(enable0),
.in ({tag_in, (valid_in && ~negate), (valid_in && negate)}),
.out({out_tag_st0, out_valid_st0, in_valid_st0})
);
VX_shift_register #(
.DATAW(TAGW + 1),
.DEPTH(1)
) shift_reg1 (
.clk(clk),
.reset(reset),
.enable(enable1),
.in({out_tag_st0, in_valid_st0}),
.out({out_tag_st1, out_valid_st1})
);
wire out_stall = ~ready_out && valid_out;
assign enable0 = ~out_stall;
assign enable1 = ~out_stall && ~(out_valid_st0 && out_valid_st1); // stall the negate stage if dual outputs
assign result = out_valid_st0 ? result_st0 : result_st1;
assign tag_out = out_valid_st0 ? out_tag_st0 : out_tag_st1;
assign valid_out = out_valid_st0 || out_valid_st1;
endmodule

View file

@ -1,81 +0,0 @@
`include "VX_define.vh"
module VX_fp_mul #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [TAGW-1:0] tag_in,
input wire [LANES-1:0][31:0] dataa,
input wire [LANES-1:0][31:0] datab,
output wire [LANES-1:0][31:0] result,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
);
wire stall = ~ready_out && valid_out;
wire enable = ~stall;
assign ready_in = enable;
for (genvar i = 0; i < LANES; i++) begin
twentynm_fp_mac mac_fp_wys (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(),
.ay(datab[i]),
.az(dataa[i]),
.clk({2'b00,clk}),
.ena({2'b11,enable}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result[i]),
.chainout()
);
defparam mac_fp_wys.operation_mode = "sp_mult";
defparam mac_fp_wys.use_chainin = "false";
defparam mac_fp_wys.adder_subtract = "false";
defparam mac_fp_wys.ax_clock = "none";
defparam mac_fp_wys.ay_clock = "0";
defparam mac_fp_wys.az_clock = "0";
defparam mac_fp_wys.output_clock = "0";
defparam mac_fp_wys.accumulate_clock = "none";
defparam mac_fp_wys.ax_chainin_pl_clock = "none";
defparam mac_fp_wys.accum_pipeline_clock = "none";
defparam mac_fp_wys.mult_pipeline_clock = "0";
defparam mac_fp_wys.adder_input_clock = "none";
defparam mac_fp_wys.accum_adder_clock = "none";
end
VX_shift_register #(
.DATAW(TAGW + 1),
.DEPTH(1)
) shift_reg (
.clk(clk),
.reset(reset),
.enable(enable),
.in ({tag_in, valid_in}),
.out({tag_out, valid_out})
);
endmodule

View file

@ -1,81 +0,0 @@
`include "VX_define.vh"
module VX_fp_sub #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [TAGW-1:0] tag_in,
input wire [LANES-1:0][31:0] dataa,
input wire [LANES-1:0][31:0] datab,
output wire [LANES-1:0][31:0] result,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
);
wire stall = ~ready_out && valid_out;
wire enable = ~stall;
assign ready_in = enable;
for (genvar i = 0; i < LANES; i++) begin
twentynm_fp_mac mac_fp_wys (
// inputs
.accumulate(),
.chainin_overflow(),
.chainin_invalid(),
.chainin_underflow(),
.chainin_inexact(),
.ax(dataa[i]),
.ay(datab[i]),
.az(),
.clk({2'b00,clk}),
.ena({2'b11,enable}),
.aclr(2'b00),
.chainin(),
// outputs
.overflow(),
.invalid(),
.underflow(),
.inexact(),
.chainout_overflow(),
.chainout_invalid(),
.chainout_underflow(),
.chainout_inexact(),
.resulta(result[i]),
.chainout()
);
defparam mac_fp_wys.operation_mode = "sp_add";
defparam mac_fp_wys.use_chainin = "false";
defparam mac_fp_wys.adder_subtract = "true";
defparam mac_fp_wys.ax_clock = "0";
defparam mac_fp_wys.ay_clock = "0";
defparam mac_fp_wys.az_clock = "none";
defparam mac_fp_wys.output_clock = "0";
defparam mac_fp_wys.accumulate_clock = "none";
defparam mac_fp_wys.ax_chainin_pl_clock = "none";
defparam mac_fp_wys.accum_pipeline_clock = "none";
defparam mac_fp_wys.mult_pipeline_clock = "none";
defparam mac_fp_wys.adder_input_clock = "0";
defparam mac_fp_wys.accum_adder_clock = "none";
end
VX_shift_register #(
.DATAW(TAGW + 1),
.DEPTH(1)
) shift_reg (
.clk(clk),
.reset(reset),
.enable(enable),
.in ({tag_in, valid_in}),
.out({tag_out, valid_out})
);
endmodule

View file

@ -1,48 +0,0 @@
`include "VX_define.vh"
module VX_fp_utof #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [TAGW-1:0] tag_in,
input wire [LANES-1:0][31:0] dataa,
output wire [LANES-1:0][31:0] result,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
);
wire stall = ~ready_out && valid_out;
wire enable = ~stall;
assign ready_in = enable;
for (genvar i = 0; i < LANES; i++) begin
acl_fp_utof utof (
.clk (clk),
.areset (1'b0),
.en (enable),
.a (dataa[i]),
.q (result[i])
);
end
VX_shift_register #(
.DATAW(TAGW + 1),
.DEPTH(`LATENCY_UTOF)
) shift_reg (
.clk(clk),
.reset(reset),
.enable(enable),
.in ({tag_in, valid_in}),
.out({tag_out, valid_out})
);
endmodule

View file

@ -0,0 +1,95 @@
// Legal Notice: Copyright 2017 Intel Corporation. All rights reserved.
// Your use of Intel Corporation's design tools, logic functions and other
// software and tools, and its AMPP partner logic functions, and any output
// files any of the foregoing device programming or simulation files), and
// any associated documentation or information are expressly subject to the
// terms and conditions of the Intel FPGA Software License Agreement,
// Intel MegaCore Function License Agreement, or other applicable license
// agreement, including, without limitation, that your use is for the sole
// purpose of programming logic devices manufactured by Intel and sold by
// Intel or its authorized distributors. Please refer to the applicable
// agreement for further details.
module dspba_delay_ver
#(
parameter width = 8,
parameter depth = 1,
parameter reset_high = 1'b1,
parameter reset_kind = "ASYNC"
) (
input clk,
input aclr,
input ena,
input [width-1:0] xin,
output [width-1:0] xout
);
wire reset;
reg [width-1:0] delays [depth-1:0];
assign reset = aclr ^ reset_high;
generate
if (depth > 0)
begin
genvar i;
for (i = 0; i < depth; ++i)
begin : delay_block
if (reset_kind == "ASYNC")
begin : sync_reset
always @ (posedge clk or negedge reset)
begin: a
if (!reset) begin
delays[i] <= 0;
end else begin
if (ena) begin
if (i > 0) begin
delays[i] <= delays[i - 1];
end else begin
delays[i] <= xin;
end
end
end
end
end
if (reset_kind == "SYNC")
begin : async_reset
always @ (posedge clk)
begin: a
if (!reset) begin
delays[i] <= 0;
end else begin
if (ena) begin
if (i > 0) begin
delays[i] <= delays[i - 1];
end else begin
delays[i] <= xin;
end
end
end
end
end
if (reset_kind == "NONE")
begin : no_reset
always @ (posedge clk)
begin: a
if (ena) begin
if (i > 0) begin
delays[i] <= delays[i - 1];
end else begin
delays[i] <= xin;
end
end
end
end
end
assign xout = delays[depth - 1];
end else begin
assign xout = xin;
end
endgenerate
endmodule

View file

@ -1,392 +0,0 @@
// Legal Notice: Copyright 2017 Intel Corporation. All rights reserved.
// Your use of Intel Corporation's design tools, logic functions and other
// software and tools, and its AMPP partner logic functions, and any output
// files any of the foregoing device programming or simulation files), and
// any associated documentation or information are expressly subject to the
// terms and conditions of the Intel FPGA Software License Agreement,
// Intel MegaCore Function License Agreement, or other applicable license
// agreement, including, without limitation, that your use is for the sole
// purpose of programming logic devices manufactured by Intel and sold by
// Intel or its authorized distributors. Please refer to the applicable
// agreement for further details.
module dspba_delay_ver
#(
parameter width = 8,
parameter depth = 1,
parameter reset_high = 1'b1,
parameter reset_kind = "ASYNC"
) (
input clk,
input aclr,
input ena,
input [width-1:0] xin,
output [width-1:0] xout
);
wire reset;
reg [width-1:0] delays [depth-1:0];
assign reset = aclr ^ reset_high;
generate
if (depth > 0)
begin
genvar i;
for (i = 0; i < depth; ++i)
begin : delay_block
if (reset_kind == "ASYNC")
begin : sync_reset
always @ (posedge clk or negedge reset)
begin: a
if (!reset) begin
delays[i] <= 0;
end else begin
if (ena) begin
if (i > 0) begin
delays[i] <= delays[i - 1];
end else begin
delays[i] <= xin;
end
end
end
end
end
if (reset_kind == "SYNC")
begin : async_reset
always @ (posedge clk)
begin: a
if (!reset) begin
delays[i] <= 0;
end else begin
if (ena) begin
if (i > 0) begin
delays[i] <= delays[i - 1];
end else begin
delays[i] <= xin;
end
end
end
end
end
if (reset_kind == "NONE")
begin : no_reset
always @ (posedge clk)
begin: a
if (ena) begin
if (i > 0) begin
delays[i] <= delays[i - 1];
end else begin
delays[i] <= xin;
end
end
end
end
end
assign xout = delays[depth - 1];
end else begin
assign xout = xin;
end
endgenerate
endmodule
//------------------------------------------------------------------------------
module dspba_sync_reg_ver
#(
parameter width1 = 8,
parameter width2 = 8,
parameter depth = 2,
parameter pulse_multiplier = 1,
parameter counter_width = 8,
parameter init_value = 0,
parameter reset1_high = 1'b1,
parameter reset2_high = 1'b1,
parameter reset_kind = "ASYNC"
) (
input clk1,
input aclr1,
input [0 : 0] ena,
input [width1-1 : 0] xin,
output [width1-1 : 0] xout,
input clk2,
input aclr2,
output [width2-1 : 0] sxout
);
wire [width1-1 : 0] init_value_internal;
wire reset1;
wire reset2;
reg iclk_enable;
reg [width1-1 : 0] iclk_data;
reg [width2-1 : 0] oclk_data;
// For Synthesis this means: preserve this registers and do not merge any other flip-flops with synchronizer flip-flops
// For TimeQuest this means: identify these flip-flops as synchronizer to enable automatic MTBF analysis
(* altera_attribute = {"-name ADV_NETLIST_OPT_ALLOWED NEVER_ALLOW; -name SYNCHRONIZER_IDENTIFICATION FORCED; -name DONT_MERGE_REGISTER ON; -name PRESERVE_REGISTER ON"} *) reg [depth-1 : 0] sync_regs;
wire oclk_enable;
wire ena_internal;
reg [counter_width-1 : 0] counter;
assign init_value_internal = init_value;
assign reset1 = aclr1 ^ reset1_high;
assign reset2 = aclr2 ^ reset2_high;
generate
if (pulse_multiplier == 1)
begin: no_multiplication
assign ena_internal = ena[0];
end
endgenerate
generate
if (pulse_multiplier > 1)
begin: multiplu_ena_pulse
if (reset_kind == "ASYNC")
begin: async_reset
always @ (posedge clk1 or negedge reset1)
begin
if (reset1 == 1'b0) begin
counter <= 0;
end else begin
if (counter > 0) begin
if (counter == pulse_multiplier - 1) begin
counter <= 0;
end else begin
counter <= counter + 2'd1;
end
end else begin
if (ena[0] == 1'b1) begin
counter <= 1;
end
end
end
end
end
if (reset_kind == "SYNC")
begin: sync_reset
always @ (posedge clk1)
begin
if (reset1 == 1'b0) begin
counter <= 0;
end else begin
if (counter > 0) begin
if (counter == pulse_multiplier - 1) begin
counter <= 0;
end else begin
counter <= counter + 2'd1;
end
end else begin
if (ena[0] == 1'b1) begin
counter <= 1;
end
end
end
end
end
if (reset_kind == "NONE")
begin: no_reset
always @ (posedge clk1)
begin
if (counter > 0) begin
if (counter == pulse_multiplier - 1) begin
counter <= 0;
end else begin
counter <= counter + 2'd1;
end
end else begin
if (ena[0] == 1'b1) begin
counter <= 1;
end
end
end
end
assign ena_internal = counter > 0 ? 1'b1 : ena[0];
end
endgenerate
assign oclk_enable = sync_regs[depth - 1];
generate
if (reset_kind == "ASYNC")
begin: iclk_async_reset
always @ (posedge clk1 or negedge reset1)
begin
if (reset1 == 1'b0) begin
iclk_data <= init_value_internal;
iclk_enable <= 1'b0;
end else begin
iclk_enable <= ena_internal;
if (ena[0] == 1'b1) begin
iclk_data <= xin;
end
end
end
end
if (reset_kind == "SYNC")
begin: iclk_sync_reset
always @ (posedge clk1)
begin
if (reset1 == 1'b0) begin
iclk_data <= init_value_internal;
iclk_enable <= 1'b0;
end else begin
iclk_enable <= ena_internal;
if (ena[0] == 1'b1) begin
iclk_data <= xin;
end
end
end
end
if (reset_kind == "NONE")
begin: iclk_no_reset
always @ (posedge clk1)
begin
iclk_enable <= ena_internal;
if (ena[0] == 1'b1) begin
iclk_data <= xin;
end
end
end
endgenerate
generate
genvar i;
for (i = 0; i < depth; ++i)
begin: sync_regs_block
if (reset_kind == "ASYNC")
begin: sync_reg_async_reset
always @ (posedge clk2 or negedge reset2) begin
if (reset2 == 1'b0) begin
sync_regs[i] <= 1'b0;
end else begin
if (i > 0) begin
sync_regs[i] <= sync_regs[i - 1];
end else begin
sync_regs[i] <= iclk_enable;
end
end
end
end
if (reset_kind == "SYNC")
begin: sync_reg_sync_reset
always @ (posedge clk2) begin
if (reset2 == 1'b0) begin
sync_regs[i] <= 1'b0;
end else begin
if (i > 0) begin
sync_regs[i] <= sync_regs[i - 1];
end else begin
sync_regs[i] <= iclk_enable;
end
end
end
end
if (reset_kind == "NONE")
begin: sync_reg_no_reset
always @ (posedge clk2) begin
if (i > 0) begin
sync_regs[i] <= sync_regs[i - 1];
end else begin
sync_regs[i] <= iclk_enable;
end
end
end
end
endgenerate
generate
if (reset_kind == "ASYNC")
begin: oclk_async_reset
always @ (posedge clk2 or negedge reset2)
begin
if (reset2 == 1'b0) begin
oclk_data <= init_value_internal[width2-1 : 0];
end else begin
if (oclk_enable == 1'b1) begin
oclk_data <= iclk_data[width2-1 : 0];
end
end
end
end
if (reset_kind == "SYNC")
begin: oclk_sync_reset
always @ (posedge clk2)
begin
if (reset2 == 1'b0) begin
oclk_data <= init_value_internal[width2-1 : 0];
end else begin
if (oclk_enable == 1'b1) begin
oclk_data <= iclk_data[width2-1 : 0];
end
end
end
end
if (reset_kind == "NONE")
begin: oclk_no_reset
always @ (posedge clk2)
begin
if (oclk_enable == 1'b1) begin
oclk_data <= iclk_data[width2-1 : 0];
end
end
end
endgenerate
assign xout = iclk_data;
assign sxout = oclk_data;
endmodule
//------------------------------------------------------------------------------
module dspba_pipe
#(
parameter num_bits = 8,
parameter num_stages = 0,
parameter init_value = 1'bx
) (
input clk,
input [num_bits-1:0] d,
output [num_bits-1:0] q
);
logic [num_bits-1:0] init_stage = { num_bits { init_value } };
generate
if (num_stages > 0)
begin
reg [num_bits-1:0] stage_array[num_stages-1:0];
genvar i;
for (i = 0; i < num_stages; ++i)
begin : g_pipe
always @ (posedge clk) begin
if (i>0) begin
stage_array[i] <= stage_array[i-1];
end else begin
stage_array[i] <= d;
end
end
end
initial begin
stage_array = '{ num_stages { init_stage } };
end
assign q = stage_array[num_stages-1];
end else begin
assign q = d;
end
endgenerate
endmodule

View file

@ -0,0 +1,210 @@
#include <stdio.h>
#include <math.h>
#include <unordered_map>
#include <vector>
#include <mutex>
#include "svdpi.h"
#include "VX_config.h"
extern "C" {
void dpi_fadd(bool clk, bool enable, int a, int b, int* result);
void dpi_fsub(bool clk, bool enable, int a, int b, int* result);
void dpi_fmul(bool clk, bool enable, int a, int b, int* result);
void dpi_fmadd(bool clk, bool enable, int a, int b, int c, int* result);
void dpi_fmsub(bool clk, bool enable, int a, int b, int c, int* result);
void dpi_fdiv(bool clk, bool enable, int a, int b, int* result);
void dpi_fsqrt(bool clk, bool enable, int a, int* result);
void dpi_ftoi(bool clk, bool enable, int a, int* result);
void dpi_ftou(bool clk, bool enable, int a, int* result);
void dpi_itof(bool clk, bool enable, int a, int* result);
void dpi_utof(bool clk, bool enable, int a, int* result);
}
class ShiftRegister {
public:
ShiftRegister() : init_(false), depth_(0) {}
void ensure_init(int depth) {
if (!init_) {
buffer_.resize(depth);
init_ = true;
depth_ = depth;
}
}
void push(int value, bool clk, bool enable) {
if (clk || !enable)
return;
for (unsigned i = 0; i < depth_-1; ++i) {
buffer_[i] = buffer_[i+1];
}
buffer_[depth_-1] = value;
}
int top() const {
return buffer_[0];
}
private:
std::vector<int> buffer_;
unsigned depth_;
bool init_;
};
class Instances {
public:
ShiftRegister& get(svScope scope) {
mutex_.lock();
ShiftRegister& reg = instances_[scope];
mutex_.unlock();
return reg;
}
private:
std::unordered_map<svScope, ShiftRegister> instances_;
std::mutex mutex_;
};
Instances instances;
void dpi_fadd(bool clk, bool enable, int a, int b, int* result) {
auto scope = svGetScope();
ShiftRegister& inst = instances.get(scope);
float fa = *(float*)&a;
float fb = *(float*)&b;
float fr = fa + fb;
inst.ensure_init(LATENCY_FMADD);
inst.push(*(int*)&fr, clk, enable);
*result = inst.top();
}
void dpi_fsub(bool clk, bool enable, int a, int b, int* result) {
auto scope = svGetScope();
ShiftRegister& inst = instances.get(scope);
float fa = *(float*)&a;
float fb = *(float*)&b;
float fr = fa - fb;
inst.ensure_init(LATENCY_FMADD);
inst.push(*(int*)&fr, clk, enable);
*result = inst.top();
}
void dpi_fmul(bool clk, bool enable, int a, int b, int* result) {
auto scope = svGetScope();
ShiftRegister& inst = instances.get(scope);
float fa = *(float*)&a;
float fb = *(float*)&b;
float fr = fa * fb;
inst.ensure_init(LATENCY_FMADD);
inst.push(*(int*)&fr, clk, enable);
*result = inst.top();
}
void dpi_fmadd(bool clk, bool enable, int a, int b, int c, int* result) {
auto scope = svGetScope();
ShiftRegister& inst = instances.get(scope);
float fa = *(float*)&a;
float fb = *(float*)&b;
float fc = *(float*)&c;
float fr = fa * fb + fc;
inst.ensure_init(LATENCY_FMADD);
inst.push(*(int*)&fr, clk, enable);
*result = inst.top();
}
void dpi_fmsub(bool clk, bool enable, int a, int b, int c, int* result) {
auto scope = svGetScope();
ShiftRegister& inst = instances.get(scope);
float fa = *(float*)&a;
float fb = *(float*)&b;
float fc = *(float*)&c;
float fr = fa * fb - fc;
inst.ensure_init(LATENCY_FMADD);
inst.push(*(int*)&fr, clk, enable);
*result = inst.top();
}
void dpi_fdiv(bool clk, bool enable, int a, int b, int* result) {
auto scope = svGetScope();
ShiftRegister& inst = instances.get(scope);
float fa = *(float*)&a;
float fb = *(float*)&b;
float fr = fa / fb;
inst.ensure_init(LATENCY_FDIV);
inst.push(*(int*)&fr, clk, enable);
*result = inst.
top();
}
void dpi_fsqrt(bool clk, bool enable, int a, int* result) {
auto scope = svGetScope();
ShiftRegister& inst = instances.get(scope);
float fa = *(float*)&a;
float fr = sqrt(fa);
inst.ensure_init(LATENCY_FSQRT);
inst.push(*(int*)&fr, clk, enable);
*result = inst.top();
}
void dpi_ftoi(bool clk, bool enable, int a, int* result) {
auto scope = svGetScope();
ShiftRegister& inst = instances.get(scope);
float fa = *(float*)&a;
int ir = int(fa);
inst.ensure_init(LATENCY_FTOI);
inst.push(ir, clk, enable);
*result = inst.top();
}
void dpi_ftou(bool clk, bool enable, int a, int* result) {
auto scope = svGetScope();
ShiftRegister& inst = instances.get(scope);
float fa = *(float*)&a;
unsigned ir = unsigned(fa);
inst.ensure_init(LATENCY_FTOI);
inst.push(ir, clk, enable);
*result = inst.top();
}
void dpi_itof(bool clk, bool enable, int a, int* result) {
auto scope = svGetScope();
ShiftRegister& inst = instances.get(scope);
float fr = float(a);
inst.ensure_init(LATENCY_ITOF);
inst.push(*(int*)&fr, clk, enable);
*result = inst.top();
}
void dpi_utof(bool clk, bool enable, int a, int* result) {
auto scope = svGetScope();
ShiftRegister& inst = instances.get(scope);
unsigned ua = *(unsigned*)&a;
float fr = float(ua);
inst.ensure_init(LATENCY_ITOF);
inst.push(*(int*)&fr, clk, enable);
*result = inst.top();
}

View file

@ -0,0 +1,16 @@
`ifndef FLOAT_DPI
`define FLOAT_DPI
import "DPI-C" context function void dpi_fadd(input logic clk, input logic enable, input int a, input int b, output int result);
import "DPI-C" context function void dpi_fsub(input logic clk, input logic enable, input int a, input int b, output int result);
import "DPI-C" context function void dpi_fmul(input logic clk, input logic enable, input int a, input int b, output int result);
import "DPI-C" context function void dpi_fmadd(input logic clk, input logic enable, input int a, input int b, input int c, output int result);
import "DPI-C" context function void dpi_fmsub(input logic clk, input logic enable, input int a, input int b, input int c, output int result);
import "DPI-C" context function void dpi_fdiv(input logic clk, input logic enable, input int a, input int b, output int result);
import "DPI-C" context function void dpi_fsqrt(input logic clk, input logic enable, input int a, output int result);
import "DPI-C" context function void dpi_ftoi(input logic clk, input logic enable, input int a, output int result);
import "DPI-C" context function void dpi_ftou(input logic clk, input logic enable, input int a, output int result);
import "DPI-C" context function void dpi_itof(input logic clk, input logic enable, input int a, output int result);
import "DPI-C" context function void dpi_utof(input logic clk, input logic enable, input int a, output int result);
`endif

View file

@ -94,6 +94,7 @@ if args.outc != 'none':
// Translated from VX_config.vh:
'''[1:].format(date=datetime.now()), file=f)
with open(path.join(script_dir, '../rtl/VX_config.vh'), 'r') as r:
lineno = 0
for line in r:
if in_expansion:
f.write(post_process_line(line))
@ -107,7 +108,8 @@ if args.outc != 'none':
f.write(post_process_line(pat.sub(repl, line)))
break
else:
raise ValueError('failed to find rule for: ' + line)
raise ValueError('failed to find rule for: "' + line + '" (' + str(lineno) + ')')
lineno = lineno + 1
print('''
// Misc

View file

@ -17,10 +17,11 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
DBG_FLAGS += $(DBG_PRINT_FLAGS)
DBG_FLAGS += -DDBG_CORE_REQ_INFO
FPU_INCLUDE = -I../rtl/fp_cores/fpnew/src/common_cells/include -I../rtl/fp_cores/fpnew/src/common_cells/src -I../rtl/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I../rtl/fp_cores/fpnew/src
INCLUDE = -I../rtl/ -I../rtl/libs -I../rtl/interfaces -I../rtl/cache -I../rtl/fp_cores -I../rtl/simulate $(FPU_INCLUDE)
FPU_INCLUDE = -I../rtl/fp_cores -I../rtl/fp_cores/svdpi -I../rtl/fp_cores/fpnew/src/common_cells/include -I../rtl/fp_cores/fpnew/src/common_cells/src -I../rtl/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I../rtl/fp_cores/fpnew/src
INCLUDE = -I../rtl/ -I../rtl/libs -I../rtl/interfaces -I../rtl/cache -I../rtl/simulate $(FPU_INCLUDE)
SRCS = simulator.cpp testbench.cpp
SRCS += ../rtl/fp_cores/svdpi/float_dpi.cpp
all: build-s