mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-24 05:47:35 -04:00
FPU DPI fallback
This commit is contained in:
parent
0a0b28aac0
commit
df711986bc
29 changed files with 1147 additions and 1329 deletions
|
@ -34,9 +34,10 @@ LDFLAGS += -shared -pthread
|
|||
TOP = Vortex
|
||||
|
||||
SRCS = vortex.cpp ../common/vx_utils.cpp ../../hw/simulate/simulator.cpp
|
||||
SRCS += ../../hw/rtl/fp_cores/svdpi/float_dpi.cpp
|
||||
|
||||
FPU_INCLUDE = -I../../hw/rtl/fp_cores/fpnew/src/common_cells/include -I../../hw/rtl/fp_cores/fpnew/src/common_cells/src -I../../hw/rtl/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I../../hw/rtl/fp_cores/fpnew/src
|
||||
RTL_INCLUDE = -I../../hw/rtl -I../../hw/rtl/libs -I../../hw/rtl/interfaces -I../../hw/rtl/cache -I../../hw/rtl/fp_cores $(FPU_INCLUDE)
|
||||
FPU_INCLUDE = -I../../hw/rtl/fp_cores -I../../hw/rtl/fp_cores/svdpi -I../../hw/rtl/fp_cores/fpnew/src/common_cells/include -I../../hw/rtl/fp_cores/fpnew/src/common_cells/src -I../../hw/rtl/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I../../hw/rtl/fp_cores/fpnew/src
|
||||
RTL_INCLUDE = -I../../hw/rtl -I../../hw/rtl/libs -I../../hw/rtl/interfaces -I../../hw/rtl/cache $(FPU_INCLUDE)
|
||||
|
||||
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic $(CONFIGS)
|
||||
VL_FLAGS += -Wno-DECLFILENAME
|
||||
|
|
|
@ -51,15 +51,55 @@
|
|||
`define L3_ENABLE (`NUM_CLUSTERS > 1)
|
||||
`endif
|
||||
|
||||
`ifndef EXT_M_DISABLE
|
||||
`define EXT_M_ENABLE
|
||||
`endif
|
||||
|
||||
`ifndef EXT_F_DISABLE
|
||||
`define EXT_F_ENABLE
|
||||
`endif
|
||||
|
||||
`define FPNEW_ENABLE
|
||||
|
||||
// Device identification
|
||||
`define VENDOR_ID 0
|
||||
`define ARCHITECTURE_ID 0
|
||||
`define IMPLEMENTATION_ID 0
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`ifndef LATENCY_IMUL
|
||||
`define LATENCY_IMUL 3
|
||||
`endif
|
||||
|
||||
`ifndef LATENCY_FNONCOMP
|
||||
`define LATENCY_FNONCOMP 1
|
||||
`endif
|
||||
|
||||
`ifndef LATENCY_FMADD
|
||||
`define LATENCY_FMADD 1
|
||||
`endif
|
||||
|
||||
`ifndef LATENCY_FNMADD
|
||||
`define LATENCY_FNMADD 2
|
||||
`endif
|
||||
|
||||
`ifndef LATENCY_FDIV
|
||||
`define LATENCY_FDIV 15
|
||||
`endif
|
||||
|
||||
`ifndef LATENCY_FSQRT
|
||||
`define LATENCY_FSQRT 9
|
||||
`endif
|
||||
|
||||
`ifndef LATENCY_ITOF
|
||||
`define LATENCY_ITOF 7
|
||||
`endif
|
||||
|
||||
`ifndef LATENCY_FTOI
|
||||
`define LATENCY_FTOI 3
|
||||
`endif
|
||||
|
||||
// CSR Addresses //////////////////////////////////////////////////////////////
|
||||
|
||||
`define CSR_FFLAGS 12'h001
|
||||
|
|
|
@ -35,22 +35,6 @@
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define LATENCY_IMUL 3
|
||||
|
||||
`define LATENCY_FDIV 16
|
||||
`define LATENCY_FSQRT 10
|
||||
`define LATENCY_FTOI 5
|
||||
`define LATENCY_FTOU 4
|
||||
`define LATENCY_ITOF 8
|
||||
`define LATENCY_UTOF 7
|
||||
|
||||
`define LATENCY_FMULADD 2
|
||||
`define LATENCY_FDIVSQRT 2
|
||||
`define LATENCY_FCONV 2
|
||||
`define LATENCY_FNONCOMP 1
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define INST_LUI 7'b0110111
|
||||
`define INST_AUIPC 7'b0010111
|
||||
`define INST_JAL 7'b1101111
|
||||
|
|
|
@ -56,7 +56,7 @@ module VX_fpu_unit #(
|
|||
// can accept new request?
|
||||
assign fpu_req_if.ready = ready_in && ~fpuq_full;
|
||||
|
||||
`ifdef SYNTHESIS
|
||||
`ifndef FPNEW_ENABLE
|
||||
|
||||
VX_fp_fpga #(
|
||||
.TAGW (FPUQ_BITS)
|
||||
|
|
|
@ -1,5 +1,9 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
`ifndef SYNTHESIS
|
||||
`include "float_dpi.vh"
|
||||
`endif
|
||||
|
||||
module VX_fp_div #(
|
||||
parameter TAGW = 1,
|
||||
parameter LANES = 1
|
||||
|
@ -21,19 +25,23 @@ module VX_fp_div #(
|
|||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
wire stall = ~ready_out && valid_out;
|
||||
wire enable = ~stall;
|
||||
assign ready_in = enable;
|
||||
|
||||
wire stall = ~ready_out && valid_out;
|
||||
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
`ifdef QUARTUS
|
||||
acl_fp_div fdiv (
|
||||
.clk (clk),
|
||||
.areset (1'b0),
|
||||
.en (enable),
|
||||
.en (~stall),
|
||||
.a (dataa[i]),
|
||||
.b (datab[i]),
|
||||
.q (result[i])
|
||||
);
|
||||
`else
|
||||
always @(posedge clk) begin
|
||||
dpi_fdiv(clk, ~stall, dataa[i], datab[i], result[i]);
|
||||
end
|
||||
`endif
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
|
@ -42,9 +50,11 @@ module VX_fp_div #(
|
|||
) shift_reg (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(enable),
|
||||
.enable(~stall),
|
||||
.in ({tag_in, valid_in}),
|
||||
.out({tag_out, valid_out})
|
||||
);
|
||||
|
||||
assign ready_in = ~stall;
|
||||
|
||||
endmodule
|
|
@ -1,5 +1,4 @@
|
|||
`include "VX_define.vh"
|
||||
`include "dspba_library_ver.sv"
|
||||
|
||||
module VX_fp_fpga #(
|
||||
parameter TAGW = 1
|
||||
|
@ -28,7 +27,7 @@ module VX_fp_fpga #(
|
|||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
localparam NUM_FPC = 12;
|
||||
localparam NUM_FPC = 7;
|
||||
localparam FPC_BITS = `LOG2UP(NUM_FPC);
|
||||
|
||||
wire [NUM_FPC-1:0] per_core_ready_in;
|
||||
|
@ -41,26 +40,30 @@ module VX_fp_fpga #(
|
|||
fflags_t [`NUM_THREADS-1:0] fpnew_fflags;
|
||||
|
||||
reg [FPC_BITS-1:0] core_select;
|
||||
reg fmadd_negate;
|
||||
reg do_add, do_sub, do_mul;
|
||||
reg is_signed;
|
||||
|
||||
always @(*) begin
|
||||
core_select = 0;
|
||||
fmadd_negate = 0;
|
||||
core_select = 'x;
|
||||
do_add = 'x;
|
||||
do_sub = 'x;
|
||||
do_mul = 'x;
|
||||
is_signed = 'x;
|
||||
case (op_type)
|
||||
`FPU_ADD: core_select = 1;
|
||||
`FPU_SUB: core_select = 2;
|
||||
`FPU_MUL: core_select = 3;
|
||||
`FPU_MADD: core_select = 4;
|
||||
`FPU_MSUB: core_select = 5;
|
||||
`FPU_NMSUB: begin core_select = 4; fmadd_negate = 1; end
|
||||
`FPU_NMADD: begin core_select = 5; fmadd_negate = 1; end
|
||||
`FPU_DIV: core_select = 6;
|
||||
`FPU_SQRT: core_select = 7;
|
||||
`FPU_CVTWS: core_select = 8;
|
||||
`FPU_CVTWUS: core_select = 9;
|
||||
`FPU_CVTSW: core_select = 10;
|
||||
`FPU_CVTSWU: core_select = 11;
|
||||
default:;
|
||||
`FPU_ADD: begin core_select = 1; do_mul = 0; do_add = 1; do_sub = 0; end
|
||||
`FPU_SUB: begin core_select = 1; do_mul = 0; do_add = 0; do_sub = 1; end
|
||||
`FPU_MUL: begin core_select = 1; do_mul = 1; do_add = 0; do_sub = 0; end
|
||||
`FPU_MADD: begin core_select = 1; do_mul = 1; do_add = 1; do_sub = 0; end
|
||||
`FPU_MSUB: begin core_select = 1; do_mul = 1; do_add = 0; do_sub = 1; end
|
||||
`FPU_NMSUB: begin core_select = 2; do_sub = 1; end
|
||||
`FPU_NMADD: begin core_select = 2; do_sub = 0; end
|
||||
`FPU_DIV: begin core_select = 3; end
|
||||
`FPU_SQRT: begin core_select = 4; end
|
||||
`FPU_CVTWS: begin core_select = 5; is_signed = 1; end
|
||||
`FPU_CVTWUS: begin core_select = 5; is_signed = 0; end
|
||||
`FPU_CVTSW: begin core_select = 6; is_signed = 1; end
|
||||
`FPU_CVTSWU: begin core_select = 6; is_signed = 0; end
|
||||
default: begin core_select = 0; end
|
||||
endcase
|
||||
end
|
||||
|
||||
|
@ -76,7 +79,7 @@ module VX_fp_fpga #(
|
|||
.op_type (op_type),
|
||||
.frm (frm),
|
||||
.dataa (dataa),
|
||||
.datab (datab),
|
||||
.datab (datab),
|
||||
.result (per_core_result[0]),
|
||||
.has_fflags (fpnew_has_fflags),
|
||||
.fflags (fpnew_fflags),
|
||||
|
@ -85,44 +88,50 @@ module VX_fp_fpga #(
|
|||
.valid_out (per_core_valid_out[0])
|
||||
);
|
||||
|
||||
VX_fp_add #(
|
||||
VX_fp_madd #(
|
||||
.TAGW (TAGW),
|
||||
.LANES(`NUM_THREADS)
|
||||
) fp_add (
|
||||
) fp_madd (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 1)),
|
||||
.ready_in (per_core_ready_in[1]),
|
||||
.tag_in (tag_in),
|
||||
.do_add (do_add),
|
||||
.do_sub (do_sub),
|
||||
.do_mul (do_mul),
|
||||
.dataa (dataa),
|
||||
.datab (datab),
|
||||
.datab (datab),
|
||||
.datac (datac),
|
||||
.result (per_core_result[1]),
|
||||
.tag_out (per_core_tag_out[1]),
|
||||
.ready_out (per_core_ready_out[1]),
|
||||
.valid_out (per_core_valid_out[1])
|
||||
);
|
||||
|
||||
VX_fp_sub #(
|
||||
VX_fp_nmadd #(
|
||||
.TAGW (TAGW),
|
||||
.LANES(`NUM_THREADS)
|
||||
) fp_sub (
|
||||
) fp_nmadd (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 2)),
|
||||
.ready_in (per_core_ready_in[2]),
|
||||
.tag_in (tag_in),
|
||||
.tag_in (tag_in),
|
||||
.do_sub (do_sub),
|
||||
.dataa (dataa),
|
||||
.datab (datab),
|
||||
.datab (datab),
|
||||
.datac (datac),
|
||||
.result (per_core_result[2]),
|
||||
.tag_out (per_core_tag_out[2]),
|
||||
.ready_out (per_core_ready_out[2]),
|
||||
.valid_out (per_core_valid_out[2])
|
||||
);
|
||||
|
||||
VX_fp_mul #(
|
||||
VX_fp_div #(
|
||||
.TAGW (TAGW),
|
||||
.LANES(`NUM_THREADS)
|
||||
) fp_mul (
|
||||
) fp_div (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 3)),
|
||||
|
@ -136,75 +145,20 @@ module VX_fp_fpga #(
|
|||
.valid_out (per_core_valid_out[3])
|
||||
);
|
||||
|
||||
VX_fp_madd #(
|
||||
.TAGW (TAGW),
|
||||
.LANES(`NUM_THREADS)
|
||||
) fp_madd (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 4)),
|
||||
.ready_in (per_core_ready_in[4]),
|
||||
.tag_in (tag_in),
|
||||
.negate (fmadd_negate),
|
||||
.dataa (dataa),
|
||||
.datab (datab),
|
||||
.datac (datac),
|
||||
.result (per_core_result[4]),
|
||||
.tag_out (per_core_tag_out[4]),
|
||||
.ready_out (per_core_ready_out[4]),
|
||||
.valid_out (per_core_valid_out[4])
|
||||
);
|
||||
|
||||
VX_fp_msub #(
|
||||
.TAGW (TAGW),
|
||||
.LANES(`NUM_THREADS)
|
||||
) fp_msub (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 5)),
|
||||
.ready_in (per_core_ready_in[5]),
|
||||
.tag_in (tag_in),
|
||||
.negate (fmadd_negate),
|
||||
.dataa (dataa),
|
||||
.datab (datab),
|
||||
.datac (datac),
|
||||
.result (per_core_result[5]),
|
||||
.tag_out (per_core_tag_out[5]),
|
||||
.ready_out (per_core_ready_out[5]),
|
||||
.valid_out (per_core_valid_out[5])
|
||||
);
|
||||
|
||||
VX_fp_div #(
|
||||
.TAGW (TAGW),
|
||||
.LANES(`NUM_THREADS)
|
||||
) fp_div (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 6)),
|
||||
.ready_in (per_core_ready_in[6]),
|
||||
.tag_in (tag_in),
|
||||
.dataa (dataa),
|
||||
.datab (datab),
|
||||
.result (per_core_result[6]),
|
||||
.tag_out (per_core_tag_out[6]),
|
||||
.ready_out (per_core_ready_out[6]),
|
||||
.valid_out (per_core_valid_out[6])
|
||||
);
|
||||
|
||||
VX_fp_sqrt #(
|
||||
.TAGW (TAGW),
|
||||
.LANES(`NUM_THREADS)
|
||||
) fp_sqrt (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 7)),
|
||||
.ready_in (per_core_ready_in[7]),
|
||||
.valid_in (valid_in && (core_select == 4)),
|
||||
.ready_in (per_core_ready_in[4]),
|
||||
.tag_in (tag_in),
|
||||
.dataa (dataa),
|
||||
.result (per_core_result[7]),
|
||||
.tag_out (per_core_tag_out[7]),
|
||||
.ready_out (per_core_ready_out[7]),
|
||||
.valid_out (per_core_valid_out[7])
|
||||
.result (per_core_result[4]),
|
||||
.tag_out (per_core_tag_out[4]),
|
||||
.ready_out (per_core_ready_out[4]),
|
||||
.valid_out (per_core_valid_out[4])
|
||||
);
|
||||
|
||||
VX_fp_ftoi #(
|
||||
|
@ -213,30 +167,15 @@ module VX_fp_fpga #(
|
|||
) fp_ftoi (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 8)),
|
||||
.ready_in (per_core_ready_in[8]),
|
||||
.tag_in (tag_in),
|
||||
.valid_in (valid_in && (core_select == 5)),
|
||||
.ready_in (per_core_ready_in[5]),
|
||||
.tag_in (tag_in),
|
||||
.is_signed (is_signed),
|
||||
.dataa (dataa),
|
||||
.result (per_core_result[8]),
|
||||
.tag_out (per_core_tag_out[8]),
|
||||
.ready_out (per_core_ready_out[8]),
|
||||
.valid_out (per_core_valid_out[8])
|
||||
);
|
||||
|
||||
VX_fp_ftou #(
|
||||
.TAGW (TAGW),
|
||||
.LANES(`NUM_THREADS)
|
||||
) fp_ftou (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 9)),
|
||||
.ready_in (per_core_ready_in[9]),
|
||||
.tag_in (tag_in),
|
||||
.dataa (dataa),
|
||||
.result (per_core_result[9]),
|
||||
.tag_out (per_core_tag_out[9]),
|
||||
.ready_out (per_core_ready_out[9]),
|
||||
.valid_out (per_core_valid_out[9])
|
||||
.result (per_core_result[5]),
|
||||
.tag_out (per_core_tag_out[5]),
|
||||
.ready_out (per_core_ready_out[5]),
|
||||
.valid_out (per_core_valid_out[5])
|
||||
);
|
||||
|
||||
VX_fp_itof #(
|
||||
|
@ -245,60 +184,45 @@ module VX_fp_fpga #(
|
|||
) fp_itof (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 10)),
|
||||
.ready_in (per_core_ready_in[10]),
|
||||
.tag_in (tag_in),
|
||||
.valid_in (valid_in && (core_select == 6)),
|
||||
.ready_in (per_core_ready_in[6]),
|
||||
.tag_in (tag_in),
|
||||
.is_signed (is_signed),
|
||||
.dataa (dataa),
|
||||
.result (per_core_result[10]),
|
||||
.tag_out (per_core_tag_out[10]),
|
||||
.ready_out (per_core_ready_out[10]),
|
||||
.valid_out (per_core_valid_out[10])
|
||||
.result (per_core_result[6]),
|
||||
.tag_out (per_core_tag_out[6]),
|
||||
.ready_out (per_core_ready_out[6]),
|
||||
.valid_out (per_core_valid_out[6])
|
||||
);
|
||||
|
||||
VX_fp_utof #(
|
||||
.TAGW (TAGW),
|
||||
.LANES(`NUM_THREADS)
|
||||
) fp_utof (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 11)),
|
||||
.ready_in (per_core_ready_in[11]),
|
||||
.tag_in (tag_in),
|
||||
.dataa (dataa),
|
||||
.result (per_core_result[11]),
|
||||
.tag_out (per_core_tag_out[11]),
|
||||
.ready_out (per_core_ready_out[11]),
|
||||
.valid_out (per_core_valid_out[11])
|
||||
);
|
||||
|
||||
reg valid_out_r;
|
||||
reg has_fflags_r;
|
||||
reg [`NUM_THREADS-1:0][31:0] result_r;
|
||||
reg [TAGW-1:0] tag_out_r;
|
||||
reg valid_out_n;
|
||||
reg has_fflags_n;
|
||||
reg [`NUM_THREADS-1:0][31:0] result_n;
|
||||
reg [TAGW-1:0] tag_out_n;
|
||||
|
||||
always @(*) begin
|
||||
per_core_ready_out = 0;
|
||||
valid_out_r = 0;
|
||||
has_fflags_r = 'x;
|
||||
result_r = 'x;
|
||||
tag_out_r = 'x;
|
||||
valid_out_n = 0;
|
||||
has_fflags_n = 'x;
|
||||
result_n = 'x;
|
||||
tag_out_n = 'x;
|
||||
for (integer i = 0; i < NUM_FPC; i++) begin
|
||||
if (per_core_valid_out[i]) begin
|
||||
per_core_ready_out[i] = ready_out;
|
||||
valid_out_r = 1;
|
||||
has_fflags_r = fpnew_has_fflags && (i == 0);
|
||||
result_r = per_core_result[i];
|
||||
tag_out_r = per_core_tag_out[i];
|
||||
valid_out_n = 1;
|
||||
has_fflags_n = fpnew_has_fflags && (i == 0);
|
||||
result_n = per_core_result[i];
|
||||
tag_out_n = per_core_tag_out[i];
|
||||
break;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
assign ready_in = (& per_core_ready_in);
|
||||
assign valid_out = valid_out_r;
|
||||
assign has_fflags = has_fflags_r;
|
||||
assign tag_out = tag_out_r;
|
||||
assign result = result_r;
|
||||
assign valid_out = valid_out_n;
|
||||
assign has_fflags = has_fflags_n;
|
||||
assign tag_out = tag_out_n;
|
||||
assign result = result_n;
|
||||
assign fflags = fpnew_fflags;
|
||||
|
||||
endmodule
|
77
hw/rtl/fp_cores/VX_fp_ftoi.v
Normal file
77
hw/rtl/fp_cores/VX_fp_ftoi.v
Normal file
|
@ -0,0 +1,77 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
`ifndef SYNTHESIS
|
||||
`include "float_dpi.vh"
|
||||
`endif
|
||||
|
||||
module VX_fp_ftoi #(
|
||||
parameter TAGW = 1,
|
||||
parameter LANES = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
output wire ready_in,
|
||||
input wire valid_in,
|
||||
|
||||
input wire [TAGW-1:0] tag_in,
|
||||
|
||||
input wire is_signed,
|
||||
|
||||
input wire [LANES-1:0][31:0] dataa,
|
||||
output wire [LANES-1:0][31:0] result,
|
||||
|
||||
output wire [TAGW-1:0] tag_out,
|
||||
|
||||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
wire stall = ~ready_out && valid_out;
|
||||
|
||||
reg is_signed_r;
|
||||
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
|
||||
wire [31:0] result_s;
|
||||
wire [31:0] result_u;
|
||||
|
||||
`ifdef QUARTUS
|
||||
acl_fp_ftoi ftoi (
|
||||
.clk (clk),
|
||||
.areset (1'b0),
|
||||
.en (~stall),
|
||||
.a (dataa[i]),
|
||||
.q (result_s)
|
||||
);
|
||||
|
||||
acl_fp_ftou ftou (
|
||||
.clk (clk),
|
||||
.areset (1'b0),
|
||||
.en (~stall),
|
||||
.a (dataa[i]),
|
||||
.q (result_u)
|
||||
);
|
||||
`else
|
||||
always @(posedge clk) begin
|
||||
dpi_ftoi(clk, ~stall, dataa[i], result_s);
|
||||
dpi_ftou(clk, ~stall, dataa[i], result_u);
|
||||
end
|
||||
`endif
|
||||
|
||||
assign result[i] = is_signed_r ? result_s : result_u;
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW(TAGW + 1 + 1),
|
||||
.DEPTH(`LATENCY_FTOI)
|
||||
) shift_reg (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(~stall),
|
||||
.in ({tag_in, valid_in, is_signed}),
|
||||
.out({tag_out, valid_out, is_signed_r})
|
||||
);
|
||||
|
||||
assign ready_in = ~stall;
|
||||
|
||||
endmodule
|
77
hw/rtl/fp_cores/VX_fp_itof.v
Normal file
77
hw/rtl/fp_cores/VX_fp_itof.v
Normal file
|
@ -0,0 +1,77 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
`ifndef SYNTHESIS
|
||||
`include "float_dpi.vh"
|
||||
`endif
|
||||
|
||||
module VX_fp_itof #(
|
||||
parameter TAGW = 1,
|
||||
parameter LANES = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
output wire ready_in,
|
||||
input wire valid_in,
|
||||
|
||||
input wire [TAGW-1:0] tag_in,
|
||||
|
||||
input wire is_signed,
|
||||
|
||||
input wire [LANES-1:0][31:0] dataa,
|
||||
output wire [LANES-1:0][31:0] result,
|
||||
|
||||
output wire [TAGW-1:0] tag_out,
|
||||
|
||||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
wire stall = ~ready_out && valid_out;
|
||||
|
||||
reg is_signed_r;
|
||||
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
|
||||
wire [31:0] result_s;
|
||||
wire [31:0] result_u;
|
||||
|
||||
`ifdef QUARTUS
|
||||
acl_fp_itof itof (
|
||||
.clk (clk),
|
||||
.areset (1'b0),
|
||||
.en (~stall),
|
||||
.a (dataa[i]),
|
||||
.q (result_s)
|
||||
);
|
||||
|
||||
acl_fp_utof utof (
|
||||
.clk (clk),
|
||||
.areset (1'b0),
|
||||
.en (~stall),
|
||||
.a (dataa[i]),
|
||||
.q (result_u)
|
||||
);
|
||||
`else
|
||||
always @(posedge clk) begin
|
||||
dpi_itof(clk, ~stall, dataa[i], result_s);
|
||||
dpi_utof(clk, ~stall, dataa[i], result_u);
|
||||
end
|
||||
`endif
|
||||
|
||||
assign result[i] = is_signed_r ? result_s : result_u;
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW(TAGW + 1 + 1),
|
||||
.DEPTH(`LATENCY_FTOI)
|
||||
) shift_reg (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(~stall),
|
||||
.in ({tag_in, valid_in, is_signed}),
|
||||
.out({tag_out, valid_out, is_signed_r})
|
||||
);
|
||||
|
||||
assign ready_in = ~stall;
|
||||
|
||||
endmodule
|
291
hw/rtl/fp_cores/VX_fp_madd.v
Normal file
291
hw/rtl/fp_cores/VX_fp_madd.v
Normal file
|
@ -0,0 +1,291 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
`ifndef SYNTHESIS
|
||||
`include "float_dpi.vh"
|
||||
`endif
|
||||
|
||||
module VX_fp_madd #(
|
||||
parameter TAGW = 1,
|
||||
parameter LANES = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
output wire ready_in,
|
||||
input wire valid_in,
|
||||
|
||||
input wire [TAGW-1:0] tag_in,
|
||||
|
||||
input wire do_add,
|
||||
input wire do_sub,
|
||||
input wire do_mul,
|
||||
|
||||
input wire [LANES-1:0][31:0] dataa,
|
||||
input wire [LANES-1:0][31:0] datab,
|
||||
input wire [LANES-1:0][31:0] datac,
|
||||
output wire [LANES-1:0][31:0] result,
|
||||
|
||||
output wire [TAGW-1:0] tag_out,
|
||||
|
||||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
|
||||
wire stall = ~ready_out && valid_out;
|
||||
|
||||
reg do_add_r, do_sub_r, do_mul_r;
|
||||
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
|
||||
wire [31:0] result_add;
|
||||
wire [31:0] result_sub;
|
||||
wire [31:0] result_mul;
|
||||
wire [31:0] result_madd;
|
||||
wire [31:0] result_msub;
|
||||
|
||||
`ifdef QUARTUS
|
||||
twentynm_fp_mac mac_fp_add (
|
||||
// inputs
|
||||
.accumulate(),
|
||||
.chainin_overflow(),
|
||||
.chainin_invalid(),
|
||||
.chainin_underflow(),
|
||||
.chainin_inexact(),
|
||||
.ax(),
|
||||
.ay(datab[i]),
|
||||
.az(dataa[i]),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,~stall}),
|
||||
.aclr(2'b00),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
.invalid(),
|
||||
.underflow(),
|
||||
.inexact(),
|
||||
.chainout_overflow(),
|
||||
.chainout_invalid(),
|
||||
.chainout_underflow(),
|
||||
.chainout_inexact(),
|
||||
.resulta(result_add),
|
||||
.chainout()
|
||||
);
|
||||
defparam mac_fp_add.operation_mode = "sp_add";
|
||||
defparam mac_fp_add.use_chainin = "false";
|
||||
defparam mac_fp_add.adder_subtract = "false";
|
||||
defparam mac_fp_add.ax_clock = "0";
|
||||
defparam mac_fp_add.ay_clock = "0";
|
||||
defparam mac_fp_add.az_clock = "0";
|
||||
defparam mac_fp_add.output_clock = "0";
|
||||
defparam mac_fp_add.accumulate_clock = "none";
|
||||
defparam mac_fp_add.ax_chainin_pl_clock = "0";
|
||||
defparam mac_fp_add.accum_pipeline_clock = "none";
|
||||
defparam mac_fp_add.mult_pipeline_clock = "0";
|
||||
defparam mac_fp_add.adder_input_clock = "0";
|
||||
defparam mac_fp_add.accum_adder_clock = "none";
|
||||
|
||||
twentynm_fp_mac mac_fp_sub (
|
||||
// inputs
|
||||
.accumulate(),
|
||||
.chainin_overflow(),
|
||||
.chainin_invalid(),
|
||||
.chainin_underflow(),
|
||||
.chainin_inexact(),
|
||||
.ax(),
|
||||
.ay(datab[i]),
|
||||
.az(dataa[i]),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,~stall}),
|
||||
.aclr(2'b00),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
.invalid(),
|
||||
.underflow(),
|
||||
.inexact(),
|
||||
.chainout_overflow(),
|
||||
.chainout_invalid(),
|
||||
.chainout_underflow(),
|
||||
.chainout_inexact(),
|
||||
.resulta(result_sub),
|
||||
.chainout()
|
||||
);
|
||||
defparam mac_fp_sub.operation_mode = "sp_add";
|
||||
defparam mac_fp_sub.use_chainin = "false";
|
||||
defparam mac_fp_sub.adder_subtract = "true";
|
||||
defparam mac_fp_sub.ax_clock = "0";
|
||||
defparam mac_fp_sub.ay_clock = "0";
|
||||
defparam mac_fp_sub.az_clock = "none";
|
||||
defparam mac_fp_sub.output_clock = "0";
|
||||
defparam mac_fp_sub.accumulate_clock = "none";
|
||||
defparam mac_fp_sub.ax_chainin_pl_clock = "none";
|
||||
defparam mac_fp_sub.accum_pipeline_clock = "none";
|
||||
defparam mac_fp_sub.mult_pipeline_clock = "none";
|
||||
defparam mac_fp_sub.adder_input_clock = "0";
|
||||
defparam mac_fp_sub.accum_adder_clock = "none";
|
||||
|
||||
twentynm_fp_mac mac_fp_mul (
|
||||
// inputs
|
||||
.accumulate(),
|
||||
.chainin_overflow(),
|
||||
.chainin_invalid(),
|
||||
.chainin_underflow(),
|
||||
.chainin_inexact(),
|
||||
.ax(),
|
||||
.ay(datab[i]),
|
||||
.az(dataa[i]),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,~stall}),
|
||||
.aclr(2'b00),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
.invalid(),
|
||||
.underflow(),
|
||||
.inexact(),
|
||||
.chainout_overflow(),
|
||||
.chainout_invalid(),
|
||||
.chainout_underflow(),
|
||||
.chainout_inexact(),
|
||||
.resulta(result_mul),
|
||||
.chainout()
|
||||
);
|
||||
defparam mac_fp_mul.operation_mode = "sp_mult";
|
||||
defparam mac_fp_mul.use_chainin = "false";
|
||||
defparam mac_fp_mul.adder_subtract = "false";
|
||||
defparam mac_fp_mul.ax_clock = "none";
|
||||
defparam mac_fp_mul.ay_clock = "0";
|
||||
defparam mac_fp_mul.az_clock = "0";
|
||||
defparam mac_fp_mul.output_clock = "0";
|
||||
defparam mac_fp_mul.accumulate_clock = "none";
|
||||
defparam mac_fp_mul.ax_chainin_pl_clock = "none";
|
||||
defparam mac_fp_mul.accum_pipeline_clock = "none";
|
||||
defparam mac_fp_mul.mult_pipeline_clock = "0";
|
||||
defparam mac_fp_mul.adder_input_clock = "none";
|
||||
defparam mac_fp_mul.accum_adder_clock = "none";
|
||||
|
||||
twentynm_fp_mac mac_fp_madd (
|
||||
// inputs
|
||||
.accumulate(),
|
||||
.chainin_overflow(),
|
||||
.chainin_invalid(),
|
||||
.chainin_underflow(),
|
||||
.chainin_inexact(),
|
||||
.ax(datac[i]),
|
||||
.ay(datab[i]),
|
||||
.az(dataa[i]),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,~stall}),
|
||||
.aclr(2'b00),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
.invalid(),
|
||||
.underflow(),
|
||||
.inexact(),
|
||||
.chainout_overflow(),
|
||||
.chainout_invalid(),
|
||||
.chainout_underflow(),
|
||||
.chainout_inexact(),
|
||||
.resulta(result_madd),
|
||||
.chainout()
|
||||
);
|
||||
defparam mac_fp_madd.operation_mode = "sp_mult_add";
|
||||
defparam mac_fp_madd.use_chainin = "false";
|
||||
defparam mac_fp_madd.adder_subtract = "false";
|
||||
defparam mac_fp_madd.ax_clock = "0";
|
||||
defparam mac_fp_madd.ay_clock = "0";
|
||||
defparam mac_fp_madd.az_clock = "0";
|
||||
defparam mac_fp_madd.output_clock = "0";
|
||||
defparam mac_fp_madd.accumulate_clock = "none";
|
||||
defparam mac_fp_madd.ax_chainin_pl_clock = "0";
|
||||
defparam mac_fp_madd.accum_pipeline_clock = "none";
|
||||
defparam mac_fp_madd.mult_pipeline_clock = "0";
|
||||
defparam mac_fp_madd.adder_input_clock = "0";
|
||||
defparam mac_fp_madd.accum_adder_clock = "none";
|
||||
|
||||
twentynm_fp_mac mac_fp_msub (
|
||||
// inputs
|
||||
.accumulate(),
|
||||
.chainin_overflow(),
|
||||
.chainin_invalid(),
|
||||
.chainin_underflow(),
|
||||
.chainin_inexact(),
|
||||
.ax(datac[i]),
|
||||
.ay(datab[i]),
|
||||
.az(dataa[i]),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,~stall}),
|
||||
.aclr(2'b00),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
.invalid(),
|
||||
.underflow(),
|
||||
.inexact(),
|
||||
.chainout_overflow(),
|
||||
.chainout_invalid(),
|
||||
.chainout_underflow(),
|
||||
.chainout_inexact(),
|
||||
.resulta(result_msub),
|
||||
.chainout()
|
||||
);
|
||||
defparam mac_fp_msub.operation_mode = "sp_mult_add";
|
||||
defparam mac_fp_msub.use_chainin = "false";
|
||||
defparam mac_fp_msub.adder_subtract = "true";
|
||||
defparam mac_fp_msub.ax_clock = "0";
|
||||
defparam mac_fp_msub.ay_clock = "0";
|
||||
defparam mac_fp_msub.az_clock = "0";
|
||||
defparam mac_fp_msub.output_clock = "0";
|
||||
defparam mac_fp_msub.accumulate_clock = "none";
|
||||
defparam mac_fp_msub.ax_chainin_pl_clock = "0";
|
||||
defparam mac_fp_msub.accum_pipeline_clock = "none";
|
||||
defparam mac_fp_msub.mult_pipeline_clock = "0";
|
||||
defparam mac_fp_msub.adder_input_clock = "0";
|
||||
defparam mac_fp_msub.accum_adder_clock = "none";
|
||||
`else
|
||||
always @(posedge clk) begin
|
||||
dpi_fadd(clk, ~stall, dataa[i], datab[i], result_add);
|
||||
dpi_fsub(clk, ~stall, dataa[i], datab[i], result_sub);
|
||||
dpi_fmul(clk, ~stall, dataa[i], datab[i], result_mul);
|
||||
dpi_fmadd(clk, ~stall, dataa[i], datab[i], datac[i], result_madd);
|
||||
dpi_fmsub(clk, ~stall, dataa[i], datab[i], datac[i], result_msub);
|
||||
end
|
||||
`endif
|
||||
|
||||
reg [31:0] result_r;
|
||||
|
||||
always @(*) begin
|
||||
result_r = 'x;
|
||||
if (do_mul_r) begin
|
||||
if (do_add_r)
|
||||
result_r = result_madd;
|
||||
else if (do_sub_r)
|
||||
result_r = result_msub;
|
||||
else
|
||||
result_r = result_mul;
|
||||
end else begin
|
||||
if (do_add_r)
|
||||
result_r = result_add;
|
||||
else if (do_sub_r)
|
||||
result_r = result_sub;
|
||||
end
|
||||
end
|
||||
|
||||
assign result[i] = result_r;
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW(TAGW + 1 + 1 + 1 + 1),
|
||||
.DEPTH(`LATENCY_FMADD)
|
||||
) shift_reg1 (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(~stall),
|
||||
.in({tag_in, valid_in, do_add, do_sub, do_mul}),
|
||||
.out({tag_out, valid_out, do_add_r, do_sub_r, do_mul_r})
|
||||
);
|
||||
|
||||
assign ready_in = ~stall;
|
||||
|
||||
endmodule
|
191
hw/rtl/fp_cores/VX_fp_nmadd.v
Normal file
191
hw/rtl/fp_cores/VX_fp_nmadd.v
Normal file
|
@ -0,0 +1,191 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
`ifndef SYNTHESIS
|
||||
`include "float_dpi.vh"
|
||||
`endif
|
||||
|
||||
module VX_fp_nmadd #(
|
||||
parameter TAGW = 1,
|
||||
parameter LANES = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
output wire ready_in,
|
||||
input wire valid_in,
|
||||
|
||||
input wire [TAGW-1:0] tag_in,
|
||||
|
||||
input wire do_sub,
|
||||
|
||||
input wire [LANES-1:0][31:0] dataa,
|
||||
input wire [LANES-1:0][31:0] datab,
|
||||
input wire [LANES-1:0][31:0] datac,
|
||||
output wire [LANES-1:0][31:0] result,
|
||||
|
||||
output wire [TAGW-1:0] tag_out,
|
||||
|
||||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
|
||||
wire stall = ~ready_out && valid_out;
|
||||
|
||||
reg do_sub_r;
|
||||
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
|
||||
wire [31:0] result_madd;
|
||||
wire [31:0] result_msub;
|
||||
|
||||
wire [31:0] result_st0 = do_sub_r ? result_msub : result_madd;
|
||||
|
||||
`ifdef QUARTUS
|
||||
twentynm_fp_mac mac_fp_madd (
|
||||
// inputs
|
||||
.accumulate(),
|
||||
.chainin_overflow(),
|
||||
.chainin_invalid(),
|
||||
.chainin_underflow(),
|
||||
.chainin_inexact(),
|
||||
.ax(datac[i]),
|
||||
.ay(datab[i]),
|
||||
.az(dataa[i]),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,~stall),
|
||||
.aclr(2'b00),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
.invalid(),
|
||||
.underflow(),
|
||||
.inexact(),
|
||||
.chainout_overflow(),
|
||||
.chainout_invalid(),
|
||||
.chainout_underflow(),
|
||||
.chainout_inexact(),
|
||||
.resulta(result_madd),
|
||||
.chainout()
|
||||
);
|
||||
defparam mac_fp_madd.operation_mode = "sp_mult_add";
|
||||
defparam mac_fp_madd.use_chainin = "false";
|
||||
defparam mac_fp_madd.adder_subtract = "false";
|
||||
defparam mac_fp_madd.ax_clock = "0";
|
||||
defparam mac_fp_madd.ay_clock = "0";
|
||||
defparam mac_fp_madd.az_clock = "0";
|
||||
defparam mac_fp_madd.output_clock = "0";
|
||||
defparam mac_fp_madd.accumulate_clock = "none";
|
||||
defparam mac_fp_madd.ax_chainin_pl_clock = "0";
|
||||
defparam mac_fp_madd.accum_pipeline_clock = "none";
|
||||
defparam mac_fp_madd.mult_pipeline_clock = "0";
|
||||
defparam mac_fp_madd.adder_input_clock = "0";
|
||||
defparam mac_fp_madd.accum_adder_clock = "none";
|
||||
|
||||
twentynm_fp_mac mac_fp_msub (
|
||||
// inputs
|
||||
.accumulate(),
|
||||
.chainin_overflow(),
|
||||
.chainin_invalid(),
|
||||
.chainin_underflow(),
|
||||
.chainin_inexact(),
|
||||
.ax(datac[i]),
|
||||
.ay(datab[i]),
|
||||
.az(dataa[i]),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,enable0}),
|
||||
.aclr(2'b00),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
.invalid(),
|
||||
.underflow(),
|
||||
.inexact(),
|
||||
.chainout_overflow(),
|
||||
.chainout_invalid(),
|
||||
.chainout_underflow(),
|
||||
.chainout_inexact(),
|
||||
.resulta(result_msub),
|
||||
.chainout()
|
||||
);
|
||||
defparam mac_fp_msub.operation_mode = "sp_mult_add";
|
||||
defparam mac_fp_msub.use_chainin = "false";
|
||||
defparam mac_fp_msub.adder_subtract = "true";
|
||||
defparam mac_fp_msub.ax_clock = "0";
|
||||
defparam mac_fp_msub.ay_clock = "0";
|
||||
defparam mac_fp_msub.az_clock = "0";
|
||||
defparam mac_fp_msub.output_clock = "0";
|
||||
defparam mac_fp_msub.accumulate_clock = "none";
|
||||
defparam mac_fp_msub.ax_chainin_pl_clock = "0";
|
||||
defparam mac_fp_msub.accum_pipeline_clock = "none";
|
||||
defparam mac_fp_msub.mult_pipeline_clock = "0";
|
||||
defparam mac_fp_msub.adder_input_clock = "0";
|
||||
defparam mac_fp_msub.accum_adder_clock = "none";
|
||||
|
||||
twentynm_fp_mac mac_fp_neg (
|
||||
// inputs
|
||||
.accumulate(),
|
||||
.chainin_overflow(),
|
||||
.chainin_invalid(),
|
||||
.chainin_underflow(),
|
||||
.chainin_inexact(),
|
||||
.ax(32'h0),
|
||||
.ay(result_st0),
|
||||
.az(),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,enable1}),
|
||||
.aclr(2'b00),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
.invalid(),
|
||||
.underflow(),
|
||||
.inexact(),
|
||||
.chainout_overflow(),
|
||||
.chainout_invalid(),
|
||||
.chainout_underflow(),
|
||||
.chainout_inexact(),
|
||||
.resulta(result[i]),
|
||||
.chainout()
|
||||
);
|
||||
defparam mac_fp_neg.operation_mode = "sp_add";
|
||||
defparam mac_fp_neg.use_chainin = "false";
|
||||
defparam mac_fp_neg.adder_subtract = "true";
|
||||
defparam mac_fp_neg.ax_clock = "0";
|
||||
defparam mac_fp_neg.ay_clock = "0";
|
||||
defparam mac_fp_neg.az_clock = "none";
|
||||
defparam mac_fp_neg.output_clock = "0";
|
||||
defparam mac_fp_neg.accumulate_clock = "none";
|
||||
defparam mac_fp_neg.ax_chainin_pl_clock = "none";
|
||||
defparam mac_fp_neg.accum_pipeline_clock = "none";
|
||||
defparam mac_fp_neg.mult_pipeline_clock = "none";
|
||||
defparam mac_fp_neg.adder_input_clock = "0";
|
||||
defparam mac_fp_neg.accum_adder_clock = "none";
|
||||
`else
|
||||
always @(posedge clk) begin
|
||||
dpi_fmadd(clk, ~stall, dataa[i], datab[i], datac[i], result_madd);
|
||||
dpi_fmsub(clk, ~stall, dataa[i], datab[i], datac[i], result_msub);
|
||||
dpi_fsub(clk, ~stall, 32'b0, result_st0, result[i]);
|
||||
end
|
||||
`endif
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (~stall) begin
|
||||
do_sub_r <= do_sub;
|
||||
end
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW(TAGW + 1),
|
||||
.DEPTH(`LATENCY_FNMADD)
|
||||
) shift_reg1 (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(~stall),
|
||||
.in({tag_in, valid_in}),
|
||||
.out({tag_out, valid_out})
|
||||
);
|
||||
|
||||
assign ready_in = ~stall;
|
||||
|
||||
endmodule
|
|
@ -45,8 +45,8 @@ module VX_fp_noncomp #(
|
|||
reg [LANES-1:0][31:0] datab_r;
|
||||
|
||||
reg [LANES-1:0] a_sign, b_sign;
|
||||
reg [LANES-1:0][7:0] a_exponent, b_exponent;
|
||||
reg [LANES-1:0][22:0] a_mantissa, b_mantissa;
|
||||
reg [LANES-1:0][7:0] a_exponent;
|
||||
reg [LANES-1:0][22:0] a_mantissa;
|
||||
fp_type_t [LANES-1:0] a_type, b_type;
|
||||
reg [LANES-1:0] a_smaller, ab_equal;
|
||||
|
||||
|
@ -60,12 +60,12 @@ module VX_fp_noncomp #(
|
|||
|
||||
// Setup
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
wire tmp_a_sign = dataa[i][31];
|
||||
wire [7:0] tmp_a_exponent = dataa[i][30:23];
|
||||
wire tmp_a_sign = dataa[i][31];
|
||||
wire [7:0] tmp_a_exponent = dataa[i][30:23];
|
||||
wire [22:0] tmp_a_mantissa = dataa[i][22:0];
|
||||
|
||||
wire tmp_b_sign = datab[i][31];
|
||||
wire [7:0] tmp_b_exponent = datab[i][30:23];
|
||||
wire tmp_b_sign = datab[i][31];
|
||||
wire [7:0] tmp_b_exponent = datab[i][30:23];
|
||||
wire [22:0] tmp_b_mantissa = datab[i][22:0];
|
||||
|
||||
fp_type_t tmp_a_type, tmp_b_type;
|
||||
|
@ -86,14 +86,14 @@ module VX_fp_noncomp #(
|
|||
wire tmp_ab_equal = (dataa[i] == datab[i]) | (tmp_a_type[4] & tmp_b_type[4]);
|
||||
|
||||
VX_generic_register #(
|
||||
.N(1 + 1 + 8 + 8 + 23 + 23 + $bits(fp_type_t) + $bits(fp_type_t) + 1 + 1)
|
||||
.N(1 + 1 + 8 + 23 + $bits(fp_type_t) + $bits(fp_type_t) + 1 + 1)
|
||||
) fnc1_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.stall (stall),
|
||||
.flush (1'b0),
|
||||
.in ({tmp_a_sign, tmp_b_sign, tmp_a_exponent, tmp_b_exponent, tmp_a_mantissa, tmp_b_mantissa, tmp_a_type, tmp_b_type, tmp_a_smaller, tmp_ab_equal}),
|
||||
.out ({a_sign[i], b_sign[i], a_exponent[i], b_exponent[i], a_mantissa[i], b_mantissa[i], a_type[i], b_type[i], a_smaller[i], ab_equal[i]})
|
||||
.in ({tmp_a_sign, tmp_b_sign, tmp_a_exponent, tmp_a_mantissa, tmp_a_type, tmp_b_type, tmp_a_smaller, tmp_ab_equal}),
|
||||
.out ({a_sign[i], b_sign[i], a_exponent[i], a_mantissa[i], a_type[i], b_type[i], a_smaller[i], ab_equal[i]})
|
||||
);
|
||||
end
|
||||
|
||||
|
@ -213,8 +213,6 @@ module VX_fp_noncomp #(
|
|||
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
always @(*) begin
|
||||
tmp_result[i] = 32'hdeadbeaf;
|
||||
{tmp_fflags[i].NV, tmp_fflags[i].DZ, tmp_fflags[i].OF, tmp_fflags[i].UF, tmp_fflags[i].NX} = 5'h0;
|
||||
case (op_type_r)
|
||||
`FPU_CLASS: begin
|
||||
tmp_result[i] = fclass_mask[i];
|
||||
|
@ -224,7 +222,8 @@ module VX_fp_noncomp #(
|
|||
tmp_result[i] = fcmp_res[i];
|
||||
{tmp_fflags[i].NV, tmp_fflags[i].DZ, tmp_fflags[i].OF, tmp_fflags[i].UF, tmp_fflags[i].NX} = fcmp_excp[i];
|
||||
end
|
||||
`FPU_MISC: begin
|
||||
//`FPU_MISC:
|
||||
default: begin
|
||||
case (frm)
|
||||
0,1,2: begin
|
||||
tmp_result[i] = fsgnj_res[i];
|
||||
|
@ -234,7 +233,8 @@ module VX_fp_noncomp #(
|
|||
tmp_result[i] = fminmax_res[i];
|
||||
{tmp_fflags[i].NV, tmp_fflags[i].DZ, tmp_fflags[i].OF, tmp_fflags[i].UF, tmp_fflags[i].NX} = {a_type[i][0] | b_type[i][0], 4'h0};
|
||||
end
|
||||
5,6: begin
|
||||
//5,6,7:
|
||||
default: begin
|
||||
tmp_result[i] = dataa[i];
|
||||
{tmp_fflags[i].NV, tmp_fflags[i].DZ, tmp_fflags[i].OF, tmp_fflags[i].UF, tmp_fflags[i].NX} = 5'h0;
|
||||
end
|
||||
|
|
|
@ -1,5 +1,9 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
`ifndef SYNTHESIS
|
||||
`include "float_dpi.vh"
|
||||
`endif
|
||||
|
||||
module VX_fp_sqrt #(
|
||||
parameter TAGW = 1,
|
||||
parameter LANES = 1
|
||||
|
@ -20,18 +24,22 @@ module VX_fp_sqrt #(
|
|||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
wire stall = ~ready_out && valid_out;
|
||||
wire enable = ~stall;
|
||||
assign ready_in = enable;
|
||||
|
||||
wire stall = ~ready_out && valid_out;
|
||||
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
`ifdef QUARTUS
|
||||
acl_fp_sqrt fsqrt (
|
||||
.clk (clk),
|
||||
.areset (1'b0),
|
||||
.en (enable),
|
||||
.en (~stall),
|
||||
.a (dataa[i]),
|
||||
.q (result[i])
|
||||
);
|
||||
`else
|
||||
always @(posedge clk) begin
|
||||
dpi_fsqrt(clk, ~stall, dataa[i], result[i]);
|
||||
end
|
||||
`endif
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
|
@ -40,9 +48,11 @@ module VX_fp_sqrt #(
|
|||
) shift_reg (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(enable),
|
||||
.enable(~stall),
|
||||
.in ({tag_in, valid_in}),
|
||||
.out({tag_out, valid_out})
|
||||
);
|
||||
|
||||
assign ready_in = ~stall;
|
||||
|
||||
endmodule
|
|
@ -8,12 +8,20 @@ module VX_fp_type (
|
|||
// outputs
|
||||
output fp_type_t o_type
|
||||
);
|
||||
assign o_type.is_normal = (exponent != 8'd0) && (exponent != 8'hff);
|
||||
assign o_type.is_zero = (exponent == 8'd0) && (mantissa == 23'd0);
|
||||
assign o_type.is_subnormal = (exponent == 8'd0) && !o_type.is_zero;
|
||||
assign o_type.is_inf = ((exponent == 8'hff) && (mantissa == 23'd0));
|
||||
assign o_type.is_nan = ((exponent == 8'hff) && (mantissa != 23'd0));
|
||||
assign o_type.is_signaling = o_type.is_nan && (mantissa[22] == 1'b0);
|
||||
assign o_type.is_quiet = o_type.is_nan && !o_type.is_signaling;
|
||||
wire is_normal = (exponent != 8'd0) && (exponent != 8'hff);
|
||||
wire is_zero = (exponent == 8'd0) && (mantissa == 23'd0);
|
||||
wire is_subnormal = (exponent == 8'd0) && !is_zero;
|
||||
wire is_inf = (exponent == 8'hff) && (mantissa == 23'd0);
|
||||
wire is_nan = (exponent == 8'hff) && (mantissa != 23'd0);
|
||||
wire is_signaling = is_nan && (mantissa[22] == 1'b0);
|
||||
wire is_quiet = is_nan && !is_signaling;
|
||||
|
||||
assign o_type.is_normal = is_normal;
|
||||
assign o_type.is_zero = is_zero;
|
||||
assign o_type.is_subnormal = is_subnormal;
|
||||
assign o_type.is_inf = is_inf;
|
||||
assign o_type.is_nan = is_nan;
|
||||
assign o_type.is_signaling = is_signaling;
|
||||
assign o_type.is_quiet = is_quiet;
|
||||
|
||||
endmodule
|
|
@ -53,10 +53,10 @@ module VX_fpnew #(
|
|||
};
|
||||
|
||||
localparam fpnew_pkg::fpu_implementation_t FPU_IMPLEMENTATION = '{
|
||||
PipeRegs:'{'{`LATENCY_FMULADD, 0, 0, 0, 0}, // ADDMUL
|
||||
'{default: `LATENCY_FDIVSQRT}, // DIVSQRT
|
||||
'{default: `LATENCY_FNONCOMP}, // NONCOMP
|
||||
'{default: `LATENCY_FCONV}}, // CONV
|
||||
PipeRegs:'{'{`LATENCY_FMADD, 0, 0, 0, 0}, // ADDMUL
|
||||
'{default: `LATENCY_FDIV}, // DIVSQRT
|
||||
'{default: `LATENCY_FNONCOMP}, // NONCOMP
|
||||
'{default: `LATENCY_ITOF}}, // CONV
|
||||
UnitTypes:'{'{default: UNIT_FMULADD}, // ADDMUL
|
||||
'{default: UNIT_FDIVSQRT}, // DIVSQRT
|
||||
'{default: UNIT_FNONCOMP}, // NONCOMP
|
||||
|
|
|
@ -1,81 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_fp_add #(
|
||||
parameter TAGW = 1,
|
||||
parameter LANES = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
output wire ready_in,
|
||||
input wire valid_in,
|
||||
|
||||
input wire [TAGW-1:0] tag_in,
|
||||
|
||||
input wire [LANES-1:0][31:0] dataa,
|
||||
input wire [LANES-1:0][31:0] datab,
|
||||
output wire [LANES-1:0][31:0] result,
|
||||
|
||||
output wire [TAGW-1:0] tag_out,
|
||||
|
||||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
wire stall = ~ready_out && valid_out;
|
||||
wire enable = ~stall;
|
||||
assign ready_in = enable;
|
||||
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
twentynm_fp_mac mac_fp_wys (
|
||||
// inputs
|
||||
.accumulate(),
|
||||
.chainin_overflow(),
|
||||
.chainin_invalid(),
|
||||
.chainin_underflow(),
|
||||
.chainin_inexact(),
|
||||
.ax(dataa[i]),
|
||||
.ay(datab[i]),
|
||||
.az(),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,enable}),
|
||||
.aclr(2'b00),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
.invalid(),
|
||||
.underflow(),
|
||||
.inexact(),
|
||||
.chainout_overflow(),
|
||||
.chainout_invalid(),
|
||||
.chainout_underflow(),
|
||||
.chainout_inexact(),
|
||||
.resulta(result[i]),
|
||||
.chainout()
|
||||
);
|
||||
defparam mac_fp_wys.operation_mode = "sp_add";
|
||||
defparam mac_fp_wys.use_chainin = "false";
|
||||
defparam mac_fp_wys.adder_subtract = "false";
|
||||
defparam mac_fp_wys.ax_clock = "0";
|
||||
defparam mac_fp_wys.ay_clock = "0";
|
||||
defparam mac_fp_wys.az_clock = "none";
|
||||
defparam mac_fp_wys.output_clock = "0";
|
||||
defparam mac_fp_wys.accumulate_clock = "none";
|
||||
defparam mac_fp_wys.ax_chainin_pl_clock = "none";
|
||||
defparam mac_fp_wys.accum_pipeline_clock = "none";
|
||||
defparam mac_fp_wys.mult_pipeline_clock = "none";
|
||||
defparam mac_fp_wys.adder_input_clock = "0";
|
||||
defparam mac_fp_wys.accum_adder_clock = "none";
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW(TAGW + 1),
|
||||
.DEPTH(1)
|
||||
) shift_reg (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(enable),
|
||||
.in ({tag_in, valid_in}),
|
||||
.out({tag_out, valid_out})
|
||||
);
|
||||
|
||||
endmodule
|
|
@ -1,48 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_fp_ftoi #(
|
||||
parameter TAGW = 1,
|
||||
parameter LANES = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
output wire ready_in,
|
||||
input wire valid_in,
|
||||
|
||||
input wire [TAGW-1:0] tag_in,
|
||||
|
||||
input wire [LANES-1:0][31:0] dataa,
|
||||
output wire [LANES-1:0][31:0] result,
|
||||
|
||||
output wire [TAGW-1:0] tag_out,
|
||||
|
||||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
wire stall = ~ready_out && valid_out;
|
||||
wire enable = ~stall;
|
||||
assign ready_in = enable;
|
||||
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
acl_fp_ftoi ftoi (
|
||||
.clk (clk),
|
||||
.areset (1'b0),
|
||||
.en (enable),
|
||||
.a (dataa[i]),
|
||||
.q (result[i])
|
||||
);
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW(TAGW + 1),
|
||||
.DEPTH(`LATENCY_FTOI)
|
||||
) shift_reg (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(enable),
|
||||
.in ({tag_in, valid_in}),
|
||||
.out({tag_out, valid_out})
|
||||
);
|
||||
|
||||
endmodule
|
|
@ -1,48 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_fp_ftou #(
|
||||
parameter TAGW = 1,
|
||||
parameter LANES = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
output wire ready_in,
|
||||
input wire valid_in,
|
||||
|
||||
input wire [TAGW-1:0] tag_in,
|
||||
|
||||
input wire [LANES-1:0][31:0] dataa,
|
||||
output wire [LANES-1:0][31:0] result,
|
||||
|
||||
output wire [TAGW-1:0] tag_out,
|
||||
|
||||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
wire stall = ~ready_out && valid_out;
|
||||
wire enable = ~stall;
|
||||
assign ready_in = enable;
|
||||
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
acl_fp_ftou ftou (
|
||||
.clk (clk),
|
||||
.areset (1'b0),
|
||||
.en (enable),
|
||||
.a (dataa[i]),
|
||||
.q (result[i])
|
||||
);
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW(TAGW + 1),
|
||||
.DEPTH(`LATENCY_FTOU)
|
||||
) shift_reg (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(enable),
|
||||
.in ({tag_in, valid_in}),
|
||||
.out({tag_out, valid_out})
|
||||
);
|
||||
|
||||
endmodule
|
|
@ -1,48 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_fp_itof #(
|
||||
parameter TAGW = 1,
|
||||
parameter LANES = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
output wire ready_in,
|
||||
input wire valid_in,
|
||||
|
||||
input wire [TAGW-1:0] tag_in,
|
||||
|
||||
input wire [LANES-1:0][31:0] dataa,
|
||||
output wire [LANES-1:0][31:0] result,
|
||||
|
||||
output wire [TAGW-1:0] tag_out,
|
||||
|
||||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
wire stall = ~ready_out && valid_out;
|
||||
wire enable = ~stall;
|
||||
assign ready_in = enable;
|
||||
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
acl_fp_itof itof (
|
||||
.clk (clk),
|
||||
.areset (1'b0),
|
||||
.en (enable),
|
||||
.a (dataa[i]),
|
||||
.q (result[i])
|
||||
);
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW(TAGW + 1),
|
||||
.DEPTH(`LATENCY_ITOF)
|
||||
) shift_reg (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(enable),
|
||||
.in ({tag_in, valid_in}),
|
||||
.out({tag_out, valid_out})
|
||||
);
|
||||
|
||||
endmodule
|
|
@ -1,146 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_fp_madd #(
|
||||
parameter TAGW = 1,
|
||||
parameter LANES = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
output wire ready_in,
|
||||
input wire valid_in,
|
||||
|
||||
input wire [TAGW-1:0] tag_in,
|
||||
|
||||
input wire [LANES-1:0][31:0] dataa,
|
||||
input wire [LANES-1:0][31:0] datab,
|
||||
input wire [LANES-1:0][31:0] datac,
|
||||
output wire [LANES-1:0][31:0] result,
|
||||
|
||||
input wire negate,
|
||||
|
||||
output wire [TAGW-1:0] tag_out,
|
||||
|
||||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
wire enable0, enable1;
|
||||
assign ready_in = enable0 && enable1;
|
||||
|
||||
wire [LANES-1:0][31:0] result_st0, result_st1;
|
||||
wire [TAGW-1:0] out_tag_st0, out_tag_st1;
|
||||
wire in_valid_st0, out_valid_st0, out_valid_st1;
|
||||
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
twentynm_fp_mac mac_fp_wys0 (
|
||||
// inputs
|
||||
.accumulate(),
|
||||
.chainin_overflow(),
|
||||
.chainin_invalid(),
|
||||
.chainin_underflow(),
|
||||
.chainin_inexact(),
|
||||
.ax(datac[i]),
|
||||
.ay(datab[i]),
|
||||
.az(dataa[i]),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,enable0}),
|
||||
.aclr(2'b00),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
.invalid(),
|
||||
.underflow(),
|
||||
.inexact(),
|
||||
.chainout_overflow(),
|
||||
.chainout_invalid(),
|
||||
.chainout_underflow(),
|
||||
.chainout_inexact(),
|
||||
.resulta(result_st0[i]),
|
||||
.chainout()
|
||||
);
|
||||
defparam mac_fp_wys0.operation_mode = "sp_mult_add";
|
||||
defparam mac_fp_wys0.use_chainin = "false";
|
||||
defparam mac_fp_wys0.adder_subtract = "false";
|
||||
defparam mac_fp_wys0.ax_clock = "0";
|
||||
defparam mac_fp_wys0.ay_clock = "0";
|
||||
defparam mac_fp_wys0.az_clock = "0";
|
||||
defparam mac_fp_wys0.output_clock = "0";
|
||||
defparam mac_fp_wys0.accumulate_clock = "none";
|
||||
defparam mac_fp_wys0.ax_chainin_pl_clock = "0";
|
||||
defparam mac_fp_wys0.accum_pipeline_clock = "none";
|
||||
defparam mac_fp_wys0.mult_pipeline_clock = "0";
|
||||
defparam mac_fp_wys0.adder_input_clock = "0";
|
||||
defparam mac_fp_wys0.accum_adder_clock = "none";
|
||||
|
||||
twentynm_fp_mac mac_fp_wys1 (
|
||||
// inputs
|
||||
.accumulate(),
|
||||
.chainin_overflow(),
|
||||
.chainin_invalid(),
|
||||
.chainin_underflow(),
|
||||
.chainin_inexact(),
|
||||
.ax(32'h0),
|
||||
.ay(result_st0[i]),
|
||||
.az(),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,enable1}),
|
||||
.aclr(2'b00),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
.invalid(),
|
||||
.underflow(),
|
||||
.inexact(),
|
||||
.chainout_overflow(),
|
||||
.chainout_invalid(),
|
||||
.chainout_underflow(),
|
||||
.chainout_inexact(),
|
||||
.resulta(result_st1[i]),
|
||||
.chainout()
|
||||
);
|
||||
defparam mac_fp_wys1.operation_mode = "sp_add";
|
||||
defparam mac_fp_wys1.use_chainin = "false";
|
||||
defparam mac_fp_wys1.adder_subtract = "true";
|
||||
defparam mac_fp_wys1.ax_clock = "0";
|
||||
defparam mac_fp_wys1.ay_clock = "0";
|
||||
defparam mac_fp_wys1.az_clock = "none";
|
||||
defparam mac_fp_wys1.output_clock = "0";
|
||||
defparam mac_fp_wys1.accumulate_clock = "none";
|
||||
defparam mac_fp_wys1.ax_chainin_pl_clock = "none";
|
||||
defparam mac_fp_wys1.accum_pipeline_clock = "none";
|
||||
defparam mac_fp_wys1.mult_pipeline_clock = "none";
|
||||
defparam mac_fp_wys1.adder_input_clock = "0";
|
||||
defparam mac_fp_wys1.accum_adder_clock = "none";
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW(TAGW + 1 + 1),
|
||||
.DEPTH(1)
|
||||
) shift_reg0 (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(enable0),
|
||||
.in ({tag_in, (valid_in && ~negate), (valid_in && negate)}),
|
||||
.out({out_tag_st0, out_valid_st0, in_valid_st0})
|
||||
);
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW(TAGW + 1),
|
||||
.DEPTH(1)
|
||||
) shift_reg1 (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(enable1),
|
||||
.in({out_tag_st0, in_valid_st0}),
|
||||
.out({out_tag_st1, out_valid_st1})
|
||||
);
|
||||
|
||||
wire out_stall = ~ready_out && valid_out;
|
||||
assign enable0 = ~out_stall;
|
||||
assign enable1 = ~out_stall && ~(out_valid_st0 && out_valid_st1); // stall the negate stage if dual outputs
|
||||
|
||||
assign result = out_valid_st0 ? result_st0 : result_st1;
|
||||
assign tag_out = out_valid_st0 ? out_tag_st0 : out_tag_st1;
|
||||
assign valid_out = out_valid_st0 || out_valid_st1;
|
||||
|
||||
endmodule
|
|
@ -1,146 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_fp_msub #(
|
||||
parameter TAGW = 1,
|
||||
parameter LANES = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
output wire ready_in,
|
||||
input wire valid_in,
|
||||
|
||||
input wire [TAGW-1:0] tag_in,
|
||||
|
||||
input wire [LANES-1:0][31:0] dataa,
|
||||
input wire [LANES-1:0][31:0] datab,
|
||||
input wire [LANES-1:0][31:0] datac,
|
||||
output wire [LANES-1:0][31:0] result,
|
||||
|
||||
input wire negate,
|
||||
|
||||
output wire [TAGW-1:0] tag_out,
|
||||
|
||||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
wire enable0, enable1;
|
||||
assign ready_in = enable0 && enable1;
|
||||
|
||||
wire [LANES-1:0][31:0] result_st0, result_st1;
|
||||
wire [TAGW-1:0] out_tag_st0, out_tag_st1;
|
||||
wire in_valid_st0, out_valid_st0, out_valid_st1;
|
||||
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
twentynm_fp_mac mac_fp_wys0 (
|
||||
// inputs
|
||||
.accumulate(),
|
||||
.chainin_overflow(),
|
||||
.chainin_invalid(),
|
||||
.chainin_underflow(),
|
||||
.chainin_inexact(),
|
||||
.ax(datac[i]),
|
||||
.ay(datab[i]),
|
||||
.az(dataa[i]),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,enable0}),
|
||||
.aclr(2'b00),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
.invalid(),
|
||||
.underflow(),
|
||||
.inexact(),
|
||||
.chainout_overflow(),
|
||||
.chainout_invalid(),
|
||||
.chainout_underflow(),
|
||||
.chainout_inexact(),
|
||||
.resulta(result_st0[i]),
|
||||
.chainout()
|
||||
);
|
||||
defparam mac_fp_wys0.operation_mode = "sp_mult_add";
|
||||
defparam mac_fp_wys0.use_chainin = "false";
|
||||
defparam mac_fp_wys0.adder_subtract = "true";
|
||||
defparam mac_fp_wys0.ax_clock = "0";
|
||||
defparam mac_fp_wys0.ay_clock = "0";
|
||||
defparam mac_fp_wys0.az_clock = "0";
|
||||
defparam mac_fp_wys0.output_clock = "0";
|
||||
defparam mac_fp_wys0.accumulate_clock = "none";
|
||||
defparam mac_fp_wys0.ax_chainin_pl_clock = "0";
|
||||
defparam mac_fp_wys0.accum_pipeline_clock = "none";
|
||||
defparam mac_fp_wys0.mult_pipeline_clock = "0";
|
||||
defparam mac_fp_wys0.adder_input_clock = "0";
|
||||
defparam mac_fp_wys0.accum_adder_clock = "none";
|
||||
|
||||
twentynm_fp_mac mac_fp_wys1 (
|
||||
// inputs
|
||||
.accumulate(),
|
||||
.chainin_overflow(),
|
||||
.chainin_invalid(),
|
||||
.chainin_underflow(),
|
||||
.chainin_inexact(),
|
||||
.ax(32'h0),
|
||||
.ay(result_st0[i]),
|
||||
.az(),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,enable1}),
|
||||
.aclr(2'b00),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
.invalid(),
|
||||
.underflow(),
|
||||
.inexact(),
|
||||
.chainout_overflow(),
|
||||
.chainout_invalid(),
|
||||
.chainout_underflow(),
|
||||
.chainout_inexact(),
|
||||
.resulta(result_st1[i]),
|
||||
.chainout()
|
||||
);
|
||||
defparam mac_fp_wys1.operation_mode = "sp_add";
|
||||
defparam mac_fp_wys1.use_chainin = "false";
|
||||
defparam mac_fp_wys1.adder_subtract = "true";
|
||||
defparam mac_fp_wys1.ax_clock = "0";
|
||||
defparam mac_fp_wys1.ay_clock = "0";
|
||||
defparam mac_fp_wys1.az_clock = "none";
|
||||
defparam mac_fp_wys1.output_clock = "0";
|
||||
defparam mac_fp_wys1.accumulate_clock = "none";
|
||||
defparam mac_fp_wys1.ax_chainin_pl_clock = "none";
|
||||
defparam mac_fp_wys1.accum_pipeline_clock = "none";
|
||||
defparam mac_fp_wys1.mult_pipeline_clock = "none";
|
||||
defparam mac_fp_wys1.adder_input_clock = "0";
|
||||
defparam mac_fp_wys1.accum_adder_clock = "none";
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW(TAGW + 1 + 1),
|
||||
.DEPTH(1)
|
||||
) shift_reg0 (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(enable0),
|
||||
.in ({tag_in, (valid_in && ~negate), (valid_in && negate)}),
|
||||
.out({out_tag_st0, out_valid_st0, in_valid_st0})
|
||||
);
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW(TAGW + 1),
|
||||
.DEPTH(1)
|
||||
) shift_reg1 (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(enable1),
|
||||
.in({out_tag_st0, in_valid_st0}),
|
||||
.out({out_tag_st1, out_valid_st1})
|
||||
);
|
||||
|
||||
wire out_stall = ~ready_out && valid_out;
|
||||
assign enable0 = ~out_stall;
|
||||
assign enable1 = ~out_stall && ~(out_valid_st0 && out_valid_st1); // stall the negate stage if dual outputs
|
||||
|
||||
assign result = out_valid_st0 ? result_st0 : result_st1;
|
||||
assign tag_out = out_valid_st0 ? out_tag_st0 : out_tag_st1;
|
||||
assign valid_out = out_valid_st0 || out_valid_st1;
|
||||
|
||||
endmodule
|
|
@ -1,81 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_fp_mul #(
|
||||
parameter TAGW = 1,
|
||||
parameter LANES = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
output wire ready_in,
|
||||
input wire valid_in,
|
||||
|
||||
input wire [TAGW-1:0] tag_in,
|
||||
|
||||
input wire [LANES-1:0][31:0] dataa,
|
||||
input wire [LANES-1:0][31:0] datab,
|
||||
output wire [LANES-1:0][31:0] result,
|
||||
|
||||
output wire [TAGW-1:0] tag_out,
|
||||
|
||||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
wire stall = ~ready_out && valid_out;
|
||||
wire enable = ~stall;
|
||||
assign ready_in = enable;
|
||||
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
twentynm_fp_mac mac_fp_wys (
|
||||
// inputs
|
||||
.accumulate(),
|
||||
.chainin_overflow(),
|
||||
.chainin_invalid(),
|
||||
.chainin_underflow(),
|
||||
.chainin_inexact(),
|
||||
.ax(),
|
||||
.ay(datab[i]),
|
||||
.az(dataa[i]),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,enable}),
|
||||
.aclr(2'b00),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
.invalid(),
|
||||
.underflow(),
|
||||
.inexact(),
|
||||
.chainout_overflow(),
|
||||
.chainout_invalid(),
|
||||
.chainout_underflow(),
|
||||
.chainout_inexact(),
|
||||
.resulta(result[i]),
|
||||
.chainout()
|
||||
);
|
||||
defparam mac_fp_wys.operation_mode = "sp_mult";
|
||||
defparam mac_fp_wys.use_chainin = "false";
|
||||
defparam mac_fp_wys.adder_subtract = "false";
|
||||
defparam mac_fp_wys.ax_clock = "none";
|
||||
defparam mac_fp_wys.ay_clock = "0";
|
||||
defparam mac_fp_wys.az_clock = "0";
|
||||
defparam mac_fp_wys.output_clock = "0";
|
||||
defparam mac_fp_wys.accumulate_clock = "none";
|
||||
defparam mac_fp_wys.ax_chainin_pl_clock = "none";
|
||||
defparam mac_fp_wys.accum_pipeline_clock = "none";
|
||||
defparam mac_fp_wys.mult_pipeline_clock = "0";
|
||||
defparam mac_fp_wys.adder_input_clock = "none";
|
||||
defparam mac_fp_wys.accum_adder_clock = "none";
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW(TAGW + 1),
|
||||
.DEPTH(1)
|
||||
) shift_reg (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(enable),
|
||||
.in ({tag_in, valid_in}),
|
||||
.out({tag_out, valid_out})
|
||||
);
|
||||
|
||||
endmodule
|
|
@ -1,81 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_fp_sub #(
|
||||
parameter TAGW = 1,
|
||||
parameter LANES = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
output wire ready_in,
|
||||
input wire valid_in,
|
||||
|
||||
input wire [TAGW-1:0] tag_in,
|
||||
|
||||
input wire [LANES-1:0][31:0] dataa,
|
||||
input wire [LANES-1:0][31:0] datab,
|
||||
output wire [LANES-1:0][31:0] result,
|
||||
|
||||
output wire [TAGW-1:0] tag_out,
|
||||
|
||||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
wire stall = ~ready_out && valid_out;
|
||||
wire enable = ~stall;
|
||||
assign ready_in = enable;
|
||||
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
twentynm_fp_mac mac_fp_wys (
|
||||
// inputs
|
||||
.accumulate(),
|
||||
.chainin_overflow(),
|
||||
.chainin_invalid(),
|
||||
.chainin_underflow(),
|
||||
.chainin_inexact(),
|
||||
.ax(dataa[i]),
|
||||
.ay(datab[i]),
|
||||
.az(),
|
||||
.clk({2'b00,clk}),
|
||||
.ena({2'b11,enable}),
|
||||
.aclr(2'b00),
|
||||
.chainin(),
|
||||
// outputs
|
||||
.overflow(),
|
||||
.invalid(),
|
||||
.underflow(),
|
||||
.inexact(),
|
||||
.chainout_overflow(),
|
||||
.chainout_invalid(),
|
||||
.chainout_underflow(),
|
||||
.chainout_inexact(),
|
||||
.resulta(result[i]),
|
||||
.chainout()
|
||||
);
|
||||
defparam mac_fp_wys.operation_mode = "sp_add";
|
||||
defparam mac_fp_wys.use_chainin = "false";
|
||||
defparam mac_fp_wys.adder_subtract = "true";
|
||||
defparam mac_fp_wys.ax_clock = "0";
|
||||
defparam mac_fp_wys.ay_clock = "0";
|
||||
defparam mac_fp_wys.az_clock = "none";
|
||||
defparam mac_fp_wys.output_clock = "0";
|
||||
defparam mac_fp_wys.accumulate_clock = "none";
|
||||
defparam mac_fp_wys.ax_chainin_pl_clock = "none";
|
||||
defparam mac_fp_wys.accum_pipeline_clock = "none";
|
||||
defparam mac_fp_wys.mult_pipeline_clock = "none";
|
||||
defparam mac_fp_wys.adder_input_clock = "0";
|
||||
defparam mac_fp_wys.accum_adder_clock = "none";
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW(TAGW + 1),
|
||||
.DEPTH(1)
|
||||
) shift_reg (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(enable),
|
||||
.in ({tag_in, valid_in}),
|
||||
.out({tag_out, valid_out})
|
||||
);
|
||||
|
||||
endmodule
|
|
@ -1,48 +0,0 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_fp_utof #(
|
||||
parameter TAGW = 1,
|
||||
parameter LANES = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
output wire ready_in,
|
||||
input wire valid_in,
|
||||
|
||||
input wire [TAGW-1:0] tag_in,
|
||||
|
||||
input wire [LANES-1:0][31:0] dataa,
|
||||
output wire [LANES-1:0][31:0] result,
|
||||
|
||||
output wire [TAGW-1:0] tag_out,
|
||||
|
||||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
wire stall = ~ready_out && valid_out;
|
||||
wire enable = ~stall;
|
||||
assign ready_in = enable;
|
||||
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
acl_fp_utof utof (
|
||||
.clk (clk),
|
||||
.areset (1'b0),
|
||||
.en (enable),
|
||||
.a (dataa[i]),
|
||||
.q (result[i])
|
||||
);
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW(TAGW + 1),
|
||||
.DEPTH(`LATENCY_UTOF)
|
||||
) shift_reg (
|
||||
.clk(clk),
|
||||
.reset(reset),
|
||||
.enable(enable),
|
||||
.in ({tag_in, valid_in}),
|
||||
.out({tag_out, valid_out})
|
||||
);
|
||||
|
||||
endmodule
|
95
hw/rtl/fp_cores/altera/dspba_delay_ver.sv
Normal file
95
hw/rtl/fp_cores/altera/dspba_delay_ver.sv
Normal file
|
@ -0,0 +1,95 @@
|
|||
// Legal Notice: Copyright 2017 Intel Corporation. All rights reserved.
|
||||
// Your use of Intel Corporation's design tools, logic functions and other
|
||||
// software and tools, and its AMPP partner logic functions, and any output
|
||||
// files any of the foregoing device programming or simulation files), and
|
||||
// any associated documentation or information are expressly subject to the
|
||||
// terms and conditions of the Intel FPGA Software License Agreement,
|
||||
// Intel MegaCore Function License Agreement, or other applicable license
|
||||
// agreement, including, without limitation, that your use is for the sole
|
||||
// purpose of programming logic devices manufactured by Intel and sold by
|
||||
// Intel or its authorized distributors. Please refer to the applicable
|
||||
// agreement for further details.
|
||||
|
||||
module dspba_delay_ver
|
||||
#(
|
||||
parameter width = 8,
|
||||
parameter depth = 1,
|
||||
parameter reset_high = 1'b1,
|
||||
parameter reset_kind = "ASYNC"
|
||||
) (
|
||||
input clk,
|
||||
input aclr,
|
||||
input ena,
|
||||
input [width-1:0] xin,
|
||||
output [width-1:0] xout
|
||||
);
|
||||
|
||||
wire reset;
|
||||
reg [width-1:0] delays [depth-1:0];
|
||||
|
||||
assign reset = aclr ^ reset_high;
|
||||
|
||||
generate
|
||||
if (depth > 0)
|
||||
begin
|
||||
genvar i;
|
||||
for (i = 0; i < depth; ++i)
|
||||
begin : delay_block
|
||||
if (reset_kind == "ASYNC")
|
||||
begin : sync_reset
|
||||
always @ (posedge clk or negedge reset)
|
||||
begin: a
|
||||
if (!reset) begin
|
||||
delays[i] <= 0;
|
||||
end else begin
|
||||
if (ena) begin
|
||||
if (i > 0) begin
|
||||
delays[i] <= delays[i - 1];
|
||||
end else begin
|
||||
delays[i] <= xin;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
if (reset_kind == "SYNC")
|
||||
begin : async_reset
|
||||
always @ (posedge clk)
|
||||
begin: a
|
||||
if (!reset) begin
|
||||
delays[i] <= 0;
|
||||
end else begin
|
||||
if (ena) begin
|
||||
if (i > 0) begin
|
||||
delays[i] <= delays[i - 1];
|
||||
end else begin
|
||||
delays[i] <= xin;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
if (reset_kind == "NONE")
|
||||
begin : no_reset
|
||||
always @ (posedge clk)
|
||||
begin: a
|
||||
if (ena) begin
|
||||
if (i > 0) begin
|
||||
delays[i] <= delays[i - 1];
|
||||
end else begin
|
||||
delays[i] <= xin;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
assign xout = delays[depth - 1];
|
||||
end else begin
|
||||
assign xout = xin;
|
||||
end
|
||||
endgenerate
|
||||
|
||||
endmodule
|
|
@ -1,392 +0,0 @@
|
|||
// Legal Notice: Copyright 2017 Intel Corporation. All rights reserved.
|
||||
// Your use of Intel Corporation's design tools, logic functions and other
|
||||
// software and tools, and its AMPP partner logic functions, and any output
|
||||
// files any of the foregoing device programming or simulation files), and
|
||||
// any associated documentation or information are expressly subject to the
|
||||
// terms and conditions of the Intel FPGA Software License Agreement,
|
||||
// Intel MegaCore Function License Agreement, or other applicable license
|
||||
// agreement, including, without limitation, that your use is for the sole
|
||||
// purpose of programming logic devices manufactured by Intel and sold by
|
||||
// Intel or its authorized distributors. Please refer to the applicable
|
||||
// agreement for further details.
|
||||
|
||||
|
||||
module dspba_delay_ver
|
||||
#(
|
||||
parameter width = 8,
|
||||
parameter depth = 1,
|
||||
parameter reset_high = 1'b1,
|
||||
parameter reset_kind = "ASYNC"
|
||||
) (
|
||||
input clk,
|
||||
input aclr,
|
||||
input ena,
|
||||
input [width-1:0] xin,
|
||||
output [width-1:0] xout
|
||||
);
|
||||
|
||||
wire reset;
|
||||
reg [width-1:0] delays [depth-1:0];
|
||||
|
||||
assign reset = aclr ^ reset_high;
|
||||
|
||||
generate
|
||||
if (depth > 0)
|
||||
begin
|
||||
genvar i;
|
||||
for (i = 0; i < depth; ++i)
|
||||
begin : delay_block
|
||||
if (reset_kind == "ASYNC")
|
||||
begin : sync_reset
|
||||
always @ (posedge clk or negedge reset)
|
||||
begin: a
|
||||
if (!reset) begin
|
||||
delays[i] <= 0;
|
||||
end else begin
|
||||
if (ena) begin
|
||||
if (i > 0) begin
|
||||
delays[i] <= delays[i - 1];
|
||||
end else begin
|
||||
delays[i] <= xin;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
if (reset_kind == "SYNC")
|
||||
begin : async_reset
|
||||
always @ (posedge clk)
|
||||
begin: a
|
||||
if (!reset) begin
|
||||
delays[i] <= 0;
|
||||
end else begin
|
||||
if (ena) begin
|
||||
if (i > 0) begin
|
||||
delays[i] <= delays[i - 1];
|
||||
end else begin
|
||||
delays[i] <= xin;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
if (reset_kind == "NONE")
|
||||
begin : no_reset
|
||||
always @ (posedge clk)
|
||||
begin: a
|
||||
if (ena) begin
|
||||
if (i > 0) begin
|
||||
delays[i] <= delays[i - 1];
|
||||
end else begin
|
||||
delays[i] <= xin;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
assign xout = delays[depth - 1];
|
||||
end else begin
|
||||
assign xout = xin;
|
||||
end
|
||||
endgenerate
|
||||
|
||||
endmodule
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
module dspba_sync_reg_ver
|
||||
#(
|
||||
parameter width1 = 8,
|
||||
parameter width2 = 8,
|
||||
parameter depth = 2,
|
||||
parameter pulse_multiplier = 1,
|
||||
parameter counter_width = 8,
|
||||
parameter init_value = 0,
|
||||
parameter reset1_high = 1'b1,
|
||||
parameter reset2_high = 1'b1,
|
||||
parameter reset_kind = "ASYNC"
|
||||
) (
|
||||
input clk1,
|
||||
input aclr1,
|
||||
input [0 : 0] ena,
|
||||
input [width1-1 : 0] xin,
|
||||
output [width1-1 : 0] xout,
|
||||
input clk2,
|
||||
input aclr2,
|
||||
output [width2-1 : 0] sxout
|
||||
);
|
||||
wire [width1-1 : 0] init_value_internal;
|
||||
|
||||
wire reset1;
|
||||
wire reset2;
|
||||
|
||||
reg iclk_enable;
|
||||
reg [width1-1 : 0] iclk_data;
|
||||
reg [width2-1 : 0] oclk_data;
|
||||
|
||||
// For Synthesis this means: preserve this registers and do not merge any other flip-flops with synchronizer flip-flops
|
||||
// For TimeQuest this means: identify these flip-flops as synchronizer to enable automatic MTBF analysis
|
||||
(* altera_attribute = {"-name ADV_NETLIST_OPT_ALLOWED NEVER_ALLOW; -name SYNCHRONIZER_IDENTIFICATION FORCED; -name DONT_MERGE_REGISTER ON; -name PRESERVE_REGISTER ON"} *) reg [depth-1 : 0] sync_regs;
|
||||
|
||||
wire oclk_enable;
|
||||
|
||||
wire ena_internal;
|
||||
reg [counter_width-1 : 0] counter;
|
||||
|
||||
assign init_value_internal = init_value;
|
||||
|
||||
assign reset1 = aclr1 ^ reset1_high;
|
||||
assign reset2 = aclr2 ^ reset2_high;
|
||||
|
||||
generate
|
||||
if (pulse_multiplier == 1)
|
||||
begin: no_multiplication
|
||||
assign ena_internal = ena[0];
|
||||
end
|
||||
endgenerate
|
||||
|
||||
generate
|
||||
if (pulse_multiplier > 1)
|
||||
begin: multiplu_ena_pulse
|
||||
if (reset_kind == "ASYNC")
|
||||
begin: async_reset
|
||||
always @ (posedge clk1 or negedge reset1)
|
||||
begin
|
||||
if (reset1 == 1'b0) begin
|
||||
counter <= 0;
|
||||
end else begin
|
||||
if (counter > 0) begin
|
||||
if (counter == pulse_multiplier - 1) begin
|
||||
counter <= 0;
|
||||
end else begin
|
||||
counter <= counter + 2'd1;
|
||||
end
|
||||
end else begin
|
||||
if (ena[0] == 1'b1) begin
|
||||
counter <= 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
if (reset_kind == "SYNC")
|
||||
begin: sync_reset
|
||||
always @ (posedge clk1)
|
||||
begin
|
||||
if (reset1 == 1'b0) begin
|
||||
counter <= 0;
|
||||
end else begin
|
||||
if (counter > 0) begin
|
||||
if (counter == pulse_multiplier - 1) begin
|
||||
counter <= 0;
|
||||
end else begin
|
||||
counter <= counter + 2'd1;
|
||||
end
|
||||
end else begin
|
||||
if (ena[0] == 1'b1) begin
|
||||
counter <= 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
if (reset_kind == "NONE")
|
||||
begin: no_reset
|
||||
always @ (posedge clk1)
|
||||
begin
|
||||
if (counter > 0) begin
|
||||
if (counter == pulse_multiplier - 1) begin
|
||||
counter <= 0;
|
||||
end else begin
|
||||
counter <= counter + 2'd1;
|
||||
end
|
||||
end else begin
|
||||
if (ena[0] == 1'b1) begin
|
||||
counter <= 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
assign ena_internal = counter > 0 ? 1'b1 : ena[0];
|
||||
end
|
||||
endgenerate
|
||||
|
||||
assign oclk_enable = sync_regs[depth - 1];
|
||||
|
||||
generate
|
||||
if (reset_kind == "ASYNC")
|
||||
begin: iclk_async_reset
|
||||
always @ (posedge clk1 or negedge reset1)
|
||||
begin
|
||||
if (reset1 == 1'b0) begin
|
||||
iclk_data <= init_value_internal;
|
||||
iclk_enable <= 1'b0;
|
||||
end else begin
|
||||
iclk_enable <= ena_internal;
|
||||
if (ena[0] == 1'b1) begin
|
||||
iclk_data <= xin;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
if (reset_kind == "SYNC")
|
||||
begin: iclk_sync_reset
|
||||
always @ (posedge clk1)
|
||||
begin
|
||||
if (reset1 == 1'b0) begin
|
||||
iclk_data <= init_value_internal;
|
||||
iclk_enable <= 1'b0;
|
||||
end else begin
|
||||
iclk_enable <= ena_internal;
|
||||
if (ena[0] == 1'b1) begin
|
||||
iclk_data <= xin;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
if (reset_kind == "NONE")
|
||||
begin: iclk_no_reset
|
||||
always @ (posedge clk1)
|
||||
begin
|
||||
iclk_enable <= ena_internal;
|
||||
if (ena[0] == 1'b1) begin
|
||||
iclk_data <= xin;
|
||||
end
|
||||
end
|
||||
end
|
||||
endgenerate
|
||||
|
||||
generate
|
||||
genvar i;
|
||||
for (i = 0; i < depth; ++i)
|
||||
begin: sync_regs_block
|
||||
if (reset_kind == "ASYNC")
|
||||
begin: sync_reg_async_reset
|
||||
always @ (posedge clk2 or negedge reset2) begin
|
||||
if (reset2 == 1'b0) begin
|
||||
sync_regs[i] <= 1'b0;
|
||||
end else begin
|
||||
if (i > 0) begin
|
||||
sync_regs[i] <= sync_regs[i - 1];
|
||||
end else begin
|
||||
sync_regs[i] <= iclk_enable;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
if (reset_kind == "SYNC")
|
||||
begin: sync_reg_sync_reset
|
||||
always @ (posedge clk2) begin
|
||||
if (reset2 == 1'b0) begin
|
||||
sync_regs[i] <= 1'b0;
|
||||
end else begin
|
||||
if (i > 0) begin
|
||||
sync_regs[i] <= sync_regs[i - 1];
|
||||
end else begin
|
||||
sync_regs[i] <= iclk_enable;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
if (reset_kind == "NONE")
|
||||
begin: sync_reg_no_reset
|
||||
always @ (posedge clk2) begin
|
||||
if (i > 0) begin
|
||||
sync_regs[i] <= sync_regs[i - 1];
|
||||
end else begin
|
||||
sync_regs[i] <= iclk_enable;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
endgenerate
|
||||
|
||||
generate
|
||||
if (reset_kind == "ASYNC")
|
||||
begin: oclk_async_reset
|
||||
always @ (posedge clk2 or negedge reset2)
|
||||
begin
|
||||
if (reset2 == 1'b0) begin
|
||||
oclk_data <= init_value_internal[width2-1 : 0];
|
||||
end else begin
|
||||
if (oclk_enable == 1'b1) begin
|
||||
oclk_data <= iclk_data[width2-1 : 0];
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
if (reset_kind == "SYNC")
|
||||
begin: oclk_sync_reset
|
||||
always @ (posedge clk2)
|
||||
begin
|
||||
if (reset2 == 1'b0) begin
|
||||
oclk_data <= init_value_internal[width2-1 : 0];
|
||||
end else begin
|
||||
if (oclk_enable == 1'b1) begin
|
||||
oclk_data <= iclk_data[width2-1 : 0];
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
if (reset_kind == "NONE")
|
||||
begin: oclk_no_reset
|
||||
always @ (posedge clk2)
|
||||
begin
|
||||
if (oclk_enable == 1'b1) begin
|
||||
oclk_data <= iclk_data[width2-1 : 0];
|
||||
end
|
||||
end
|
||||
end
|
||||
endgenerate
|
||||
|
||||
assign xout = iclk_data;
|
||||
assign sxout = oclk_data;
|
||||
|
||||
endmodule
|
||||
|
||||
//------------------------------------------------------------------------------
|
||||
|
||||
module dspba_pipe
|
||||
#(
|
||||
parameter num_bits = 8,
|
||||
parameter num_stages = 0,
|
||||
parameter init_value = 1'bx
|
||||
) (
|
||||
input clk,
|
||||
input [num_bits-1:0] d,
|
||||
output [num_bits-1:0] q
|
||||
);
|
||||
logic [num_bits-1:0] init_stage = { num_bits { init_value } };
|
||||
|
||||
generate
|
||||
if (num_stages > 0)
|
||||
begin
|
||||
reg [num_bits-1:0] stage_array[num_stages-1:0];
|
||||
|
||||
genvar i;
|
||||
for (i = 0; i < num_stages; ++i)
|
||||
begin : g_pipe
|
||||
always @ (posedge clk) begin
|
||||
if (i>0) begin
|
||||
stage_array[i] <= stage_array[i-1];
|
||||
end else begin
|
||||
stage_array[i] <= d;
|
||||
end
|
||||
end
|
||||
end
|
||||
initial begin
|
||||
stage_array = '{ num_stages { init_stage } };
|
||||
end
|
||||
|
||||
assign q = stage_array[num_stages-1];
|
||||
|
||||
end else begin
|
||||
assign q = d;
|
||||
end
|
||||
endgenerate
|
||||
|
||||
endmodule
|
210
hw/rtl/fp_cores/svdpi/float_dpi.cpp
Normal file
210
hw/rtl/fp_cores/svdpi/float_dpi.cpp
Normal file
|
@ -0,0 +1,210 @@
|
|||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
#include <mutex>
|
||||
#include "svdpi.h"
|
||||
#include "VX_config.h"
|
||||
|
||||
extern "C" {
|
||||
void dpi_fadd(bool clk, bool enable, int a, int b, int* result);
|
||||
void dpi_fsub(bool clk, bool enable, int a, int b, int* result);
|
||||
void dpi_fmul(bool clk, bool enable, int a, int b, int* result);
|
||||
void dpi_fmadd(bool clk, bool enable, int a, int b, int c, int* result);
|
||||
void dpi_fmsub(bool clk, bool enable, int a, int b, int c, int* result);
|
||||
void dpi_fdiv(bool clk, bool enable, int a, int b, int* result);
|
||||
void dpi_fsqrt(bool clk, bool enable, int a, int* result);
|
||||
void dpi_ftoi(bool clk, bool enable, int a, int* result);
|
||||
void dpi_ftou(bool clk, bool enable, int a, int* result);
|
||||
void dpi_itof(bool clk, bool enable, int a, int* result);
|
||||
void dpi_utof(bool clk, bool enable, int a, int* result);
|
||||
}
|
||||
|
||||
class ShiftRegister {
|
||||
public:
|
||||
ShiftRegister() : init_(false), depth_(0) {}
|
||||
|
||||
void ensure_init(int depth) {
|
||||
if (!init_) {
|
||||
buffer_.resize(depth);
|
||||
init_ = true;
|
||||
depth_ = depth;
|
||||
}
|
||||
}
|
||||
|
||||
void push(int value, bool clk, bool enable) {
|
||||
if (clk || !enable)
|
||||
return;
|
||||
for (unsigned i = 0; i < depth_-1; ++i) {
|
||||
buffer_[i] = buffer_[i+1];
|
||||
}
|
||||
buffer_[depth_-1] = value;
|
||||
}
|
||||
|
||||
int top() const {
|
||||
return buffer_[0];
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
std::vector<int> buffer_;
|
||||
unsigned depth_;
|
||||
bool init_;
|
||||
};
|
||||
|
||||
class Instances {
|
||||
public:
|
||||
ShiftRegister& get(svScope scope) {
|
||||
mutex_.lock();
|
||||
ShiftRegister& reg = instances_[scope];
|
||||
mutex_.unlock();
|
||||
return reg;
|
||||
}
|
||||
|
||||
private:
|
||||
std::unordered_map<svScope, ShiftRegister> instances_;
|
||||
std::mutex mutex_;
|
||||
};
|
||||
|
||||
Instances instances;
|
||||
|
||||
void dpi_fadd(bool clk, bool enable, int a, int b, int* result) {
|
||||
auto scope = svGetScope();
|
||||
ShiftRegister& inst = instances.get(scope);
|
||||
|
||||
float fa = *(float*)&a;
|
||||
float fb = *(float*)&b;
|
||||
float fr = fa + fb;
|
||||
|
||||
inst.ensure_init(LATENCY_FMADD);
|
||||
inst.push(*(int*)&fr, clk, enable);
|
||||
*result = inst.top();
|
||||
}
|
||||
|
||||
void dpi_fsub(bool clk, bool enable, int a, int b, int* result) {
|
||||
auto scope = svGetScope();
|
||||
ShiftRegister& inst = instances.get(scope);
|
||||
|
||||
float fa = *(float*)&a;
|
||||
float fb = *(float*)&b;
|
||||
float fr = fa - fb;
|
||||
|
||||
inst.ensure_init(LATENCY_FMADD);
|
||||
inst.push(*(int*)&fr, clk, enable);
|
||||
*result = inst.top();
|
||||
}
|
||||
|
||||
void dpi_fmul(bool clk, bool enable, int a, int b, int* result) {
|
||||
auto scope = svGetScope();
|
||||
ShiftRegister& inst = instances.get(scope);
|
||||
|
||||
float fa = *(float*)&a;
|
||||
float fb = *(float*)&b;
|
||||
float fr = fa * fb;
|
||||
|
||||
inst.ensure_init(LATENCY_FMADD);
|
||||
inst.push(*(int*)&fr, clk, enable);
|
||||
*result = inst.top();
|
||||
}
|
||||
|
||||
void dpi_fmadd(bool clk, bool enable, int a, int b, int c, int* result) {
|
||||
auto scope = svGetScope();
|
||||
ShiftRegister& inst = instances.get(scope);
|
||||
|
||||
float fa = *(float*)&a;
|
||||
float fb = *(float*)&b;
|
||||
float fc = *(float*)&c;
|
||||
float fr = fa * fb + fc;
|
||||
|
||||
inst.ensure_init(LATENCY_FMADD);
|
||||
inst.push(*(int*)&fr, clk, enable);
|
||||
*result = inst.top();
|
||||
}
|
||||
|
||||
void dpi_fmsub(bool clk, bool enable, int a, int b, int c, int* result) {
|
||||
auto scope = svGetScope();
|
||||
ShiftRegister& inst = instances.get(scope);
|
||||
|
||||
float fa = *(float*)&a;
|
||||
float fb = *(float*)&b;
|
||||
float fc = *(float*)&c;
|
||||
float fr = fa * fb - fc;
|
||||
|
||||
inst.ensure_init(LATENCY_FMADD);
|
||||
inst.push(*(int*)&fr, clk, enable);
|
||||
*result = inst.top();
|
||||
}
|
||||
|
||||
void dpi_fdiv(bool clk, bool enable, int a, int b, int* result) {
|
||||
auto scope = svGetScope();
|
||||
ShiftRegister& inst = instances.get(scope);
|
||||
|
||||
float fa = *(float*)&a;
|
||||
float fb = *(float*)&b;
|
||||
float fr = fa / fb;
|
||||
|
||||
inst.ensure_init(LATENCY_FDIV);
|
||||
inst.push(*(int*)&fr, clk, enable);
|
||||
*result = inst.
|
||||
|
||||
top();
|
||||
}
|
||||
|
||||
void dpi_fsqrt(bool clk, bool enable, int a, int* result) {
|
||||
auto scope = svGetScope();
|
||||
ShiftRegister& inst = instances.get(scope);
|
||||
|
||||
float fa = *(float*)&a;
|
||||
float fr = sqrt(fa);
|
||||
|
||||
inst.ensure_init(LATENCY_FSQRT);
|
||||
inst.push(*(int*)&fr, clk, enable);
|
||||
*result = inst.top();
|
||||
}
|
||||
|
||||
void dpi_ftoi(bool clk, bool enable, int a, int* result) {
|
||||
auto scope = svGetScope();
|
||||
ShiftRegister& inst = instances.get(scope);
|
||||
|
||||
float fa = *(float*)&a;
|
||||
int ir = int(fa);
|
||||
|
||||
inst.ensure_init(LATENCY_FTOI);
|
||||
inst.push(ir, clk, enable);
|
||||
*result = inst.top();
|
||||
}
|
||||
|
||||
void dpi_ftou(bool clk, bool enable, int a, int* result) {
|
||||
auto scope = svGetScope();
|
||||
ShiftRegister& inst = instances.get(scope);
|
||||
|
||||
float fa = *(float*)&a;
|
||||
unsigned ir = unsigned(fa);
|
||||
|
||||
inst.ensure_init(LATENCY_FTOI);
|
||||
inst.push(ir, clk, enable);
|
||||
*result = inst.top();
|
||||
}
|
||||
|
||||
void dpi_itof(bool clk, bool enable, int a, int* result) {
|
||||
auto scope = svGetScope();
|
||||
ShiftRegister& inst = instances.get(scope);
|
||||
|
||||
float fr = float(a);
|
||||
|
||||
inst.ensure_init(LATENCY_ITOF);
|
||||
inst.push(*(int*)&fr, clk, enable);
|
||||
*result = inst.top();
|
||||
}
|
||||
|
||||
void dpi_utof(bool clk, bool enable, int a, int* result) {
|
||||
auto scope = svGetScope();
|
||||
ShiftRegister& inst = instances.get(scope);
|
||||
|
||||
unsigned ua = *(unsigned*)&a;
|
||||
float fr = float(ua);
|
||||
|
||||
inst.ensure_init(LATENCY_ITOF);
|
||||
inst.push(*(int*)&fr, clk, enable);
|
||||
*result = inst.top();
|
||||
}
|
16
hw/rtl/fp_cores/svdpi/float_dpi.vh
Normal file
16
hw/rtl/fp_cores/svdpi/float_dpi.vh
Normal file
|
@ -0,0 +1,16 @@
|
|||
`ifndef FLOAT_DPI
|
||||
`define FLOAT_DPI
|
||||
|
||||
import "DPI-C" context function void dpi_fadd(input logic clk, input logic enable, input int a, input int b, output int result);
|
||||
import "DPI-C" context function void dpi_fsub(input logic clk, input logic enable, input int a, input int b, output int result);
|
||||
import "DPI-C" context function void dpi_fmul(input logic clk, input logic enable, input int a, input int b, output int result);
|
||||
import "DPI-C" context function void dpi_fmadd(input logic clk, input logic enable, input int a, input int b, input int c, output int result);
|
||||
import "DPI-C" context function void dpi_fmsub(input logic clk, input logic enable, input int a, input int b, input int c, output int result);
|
||||
import "DPI-C" context function void dpi_fdiv(input logic clk, input logic enable, input int a, input int b, output int result);
|
||||
import "DPI-C" context function void dpi_fsqrt(input logic clk, input logic enable, input int a, output int result);
|
||||
import "DPI-C" context function void dpi_ftoi(input logic clk, input logic enable, input int a, output int result);
|
||||
import "DPI-C" context function void dpi_ftou(input logic clk, input logic enable, input int a, output int result);
|
||||
import "DPI-C" context function void dpi_itof(input logic clk, input logic enable, input int a, output int result);
|
||||
import "DPI-C" context function void dpi_utof(input logic clk, input logic enable, input int a, output int result);
|
||||
|
||||
`endif
|
|
@ -94,6 +94,7 @@ if args.outc != 'none':
|
|||
// Translated from VX_config.vh:
|
||||
'''[1:].format(date=datetime.now()), file=f)
|
||||
with open(path.join(script_dir, '../rtl/VX_config.vh'), 'r') as r:
|
||||
lineno = 0
|
||||
for line in r:
|
||||
if in_expansion:
|
||||
f.write(post_process_line(line))
|
||||
|
@ -107,7 +108,8 @@ if args.outc != 'none':
|
|||
f.write(post_process_line(pat.sub(repl, line)))
|
||||
break
|
||||
else:
|
||||
raise ValueError('failed to find rule for: ' + line)
|
||||
raise ValueError('failed to find rule for: "' + line + '" (' + str(lineno) + ')')
|
||||
lineno = lineno + 1
|
||||
|
||||
print('''
|
||||
// Misc
|
||||
|
|
|
@ -17,10 +17,11 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
|
|||
DBG_FLAGS += $(DBG_PRINT_FLAGS)
|
||||
DBG_FLAGS += -DDBG_CORE_REQ_INFO
|
||||
|
||||
FPU_INCLUDE = -I../rtl/fp_cores/fpnew/src/common_cells/include -I../rtl/fp_cores/fpnew/src/common_cells/src -I../rtl/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I../rtl/fp_cores/fpnew/src
|
||||
INCLUDE = -I../rtl/ -I../rtl/libs -I../rtl/interfaces -I../rtl/cache -I../rtl/fp_cores -I../rtl/simulate $(FPU_INCLUDE)
|
||||
FPU_INCLUDE = -I../rtl/fp_cores -I../rtl/fp_cores/svdpi -I../rtl/fp_cores/fpnew/src/common_cells/include -I../rtl/fp_cores/fpnew/src/common_cells/src -I../rtl/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I../rtl/fp_cores/fpnew/src
|
||||
INCLUDE = -I../rtl/ -I../rtl/libs -I../rtl/interfaces -I../rtl/cache -I../rtl/simulate $(FPU_INCLUDE)
|
||||
|
||||
SRCS = simulator.cpp testbench.cpp
|
||||
SRCS += ../rtl/fp_cores/svdpi/float_dpi.cpp
|
||||
|
||||
all: build-s
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue