mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
RV64F DSP FPU hardware fixes
This commit is contained in:
parent
bd5a52ff9c
commit
34290f7e95
12 changed files with 258 additions and 201 deletions
|
@ -41,22 +41,21 @@ extern "C" {
|
|||
}
|
||||
|
||||
inline uint64_t nan_box(uint32_t value) {
|
||||
uint64_t mask = 0xffffffff00000000;
|
||||
return value | mask;
|
||||
return value | 0xffffffff00000000;
|
||||
}
|
||||
|
||||
inline bool is_nan_boxed(uint64_t value) {
|
||||
#if (XLEN == 64)
|
||||
inline bool is_nan_boxed(uint64_t value) {
|
||||
return (uint32_t(value >> 32) == 0xffffffff);
|
||||
#else
|
||||
return true;
|
||||
#endif
|
||||
}
|
||||
|
||||
inline int64_t check_boxing(int64_t a) {
|
||||
if (is_nan_boxed(a))
|
||||
return a;
|
||||
return nan_box(0x7fc00000); // NaN
|
||||
#if (FLEN == 64)
|
||||
// this check is only needed when both single and double precisions are enabled
|
||||
if (!is_nan_boxed(a)) {
|
||||
return nan_box(0x7fc00000); // NaN
|
||||
}
|
||||
#endif
|
||||
return a;
|
||||
}
|
||||
|
||||
void dpi_fadd(bool enable, int dst_fmt, int64_t a, int64_t b, const svBitVecVal* frm, int64_t* result, svBitVecVal* fflags) {
|
||||
|
|
|
@ -155,14 +155,8 @@
|
|||
`define FPU_DPI
|
||||
`endif
|
||||
|
||||
`ifdef SYNTHESIS
|
||||
`ifndef FPU_DSP
|
||||
`ifndef FPU_FPNEW
|
||||
`define FPU_FPNEW
|
||||
`endif
|
||||
`endif
|
||||
`else
|
||||
`ifndef FPU_DPI
|
||||
`ifndef FPU_DSP
|
||||
`ifndef FPU_FPNEW
|
||||
`define FPU_FPNEW
|
||||
`endif
|
||||
|
@ -329,6 +323,9 @@
|
|||
`ifdef VIVADO
|
||||
`define LATENCY_FMA 16
|
||||
`endif
|
||||
`ifndef LATENCY_FMA
|
||||
`define LATENCY_FMA 4
|
||||
`endif
|
||||
`endif
|
||||
`endif
|
||||
|
||||
|
@ -346,6 +343,9 @@
|
|||
`ifdef VIVADO
|
||||
`define LATENCY_FDIV 28
|
||||
`endif
|
||||
`ifndef LATENCY_FDIV
|
||||
`define LATENCY_FDIV 16
|
||||
`endif
|
||||
`endif
|
||||
`endif
|
||||
|
||||
|
@ -363,6 +363,9 @@
|
|||
`ifdef VIVADO
|
||||
`define LATENCY_FSQRT 28
|
||||
`endif
|
||||
`ifndef LATENCY_FSQRT
|
||||
`define LATENCY_FSQRT 16
|
||||
`endif
|
||||
`endif
|
||||
`endif
|
||||
|
||||
|
|
|
@ -196,9 +196,9 @@
|
|||
`define INST_FPU_MUL 4'b0010
|
||||
`define INST_FPU_DIV 4'b0011
|
||||
`define INST_FPU_SQRT 4'b0100
|
||||
`define INST_FPU_CMP 4'b0101
|
||||
`define INST_FPU_CMP 4'b0101 // mod: LE=0, LT=1, EQ=2
|
||||
`define INST_FPU_F2F 4'b0110
|
||||
`define INST_FPU_MISC 4'b0111 // SGNJ, SGNJN, SGNJX, CLASS, MVXW, MVWX, FMIN, FMAX
|
||||
`define INST_FPU_MISC 4'b0111 // mod: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7
|
||||
`define INST_FPU_F2I 4'b1000
|
||||
`define INST_FPU_F2U 4'b1001
|
||||
`define INST_FPU_I2F 4'b1010
|
||||
|
@ -208,6 +208,8 @@
|
|||
`define INST_FPU_NMSUB 4'b1110
|
||||
`define INST_FPU_NMADD 4'b1111
|
||||
`define INST_FPU_IS_W(mod) (mod[4])
|
||||
`define INST_FPU_IS_CLASS(op, mod) (op == `INST_FPU_MISC && mod == 3)
|
||||
`define INST_FPU_IS_MVXW(op, mod) (op == `INST_FPU_MISC && mod == 4)
|
||||
|
||||
`define INST_FPU_BITS 4
|
||||
|
||||
|
|
|
@ -35,7 +35,10 @@ module VX_fpu_cvt #(
|
|||
|
||||
localparam MAN_BITS = 23;
|
||||
localparam EXP_BITS = 8;
|
||||
localparam EXP_BIAS = 2**(EXP_BITS-1)-1;
|
||||
localparam EXP_BIAS = 2**(EXP_BITS-1)-1;
|
||||
|
||||
localparam logic [EXP_BITS-1:0] QNAN_EXPONENT = 2**EXP_BITS-1;
|
||||
localparam logic [MAN_BITS-1:0] QNAN_MANTISSA = 2**(MAN_BITS-1);
|
||||
|
||||
// Use 32-bit integer
|
||||
localparam MAX_INT_WIDTH = 32;
|
||||
|
@ -122,15 +125,15 @@ module VX_fpu_cvt #(
|
|||
wire [NUM_LANES-1:0] mant_is_zero_s0; // for integer zeroes
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
wire mant_is_nonzero;
|
||||
wire mant_is_nonzero_s0;
|
||||
VX_lzc #(
|
||||
.N (INT_MAN_WIDTH)
|
||||
) lzc (
|
||||
.data_in (encoded_mant_s0[i]),
|
||||
.data_out (renorm_shamt_s0[i]),
|
||||
.valid_out (mant_is_nonzero)
|
||||
.valid_out (mant_is_nonzero_s0)
|
||||
);
|
||||
assign mant_is_zero_s0[i] = ~mant_is_nonzero;
|
||||
assign mant_is_zero_s0[i] = ~mant_is_nonzero_s0;
|
||||
end
|
||||
|
||||
wire [NUM_LANES-1:0][INT_MAN_WIDTH-1:0] input_mant_s0; // normalized input mantissa
|
||||
|
@ -142,10 +145,10 @@ module VX_fpu_cvt #(
|
|||
assign input_mant_s0[i] = encoded_mant_s0[i] << renorm_shamt_s0[i];
|
||||
|
||||
// Unbias exponent and compensate for shift
|
||||
wire [INT_EXP_WIDTH-1:0] fp_input_exp = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - {1'b0, renorm_shamt_s0[i]};
|
||||
wire [INT_EXP_WIDTH-1:0] int_input_exp = (INT_MAN_WIDTH-1) - {1'b0, renorm_shamt_s0[i]};
|
||||
wire [INT_EXP_WIDTH-1:0] fp_input_exp_s0 = fmt_exponent_s0[i] + INT_EXP_WIDTH'(FMT_SHIFT_COMPENSATION - EXP_BIAS) - {1'b0, renorm_shamt_s0[i]};
|
||||
wire [INT_EXP_WIDTH-1:0] int_input_exp_s0 = (INT_MAN_WIDTH-1) - {1'b0, renorm_shamt_s0[i]};
|
||||
|
||||
assign input_exp_s0[i] = is_itof_s0 ? int_input_exp : fp_input_exp;
|
||||
assign input_exp_s0[i] = is_itof_s0 ? int_input_exp_s0 : fp_input_exp_s0;
|
||||
`IGNORE_WARNINGS_END
|
||||
end
|
||||
|
||||
|
@ -180,54 +183,54 @@ module VX_fpu_cvt #(
|
|||
wire [NUM_LANES-1:0] of_before_round_s1;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
reg [2*INT_MAN_WIDTH:0] preshift_mant; // mantissa before final shift
|
||||
reg [SHAMT_BITS-1:0] denorm_shamt; // shift amount for denormalization
|
||||
reg [INT_EXP_WIDTH-1:0] final_exp; // after eventual adjustments
|
||||
reg of_before_round;
|
||||
reg [2*INT_MAN_WIDTH:0] preshift_mant_s1; // mantissa before final shift
|
||||
reg [SHAMT_BITS-1:0] denorm_shamt_s1; // shift amount for denormalization
|
||||
reg [INT_EXP_WIDTH-1:0] final_exp_tmp_s1; // after eventual adjustments
|
||||
reg of_before_round_tmp_s1;
|
||||
|
||||
always @(*) begin
|
||||
`IGNORE_WARNINGS_BEGIN
|
||||
// Default assignment
|
||||
final_exp = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS); // take exponent as is, only look at lower bits
|
||||
preshift_mant = {input_mant_s1[i], 33'b0}; // Place mantissa to the left of the shifter
|
||||
denorm_shamt = '0; // right of mantissa
|
||||
of_before_round = 1'b0;
|
||||
final_exp_tmp_s1 = input_exp_s1[i] + INT_EXP_WIDTH'(EXP_BIAS); // take exponent as is, only look at lower bits
|
||||
preshift_mant_s1 = {input_mant_s1[i], 33'b0}; // Place mantissa to the left of the shifter
|
||||
denorm_shamt_s1 = '0; // right of mantissa
|
||||
of_before_round_tmp_s1 = 1'b0;
|
||||
|
||||
// Handle INT casts
|
||||
if (is_itof_s1) begin
|
||||
if ($signed(input_exp_s1[i]) >= $signed(2**EXP_BITS-1-EXP_BIAS)) begin
|
||||
// Overflow or infinities (for proper rounding)
|
||||
final_exp = (2**EXP_BITS-2); // largest normal value
|
||||
preshift_mant = ~0; // largest normal value and RS bits set
|
||||
of_before_round = 1'b1;
|
||||
final_exp_tmp_s1 = (2**EXP_BITS-2); // largest normal value
|
||||
preshift_mant_s1 = ~0; // largest normal value and RS bits set
|
||||
of_before_round_tmp_s1 = 1'b1;
|
||||
end else if ($signed(input_exp_s1[i]) < $signed(-MAN_BITS-EXP_BIAS)) begin
|
||||
// Limit the shift to retain sticky bits
|
||||
final_exp = '0; // denormal result
|
||||
denorm_shamt = (2 + MAN_BITS); // to sticky
|
||||
final_exp_tmp_s1 = '0; // denormal result
|
||||
denorm_shamt_s1 = (2 + MAN_BITS); // to sticky
|
||||
end else if ($signed(input_exp_s1[i]) < $signed(1-EXP_BIAS)) begin
|
||||
// Denormalize underflowing values
|
||||
final_exp = '0; // denormal result
|
||||
denorm_shamt = SHAMT_BITS'(1-EXP_BIAS - input_exp_s1[i]); // adjust right shifting
|
||||
final_exp_tmp_s1 = '0; // denormal result
|
||||
denorm_shamt_s1 = SHAMT_BITS'(1-EXP_BIAS - input_exp_s1[i]); // adjust right shifting
|
||||
end
|
||||
end else begin
|
||||
if ($signed(input_exp_s1[i]) >= $signed((MAX_INT_WIDTH-1) + unsigned_s1)) begin
|
||||
// overflow: when converting to unsigned the range is larger by one
|
||||
denorm_shamt = SHAMT_BITS'(0); // prevent shifting
|
||||
of_before_round = 1'b1;
|
||||
denorm_shamt_s1 = SHAMT_BITS'(0); // prevent shifting
|
||||
of_before_round_tmp_s1 = 1'b1;
|
||||
end else if ($signed(input_exp_s1[i]) < $signed(-1)) begin
|
||||
// underflow
|
||||
denorm_shamt = MAX_INT_WIDTH+1; // all bits go to the sticky
|
||||
denorm_shamt_s1 = MAX_INT_WIDTH+1; // all bits go to the sticky
|
||||
end else begin
|
||||
// By default right shift mantissa to be an integer
|
||||
denorm_shamt = SHAMT_BITS'((MAX_INT_WIDTH-1) - input_exp_s1[i]);
|
||||
denorm_shamt_s1 = SHAMT_BITS'((MAX_INT_WIDTH-1) - input_exp_s1[i]);
|
||||
end
|
||||
end
|
||||
`IGNORE_WARNINGS_END
|
||||
end
|
||||
|
||||
assign destination_mant_s1[i] = preshift_mant >> denorm_shamt;
|
||||
assign final_exp_s1[i] = final_exp;
|
||||
assign of_before_round_s1[i] = of_before_round;
|
||||
assign destination_mant_s1[i] = preshift_mant_s1 >> denorm_shamt_s1;
|
||||
assign final_exp_s1[i] = final_exp_tmp_s1;
|
||||
assign of_before_round_s1[i] = of_before_round_tmp_s1;
|
||||
end
|
||||
|
||||
// Pipeline stage2
|
||||
|
@ -242,7 +245,7 @@ module VX_fpu_cvt #(
|
|||
wire [NUM_LANES-1:0] input_sign_s2;
|
||||
wire [NUM_LANES-1:0][2*INT_MAN_WIDTH:0] destination_mant_s2;
|
||||
wire [NUM_LANES-1:0][INT_EXP_WIDTH-1:0] final_exp_s2;
|
||||
wire [NUM_LANES-1:0] of_before_round_s2;
|
||||
wire [NUM_LANES-1:0] of_before_round_s2;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + TAGW + 1 + 1 + `INST_FRM_BITS + NUM_LANES * ($bits(fclass_t) + 1 + 1 + (2*INT_MAN_WIDTH+1) + INT_EXP_WIDTH + 1)),
|
||||
|
@ -255,47 +258,51 @@ module VX_fpu_cvt #(
|
|||
.data_out ({valid_in_s2, tag_in_s2, is_itof_s2, unsigned_s2, rnd_mode_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, destination_mant_s2, final_exp_s2, of_before_round_s2})
|
||||
);
|
||||
|
||||
wire [NUM_LANES-1:0] rounded_sign;
|
||||
wire [NUM_LANES-1:0][31:0] rounded_abs; // absolute value of result after rounding
|
||||
wire [NUM_LANES-1:0][1:0] fp_round_sticky_bits, int_round_sticky_bits;
|
||||
wire [NUM_LANES-1:0] rounded_sign_s2;
|
||||
wire [NUM_LANES-1:0][31:0] rounded_abs_s2; // absolute value of result after rounding
|
||||
wire [NUM_LANES-1:0] int_round_has_sticky_s2;
|
||||
wire [NUM_LANES-1:0] fp_round_has_sticky_s2;
|
||||
|
||||
// Rouding and classification
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
wire [MAN_BITS-1:0] final_mant; // mantissa after adjustments
|
||||
wire [MAX_INT_WIDTH-1:0] final_int; // integer shifted in position
|
||||
wire [1:0] round_sticky_bits;
|
||||
wire [31:0] fmt_pre_round_abs;
|
||||
wire [31:0] pre_round_abs;
|
||||
wire [MAN_BITS-1:0] final_mant_s2; // mantissa after adjustments
|
||||
wire [MAX_INT_WIDTH-1:0] final_int_s2; // integer shifted in position
|
||||
wire [1:0] round_sticky_bits_s2;
|
||||
wire [31:0] fmt_pre_round_abs_s2;
|
||||
wire [31:0] pre_round_abs_s2;
|
||||
wire [1:0] int_round_sticky_bits_s2, fp_round_sticky_bits_s2;
|
||||
|
||||
// Extract final mantissa and round bit, discard the normal bit (for FP)
|
||||
assign {final_mant, fp_round_sticky_bits[i][1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1];
|
||||
assign {final_int, int_round_sticky_bits[i][1]} = destination_mant_s2[i][2*INT_MAN_WIDTH : 2*INT_MAN_WIDTH - (MAX_INT_WIDTH+1) + 1];
|
||||
assign {final_mant_s2, fp_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH-1 : 2*INT_MAN_WIDTH-1 - (MAN_BITS+1) + 1];
|
||||
assign {final_int_s2, int_round_sticky_bits_s2[1]} = destination_mant_s2[i][2*INT_MAN_WIDTH : 2*INT_MAN_WIDTH - (MAX_INT_WIDTH+1) + 1];
|
||||
|
||||
// Collapse sticky bits
|
||||
assign fp_round_sticky_bits[i][0] = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]);
|
||||
assign int_round_sticky_bits[i][0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]);
|
||||
assign fp_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_FP_STICKY-1:0]);
|
||||
assign int_round_sticky_bits_s2[0] = (| destination_mant_s2[i][NUM_INT_STICKY-1:0]);
|
||||
assign fp_round_has_sticky_s2[i] = (| fp_round_sticky_bits_s2);
|
||||
assign int_round_has_sticky_s2[i] = (| int_round_sticky_bits_s2);
|
||||
|
||||
// select RS bits for destination operation
|
||||
assign round_sticky_bits = is_itof_s2 ? fp_round_sticky_bits[i] : int_round_sticky_bits[i];
|
||||
assign round_sticky_bits_s2 = is_itof_s2 ? fp_round_sticky_bits_s2 : int_round_sticky_bits_s2;
|
||||
|
||||
// Pack exponent and mantissa into proper rounding form
|
||||
assign fmt_pre_round_abs = {1'b0, final_exp_s2[i][EXP_BITS-1:0], final_mant[MAN_BITS-1:0]};
|
||||
assign fmt_pre_round_abs_s2 = {1'b0, final_exp_s2[i][EXP_BITS-1:0], final_mant_s2[MAN_BITS-1:0]};
|
||||
|
||||
// Select output with destination format and operation
|
||||
assign pre_round_abs = is_itof_s2 ? fmt_pre_round_abs : final_int;
|
||||
assign pre_round_abs_s2 = is_itof_s2 ? fmt_pre_round_abs_s2 : final_int_s2;
|
||||
|
||||
// Perform the rounding
|
||||
VX_fpu_rounding #(
|
||||
.DAT_WIDTH (32)
|
||||
) fp_rounding (
|
||||
.abs_value_i (pre_round_abs),
|
||||
.abs_value_i (pre_round_abs_s2),
|
||||
.sign_i (input_sign_s2[i]),
|
||||
.round_sticky_bits_i(round_sticky_bits),
|
||||
.round_sticky_bits_i(round_sticky_bits_s2),
|
||||
.rnd_mode_i (rnd_mode_s2),
|
||||
.effective_subtraction_i(1'b0),
|
||||
.abs_rounded_o (rounded_abs[i]),
|
||||
.sign_o (rounded_sign[i]),
|
||||
.abs_rounded_o (rounded_abs_s2[i]),
|
||||
.sign_o (rounded_sign_s2[i]),
|
||||
`UNUSED_PIN (exact_zero_o)
|
||||
);
|
||||
end
|
||||
|
@ -311,117 +318,116 @@ module VX_fpu_cvt #(
|
|||
wire [NUM_LANES-1:0] input_sign_s3;
|
||||
wire [NUM_LANES-1:0] rounded_sign_s3;
|
||||
wire [NUM_LANES-1:0][31:0] rounded_abs_s3;
|
||||
wire [NUM_LANES-1:0] of_before_round_s3;
|
||||
wire [NUM_LANES-1:0] of_before_round_s3;
|
||||
wire [NUM_LANES-1:0] int_round_has_sticky_s3;
|
||||
wire [NUM_LANES-1:0] fp_round_has_sticky_s3;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + TAGW + 1 + 1 + NUM_LANES * ($bits(fclass_t) + 1 + 1 + 32 + 1 + 1)),
|
||||
.DATAW (1 + TAGW + 1 + 1 + NUM_LANES * ($bits(fclass_t) + 1 + 1 + 32 + 1 + 1 + 1 + 1)),
|
||||
.RESETW (1)
|
||||
) pipe_reg3 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall),
|
||||
.data_in ({valid_in_s2, tag_in_s2, is_itof_s2, unsigned_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs, rounded_sign, of_before_round_s2}),
|
||||
.data_out ({valid_in_s3, tag_in_s3, is_itof_s3, unsigned_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3})
|
||||
.data_in ({valid_in_s2, tag_in_s2, is_itof_s2, unsigned_s2, fclass_s2, mant_is_zero_s2, input_sign_s2, rounded_abs_s2, rounded_sign_s2, of_before_round_s2, int_round_has_sticky_s2, fp_round_has_sticky_s2}),
|
||||
.data_out ({valid_in_s3, tag_in_s3, is_itof_s3, unsigned_s3, fclass_s3, mant_is_zero_s3, input_sign_s3, rounded_abs_s3, rounded_sign_s3, of_before_round_s3, int_round_has_sticky_s3, fp_round_has_sticky_s3})
|
||||
);
|
||||
|
||||
wire [NUM_LANES-1:0] of_after_round;
|
||||
wire [NUM_LANES-1:0] uf_after_round;
|
||||
wire [NUM_LANES-1:0][31:0] fmt_result;
|
||||
wire [NUM_LANES-1:0][31:0] rounded_int_res; // after possible inversion
|
||||
wire [NUM_LANES-1:0] rounded_int_res_zero; // after rounding
|
||||
wire [NUM_LANES-1:0] of_after_round_s3;
|
||||
wire [NUM_LANES-1:0] uf_after_round_s3;
|
||||
wire [NUM_LANES-1:0][31:0] fmt_result_s3;
|
||||
wire [NUM_LANES-1:0][31:0] rounded_int_res_s3; // after possible inversion
|
||||
wire [NUM_LANES-1:0] rounded_int_res_zero_s3; // after rounding
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
// Assemble regular result, nan box short ones. Int zeroes need to be detected
|
||||
assign fmt_result[i] = (is_itof_s3 & mant_is_zero_s3[i]) ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]};
|
||||
assign fmt_result_s3[i] = (is_itof_s3 & mant_is_zero_s3[i]) ? 0 : {rounded_sign_s3[i], rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:0]};
|
||||
|
||||
// Classification after rounding select by destination format
|
||||
assign uf_after_round[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == 0); // denormal
|
||||
assign of_after_round[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == ~0); // inf exp.
|
||||
assign uf_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == 0); // denormal
|
||||
assign of_after_round_s3[i] = (rounded_abs_s3[i][EXP_BITS+MAN_BITS-1:MAN_BITS] == ~0); // inf exp.
|
||||
|
||||
// Negative integer result needs to be brought into two's complement
|
||||
assign rounded_int_res[i] = rounded_sign_s3[i] ? (-rounded_abs_s3[i]) : rounded_abs_s3[i];
|
||||
assign rounded_int_res_zero[i] = (rounded_int_res[i] == 0);
|
||||
assign rounded_int_res_s3[i] = rounded_sign_s3[i] ? (-rounded_abs_s3[i]) : rounded_abs_s3[i];
|
||||
assign rounded_int_res_zero_s3[i] = (rounded_int_res_s3[i] == 0);
|
||||
end
|
||||
|
||||
// FP Special case handling
|
||||
|
||||
wire [NUM_LANES-1:0][31:0] fp_special_result;
|
||||
fflags_t [NUM_LANES-1:0] fp_special_status;
|
||||
wire [NUM_LANES-1:0] fp_result_is_special;
|
||||
|
||||
localparam logic [EXP_BITS-1:0] QNAN_EXPONENT = 2**EXP_BITS-1;
|
||||
localparam logic [MAN_BITS-1:0] QNAN_MANTISSA = 2**(MAN_BITS-1);
|
||||
wire [NUM_LANES-1:0][31:0] fp_special_result_s3;
|
||||
fflags_t [NUM_LANES-1:0] fp_special_status_s3;
|
||||
wire [NUM_LANES-1:0] fp_result_is_special_s3;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
// Detect special case from source format, I2F casts don't produce a special result
|
||||
assign fp_result_is_special[i] = ~is_itof_s3 & (fclass_s3[i].is_zero | fclass_s3[i].is_nan);
|
||||
assign fp_result_is_special_s3[i] = ~is_itof_s3 & (fclass_s3[i].is_zero | fclass_s3[i].is_nan);
|
||||
|
||||
// Signalling input NaNs raise invalid flag, otherwise no flags set
|
||||
assign fp_special_status[i] = fclass_s3[i].is_signaling ? {1'b1, 4'h0} : 5'h0; // invalid operation
|
||||
assign fp_special_status_s3[i] = fclass_s3[i].is_signaling ? {1'b1, 4'h0} : 5'h0; // invalid operation
|
||||
|
||||
// Assemble result according to destination format
|
||||
assign fp_special_result[i] = fclass_s3[i].is_zero ? (32'(input_sign_s3) << 31) // signed zero
|
||||
: {1'b0, QNAN_EXPONENT, QNAN_MANTISSA}; // qNaN
|
||||
assign fp_special_result_s3[i] = fclass_s3[i].is_zero ? (32'(input_sign_s3) << 31) // signed zero
|
||||
: {1'b0, QNAN_EXPONENT, QNAN_MANTISSA}; // qNaN
|
||||
end
|
||||
|
||||
// INT Special case handling
|
||||
|
||||
reg [NUM_LANES-1:0][31:0] int_special_result;
|
||||
fflags_t [NUM_LANES-1:0] int_special_status;
|
||||
wire [NUM_LANES-1:0] int_result_is_special;
|
||||
reg [NUM_LANES-1:0][31:0] int_special_result_s3;
|
||||
fflags_t [NUM_LANES-1:0] int_special_status_s3;
|
||||
wire [NUM_LANES-1:0] int_result_is_special_s3;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
// Assemble result according to destination format
|
||||
always @(*) begin
|
||||
if (input_sign_s3[i] && !fclass_s3[i].is_nan) begin
|
||||
int_special_result[i][30:0] = '0; // alone yields 2**(31)-1
|
||||
int_special_result[i][31] = ~unsigned_s3; // for unsigned casts yields 2**31
|
||||
int_special_result_s3[i][30:0] = '0; // alone yields 2**(31)-1
|
||||
int_special_result_s3[i][31] = ~unsigned_s3; // for unsigned casts yields 2**31
|
||||
end else begin
|
||||
int_special_result[i][30:0] = 2**(31) - 1; // alone yields 2**(31)-1
|
||||
int_special_result[i][31] = unsigned_s3; // for unsigned casts yields 2**31
|
||||
int_special_result_s3[i][30:0] = 2**(31) - 1; // alone yields 2**(31)-1
|
||||
int_special_result_s3[i][31] = unsigned_s3; // for unsigned casts yields 2**31
|
||||
end
|
||||
end
|
||||
|
||||
// Detect special case from source format (inf, nan, overflow, nan-boxing or negative unsigned)
|
||||
assign int_result_is_special[i] = fclass_s3[i].is_nan
|
||||
| fclass_s3[i].is_inf
|
||||
| of_before_round_s3[i]
|
||||
| (input_sign_s3[i] & unsigned_s3 & ~rounded_int_res_zero[i]);
|
||||
assign int_result_is_special_s3[i] = fclass_s3[i].is_nan
|
||||
| fclass_s3[i].is_inf
|
||||
| of_before_round_s3[i]
|
||||
| (input_sign_s3[i] & unsigned_s3 & ~rounded_int_res_zero_s3[i]);
|
||||
|
||||
// All integer special cases are invalid
|
||||
assign int_special_status[i] = {1'b1, 4'h0};
|
||||
assign int_special_status_s3[i] = {1'b1, 4'h0};
|
||||
end
|
||||
|
||||
// Result selection and Output handshake
|
||||
|
||||
fflags_t [NUM_LANES-1:0] tmp_fflags;
|
||||
wire [NUM_LANES-1:0][31:0] tmp_result;
|
||||
fflags_t [NUM_LANES-1:0] tmp_fflags_s3;
|
||||
wire [NUM_LANES-1:0][31:0] tmp_result_s3;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
fflags_t fp_regular_status, int_regular_status;
|
||||
fflags_t fp_status, int_status;
|
||||
wire [31:0] fp_result, int_result;
|
||||
fflags_t fp_regular_status_s3, int_regular_status_s3;
|
||||
fflags_t fp_status_s3, int_status_s3;
|
||||
wire [31:0] fp_result_s3, int_result_s3;
|
||||
|
||||
wire inexact = is_itof_s3 ? (| fp_round_sticky_bits[i]) // overflow is invalid in i2f;
|
||||
: (| fp_round_sticky_bits[i]) | (~fclass_s3[i].is_inf & (of_before_round_s3[i] | of_after_round[i]));
|
||||
wire inexact_s3 = is_itof_s3 ? fp_round_has_sticky_s3[i] // overflow is invalid in i2f;
|
||||
: (fp_round_has_sticky_s3[i] || (~fclass_s3[i].is_inf && (of_before_round_s3[i] || of_after_round_s3[i])));
|
||||
|
||||
assign fp_regular_status.NV = is_itof_s3 & (of_before_round_s3[i] | of_after_round[i]); // overflow is invalid for I2F casts
|
||||
assign fp_regular_status.DZ = 1'b0; // no divisions
|
||||
assign fp_regular_status.OF = ~is_itof_s3 & (~fclass_s3[i].is_inf & (of_before_round_s3[i] | of_after_round[i])); // inf casts no OF
|
||||
assign fp_regular_status.UF = uf_after_round[i] & inexact;
|
||||
assign fp_regular_status.NX = inexact;
|
||||
assign fp_regular_status_s3.NV = is_itof_s3 & (of_before_round_s3[i] | of_after_round_s3[i]); // overflow is invalid for I2F casts
|
||||
assign fp_regular_status_s3.DZ = 1'b0; // no divisions
|
||||
assign fp_regular_status_s3.OF = ~is_itof_s3 & (~fclass_s3[i].is_inf & (of_before_round_s3[i] | of_after_round_s3[i])); // inf casts no OF
|
||||
assign fp_regular_status_s3.UF = uf_after_round_s3[i] & inexact_s3;
|
||||
assign fp_regular_status_s3.NX = inexact_s3;
|
||||
|
||||
assign int_regular_status = (| int_round_sticky_bits[i]) ? {4'h0, 1'b1} : 5'h0;
|
||||
assign int_regular_status_s3 = int_round_has_sticky_s3[i] ? {4'h0, 1'b1} : 5'h0;
|
||||
|
||||
assign fp_result = fp_result_is_special[i] ? fp_special_result[i] : fmt_result[i];
|
||||
assign int_result = int_result_is_special[i] ? int_special_result[i] : rounded_int_res[i];
|
||||
assign fp_result_s3 = fp_result_is_special_s3[i] ? fp_special_result_s3[i] : fmt_result_s3[i];
|
||||
assign int_result_s3 = int_result_is_special_s3[i] ? int_special_result_s3[i] : rounded_int_res_s3[i];
|
||||
|
||||
assign fp_status = fp_result_is_special[i] ? fp_special_status[i] : fp_regular_status;
|
||||
assign int_status = int_result_is_special[i] ? int_special_status[i] : int_regular_status;
|
||||
assign fp_status_s3 = fp_result_is_special_s3[i] ? fp_special_status_s3[i] : fp_regular_status_s3;
|
||||
assign int_status_s3 = int_result_is_special_s3[i] ? int_special_status_s3[i] : int_regular_status_s3;
|
||||
|
||||
// Select output depending on special case detection
|
||||
assign tmp_result[i] = is_itof_s3 ? fp_result : int_result;
|
||||
assign tmp_fflags[i] = is_itof_s3 ? fp_status : int_status;
|
||||
assign tmp_result_s3[i] = is_itof_s3 ? fp_result_s3 : int_result_s3;
|
||||
assign tmp_fflags_s3[i] = is_itof_s3 ? fp_status_s3 : int_status_s3;
|
||||
end
|
||||
|
||||
assign stall = ~ready_out && valid_out;
|
||||
|
@ -433,8 +439,8 @@ module VX_fpu_cvt #(
|
|||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!stall),
|
||||
.data_in ({valid_in_s3, tag_in_s3, tmp_result, tmp_fflags}),
|
||||
.data_out ({valid_out, tag_out, result, fflags})
|
||||
.data_in ({valid_in_s3, tag_in_s3, tmp_result_s3, tmp_fflags_s3}),
|
||||
.data_out ({valid_out, tag_out, result, fflags})
|
||||
);
|
||||
|
||||
assign ready_in = ~stall;
|
||||
|
|
|
@ -13,4 +13,10 @@
|
|||
import VX_fpu_types::*;
|
||||
`IGNORE_WARNINGS_END
|
||||
|
||||
`ifdef XLEN_64
|
||||
`ifdef FLEN_32
|
||||
`define FPU_RV64_F
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`endif // VX_FPU_DEFINE_VH
|
||||
|
|
|
@ -59,7 +59,7 @@ module VX_fpu_div #(
|
|||
end
|
||||
|
||||
assign has_fflags = 0;
|
||||
assign fflags = '0;
|
||||
assign fflags = 'x
|
||||
|
||||
`elsif VIVADO
|
||||
|
||||
|
@ -86,29 +86,28 @@ module VX_fpu_div #(
|
|||
`else
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
reg [`XLEN-1:0] r;
|
||||
reg [63:0] r;
|
||||
`UNUSED_VAR (r)
|
||||
|
||||
fflags_t f;
|
||||
`UNUSED_VAR (f)
|
||||
|
||||
always @(*) begin
|
||||
dpi_fdiv (enable && valid_in, dataa[i], datab[i], frm, r, f);
|
||||
dpi_fdiv (enable && valid_in, int'(0), 64'(dataa[i]), 64'(datab[i]), frm, r, f);
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW (`XLEN),
|
||||
.DATAW (32 + $bits(fflags_t)),
|
||||
.DEPTH (`LATENCY_FDIV)
|
||||
) shift_req_dpi (
|
||||
.clk (clk),
|
||||
`UNUSED_PIN (reset),
|
||||
.enable (enable),
|
||||
.data_in (r),
|
||||
.data_out (result[i])
|
||||
.data_in ({r[31:0], f}),
|
||||
.data_out ({result[i], fflags[i]})
|
||||
);
|
||||
end
|
||||
|
||||
assign has_fflags = 0;
|
||||
assign fflags = '0;
|
||||
assign has_fflags = 1;
|
||||
|
||||
`endif
|
||||
|
||||
|
|
|
@ -2,12 +2,6 @@
|
|||
|
||||
`ifdef FPU_DPI
|
||||
|
||||
`ifdef XLEN_64
|
||||
`ifdef FLEN_32
|
||||
`define ISA_RV64F
|
||||
`endif
|
||||
`endif
|
||||
|
||||
module VX_fpu_dpi #(
|
||||
parameter NUM_LANES = 1,
|
||||
parameter TAGW = 1
|
||||
|
@ -69,14 +63,6 @@ module VX_fpu_dpi #(
|
|||
operands[0][i] = 64'(dataa[i]);
|
||||
operands[1][i] = 64'(datab[i]);
|
||||
operands[2][i] = 64'(datac[i]);
|
||||
`ifdef ISA_RV64F
|
||||
// apply nan-boxing to floating-point operands
|
||||
if (op_type != `INST_FPU_I2F && op_type != `INST_FPU_U2F) begin
|
||||
operands[0][i] |= 64'hffffffff00000000;
|
||||
end
|
||||
operands[1][i] |= 64'hffffffff00000000;
|
||||
operands[2][i] |= 64'hffffffff00000000;
|
||||
`endif
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -118,7 +104,7 @@ module VX_fpu_dpi #(
|
|||
`INST_FPU_NMSUB: begin core_select = FPU_FMA; is_fnmsub = 1; end
|
||||
`INST_FPU_DIV: begin core_select = FPU_DIV; end
|
||||
`INST_FPU_SQRT: begin core_select = FPU_SQRT; end
|
||||
`INST_FPU_CMP: begin core_select = FPU_NCP; is_fcmp = 1; end
|
||||
`INST_FPU_CMP: begin core_select = FPU_NCP; is_fcmp = 1; end
|
||||
`INST_FPU_F2I: begin core_select = FPU_CVT; is_ftoi = 1; end
|
||||
`INST_FPU_F2U: begin core_select = FPU_CVT; is_ftou = 1; end
|
||||
`INST_FPU_I2F: begin core_select = FPU_CVT; is_itof = 1; end
|
||||
|
|
|
@ -39,7 +39,9 @@ module VX_fpu_dsp #(
|
|||
localparam NUM_FPC = 5;
|
||||
localparam FPC_BITS = `LOG2UP(NUM_FPC);
|
||||
|
||||
localparam RSP_ARB_DATAW = (NUM_LANES * `XLEN) + 1 + (NUM_LANES * $bits(fflags_t)) + TAGW;
|
||||
localparam RSP_ARB_DATAW = (NUM_LANES * 32) + 1 + (NUM_LANES * $bits(fflags_t)) + TAGW;
|
||||
|
||||
`UNUSED_VAR (fmt)
|
||||
|
||||
wire [NUM_FPC-1:0] per_core_ready_in;
|
||||
wire [NUM_FPC-1:0][NUM_LANES-1:0][31:0] per_core_result;
|
||||
|
@ -53,8 +55,6 @@ module VX_fpu_dsp #(
|
|||
reg [FPC_BITS-1:0] core_select;
|
||||
reg do_madd, do_sub, do_neg, is_itof, is_signed;
|
||||
|
||||
wire [`INST_FRM_BITS-1:0] frm = `INST_FRM_BITS'(frm);
|
||||
|
||||
always @(*) begin
|
||||
do_madd = 0;
|
||||
do_sub = 0;
|
||||
|
@ -85,6 +85,20 @@ module VX_fpu_dsp #(
|
|||
`RESET_RELAY (cvt_reset, reset);
|
||||
`RESET_RELAY (ncp_reset, reset);
|
||||
|
||||
wire [NUM_LANES-1:0][31:0] dataa_s;
|
||||
wire [NUM_LANES-1:0][31:0] datab_s;
|
||||
wire [NUM_LANES-1:0][31:0] datac_s;
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
assign dataa_s[i] = dataa[i][31:0];
|
||||
assign datab_s[i] = datab[i][31:0];
|
||||
assign datac_s[i] = datac[i][31:0];
|
||||
end
|
||||
|
||||
`UNUSED_VAR (dataa)
|
||||
`UNUSED_VAR (datab)
|
||||
`UNUSED_VAR (datac)
|
||||
|
||||
VX_fpu_fma #(
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.TAGW (TAGW)
|
||||
|
@ -98,9 +112,9 @@ module VX_fpu_dsp #(
|
|||
.do_madd (do_madd),
|
||||
.do_sub (do_sub),
|
||||
.do_neg (do_neg),
|
||||
.dataa (dataa),
|
||||
.datab (datab),
|
||||
.datac (datac),
|
||||
.dataa (dataa_s),
|
||||
.datab (datab_s),
|
||||
.datac (datac_s),
|
||||
.has_fflags (per_core_has_fflags[FPU_FMA]),
|
||||
.fflags (per_core_fflags[FPU_FMA]),
|
||||
.result (per_core_result[FPU_FMA]),
|
||||
|
@ -119,8 +133,8 @@ module VX_fpu_dsp #(
|
|||
.ready_in (per_core_ready_in[FPU_DIV]),
|
||||
.tag_in (tag_in),
|
||||
.frm (frm),
|
||||
.dataa (dataa),
|
||||
.datab (datab),
|
||||
.dataa (dataa_s),
|
||||
.datab (datab_s),
|
||||
.has_fflags (per_core_has_fflags[FPU_DIV]),
|
||||
.fflags (per_core_fflags[FPU_DIV]),
|
||||
.result (per_core_result[FPU_DIV]),
|
||||
|
@ -139,7 +153,7 @@ module VX_fpu_dsp #(
|
|||
.ready_in (per_core_ready_in[FPU_SQRT]),
|
||||
.tag_in (tag_in),
|
||||
.frm (frm),
|
||||
.dataa (dataa),
|
||||
.dataa (dataa_s),
|
||||
.has_fflags (per_core_has_fflags[FPU_SQRT]),
|
||||
.fflags (per_core_fflags[FPU_SQRT]),
|
||||
.result (per_core_result[FPU_SQRT]),
|
||||
|
@ -148,59 +162,78 @@ module VX_fpu_dsp #(
|
|||
.ready_out (per_core_ready_out[FPU_SQRT])
|
||||
);
|
||||
|
||||
wire cvt_rt_int_in = ~is_itof;
|
||||
wire cvt_rt_int_out;
|
||||
|
||||
VX_fpu_cvt #(
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.TAGW (TAGW)
|
||||
.TAGW (TAGW+1)
|
||||
) fp_cvt (
|
||||
.clk (clk),
|
||||
.reset (cvt_reset),
|
||||
.valid_in (valid_in && (core_select == FPU_CVT)),
|
||||
.ready_in (per_core_ready_in[FPU_CVT]),
|
||||
.tag_in (tag_in),
|
||||
.tag_in ({cvt_rt_int_in, tag_in}),
|
||||
.frm (frm),
|
||||
.is_itof (is_itof),
|
||||
.is_signed (is_signed),
|
||||
.dataa (dataa),
|
||||
.dataa (dataa_s),
|
||||
.has_fflags (per_core_has_fflags[FPU_CVT]),
|
||||
.fflags (per_core_fflags[FPU_CVT]),
|
||||
.result (per_core_result[FPU_CVT]),
|
||||
.tag_out (per_core_tag_out[FPU_CVT]),
|
||||
.tag_out ({cvt_rt_int_out, per_core_tag_out[FPU_CVT]}),
|
||||
.valid_out (per_core_valid_out[FPU_CVT]),
|
||||
.ready_out (per_core_ready_out[FPU_CVT])
|
||||
);
|
||||
|
||||
wire ncp_rt_int_in = (op_type == `INST_FPU_CMP)
|
||||
|| `INST_FPU_IS_CLASS(op_type, frm)
|
||||
|| `INST_FPU_IS_MVXW(op_type, frm);
|
||||
wire ncp_rt_int_out;
|
||||
|
||||
wire ncp_rt_sext_in = `INST_FPU_IS_MVXW(op_type, frm);
|
||||
wire ncp_rt_sext_out;
|
||||
|
||||
VX_fpu_ncomp #(
|
||||
.NUM_LANES (NUM_LANES),
|
||||
.TAGW (TAGW)
|
||||
.TAGW (TAGW+2)
|
||||
) fp_ncomp (
|
||||
.clk (clk),
|
||||
.reset (ncp_reset),
|
||||
.valid_in (valid_in && (core_select == FPU_NCP)),
|
||||
.ready_in (per_core_ready_in[FPU_NCP]),
|
||||
.tag_in (tag_in),
|
||||
.tag_in ({ncp_rt_sext_in, ncp_rt_int_in, tag_in}),
|
||||
.op_type (op_type),
|
||||
.frm (frm),
|
||||
.dataa (dataa),
|
||||
.datab (datab),
|
||||
.dataa (dataa_s),
|
||||
.datab (datab_s),
|
||||
.result (per_core_result[FPU_NCP]),
|
||||
.has_fflags (per_core_has_fflags[FPU_NCP]),
|
||||
.fflags (per_core_fflags[FPU_NCP]),
|
||||
.tag_out (per_core_tag_out[FPU_NCP]),
|
||||
.tag_out ({ncp_rt_sext_out, ncp_rt_int_out, per_core_tag_out[FPU_NCP]}),
|
||||
.valid_out (per_core_valid_out[FPU_NCP]),
|
||||
.ready_out (per_core_ready_out[FPU_NCP])
|
||||
);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire [NUM_FPC-1:0][RSP_ARB_DATAW-1:0] per_core_data_out;
|
||||
|
||||
for (genvar i = 0; i < NUM_FPC; ++i) begin
|
||||
assign per_core_data_out[i] = {per_core_result[i], per_core_has_fflags[i], per_core_fflags[i], per_core_tag_out[i]};
|
||||
reg [NUM_FPC-1:0][RSP_ARB_DATAW+2-1:0] per_core_data_out;
|
||||
|
||||
always @(*) begin
|
||||
for (integer i = 0; i < NUM_FPC; ++i) begin
|
||||
per_core_data_out[i][RSP_ARB_DATAW+1:2] = {per_core_result[i], per_core_has_fflags[i], per_core_fflags[i], per_core_tag_out[i]};
|
||||
per_core_data_out[i][1:0] = '0;
|
||||
end
|
||||
per_core_data_out[FPU_CVT][1:0] = {1'b1, cvt_rt_int_out};
|
||||
per_core_data_out[FPU_NCP][1:0] = {ncp_rt_sext_out, ncp_rt_int_out};
|
||||
end
|
||||
|
||||
wire [NUM_LANES-1:0][31:0] result_s;
|
||||
wire [1:0] op_rt_int_out;
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (NUM_FPC),
|
||||
.DATAW (RSP_ARB_DATAW),
|
||||
.DATAW (RSP_ARB_DATAW + 2),
|
||||
.ARBITER ("R"),
|
||||
.BUFFERED (2)
|
||||
) rsp_arb (
|
||||
|
@ -209,11 +242,31 @@ module VX_fpu_dsp #(
|
|||
.valid_in (per_core_valid_out),
|
||||
.ready_in (per_core_ready_out),
|
||||
.data_in (per_core_data_out),
|
||||
.data_out ({result, has_fflags, fflags, tag_out}),
|
||||
.data_out ({result_s, has_fflags, fflags, tag_out, op_rt_int_out}),
|
||||
.valid_out (valid_out),
|
||||
.ready_out (ready_out)
|
||||
);
|
||||
|
||||
`ifndef FPU_RV64_F
|
||||
`UNUSED_VAR (op_rt_int_out)
|
||||
`endif
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
`ifdef FPU_RV64_F
|
||||
reg [`XLEN-1:0] result_r;
|
||||
always @(*) begin
|
||||
case (op_rt_int_out)
|
||||
2'b11: result_r = `XLEN'($signed(result_s[i]));
|
||||
2'b01: result_r = {32'h00000000, result_s[i]};
|
||||
default: result_r = {32'hffffffff, result_s[i]};
|
||||
endcase
|
||||
end
|
||||
assign result[i] = result_r;
|
||||
`else
|
||||
assign result[i] = result_s[i];
|
||||
`endif
|
||||
end
|
||||
|
||||
// can accept new request?
|
||||
assign ready_in = per_core_ready_in[core_select];
|
||||
|
||||
|
|
|
@ -88,6 +88,9 @@ module VX_fpu_fma #(
|
|||
.q (result[i])
|
||||
);
|
||||
end
|
||||
|
||||
assign has_fflags = 0;
|
||||
assign fflags = 'x
|
||||
|
||||
`elsif VIVADO
|
||||
|
||||
|
@ -111,31 +114,33 @@ module VX_fpu_fma #(
|
|||
assign fflags[i] = {tuser[2], 1'b0, tuser[1], tuser[0], 1'b0};
|
||||
end
|
||||
|
||||
assign has_fflags = 1;
|
||||
|
||||
`else
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
reg [`XLEN-1:0] r;
|
||||
reg [63:0] r;
|
||||
`UNUSED_VAR (r)
|
||||
|
||||
fflags_t f;
|
||||
`UNUSED_VAR (f)
|
||||
|
||||
always @(*) begin
|
||||
dpi_fmadd (enable && valid_in, a[i], b[i], c[i], frm, r, f);
|
||||
dpi_fmadd (enable && valid_in, int'(0), 64'(a[i]), 64'(b[i]), 64'(c[i]), frm, r, f);
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW (`XLEN),
|
||||
.DATAW (32 + $bits(fflags_t)),
|
||||
.DEPTH (`LATENCY_FMA)
|
||||
) shift_req_dpi (
|
||||
.clk (clk),
|
||||
`UNUSED_PIN (reset),
|
||||
.enable (enable),
|
||||
.data_in (r),
|
||||
.data_out (result[i])
|
||||
.data_in ({r[31:0], f}),
|
||||
.data_out ({result[i], fflags[i]})
|
||||
);
|
||||
end
|
||||
|
||||
assign has_fflags = 1'b0;
|
||||
assign fflags = '0;
|
||||
assign has_fflags = 1;
|
||||
|
||||
`endif
|
||||
|
||||
|
|
|
@ -6,12 +6,6 @@
|
|||
`include "fpnew_pkg.sv"
|
||||
`include "defs_div_sqrt_mvp.sv"
|
||||
|
||||
`ifdef XLEN_64
|
||||
`ifdef FLEN_32
|
||||
`define ISA_RV64F
|
||||
`endif
|
||||
`endif
|
||||
|
||||
module VX_fpu_fpnew #(
|
||||
parameter NUM_LANES = 1,
|
||||
parameter TAGW = 1
|
||||
|
@ -147,7 +141,7 @@ module VX_fpu_fpnew #(
|
|||
default:;
|
||||
endcase
|
||||
|
||||
`ifdef ISA_RV64F
|
||||
`ifdef FPU_RV64_F
|
||||
// apply nan-boxing to floating-point operands
|
||||
for (integer i = 0; i < NUM_LANES; ++i) begin
|
||||
if (op_type != `INST_FPU_I2F && op_type != `INST_FPU_U2F) begin
|
||||
|
|
|
@ -86,7 +86,7 @@ module VX_fpu_ncomp #(
|
|||
|
||||
wire valid_in_s0;
|
||||
wire [TAGW-1:0] tag_in_s0;
|
||||
wire [4:0] op_mod_s0;
|
||||
wire [3:0] op_mod_s0;
|
||||
wire [NUM_LANES-1:0][31:0] dataa_s0, datab_s0;
|
||||
wire [NUM_LANES-1:0] a_sign_s0, b_sign_s0;
|
||||
wire [NUM_LANES-1:0][7:0] a_exponent_s0;
|
||||
|
@ -96,7 +96,7 @@ module VX_fpu_ncomp #(
|
|||
|
||||
wire stall;
|
||||
|
||||
wire [4:0] op_mod = {(op_type == INST_FPU_CMP), frm};
|
||||
wire [3:0] op_mod = {(op_type == `INST_FPU_CMP), frm};
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + TAGW + 4 + NUM_LANES * (2 * 32 + 1 + 1 + 8 + 23 + 2 * $bits(fclass_t) + 1 + 1)),
|
||||
|
@ -169,7 +169,7 @@ module VX_fpu_ncomp #(
|
|||
reg [NUM_LANES-1:0] fcmp_fflags_NV; // comparison fflags
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
always @(*) begin
|
||||
case (op_mod_s0[1:0])
|
||||
case (op_mod_s0[2:0])
|
||||
`INST_FRM_RNE: begin // LE
|
||||
if (a_fclass_s0[i].is_nan || b_fclass_s0[i].is_nan) begin
|
||||
fcmp_res[i] = 32'h0;
|
||||
|
|
|
@ -57,6 +57,9 @@ module VX_fpu_sqrt #(
|
|||
);
|
||||
end
|
||||
|
||||
assign has_fflags = 0;
|
||||
assign fflags = 'x;
|
||||
|
||||
`elsif VIVADO
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
|
@ -66,41 +69,42 @@ module VX_fpu_sqrt #(
|
|||
.aclk (clk),
|
||||
.aclken (enable),
|
||||
.s_axis_a_tvalid (1'b1),
|
||||
.s_axis_a_tdata (dataa[i]),
|
||||
.s_axis_a_tdata (dataa[i][31:0]),
|
||||
`UNUSED_PIN (m_axis_result_tvalid),
|
||||
.m_axis_result_tdata (result[i]),
|
||||
.m_axis_result_tdata (result[i][31:0]),
|
||||
.m_axis_result_tuser (tuser)
|
||||
);
|
||||
// NV, DZ, OF, UF, NX
|
||||
assign fflags[i] = {tuser, 1'b0, 1'b0, 1'b0, 1'b0};
|
||||
end
|
||||
|
||||
assign has_fflags = 1;
|
||||
|
||||
`else
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
reg [`XLEN-1:0] r;
|
||||
reg [63:0] r;
|
||||
`UNUSED_VAR (r)
|
||||
|
||||
fflags_t f;
|
||||
`UNUSED_VAR (f)
|
||||
|
||||
always @(*) begin
|
||||
dpi_fsqrt (enable && valid_in, dataa[i], frm, r, f);
|
||||
dpi_fsqrt (enable && valid_in, int'(0), 64'(dataa[i]), frm, r, f);
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW (`XLEN),
|
||||
.DATAW (32 + $bits(fflags_t)),
|
||||
.DEPTH (`LATENCY_FSQRT)
|
||||
) shift_req_dpi (
|
||||
.clk (clk),
|
||||
`UNUSED_PIN (reset),
|
||||
.enable (enable),
|
||||
.data_in (r),
|
||||
.data_out (result[i])
|
||||
.data_in ({r[31:0], f}),
|
||||
.data_out ({result[i], fflags[i]})
|
||||
);
|
||||
end
|
||||
|
||||
assign has_fflags = 1'b0;
|
||||
assign fflags = '0;
|
||||
assign has_fflags = 1;
|
||||
|
||||
`endif
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue