fp_noncomp fixes

This commit is contained in:
Blaise Tine 2020-08-23 16:53:28 -07:00
parent 96f5432592
commit 1c9445745f
8 changed files with 170 additions and 50 deletions

View file

@ -55,6 +55,8 @@
`define EXT_F_ENABLE
`define IBUF_ENABLE
// Device identification
`define VENDOR_ID 0
`define ARCHITECTURE_ID 0

View file

@ -111,16 +111,16 @@ module VX_lsu_unit #(
.DATAW (`NW_BITS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 2) + 2),
.SIZE (`LSUQ_SIZE)
) lsu_queue (
.clk (clk),
.reset (reset),
.write_addr (req_tag),
.acquire_slot (lsuq_push),
.read_addr (rsp_tag),
.write_data ({req_wid, req_curr_PC, req_rd, req_wb, req_offset, req_sext}),
.read_data ({rsp_wid, rsp_curr_PC, rsp_rd, rsp_wb, rsp_offset, rsp_sext}),
.release_addr (rsp_tag),
.release_slot (lsuq_pop),
.full (lsuq_full)
.clk (clk),
.reset (reset),
.write_addr (req_tag),
.acquire_slot (lsuq_push),
.read_addr (rsp_tag),
.write_data ({req_wid, req_curr_PC, req_rd, req_wb, req_offset, req_sext}),
.read_data ({rsp_wid, rsp_curr_PC, rsp_rd, rsp_wb, rsp_offset, rsp_sext}),
.release_addr (rsp_tag),
.release_slot (lsuq_pop),
.full (lsuq_full)
);
always @(posedge clk) begin
@ -170,12 +170,12 @@ module VX_lsu_unit #(
wire stall_out = ~lsu_commit_if.ready && lsu_commit_if.valid;
wire mem_rsp_stall = is_load_rsp && is_store_req; // arbitration prioritizes stores
wire arb_valid = is_store_req || is_load_rsp;
wire [`NW_BITS-1:0] arb_wid = is_store_req ? req_wid : rsp_wid;
wire [`NUM_THREADS-1:0] arb_thread_mask = is_store_req ? req_thread_mask : dcache_rsp_if.valid;
wire [31:0] arb_curr_PC = is_store_req ? req_curr_PC : rsp_curr_PC;
wire [`NR_BITS-1:0] arb_rd = is_store_req ? 0 : rsp_rd;
wire arb_wb = is_store_req ? 0 : rsp_wb;
wire arb_valid = is_store_req || is_load_rsp;
wire [`NW_BITS-1:0] arb_wid = is_store_req ? req_wid : rsp_wid;
wire [`NUM_THREADS-1:0] arb_tmask = is_store_req ? req_thread_mask : dcache_rsp_if.valid;
wire [31:0] arb_curr_PC = is_store_req ? req_curr_PC : rsp_curr_PC;
wire [`NR_BITS-1:0] arb_rd = is_store_req ? 0 : rsp_rd;
wire arb_wb = is_store_req ? 0 : rsp_wb;
VX_generic_register #(
.N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32))
@ -184,7 +184,7 @@ module VX_lsu_unit #(
.reset (reset),
.stall (stall_out),
.flush (1'b0),
.in ({arb_valid, arb_wid, arb_thread_mask, arb_curr_PC, arb_rd, arb_wb, rsp_data}),
.in ({arb_valid, arb_wid, arb_tmask, arb_curr_PC, arb_rd, arb_wb, rsp_data}),
.out ({lsu_commit_if.valid, lsu_commit_if.wid, lsu_commit_if.thread_mask, lsu_commit_if.curr_PC, lsu_commit_if.rd, lsu_commit_if.wb, lsu_commit_if.data})
);

View file

@ -14,7 +14,7 @@ module VX_scoreboard #(
output wire delay
);
reg [`NUM_THREADS-1:0] inuse_registers [(`NUM_WARPS * `NUM_REGS)-1:0];
reg [`NUM_REGS-1:0] inuse_reg_mask [`NUM_WARPS-1:0];
reg [`NUM_REGS-1:0] inuse_reg_mask [`NUM_WARPS-1:0];
wire [`NUM_REGS-1:0] inuse_mask = inuse_reg_mask[ibuf_deq_if.wid] & ibuf_deq_if.used_regs;

View file

@ -286,6 +286,7 @@ module VX_fp_fpga #(
assign per_core_ready_out[i] = ready_out && (i == fp_index);
end
assign ready_in = (& per_core_ready_in);
assign valid_out = fp_valid;
assign tag_out = per_core_tag_out[fp_index];
assign result = per_core_result[fp_index];

View file

@ -38,12 +38,17 @@ module VX_fp_noncomp #(
SIG_NAN = 32'h00000100,
QUT_NAN = 32'h00000200;
wire [LANES-1:0] a_sign, b_sign;
wire [LANES-1:0][7:0] a_exponent, b_exponent;
wire [LANES-1:0][22:0] a_mantissa, b_mantissa;
fp_type_t [LANES-1:0] a_type, b_type;
reg [`FPU_BITS-1:0] op_r;
reg [`FRM_BITS-1:0] frm_r;
wire [LANES-1:0] a_smaller, ab_equal;
reg [LANES-1:0][31:0] dataa_r;
reg [LANES-1:0][31:0] datab_r;
reg [LANES-1:0] a_sign, b_sign;
reg [LANES-1:0][7:0] a_exponent, b_exponent;
reg [LANES-1:0][22:0] a_mantissa, b_mantissa;
fp_type_t [LANES-1:0] a_type, b_type;
reg [LANES-1:0] a_smaller, ab_equal;
reg [LANES-1:0][31:0] fclass_mask; // generate a 10-bit mask for integer reg
reg [LANES-1:0][31:0] fminmax_res; // result of fmin/fmax
@ -51,32 +56,60 @@ module VX_fp_noncomp #(
reg [LANES-1:0][31:0] fcmp_res; // result of comparison
reg [LANES-1:0][ 4:0] fcmp_excp; // exception of comparison
wire stall = ~ready_out && valid_out;
// Setup
for (genvar i = 0; i < LANES; i++) begin
assign a_sign[i] = dataa[i][31];
assign a_exponent[i] = dataa[i][30:23];
assign a_mantissa[i] = dataa[i][22:0];
wire tmp_a_sign = dataa[i][31];
wire [7:0] tmp_a_exponent = dataa[i][30:23];
wire [22:0] tmp_a_mantissa = dataa[i][22:0];
assign b_sign[i] = datab[i][31];
assign b_exponent[i] = datab[i][30:23];
assign b_mantissa[i] = datab[i][22:0];
wire tmp_b_sign = datab[i][31];
wire [7:0] tmp_b_exponent = datab[i][30:23];
wire [22:0] tmp_b_mantissa = datab[i][22:0];
assign a_smaller[i] = (dataa[i] < datab[i]) ^ (a_sign[i] || b_sign[i]);
assign ab_equal[i] = (dataa[i] == datab[i]) | (a_type[i][4] & b_type[i][4]);
fp_type_t tmp_a_type, tmp_b_type;
VX_fp_type fp_type_a (
.exponent(a_exponent[i]),
.mantissa(a_mantissa[i]),
.o_type(a_type[i])
.exponent(tmp_a_exponent[i]),
.mantissa(tmp_a_mantissa[i]),
.o_type(tmp_a_type[i])
);
VX_fp_type fp_type_b (
.exponent(b_exponent[i]),
.mantissa(b_mantissa[i]),
.o_type(b_type[i])
.exponent(tmp_b_exponent[i]),
.mantissa(tmp_b_mantissa[i]),
.o_type(tmp_b_type[i])
);
wire tmp_a_smaller = (dataa[i] < datab[i]) ^ (tmp_a_sign || tmp_b_sign);
wire tmp_ab_equal = (dataa[i] == datab[i]) | (tmp_a_type[4] & tmp_b_type[4]);
always @(posedge clk) begin
if (~stall) begin
a_sign[i] <= tmp_a_sign;
b_sign[i] <= tmp_b_sign;
a_exponent[i] <= tmp_a_exponent;
b_exponent[i] <= tmp_b_exponent;
a_mantissa[i] <= tmp_a_mantissa;
b_mantissa[i] <= tmp_b_mantissa;
a_type[i] <= tmp_a_type;
b_type[i] <= tmp_b_type;
a_smaller[i] <= tmp_a_smaller;
ab_equal[i] <= tmp_ab_equal;
end
end
end
always @(posedge clk) begin
if (~stall) begin
op_r <= op;
frm_r <= frm;
dataa_r <= dataa;
datab_r <= datab;
end
end
// FCLASS
for (genvar i = 0; i < LANES; i++) begin
always @(*) begin
@ -107,13 +140,13 @@ module VX_fp_noncomp #(
if (a_type[i].is_nan && b_type[i].is_nan)
fminmax_res[i] = {1'b0, 8'hff, 1'b1, 22'd0}; // canonical qNaN
else if (a_type[i].is_nan)
fminmax_res[i] = datab[i];
fminmax_res[i] = datab_r[i];
else if (b_type[i].is_nan)
fminmax_res[i] = dataa[i];
fminmax_res[i] = dataa_r[i];
else begin
case (op) // use LSB to distinguish MIN and MAX
`FPU_MIN: fminmax_res[i] = a_smaller[i] ? dataa[i] : datab[i];
`FPU_MAX: fminmax_res[i] = a_smaller[i] ? datab[i] : dataa[i];
case (op_r) // use LSB to distinguish MIN and MAX
`FPU_MIN: fminmax_res[i] = a_smaller[i] ? dataa_r[i] : datab_r[i];
`FPU_MAX: fminmax_res[i] = a_smaller[i] ? datab_r[i] : dataa_r[i];
default: fminmax_res[i] = 32'hdeadbeaf; // don't care value
endcase
end
@ -123,7 +156,7 @@ module VX_fp_noncomp #(
// Sign Injection
for (genvar i = 0; i < LANES; i++) begin
always @(*) begin
case (op)
case (op_r)
`FPU_SGNJ: fsgnj_res[i] = { b_sign[i], a_exponent[i], a_mantissa[i]};
`FPU_SGNJN: fsgnj_res[i] = {~b_sign[i], a_exponent[i], a_mantissa[i]};
`FPU_SGNJX: fsgnj_res[i] = { a_sign[i] ^ b_sign[i], a_exponent[i], a_mantissa[i]};
@ -135,7 +168,7 @@ module VX_fp_noncomp #(
// Comparison
for (genvar i = 0; i < LANES; i++) begin
always @(*) begin
case (frm)
case (frm_r)
`FRM_RNE: begin
if (a_type[i].is_nan || b_type[i].is_nan) begin
fcmp_res[i] = 32'h0; // result is 0 when either operand is NaN
@ -183,7 +216,7 @@ module VX_fp_noncomp #(
reg [LANES-1:0][31:0] tmp_result;
always @(*) begin
case (op)
case (op_r)
`FPU_SGNJ: tmp_has_fflags = 0;
`FPU_SGNJN: tmp_has_fflags = 0;
`FPU_SGNJX: tmp_has_fflags = 0;
@ -197,7 +230,7 @@ module VX_fp_noncomp #(
for (genvar i = 0; i < LANES; i++) begin
always @(*) begin
tmp_valid = 1'b1;
case (op)
case (op_r)
`FPU_CLASS: begin
tmp_result[i] = fclass_mask[i];
{tmp_fflags[i].NV, tmp_fflags[i].DZ, tmp_fflags[i].OF, tmp_fflags[i].UF, tmp_fflags[i].NX} = 5'h0;
@ -227,9 +260,6 @@ module VX_fp_noncomp #(
end
end
wire stall = ~ready_out && valid_out;
assign ready_in = ~stall;
VX_generic_register #(
.N(1 + TAGW + (LANES * 32) + 1 + (LANES * `FFG_BITS))
) nc_reg (
@ -241,4 +271,6 @@ module VX_fp_noncomp #(
.out ({valid_out, tag_out, result, has_fflags, fflags})
);
assign ready_in = ~stall;
endmodule

View file

@ -0,0 +1,38 @@
`ifndef VX_ISSUE_IF
`define VX_ISSUE_IF
`include "VX_define.vh"
interface VX_issue_if ();
wire valid;
wire [`ITAG_BITS-1:0] issue_tag;
wire [`NW_BITS-1:0] wid;
wire [`NUM_THREADS-1:0] thread_mask;
wire [31:0] curr_PC;
wire [`EX_BITS-1:0] ex_type;
wire [`OP_BITS-1:0] ex_op;
wire [`FRM_BITS-1:0] frm;
wire wb;
wire [`NR_BITS-1:0] rd;
wire [`NUM_THREADS-1:0][31:0] rs1_data;
wire [`NUM_THREADS-1:0][31:0] rs2_data;
wire [`NUM_THREADS-1:0][31:0] rs3_data;
wire [`NR_BITS-1:0] rs1;
wire [31:0] imm;
wire rs1_is_PC;
wire rs2_is_imm;
wire [1NT_BITS-1:0] tid;
endinterface
`endif

View file

@ -0,0 +1,47 @@
`include "VX_platform.vh"
module VX_bypass_buffer #(
parameter DATAW = 1,
parameter PASSTHRU = 0
) (
input wire clk,
input wire reset,
input wire valid_in,
output wire ready_in,
input wire [DATAW-1:0] data_in,
output wire [DATAW-1:0] data_out,
input wire ready_out,
output wire valid_out
);
if (PASSTHRU) begin
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
assign ready_in = ready_out;
assign valid_out = valid_in;
assign data_out = data_in;
end else begin
reg [DATAW-1:0] buffer;
reg buffer_valid;
always @(posedge clk) begin
if (reset) begin
buffer_valid <= 0;
buffer <= 0;
end else begin
if (ready_out) begin
buffer_valid <= 0;
end
if (valid_in && ~ready_out) begin
assert(!buffer_valid);
buffer <= data_in;
buffer_valid <= 1;
end
end
end
assign ready_in = ready_out || !buffer_valid;
assign data_out = buffer_valid ? buffer : data_in;
assign valid_out = valid_in || buffer_valid;
end
endmodule

View file

@ -1,6 +1,6 @@
set_time_format -unit ns -decimal_places 3
create_clock -name {clk} -period "240 MHz" -waveform { 0.0 1.0 } [get_ports {clk}]
create_clock -name {clk} -period "200 MHz" -waveform { 0.0 1.0 } [get_ports {clk}]
derive_pll_clocks -create_base_clocks
derive_clock_uncertainty