fp_noncomp fixes

2025-04-24 05:47:35 -04:00 · 2020-08-23 16:53:28 -07:00 · 2020-08-23 16:53:28 -07:00 · 1c9445745f
commit 1c9445745f
parent 96f5432592
8 changed files with 170 additions and 50 deletions
--- a/hw/rtl/VX_config.vh
+++ b/hw/rtl/VX_config.vh
@ -55,6 +55,8 @@

 `define EXT_F_ENABLE

+`define IBUF_ENABLE
+
 // Device identification
 `define VENDOR_ID           0
 `define ARCHITECTURE_ID     0
--- a/hw/rtl/VX_lsu_unit.v
+++ b/hw/rtl/VX_lsu_unit.v
@ -111,16 +111,16 @@ module VX_lsu_unit #(
        .DATAW (`NW_BITS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 2) + 2),
        .SIZE  (`LSUQ_SIZE)
    ) lsu_queue  (
-        .clk            (clk),
-        .reset          (reset),
-        .write_addr     (req_tag),        
-        .acquire_slot   (lsuq_push),       
-        .read_addr      (rsp_tag),
-        .write_data     ({req_wid, req_curr_PC, req_rd, req_wb, req_offset, req_sext}),                    
-        .read_data      ({rsp_wid, rsp_curr_PC, rsp_rd, rsp_wb, rsp_offset, rsp_sext}),
-        .release_addr   (rsp_tag),
-        .release_slot   (lsuq_pop),     
-        .full           (lsuq_full)
+        .clk          (clk),
+        .reset        (reset),
+        .write_addr   (req_tag),        
+        .acquire_slot (lsuq_push),       
+        .read_addr    (rsp_tag),
+        .write_data   ({req_wid, req_curr_PC, req_rd, req_wb, req_offset, req_sext}),                    
+        .read_data    ({rsp_wid, rsp_curr_PC, rsp_rd, rsp_wb, rsp_offset, rsp_sext}),
+        .release_addr (rsp_tag),
+        .release_slot (lsuq_pop),     
+        .full         (lsuq_full)
    );

    always @(posedge clk) begin
@ -170,12 +170,12 @@ module VX_lsu_unit #(
    wire stall_out = ~lsu_commit_if.ready && lsu_commit_if.valid;
    wire mem_rsp_stall = is_load_rsp && is_store_req; // arbitration prioritizes stores

-    wire                          arb_valid = is_store_req || is_load_rsp;
-    wire [`NW_BITS-1:0]             arb_wid = is_store_req ? req_wid : rsp_wid;
-    wire [`NUM_THREADS-1:0] arb_thread_mask = is_store_req ? req_thread_mask : dcache_rsp_if.valid;
-    wire [31:0]                 arb_curr_PC = is_store_req ? req_curr_PC : rsp_curr_PC;
-    wire [`NR_BITS-1:0]              arb_rd = is_store_req ? 0 : rsp_rd;
-    wire                             arb_wb = is_store_req ? 0 : rsp_wb;
+    wire                    arb_valid = is_store_req || is_load_rsp;
+    wire [`NW_BITS-1:0]       arb_wid = is_store_req ? req_wid : rsp_wid;
+    wire [`NUM_THREADS-1:0] arb_tmask = is_store_req ? req_thread_mask : dcache_rsp_if.valid;
+    wire [31:0]           arb_curr_PC = is_store_req ? req_curr_PC : rsp_curr_PC;
+    wire [`NR_BITS-1:0]        arb_rd = is_store_req ? 0 : rsp_rd;
+    wire                       arb_wb = is_store_req ? 0 : rsp_wb;

    VX_generic_register #(
        .N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32))
@ -184,7 +184,7 @@ module VX_lsu_unit #(
        .reset (reset),
        .stall (stall_out),
        .flush (1'b0),
-        .in    ({arb_valid,           arb_wid,           arb_thread_mask,           arb_curr_PC,           arb_rd,           arb_wb,           rsp_data}),
+        .in    ({arb_valid,           arb_wid,           arb_tmask,                 arb_curr_PC,           arb_rd,           arb_wb,           rsp_data}),
        .out   ({lsu_commit_if.valid, lsu_commit_if.wid, lsu_commit_if.thread_mask, lsu_commit_if.curr_PC, lsu_commit_if.rd, lsu_commit_if.wb, lsu_commit_if.data})
    );

--- a/hw/rtl/VX_scoreboard.v
+++ b/hw/rtl/VX_scoreboard.v
@ -14,7 +14,7 @@ module VX_scoreboard  #(
    output wire         delay
 );
    reg [`NUM_THREADS-1:0] inuse_registers [(`NUM_WARPS * `NUM_REGS)-1:0];  
-    reg [`NUM_REGS-1:0] inuse_reg_mask  [`NUM_WARPS-1:0];
+    reg [`NUM_REGS-1:0] inuse_reg_mask [`NUM_WARPS-1:0];
    
    wire [`NUM_REGS-1:0] inuse_mask = inuse_reg_mask[ibuf_deq_if.wid] & ibuf_deq_if.used_regs;

--- a/hw/rtl/fp_cores/VX_fp_fpga.v
+++ b/hw/rtl/fp_cores/VX_fp_fpga.v
@ -286,6 +286,7 @@ module VX_fp_fpga #(
        assign per_core_ready_out[i] = ready_out && (i == fp_index);
    end

+    assign ready_in   = (& per_core_ready_in);
    assign valid_out  = fp_valid;
    assign tag_out    = per_core_tag_out[fp_index];
    assign result     = per_core_result[fp_index];
--- a/hw/rtl/fp_cores/VX_fp_noncomp.v
+++ b/hw/rtl/fp_cores/VX_fp_noncomp.v
@ -38,12 +38,17 @@ module VX_fp_noncomp #(
                SIG_NAN     = 32'h00000100,
                QUT_NAN     = 32'h00000200;

-    wire [LANES-1:0]       a_sign, b_sign;
-    wire [LANES-1:0][7:0]  a_exponent, b_exponent;
-    wire [LANES-1:0][22:0] a_mantissa, b_mantissa;
-    fp_type_t [LANES-1:0]  a_type, b_type;
+    reg [`FPU_BITS-1:0] op_r;
+    reg [`FRM_BITS-1:0] frm_r;

-    wire [LANES-1:0] a_smaller, ab_equal;
+    reg [LANES-1:0][31:0]  dataa_r;
+    reg [LANES-1:0][31:0]  datab_r;
+
+    reg [LANES-1:0]       a_sign, b_sign;
+    reg [LANES-1:0][7:0]  a_exponent, b_exponent;
+    reg [LANES-1:0][22:0] a_mantissa, b_mantissa;
+    fp_type_t [LANES-1:0]  a_type, b_type;
+    reg [LANES-1:0] a_smaller, ab_equal;

    reg [LANES-1:0][31:0] fclass_mask;  // generate a 10-bit mask for integer reg
    reg [LANES-1:0][31:0] fminmax_res;  // result of fmin/fmax
@ -51,32 +56,60 @@ module VX_fp_noncomp #(
    reg [LANES-1:0][31:0] fcmp_res;     // result of comparison
    reg [LANES-1:0][ 4:0] fcmp_excp;    // exception of comparison

+    wire stall = ~ready_out && valid_out;
+
    // Setup
    for (genvar i = 0; i < LANES; i++) begin
-        assign a_sign[i]     = dataa[i][31]; 
-        assign a_exponent[i] = dataa[i][30:23];
-        assign a_mantissa[i] = dataa[i][22:0];
+        wire tmp_a_sign            = dataa[i][31]; 
+        wire [7:0] tmp_a_exponent  = dataa[i][30:23];
+        wire [22:0] tmp_a_mantissa = dataa[i][22:0];

-        assign b_sign[i]     = datab[i][31]; 
-        assign b_exponent[i] = datab[i][30:23];
-        assign b_mantissa[i] = datab[i][22:0];
+        wire tmp_b_sign            = datab[i][31]; 
+        wire [7:0] tmp_b_exponent  = datab[i][30:23];
+        wire [22:0] tmp_b_mantissa = datab[i][22:0];

-        assign a_smaller[i]  = (dataa[i] < datab[i]) ^ (a_sign[i] || b_sign[i]);
-        assign ab_equal[i]   = (dataa[i] == datab[i]) | (a_type[i][4] & b_type[i][4]);
+        fp_type_t tmp_a_type, tmp_b_type;

        VX_fp_type fp_type_a (
-            .exponent(a_exponent[i]),
-            .mantissa(a_mantissa[i]),
-            .o_type(a_type[i])
+            .exponent(tmp_a_exponent[i]),
+            .mantissa(tmp_a_mantissa[i]),
+            .o_type(tmp_a_type[i])
        );

        VX_fp_type fp_type_b (
-            .exponent(b_exponent[i]),
-            .mantissa(b_mantissa[i]),
-            .o_type(b_type[i])
+            .exponent(tmp_b_exponent[i]),
+            .mantissa(tmp_b_mantissa[i]),
+            .o_type(tmp_b_type[i])
        );
+
+        wire tmp_a_smaller = (dataa[i] < datab[i]) ^ (tmp_a_sign || tmp_b_sign);
+        wire tmp_ab_equal  = (dataa[i] == datab[i]) | (tmp_a_type[4] & tmp_b_type[4]);
+
+        always @(posedge clk) begin
+            if (~stall) begin
+                a_sign[i]     <= tmp_a_sign;
+                b_sign[i]     <= tmp_b_sign;
+                a_exponent[i] <= tmp_a_exponent;
+                b_exponent[i] <= tmp_b_exponent;
+                a_mantissa[i] <= tmp_a_mantissa;
+                b_mantissa[i] <= tmp_b_mantissa;
+                a_type[i]     <= tmp_a_type;
+                b_type[i]     <= tmp_b_type;
+                a_smaller[i]  <= tmp_a_smaller;
+                ab_equal[i]   <= tmp_ab_equal;
+            end
+        end 
    end   

+    always @(posedge clk) begin
+        if (~stall) begin
+            op_r    <= op;
+            frm_r   <= frm;
+            dataa_r <= dataa;
+            datab_r <= datab;
+        end
+    end 
+
    // FCLASS
    for (genvar i = 0; i < LANES; i++) begin
        always @(*) begin 
@ -107,13 +140,13 @@ module VX_fp_noncomp #(
            if (a_type[i].is_nan && b_type[i].is_nan)
                fminmax_res[i] = {1'b0, 8'hff, 1'b1, 22'd0}; // canonical qNaN
            else if (a_type[i].is_nan) 
-                fminmax_res[i] = datab[i];
+                fminmax_res[i] = datab_r[i];
            else if (b_type[i].is_nan) 
-                fminmax_res[i] = dataa[i];
+                fminmax_res[i] = dataa_r[i];
            else begin 
-                case (op) // use LSB to distinguish MIN and MAX
-                    `FPU_MIN: fminmax_res[i] = a_smaller[i] ? dataa[i] : datab[i];
-                    `FPU_MAX: fminmax_res[i] = a_smaller[i] ? datab[i] : dataa[i];
+                case (op_r) // use LSB to distinguish MIN and MAX
+                    `FPU_MIN: fminmax_res[i] = a_smaller[i] ? dataa_r[i] : datab_r[i];
+                    `FPU_MAX: fminmax_res[i] = a_smaller[i] ? datab_r[i] : dataa_r[i];
                    default:  fminmax_res[i] = 32'hdeadbeaf;  // don't care value
                endcase
            end
@ -123,7 +156,7 @@ module VX_fp_noncomp #(
    // Sign Injection
    for (genvar i = 0; i < LANES; i++) begin
        always @(*) begin
-            case (op)
+            case (op_r)
                `FPU_SGNJ:  fsgnj_res[i] = { b_sign[i], a_exponent[i], a_mantissa[i]};
                `FPU_SGNJN: fsgnj_res[i] = {~b_sign[i], a_exponent[i], a_mantissa[i]};
                `FPU_SGNJX: fsgnj_res[i] = { a_sign[i] ^ b_sign[i], a_exponent[i], a_mantissa[i]};
@ -135,7 +168,7 @@ module VX_fp_noncomp #(
    // Comparison    
    for (genvar i = 0; i < LANES; i++) begin
        always @(*) begin
-            case (frm)
+            case (frm_r)
                `FRM_RNE: begin
                    if (a_type[i].is_nan || b_type[i].is_nan) begin
                        fcmp_res[i]  = 32'h0;        // result is 0 when either operand is NaN
@ -183,7 +216,7 @@ module VX_fp_noncomp #(
    reg [LANES-1:0][31:0] tmp_result;

    always @(*) begin        
-        case (op)
+        case (op_r)
            `FPU_SGNJ:  tmp_has_fflags = 0;
            `FPU_SGNJN: tmp_has_fflags = 0;
            `FPU_SGNJX: tmp_has_fflags = 0;
@ -197,7 +230,7 @@ module VX_fp_noncomp #(
    for (genvar i = 0; i < LANES; i++) begin
        always @(*) begin
            tmp_valid = 1'b1;
-            case (op)
+            case (op_r)
                `FPU_CLASS: begin
                    tmp_result[i] = fclass_mask[i];
                    {tmp_fflags[i].NV, tmp_fflags[i].DZ, tmp_fflags[i].OF, tmp_fflags[i].UF, tmp_fflags[i].NX} = 5'h0;
@ -227,9 +260,6 @@ module VX_fp_noncomp #(
        end
    end

-    wire stall = ~ready_out && valid_out;
-    assign ready_in = ~stall;
-
    VX_generic_register #(
        .N(1 + TAGW + (LANES * 32) + 1 + (LANES * `FFG_BITS))
    ) nc_reg (
@ -241,4 +271,6 @@ module VX_fp_noncomp #(
        .out   ({valid_out, tag_out, result,     has_fflags,     fflags})
    );

+    assign ready_in = ~stall;
+
 endmodule
--- a/hw/rtl/interfaces/VX_issue_if.v
+++ b/hw/rtl/interfaces/VX_issue_if.v
@ -0,0 +1,38 @@
+`ifndef VX_ISSUE_IF
+`define VX_ISSUE_IF
+
+`include "VX_define.vh"
+
+interface VX_issue_if ();
+
+    wire                    valid;    
+
+    wire [`ITAG_BITS-1:0]  issue_tag;
+    wire [`NW_BITS-1:0]     wid;
+    wire [`NUM_THREADS-1:0] thread_mask;
+    wire [31:0]             curr_PC;
+
+    wire [`EX_BITS-1:0]     ex_type;    
+    wire [`OP_BITS-1:0]     ex_op; 
+
+    wire [`FRM_BITS-1:0]    frm;
+
+    wire                    wb;
+
+    wire [`NR_BITS-1:0]     rd;
+
+    wire [`NUM_THREADS-1:0][31:0] rs1_data;
+    wire [`NUM_THREADS-1:0][31:0] rs2_data;
+    wire [`NUM_THREADS-1:0][31:0] rs3_data;
+    
+    wire [`NR_BITS-1:0]     rs1;
+    wire [31:0]             imm;
+
+    wire                    rs1_is_PC;
+    wire                    rs2_is_imm;
+
+    wire [1NT_BITS-1:0]     tid;
+
+endinterface
+
+`endif
--- a/hw/rtl/libs/VX_bypass_buffer.v
+++ b/hw/rtl/libs/VX_bypass_buffer.v
@ -0,0 +1,47 @@
+`include "VX_platform.vh"
+
+module VX_bypass_buffer #(
+    parameter DATAW    = 1,
+    parameter PASSTHRU = 0
+) ( 
+    input  wire             clk,
+    input  wire             reset,
+    input  wire             valid_in,
+    output wire             ready_in,        
+    input  wire [DATAW-1:0] data_in,
+    output wire [DATAW-1:0] data_out,
+    input  wire             ready_out,
+    output wire             valid_out
+); 
+    if (PASSTHRU) begin
+        `UNUSED_VAR (clk)
+        `UNUSED_VAR (reset)
+        assign ready_in  = ready_out;
+        assign valid_out = valid_in;        
+        assign data_out  = data_in;
+    end else begin
+        reg [DATAW-1:0] buffer;
+        reg buffer_valid;
+
+        always @(posedge clk) begin
+            if (reset) begin
+                buffer_valid <= 0;
+                buffer <= 0;
+            end else begin            
+                if (ready_out) begin
+                    buffer_valid <= 0;
+                end
+                if (valid_in && ~ready_out) begin
+                    assert(!buffer_valid);
+                    buffer <= data_in;
+                    buffer_valid <= 1;
+                end
+            end
+        end
+
+        assign ready_in  = ready_out || !buffer_valid;
+        assign data_out  = buffer_valid ? buffer : data_in;
+        assign valid_out = valid_in || buffer_valid;
+    end
+
+endmodule
--- a/hw/syn/quartus/project.sdc
+++ b/hw/syn/quartus/project.sdc
@ -1,6 +1,6 @@
 set_time_format -unit ns -decimal_places 3

-create_clock -name {clk} -period "240 MHz" -waveform { 0.0 1.0 } [get_ports {clk}]
+create_clock -name {clk} -period "200 MHz" -waveform { 0.0 1.0 } [get_ports {clk}]

 derive_pll_clocks -create_base_clocks
 derive_clock_uncertainty