mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-24 05:47:35 -04:00
fp_noncomp fixes
This commit is contained in:
parent
96f5432592
commit
1c9445745f
8 changed files with 170 additions and 50 deletions
|
@ -55,6 +55,8 @@
|
|||
|
||||
`define EXT_F_ENABLE
|
||||
|
||||
`define IBUF_ENABLE
|
||||
|
||||
// Device identification
|
||||
`define VENDOR_ID 0
|
||||
`define ARCHITECTURE_ID 0
|
||||
|
|
|
@ -111,16 +111,16 @@ module VX_lsu_unit #(
|
|||
.DATAW (`NW_BITS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 2) + 2),
|
||||
.SIZE (`LSUQ_SIZE)
|
||||
) lsu_queue (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.write_addr (req_tag),
|
||||
.acquire_slot (lsuq_push),
|
||||
.read_addr (rsp_tag),
|
||||
.write_data ({req_wid, req_curr_PC, req_rd, req_wb, req_offset, req_sext}),
|
||||
.read_data ({rsp_wid, rsp_curr_PC, rsp_rd, rsp_wb, rsp_offset, rsp_sext}),
|
||||
.release_addr (rsp_tag),
|
||||
.release_slot (lsuq_pop),
|
||||
.full (lsuq_full)
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.write_addr (req_tag),
|
||||
.acquire_slot (lsuq_push),
|
||||
.read_addr (rsp_tag),
|
||||
.write_data ({req_wid, req_curr_PC, req_rd, req_wb, req_offset, req_sext}),
|
||||
.read_data ({rsp_wid, rsp_curr_PC, rsp_rd, rsp_wb, rsp_offset, rsp_sext}),
|
||||
.release_addr (rsp_tag),
|
||||
.release_slot (lsuq_pop),
|
||||
.full (lsuq_full)
|
||||
);
|
||||
|
||||
always @(posedge clk) begin
|
||||
|
@ -170,12 +170,12 @@ module VX_lsu_unit #(
|
|||
wire stall_out = ~lsu_commit_if.ready && lsu_commit_if.valid;
|
||||
wire mem_rsp_stall = is_load_rsp && is_store_req; // arbitration prioritizes stores
|
||||
|
||||
wire arb_valid = is_store_req || is_load_rsp;
|
||||
wire [`NW_BITS-1:0] arb_wid = is_store_req ? req_wid : rsp_wid;
|
||||
wire [`NUM_THREADS-1:0] arb_thread_mask = is_store_req ? req_thread_mask : dcache_rsp_if.valid;
|
||||
wire [31:0] arb_curr_PC = is_store_req ? req_curr_PC : rsp_curr_PC;
|
||||
wire [`NR_BITS-1:0] arb_rd = is_store_req ? 0 : rsp_rd;
|
||||
wire arb_wb = is_store_req ? 0 : rsp_wb;
|
||||
wire arb_valid = is_store_req || is_load_rsp;
|
||||
wire [`NW_BITS-1:0] arb_wid = is_store_req ? req_wid : rsp_wid;
|
||||
wire [`NUM_THREADS-1:0] arb_tmask = is_store_req ? req_thread_mask : dcache_rsp_if.valid;
|
||||
wire [31:0] arb_curr_PC = is_store_req ? req_curr_PC : rsp_curr_PC;
|
||||
wire [`NR_BITS-1:0] arb_rd = is_store_req ? 0 : rsp_rd;
|
||||
wire arb_wb = is_store_req ? 0 : rsp_wb;
|
||||
|
||||
VX_generic_register #(
|
||||
.N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32))
|
||||
|
@ -184,7 +184,7 @@ module VX_lsu_unit #(
|
|||
.reset (reset),
|
||||
.stall (stall_out),
|
||||
.flush (1'b0),
|
||||
.in ({arb_valid, arb_wid, arb_thread_mask, arb_curr_PC, arb_rd, arb_wb, rsp_data}),
|
||||
.in ({arb_valid, arb_wid, arb_tmask, arb_curr_PC, arb_rd, arb_wb, rsp_data}),
|
||||
.out ({lsu_commit_if.valid, lsu_commit_if.wid, lsu_commit_if.thread_mask, lsu_commit_if.curr_PC, lsu_commit_if.rd, lsu_commit_if.wb, lsu_commit_if.data})
|
||||
);
|
||||
|
||||
|
|
|
@ -14,7 +14,7 @@ module VX_scoreboard #(
|
|||
output wire delay
|
||||
);
|
||||
reg [`NUM_THREADS-1:0] inuse_registers [(`NUM_WARPS * `NUM_REGS)-1:0];
|
||||
reg [`NUM_REGS-1:0] inuse_reg_mask [`NUM_WARPS-1:0];
|
||||
reg [`NUM_REGS-1:0] inuse_reg_mask [`NUM_WARPS-1:0];
|
||||
|
||||
wire [`NUM_REGS-1:0] inuse_mask = inuse_reg_mask[ibuf_deq_if.wid] & ibuf_deq_if.used_regs;
|
||||
|
||||
|
|
|
@ -286,6 +286,7 @@ module VX_fp_fpga #(
|
|||
assign per_core_ready_out[i] = ready_out && (i == fp_index);
|
||||
end
|
||||
|
||||
assign ready_in = (& per_core_ready_in);
|
||||
assign valid_out = fp_valid;
|
||||
assign tag_out = per_core_tag_out[fp_index];
|
||||
assign result = per_core_result[fp_index];
|
||||
|
|
|
@ -38,12 +38,17 @@ module VX_fp_noncomp #(
|
|||
SIG_NAN = 32'h00000100,
|
||||
QUT_NAN = 32'h00000200;
|
||||
|
||||
wire [LANES-1:0] a_sign, b_sign;
|
||||
wire [LANES-1:0][7:0] a_exponent, b_exponent;
|
||||
wire [LANES-1:0][22:0] a_mantissa, b_mantissa;
|
||||
fp_type_t [LANES-1:0] a_type, b_type;
|
||||
reg [`FPU_BITS-1:0] op_r;
|
||||
reg [`FRM_BITS-1:0] frm_r;
|
||||
|
||||
wire [LANES-1:0] a_smaller, ab_equal;
|
||||
reg [LANES-1:0][31:0] dataa_r;
|
||||
reg [LANES-1:0][31:0] datab_r;
|
||||
|
||||
reg [LANES-1:0] a_sign, b_sign;
|
||||
reg [LANES-1:0][7:0] a_exponent, b_exponent;
|
||||
reg [LANES-1:0][22:0] a_mantissa, b_mantissa;
|
||||
fp_type_t [LANES-1:0] a_type, b_type;
|
||||
reg [LANES-1:0] a_smaller, ab_equal;
|
||||
|
||||
reg [LANES-1:0][31:0] fclass_mask; // generate a 10-bit mask for integer reg
|
||||
reg [LANES-1:0][31:0] fminmax_res; // result of fmin/fmax
|
||||
|
@ -51,32 +56,60 @@ module VX_fp_noncomp #(
|
|||
reg [LANES-1:0][31:0] fcmp_res; // result of comparison
|
||||
reg [LANES-1:0][ 4:0] fcmp_excp; // exception of comparison
|
||||
|
||||
wire stall = ~ready_out && valid_out;
|
||||
|
||||
// Setup
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
assign a_sign[i] = dataa[i][31];
|
||||
assign a_exponent[i] = dataa[i][30:23];
|
||||
assign a_mantissa[i] = dataa[i][22:0];
|
||||
wire tmp_a_sign = dataa[i][31];
|
||||
wire [7:0] tmp_a_exponent = dataa[i][30:23];
|
||||
wire [22:0] tmp_a_mantissa = dataa[i][22:0];
|
||||
|
||||
assign b_sign[i] = datab[i][31];
|
||||
assign b_exponent[i] = datab[i][30:23];
|
||||
assign b_mantissa[i] = datab[i][22:0];
|
||||
wire tmp_b_sign = datab[i][31];
|
||||
wire [7:0] tmp_b_exponent = datab[i][30:23];
|
||||
wire [22:0] tmp_b_mantissa = datab[i][22:0];
|
||||
|
||||
assign a_smaller[i] = (dataa[i] < datab[i]) ^ (a_sign[i] || b_sign[i]);
|
||||
assign ab_equal[i] = (dataa[i] == datab[i]) | (a_type[i][4] & b_type[i][4]);
|
||||
fp_type_t tmp_a_type, tmp_b_type;
|
||||
|
||||
VX_fp_type fp_type_a (
|
||||
.exponent(a_exponent[i]),
|
||||
.mantissa(a_mantissa[i]),
|
||||
.o_type(a_type[i])
|
||||
.exponent(tmp_a_exponent[i]),
|
||||
.mantissa(tmp_a_mantissa[i]),
|
||||
.o_type(tmp_a_type[i])
|
||||
);
|
||||
|
||||
VX_fp_type fp_type_b (
|
||||
.exponent(b_exponent[i]),
|
||||
.mantissa(b_mantissa[i]),
|
||||
.o_type(b_type[i])
|
||||
.exponent(tmp_b_exponent[i]),
|
||||
.mantissa(tmp_b_mantissa[i]),
|
||||
.o_type(tmp_b_type[i])
|
||||
);
|
||||
|
||||
wire tmp_a_smaller = (dataa[i] < datab[i]) ^ (tmp_a_sign || tmp_b_sign);
|
||||
wire tmp_ab_equal = (dataa[i] == datab[i]) | (tmp_a_type[4] & tmp_b_type[4]);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (~stall) begin
|
||||
a_sign[i] <= tmp_a_sign;
|
||||
b_sign[i] <= tmp_b_sign;
|
||||
a_exponent[i] <= tmp_a_exponent;
|
||||
b_exponent[i] <= tmp_b_exponent;
|
||||
a_mantissa[i] <= tmp_a_mantissa;
|
||||
b_mantissa[i] <= tmp_b_mantissa;
|
||||
a_type[i] <= tmp_a_type;
|
||||
b_type[i] <= tmp_b_type;
|
||||
a_smaller[i] <= tmp_a_smaller;
|
||||
ab_equal[i] <= tmp_ab_equal;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (~stall) begin
|
||||
op_r <= op;
|
||||
frm_r <= frm;
|
||||
dataa_r <= dataa;
|
||||
datab_r <= datab;
|
||||
end
|
||||
end
|
||||
|
||||
// FCLASS
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
always @(*) begin
|
||||
|
@ -107,13 +140,13 @@ module VX_fp_noncomp #(
|
|||
if (a_type[i].is_nan && b_type[i].is_nan)
|
||||
fminmax_res[i] = {1'b0, 8'hff, 1'b1, 22'd0}; // canonical qNaN
|
||||
else if (a_type[i].is_nan)
|
||||
fminmax_res[i] = datab[i];
|
||||
fminmax_res[i] = datab_r[i];
|
||||
else if (b_type[i].is_nan)
|
||||
fminmax_res[i] = dataa[i];
|
||||
fminmax_res[i] = dataa_r[i];
|
||||
else begin
|
||||
case (op) // use LSB to distinguish MIN and MAX
|
||||
`FPU_MIN: fminmax_res[i] = a_smaller[i] ? dataa[i] : datab[i];
|
||||
`FPU_MAX: fminmax_res[i] = a_smaller[i] ? datab[i] : dataa[i];
|
||||
case (op_r) // use LSB to distinguish MIN and MAX
|
||||
`FPU_MIN: fminmax_res[i] = a_smaller[i] ? dataa_r[i] : datab_r[i];
|
||||
`FPU_MAX: fminmax_res[i] = a_smaller[i] ? datab_r[i] : dataa_r[i];
|
||||
default: fminmax_res[i] = 32'hdeadbeaf; // don't care value
|
||||
endcase
|
||||
end
|
||||
|
@ -123,7 +156,7 @@ module VX_fp_noncomp #(
|
|||
// Sign Injection
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
always @(*) begin
|
||||
case (op)
|
||||
case (op_r)
|
||||
`FPU_SGNJ: fsgnj_res[i] = { b_sign[i], a_exponent[i], a_mantissa[i]};
|
||||
`FPU_SGNJN: fsgnj_res[i] = {~b_sign[i], a_exponent[i], a_mantissa[i]};
|
||||
`FPU_SGNJX: fsgnj_res[i] = { a_sign[i] ^ b_sign[i], a_exponent[i], a_mantissa[i]};
|
||||
|
@ -135,7 +168,7 @@ module VX_fp_noncomp #(
|
|||
// Comparison
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
always @(*) begin
|
||||
case (frm)
|
||||
case (frm_r)
|
||||
`FRM_RNE: begin
|
||||
if (a_type[i].is_nan || b_type[i].is_nan) begin
|
||||
fcmp_res[i] = 32'h0; // result is 0 when either operand is NaN
|
||||
|
@ -183,7 +216,7 @@ module VX_fp_noncomp #(
|
|||
reg [LANES-1:0][31:0] tmp_result;
|
||||
|
||||
always @(*) begin
|
||||
case (op)
|
||||
case (op_r)
|
||||
`FPU_SGNJ: tmp_has_fflags = 0;
|
||||
`FPU_SGNJN: tmp_has_fflags = 0;
|
||||
`FPU_SGNJX: tmp_has_fflags = 0;
|
||||
|
@ -197,7 +230,7 @@ module VX_fp_noncomp #(
|
|||
for (genvar i = 0; i < LANES; i++) begin
|
||||
always @(*) begin
|
||||
tmp_valid = 1'b1;
|
||||
case (op)
|
||||
case (op_r)
|
||||
`FPU_CLASS: begin
|
||||
tmp_result[i] = fclass_mask[i];
|
||||
{tmp_fflags[i].NV, tmp_fflags[i].DZ, tmp_fflags[i].OF, tmp_fflags[i].UF, tmp_fflags[i].NX} = 5'h0;
|
||||
|
@ -227,9 +260,6 @@ module VX_fp_noncomp #(
|
|||
end
|
||||
end
|
||||
|
||||
wire stall = ~ready_out && valid_out;
|
||||
assign ready_in = ~stall;
|
||||
|
||||
VX_generic_register #(
|
||||
.N(1 + TAGW + (LANES * 32) + 1 + (LANES * `FFG_BITS))
|
||||
) nc_reg (
|
||||
|
@ -241,4 +271,6 @@ module VX_fp_noncomp #(
|
|||
.out ({valid_out, tag_out, result, has_fflags, fflags})
|
||||
);
|
||||
|
||||
assign ready_in = ~stall;
|
||||
|
||||
endmodule
|
38
hw/rtl/interfaces/VX_issue_if.v
Normal file
38
hw/rtl/interfaces/VX_issue_if.v
Normal file
|
@ -0,0 +1,38 @@
|
|||
`ifndef VX_ISSUE_IF
|
||||
`define VX_ISSUE_IF
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_issue_if ();
|
||||
|
||||
wire valid;
|
||||
|
||||
wire [`ITAG_BITS-1:0] issue_tag;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [`NUM_THREADS-1:0] thread_mask;
|
||||
wire [31:0] curr_PC;
|
||||
|
||||
wire [`EX_BITS-1:0] ex_type;
|
||||
wire [`OP_BITS-1:0] ex_op;
|
||||
|
||||
wire [`FRM_BITS-1:0] frm;
|
||||
|
||||
wire wb;
|
||||
|
||||
wire [`NR_BITS-1:0] rd;
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] rs1_data;
|
||||
wire [`NUM_THREADS-1:0][31:0] rs2_data;
|
||||
wire [`NUM_THREADS-1:0][31:0] rs3_data;
|
||||
|
||||
wire [`NR_BITS-1:0] rs1;
|
||||
wire [31:0] imm;
|
||||
|
||||
wire rs1_is_PC;
|
||||
wire rs2_is_imm;
|
||||
|
||||
wire [1NT_BITS-1:0] tid;
|
||||
|
||||
endinterface
|
||||
|
||||
`endif
|
47
hw/rtl/libs/VX_bypass_buffer.v
Normal file
47
hw/rtl/libs/VX_bypass_buffer.v
Normal file
|
@ -0,0 +1,47 @@
|
|||
`include "VX_platform.vh"
|
||||
|
||||
module VX_bypass_buffer #(
|
||||
parameter DATAW = 1,
|
||||
parameter PASSTHRU = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
input wire valid_in,
|
||||
output wire ready_in,
|
||||
input wire [DATAW-1:0] data_in,
|
||||
output wire [DATAW-1:0] data_out,
|
||||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
if (PASSTHRU) begin
|
||||
`UNUSED_VAR (clk)
|
||||
`UNUSED_VAR (reset)
|
||||
assign ready_in = ready_out;
|
||||
assign valid_out = valid_in;
|
||||
assign data_out = data_in;
|
||||
end else begin
|
||||
reg [DATAW-1:0] buffer;
|
||||
reg buffer_valid;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
buffer_valid <= 0;
|
||||
buffer <= 0;
|
||||
end else begin
|
||||
if (ready_out) begin
|
||||
buffer_valid <= 0;
|
||||
end
|
||||
if (valid_in && ~ready_out) begin
|
||||
assert(!buffer_valid);
|
||||
buffer <= data_in;
|
||||
buffer_valid <= 1;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
assign ready_in = ready_out || !buffer_valid;
|
||||
assign data_out = buffer_valid ? buffer : data_in;
|
||||
assign valid_out = valid_in || buffer_valid;
|
||||
end
|
||||
|
||||
endmodule
|
|
@ -1,6 +1,6 @@
|
|||
set_time_format -unit ns -decimal_places 3
|
||||
|
||||
create_clock -name {clk} -period "240 MHz" -waveform { 0.0 1.0 } [get_ports {clk}]
|
||||
create_clock -name {clk} -period "200 MHz" -waveform { 0.0 1.0 } [get_ports {clk}]
|
||||
|
||||
derive_pll_clocks -create_base_clocks
|
||||
derive_clock_uncertainty
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue