ibuffer redesign to reduce critical path

This commit is contained in:
Blaise Tine 2023-07-05 20:01:46 -04:00
parent 3e645cee32
commit 964046dc31
9 changed files with 200 additions and 445 deletions

View file

@ -5,7 +5,8 @@ module VX_dispatch (
input wire reset,
// inputs
VX_dispatch_if.slave dispatch_if,
VX_ibuffer_if.slave ibuffer_if,
VX_gpr_stage_if.slave gpr_stage_if,
// outputs
VX_alu_exe_if.master alu_exe_if,
@ -32,17 +33,17 @@ module VX_dispatch (
.N (`NUM_THREADS),
.REVERSE (1)
) tid_select (
.data_in (dispatch_if.tmask),
.data_in (ibuffer_if.tmask),
.data_out (tid),
`UNUSED_PIN (valid_out)
);
wire [`XLEN-1:0] next_PC = dispatch_if.PC + 4;
wire [`XLEN-1:0] next_PC = ibuffer_if.PC + 4;
// ALU unit
wire alu_req_valid = dispatch_if.valid && (dispatch_if.ex_type == `EX_ALU);
wire [`INST_ALU_BITS-1:0] alu_op_type = `INST_ALU_BITS'(dispatch_if.op_type);
wire alu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_ALU);
wire [`INST_ALU_BITS-1:0] alu_op_type = `INST_ALU_BITS'(ibuffer_if.op_type);
VX_skid_buffer #(
.DATAW (UUID_WIDTH + NW_WIDTH + `NUM_THREADS + `XLEN + `XLEN + `INST_ALU_BITS + `INST_MOD_BITS + `XLEN + 1 + 1 + `NR_BITS + 1 + `UP(`NT_BITS) + (2 * `NUM_THREADS * `XLEN)),
@ -52,16 +53,16 @@ module VX_dispatch (
.reset (reset),
.valid_in (alu_req_valid),
.ready_in (alu_req_ready),
.data_in ({dispatch_if.uuid, dispatch_if.wid, dispatch_if.tmask, dispatch_if.PC, next_PC, alu_op_type, dispatch_if.op_mod, dispatch_if.imm, dispatch_if.use_PC, dispatch_if.use_imm, dispatch_if.rd, dispatch_if.wb, tid, dispatch_if.rs1_data, dispatch_if.rs2_data}),
.data_out ({alu_exe_if.uuid, alu_exe_if.wid, alu_exe_if.tmask, alu_exe_if.PC, alu_exe_if.next_PC, alu_exe_if.op_type, alu_exe_if.op_mod, alu_exe_if.imm, alu_exe_if.use_PC, alu_exe_if.use_imm, alu_exe_if.rd, alu_exe_if.wb, alu_exe_if.tid, alu_exe_if.rs1_data, alu_exe_if.rs2_data}),
.data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, alu_op_type, ibuffer_if.op_mod, ibuffer_if.imm, ibuffer_if.use_PC, ibuffer_if.use_imm, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_stage_if.rs1_data, gpr_stage_if.rs2_data}),
.data_out ({alu_exe_if.uuid, alu_exe_if.wid, alu_exe_if.tmask, alu_exe_if.PC, alu_exe_if.next_PC, alu_exe_if.op_type, alu_exe_if.op_mod, alu_exe_if.imm, alu_exe_if.use_PC, alu_exe_if.use_imm, alu_exe_if.rd, alu_exe_if.wb, alu_exe_if.tid, alu_exe_if.rs1_data, alu_exe_if.rs2_data}),
.valid_out (alu_exe_if.valid),
.ready_out (alu_exe_if.ready)
);
// lsu unit
wire lsu_req_valid = dispatch_if.valid && (dispatch_if.ex_type == `EX_LSU);
wire [`INST_LSU_BITS-1:0] lsu_op_type = `INST_LSU_BITS'(dispatch_if.op_type);
wire lsu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_LSU);
wire [`INST_LSU_BITS-1:0] lsu_op_type = `INST_LSU_BITS'(ibuffer_if.op_type);
VX_skid_buffer #(
.DATAW (UUID_WIDTH + NW_WIDTH + `NUM_THREADS + `XLEN + `INST_LSU_BITS + `XLEN + `NR_BITS + 1 + `NUM_THREADS*`XLEN + `NUM_THREADS*`XLEN),
@ -71,22 +72,22 @@ module VX_dispatch (
.reset (reset),
.valid_in (lsu_req_valid),
.ready_in (lsu_req_ready),
.data_in ({dispatch_if.uuid, dispatch_if.wid, dispatch_if.tmask, dispatch_if.PC, lsu_op_type, dispatch_if.imm, dispatch_if.rd, dispatch_if.wb, dispatch_if.rs1_data, dispatch_if.rs2_data}),
.data_out ({lsu_exe_if.uuid, lsu_exe_if.wid, lsu_exe_if.tmask, lsu_exe_if.PC, lsu_exe_if.op_type, lsu_exe_if.offset, lsu_exe_if.rd, lsu_exe_if.wb, lsu_exe_if.base_addr, lsu_exe_if.store_data}),
.data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, lsu_op_type, ibuffer_if.imm, ibuffer_if.rd, ibuffer_if.wb, gpr_stage_if.rs1_data, gpr_stage_if.rs2_data}),
.data_out ({lsu_exe_if.uuid, lsu_exe_if.wid, lsu_exe_if.tmask, lsu_exe_if.PC, lsu_exe_if.op_type, lsu_exe_if.offset, lsu_exe_if.rd, lsu_exe_if.wb, lsu_exe_if.base_addr, lsu_exe_if.store_data}),
.valid_out (lsu_exe_if.valid),
.ready_out (lsu_exe_if.ready)
);
// csr unit
wire csr_req_valid = dispatch_if.valid && (dispatch_if.ex_type == `EX_CSR);
wire [`INST_CSR_BITS-1:0] csr_op_type = `INST_CSR_BITS'(dispatch_if.op_type);
wire [`VX_CSR_ADDR_BITS-1:0] csr_addr = dispatch_if.imm[`VX_CSR_ADDR_BITS-1:0];
wire [`NRI_BITS-1:0] csr_imm = dispatch_if.imm[`VX_CSR_ADDR_BITS +: `NRI_BITS];
wire csr_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_CSR);
wire [`INST_CSR_BITS-1:0] csr_op_type = `INST_CSR_BITS'(ibuffer_if.op_type);
wire [`VX_CSR_ADDR_BITS-1:0] csr_addr = ibuffer_if.imm[`VX_CSR_ADDR_BITS-1:0];
wire [`NRI_BITS-1:0] csr_imm = ibuffer_if.imm[`VX_CSR_ADDR_BITS +: `NRI_BITS];
wire [`NUM_THREADS-1:0][31:0] csr_data;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
assign csr_data[i] = dispatch_if.rs1_data[i][31:0];
assign csr_data[i] = gpr_stage_if.rs1_data[i][31:0];
end
VX_skid_buffer #(
@ -97,8 +98,8 @@ module VX_dispatch (
.reset (reset),
.valid_in (csr_req_valid),
.ready_in (csr_req_ready),
.data_in ({dispatch_if.uuid, dispatch_if.wid, dispatch_if.tmask, dispatch_if.PC, csr_op_type, csr_addr, dispatch_if.rd, dispatch_if.wb, dispatch_if.use_imm, csr_imm, tid, csr_data}),
.data_out ({csr_exe_if.uuid, csr_exe_if.wid, csr_exe_if.tmask, csr_exe_if.PC, csr_exe_if.op_type, csr_exe_if.addr, csr_exe_if.rd, csr_exe_if.wb, csr_exe_if.use_imm, csr_exe_if.imm, csr_exe_if.tid, csr_exe_if.rs1_data}),
.data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, csr_op_type, csr_addr, ibuffer_if.rd, ibuffer_if.wb, ibuffer_if.use_imm, csr_imm, tid, csr_data}),
.data_out ({csr_exe_if.uuid, csr_exe_if.wid, csr_exe_if.tmask, csr_exe_if.PC, csr_exe_if.op_type, csr_exe_if.addr, csr_exe_if.rd, csr_exe_if.wb, csr_exe_if.use_imm, csr_exe_if.imm, csr_exe_if.tid, csr_exe_if.rs1_data}),
.valid_out (csr_exe_if.valid),
.ready_out (csr_exe_if.ready)
);
@ -106,10 +107,10 @@ module VX_dispatch (
// fpu unit
`ifdef EXT_F_ENABLE
wire fpu_req_valid = dispatch_if.valid && (dispatch_if.ex_type == `EX_FPU);
wire [`INST_FPU_BITS-1:0] fpu_op_type = `INST_FPU_BITS'(dispatch_if.op_type);
wire [`INST_FMT_BITS-1:0] fpu_fmt = dispatch_if.imm[`INST_FMT_BITS-1:0];
wire [`INST_FRM_BITS-1:0] fpu_frm = dispatch_if.op_mod[`INST_FRM_BITS-1:0];
wire fpu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_FPU);
wire [`INST_FPU_BITS-1:0] fpu_op_type = `INST_FPU_BITS'(ibuffer_if.op_type);
wire [`INST_FMT_BITS-1:0] fpu_fmt = ibuffer_if.imm[`INST_FMT_BITS-1:0];
wire [`INST_FRM_BITS-1:0] fpu_frm = ibuffer_if.op_mod[`INST_FRM_BITS-1:0];
VX_skid_buffer #(
.DATAW (UUID_WIDTH + NW_WIDTH + `NUM_THREADS + `XLEN + `INST_FPU_BITS + `INST_FMT_BITS + `INST_FRM_BITS + `NR_BITS + (3 * `NUM_THREADS * `XLEN)),
@ -119,19 +120,19 @@ module VX_dispatch (
.reset (reset),
.valid_in (fpu_req_valid),
.ready_in (fpu_req_ready),
.data_in ({dispatch_if.uuid, dispatch_if.wid, dispatch_if.tmask, dispatch_if.PC, fpu_op_type, fpu_fmt, fpu_frm, dispatch_if.rd, dispatch_if.rs1_data, dispatch_if.rs2_data, dispatch_if.rs3_data}),
.data_out ({fpu_exe_if.uuid, fpu_exe_if.wid, fpu_exe_if.tmask, fpu_exe_if.PC, fpu_exe_if.op_type, fpu_exe_if.fmt, fpu_exe_if.frm, fpu_exe_if.rd, fpu_exe_if.rs1_data, fpu_exe_if.rs2_data, fpu_exe_if.rs3_data}),
.data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, fpu_op_type, fpu_fmt, fpu_frm, ibuffer_if.rd, gpr_stage_if.rs1_data, gpr_stage_if.rs2_data, gpr_stage_if.rs3_data}),
.data_out ({fpu_exe_if.uuid, fpu_exe_if.wid, fpu_exe_if.tmask, fpu_exe_if.PC, fpu_exe_if.op_type, fpu_exe_if.fmt, fpu_exe_if.frm, fpu_exe_if.rd, fpu_exe_if.rs1_data, fpu_exe_if.rs2_data, fpu_exe_if.rs3_data}),
.valid_out (fpu_exe_if.valid),
.ready_out (fpu_exe_if.ready)
);
`else
`UNUSED_VAR (dispatch_if.rs3_data)
`UNUSED_VAR (gpr_stage_if.rs3_data)
`endif
// gpu unit
wire gpu_req_valid = dispatch_if.valid && (dispatch_if.ex_type == `EX_GPU);
wire [`INST_GPU_BITS-1:0] gpu_op_type = `INST_GPU_BITS'(dispatch_if.op_type);
wire gpu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_GPU);
wire [`INST_GPU_BITS-1:0] gpu_op_type = `INST_GPU_BITS'(ibuffer_if.op_type);
VX_skid_buffer #(
.DATAW (UUID_WIDTH + NW_WIDTH + `NUM_THREADS + `XLEN + `XLEN + `INST_GPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + `UP(`NT_BITS) + (3 * `NUM_THREADS * `XLEN)),
@ -141,8 +142,8 @@ module VX_dispatch (
.reset (reset),
.valid_in (gpu_req_valid),
.ready_in (gpu_req_ready),
.data_in ({dispatch_if.uuid, dispatch_if.wid, dispatch_if.tmask, dispatch_if.PC, next_PC, gpu_op_type, dispatch_if.op_mod, dispatch_if.rd, dispatch_if.wb, tid, dispatch_if.rs1_data, dispatch_if.rs2_data, dispatch_if.rs3_data}),
.data_out ({gpu_exe_if.uuid, gpu_exe_if.wid, gpu_exe_if.tmask, gpu_exe_if.PC, gpu_exe_if.next_PC, gpu_exe_if.op_type, gpu_exe_if.op_mod, gpu_exe_if.rd, gpu_exe_if.wb, gpu_exe_if.tid, gpu_exe_if.rs1_data, gpu_exe_if.rs2_data, gpu_exe_if.rs3_data}),
.data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, gpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_stage_if.rs1_data, gpr_stage_if.rs2_data, gpr_stage_if.rs3_data}),
.data_out ({gpu_exe_if.uuid, gpu_exe_if.wid, gpu_exe_if.tmask, gpu_exe_if.PC, gpu_exe_if.next_PC, gpu_exe_if.op_type, gpu_exe_if.op_mod, gpu_exe_if.rd, gpu_exe_if.wb, gpu_exe_if.tid, gpu_exe_if.rs1_data, gpu_exe_if.rs2_data, gpu_exe_if.rs3_data}),
.valid_out (gpu_exe_if.valid),
.ready_out (gpu_exe_if.ready)
);
@ -150,7 +151,7 @@ module VX_dispatch (
// can take next request?
reg ready_r;
always @(*) begin
case (dispatch_if.ex_type)
case (ibuffer_if.ex_type)
`EX_LSU: ready_r = lsu_req_ready;
`EX_CSR: ready_r = csr_req_ready;
`ifdef EXT_F_ENABLE
@ -161,6 +162,6 @@ module VX_dispatch (
default: ready_r = alu_req_ready;
endcase
end
assign dispatch_if.ready = ready_r;
assign ibuffer_if.ready = ready_r;
endmodule

View file

@ -7,6 +7,7 @@ module VX_gpr_stage #(
input wire reset,
VX_writeback_if.slave writeback_if,
VX_ibuffer_if.gpr ibuffer_if,
VX_gpr_stage_if.slave gpr_stage_if
);
@ -27,14 +28,14 @@ module VX_gpr_stage #(
wire [RAM_ADDRW-1:0] waddr, raddr1, raddr2;
if (`NUM_WARPS > 1) begin
assign waddr = {writeback_if.wid, writeback_if.rd};
assign raddr1 = {gpr_stage_if.wid, gpr_stage_if.rs1};
assign raddr2 = {gpr_stage_if.wid, gpr_stage_if.rs2};
assign raddr1 = {ibuffer_if.wid, ibuffer_if.rs1};
assign raddr2 = {ibuffer_if.wid, ibuffer_if.rs2};
end else begin
`UNUSED_VAR (writeback_if.wid)
`UNUSED_VAR (gpr_stage_if.wid)
`UNUSED_VAR (ibuffer_if.wid)
assign waddr = writeback_if.rd;
assign raddr1 = gpr_stage_if.rs1;
assign raddr2 = gpr_stage_if.rs2;
assign raddr1 = ibuffer_if.rs1;
assign raddr2 = ibuffer_if.rs2;
end
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
@ -72,9 +73,9 @@ module VX_gpr_stage #(
`ifdef EXT_F_ENABLE
wire [RAM_ADDRW-1:0] raddr3;
if (`NUM_WARPS > 1) begin
assign raddr3 = {gpr_stage_if.wid, gpr_stage_if.rs3};
assign raddr3 = {ibuffer_if.wid, ibuffer_if.rs3};
end else begin
assign raddr3 = gpr_stage_if.rs3;
assign raddr3 = ibuffer_if.rs3;
end
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
@ -94,7 +95,7 @@ module VX_gpr_stage #(
);
end
`else
`UNUSED_VAR (gpr_stage_if.rs3)
`UNUSED_VAR (ibuffer_if.rs3)
assign gpr_stage_if.rs3_data = '0;
`endif

View file

@ -10,179 +10,19 @@ module VX_ibuffer #(
VX_decode_if.slave decode_if,
// outputs
VX_scoreboard_if.master scoreboard_if,
VX_ibuffer_if.master ibuffer_if
);
`UNUSED_PARAM (CORE_ID)
localparam NW_WIDTH = `UP(`NW_BITS);
localparam SIZE = (`IBUF_SIZE + 1);
localparam ALM_FULL = SIZE - 1;
localparam ALM_EMPTY = 1;
localparam DATAW = `UP(`UUID_BITS) + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + (`NR_BITS * 4) + `XLEN + 1 + 1;
localparam ADDRW = $clog2(SIZE);
localparam NWARPSW = $clog2(`NUM_WARPS+1);
localparam NW_WIDTH = `UP(`NW_BITS);
localparam DATAW = `UP(`UUID_BITS) + `NUM_THREADS + `XLEN + `EX_BITS + `INST_OP_BITS + `INST_MOD_BITS + 1 + (`NR_BITS * 4) + `XLEN + 1 + 1;
`STATIC_ASSERT ((`IBUF_SIZE > 1), ("invalid parameter"))
wire [`NUM_WARPS-1:0] q_full, q_empty, q_alm_full, q_alm_empty;
wire [`NUM_WARPS-1:0][DATAW-1:0] q_data_out;
wire [DATAW-1:0] q_data_in;
wire [`NUM_WARPS-1:0][DATAW-1:0] q_data_prev;
reg [`NUM_WARPS-1:0][DATAW-1:0] q_data_out;
wire enq_fire = decode_if.valid && decode_if.ready;
wire deq_fire = ibuffer_if.valid && ibuffer_if.ready;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
reg [ADDRW-1:0] used_r;
reg full_r, empty_r, alm_full_r, alm_empty_r;
wire push = enq_fire && (i == decode_if.wid);
wire pop = deq_fire && (i == ibuffer_if.wid);
wire going_empty = empty_r || (alm_empty_r && pop);
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (SIZE-1),
.OUT_REG (1)
) queue (
.clk (clk),
.reset (reset),
.valid_in (push && !going_empty),
.data_in (q_data_in),
.ready_out(pop),
.data_out (q_data_prev[i]),
`UNUSED_PIN (ready_in),
`UNUSED_PIN (valid_out)
);
always @(posedge clk) begin
if (reset) begin
used_r <= '0;
full_r <= 0;
alm_full_r <= 0;
empty_r <= 1;
alm_empty_r <= 1;
end else begin
if (push) begin
if (!pop) begin
empty_r <= 0;
if (used_r == ADDRW'(ALM_EMPTY))
alm_empty_r <= 0;
if (used_r == ADDRW'(SIZE-1))
full_r <= 1;
if (used_r == ADDRW'(ALM_FULL-1))
alm_full_r <= 1;
end
end else if (pop) begin
full_r <= 0;
if (used_r == ADDRW'(ALM_FULL))
alm_full_r <= 0;
if (used_r == ADDRW'(1))
empty_r <= 1;
if (used_r == ADDRW'(ALM_EMPTY+1))
alm_empty_r <= 1;
end
used_r <= $signed(used_r) + ADDRW'($signed(2'(push) - 2'(pop)));
end
if (push && going_empty) begin
q_data_out[i] <= q_data_in;
end else if (pop) begin
q_data_out[i] <= q_data_prev[i];
end
end
wire [`NUM_WARPS-1:0] q_full, q_empty;
wire [`NUM_WARPS-1:0] deq_valid_in, deq_ready_in;
assign q_full[i] = full_r;
assign q_empty[i] = empty_r;
assign q_alm_full[i] = alm_full_r;
assign q_alm_empty[i] = alm_empty_r;
end
`UNUSED_VAR (q_alm_full)
///////////////////////////////////////////////////////////////////////////
reg [`NUM_WARPS-1:0] valid_table, valid_table_n;
reg [NW_WIDTH-1:0] deq_wid, deq_wid_n;
reg [NW_WIDTH-1:0] deq_wid_rr, deq_wid_rr_n;
reg deq_valid, deq_valid_n;
reg [DATAW-1:0] deq_instr, deq_instr_n;
reg [NWARPSW-1:0] num_warps;
`UNUSED_VAR (deq_instr)
// calculate valid table
always @(*) begin
valid_table_n = valid_table;
if (deq_fire) begin
valid_table_n[deq_wid] = !q_alm_empty[deq_wid];
end
if (enq_fire) begin
valid_table_n[decode_if.wid] = 1;
end
end
// round-robin warp scheduling
VX_rr_arbiter #(
.NUM_REQS (`NUM_WARPS)
) rr_arbiter (
.clk (clk),
.reset (reset),
.requests (valid_table_n),
.grant_index (deq_wid_rr_n),
`UNUSED_PIN (grant_valid),
`UNUSED_PIN (grant_onehot),
`UNUSED_PIN (unlock)
);
// schedule the next instruction to issue
always @(*) begin
if (num_warps > 1) begin
deq_valid_n = 1;
deq_wid_n = deq_wid_rr;
deq_instr_n = q_data_out[deq_wid_rr];
end else if (1 == num_warps && !(deq_fire && q_alm_empty[deq_wid])) begin
deq_valid_n = 1;
deq_wid_n = deq_wid;
deq_instr_n = deq_fire ? q_data_prev[deq_wid] : q_data_out[deq_wid];
end else begin
deq_valid_n = enq_fire;
deq_wid_n = decode_if.wid;
deq_instr_n = q_data_in;
end
end
wire warp_added = enq_fire && q_empty[decode_if.wid];
wire warp_removed = deq_fire && q_alm_empty[deq_wid] && ~(enq_fire && decode_if.wid == deq_wid);
always @(posedge clk) begin
if (reset) begin
valid_table <= '0;
deq_valid <= 0;
num_warps <= '0;
end else begin
valid_table <= valid_table_n;
deq_valid <= deq_valid_n;
if (warp_added && !warp_removed) begin
num_warps <= num_warps + NWARPSW'(1);
end else if (warp_removed && !warp_added) begin
num_warps <= num_warps - NWARPSW'(1);
end
end
deq_wid <= deq_wid_n;
deq_wid_rr <= deq_wid_rr_n;
deq_instr <= deq_instr_n;
end
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
assign decode_if.ibuf_pop[i] = deq_fire && (ibuffer_if.wid == NW_WIDTH'(i));
end
assign decode_if.ready = ~q_full[decode_if.wid];
assign q_data_in = {decode_if.uuid,
decode_if.tmask,
decode_if.PC,
@ -198,9 +38,68 @@ module VX_ibuffer #(
decode_if.rs2,
decode_if.rs3};
assign ibuffer_if.valid = deq_valid;
assign ibuffer_if.wid = deq_wid;
assign {ibuffer_if.uuid,
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
wire q_push = decode_if.valid && decode_if.ready && (i == decode_if.wid);
wire q_pop = deq_valid_in[i] && deq_ready_in[i];
VX_fifo_queue #(
.DATAW (DATAW),
.DEPTH (`IBUF_SIZE),
.OUT_REG (1)
) inst_queue (
.clk (clk),
.reset (reset),
.push (q_push),
.pop (q_pop),
.data_in (q_data_in),
.data_out (q_data_out[i]),
.full (q_full[i]),
.empty (q_empty[i]),
`UNUSED_PIN (alm_full),
`UNUSED_PIN (alm_empty),
`UNUSED_PIN (size)
);
assign decode_if.ibuf_pop[i] = q_pop;
end
assign decode_if.ready = ~q_full[decode_if.wid];
// scoreboad access
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
assign scoreboard_if.valid[i] = ~q_empty[i];
assign scoreboard_if.rd[i] = q_data_out[i][3*`NR_BITS +: `NR_BITS];
assign scoreboard_if.rs1[i] = q_data_out[i][2*`NR_BITS +: `NR_BITS];
assign scoreboard_if.rs2[i] = q_data_out[i][1*`NR_BITS +: `NR_BITS];
assign scoreboard_if.rs3[i] = q_data_out[i][0*`NR_BITS +: `NR_BITS];
end
// round-robin select
wire [`NUM_WARPS-1:0][(NW_WIDTH+DATAW)-1:0] deq_data_in;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
assign deq_valid_in[i] = scoreboard_if.valid[i] && scoreboard_if.ready[i];
assign deq_data_in[i] = {NW_WIDTH'(i), q_data_out[i]};
end
VX_stream_arb #(
.NUM_INPUTS (`NUM_WARPS),
.DATAW (NW_WIDTH+DATAW),
.ARBITER ("R"),
.LOCK_ENABLE (1),
.BUFFERED (3)
) req_arb (
.clk (clk),
.reset (reset),
.valid_in (deq_valid_in),
.ready_in (deq_ready_in),
.data_in (deq_data_in),
.data_out ({
ibuffer_if.wid,
ibuffer_if.uuid,
ibuffer_if.tmask,
ibuffer_if.PC,
ibuffer_if.ex_type,
@ -213,13 +112,9 @@ module VX_ibuffer #(
ibuffer_if.rd,
ibuffer_if.rs1,
ibuffer_if.rs2,
ibuffer_if.rs3} = deq_instr;
// scoreboard forwarding
assign ibuffer_if.wid_n = deq_wid_n;
assign ibuffer_if.rd_n = deq_instr_n[3*`NR_BITS +: `NR_BITS];
assign ibuffer_if.rs1_n = deq_instr_n[2*`NR_BITS +: `NR_BITS];
assign ibuffer_if.rs2_n = deq_instr_n[1*`NR_BITS +: `NR_BITS];
assign ibuffer_if.rs3_n = deq_instr_n[0*`NR_BITS +: `NR_BITS];
ibuffer_if.rs3}),
.valid_out (ibuffer_if.valid),
.ready_out (ibuffer_if.ready)
);
endmodule

View file

@ -27,50 +27,9 @@ module VX_issue #(
VX_gpu_exe_if.master gpu_exe_if
);
VX_ibuffer_if ibuffer_if();
VX_gpr_stage_if gpr_stage_if();
VX_scoreboard_if scoreboard_if();
VX_dispatch_if dispatch_if();
// scoreboard interface
assign scoreboard_if.valid = ibuffer_if.valid && dispatch_if.ready;
assign scoreboard_if.uuid = ibuffer_if.uuid;
assign scoreboard_if.wid = ibuffer_if.wid;
assign scoreboard_if.tmask = ibuffer_if.tmask;
assign scoreboard_if.PC = ibuffer_if.PC;
assign scoreboard_if.wb = ibuffer_if.wb;
assign scoreboard_if.rd = ibuffer_if.rd;
assign scoreboard_if.rd_n = ibuffer_if.rd_n;
assign scoreboard_if.rs1_n = ibuffer_if.rs1_n;
assign scoreboard_if.rs2_n = ibuffer_if.rs2_n;
assign scoreboard_if.rs3_n = ibuffer_if.rs3_n;
assign scoreboard_if.wid_n = ibuffer_if.wid_n;
// GPR request interface
assign gpr_stage_if.wid = ibuffer_if.wid;
assign gpr_stage_if.rs1 = ibuffer_if.rs1;
assign gpr_stage_if.rs2 = ibuffer_if.rs2;
assign gpr_stage_if.rs3 = ibuffer_if.rs3;
// dispatch interface
assign dispatch_if.valid = ibuffer_if.valid && scoreboard_if.ready;
assign dispatch_if.uuid = ibuffer_if.uuid;
assign dispatch_if.wid = ibuffer_if.wid;
assign dispatch_if.tmask = ibuffer_if.tmask;
assign dispatch_if.PC = ibuffer_if.PC;
assign dispatch_if.ex_type = ibuffer_if.ex_type;
assign dispatch_if.op_type = ibuffer_if.op_type;
assign dispatch_if.op_mod = ibuffer_if.op_mod;
assign dispatch_if.wb = ibuffer_if.wb;
assign dispatch_if.use_PC = ibuffer_if.use_PC;
assign dispatch_if.use_imm = ibuffer_if.use_imm;
assign dispatch_if.imm = ibuffer_if.imm;
assign dispatch_if.rd = ibuffer_if.rd;
assign dispatch_if.rs1_data = gpr_stage_if.rs1_data;
assign dispatch_if.rs2_data = gpr_stage_if.rs2_data;
assign dispatch_if.rs3_data = gpr_stage_if.rs3_data;
// issue the instruction
assign ibuffer_if.ready = scoreboard_if.ready && dispatch_if.ready;
VX_gpr_stage_if gpr_stage_if();
wire [3:0] used_regs;
`RESET_RELAY (ibuf_reset, reset);
`RESET_RELAY (scoreboard_reset, reset);
@ -83,31 +42,36 @@ module VX_issue #(
.clk (clk),
.reset (ibuf_reset),
.decode_if (decode_if),
.scoreboard_if (scoreboard_if),
.ibuffer_if (ibuffer_if)
);
VX_scoreboard #(
.CORE_ID (CORE_ID)
) scoreboard (
.clk (clk),
.reset (scoreboard_reset),
.writeback_if (writeback_if),
.scoreboard_if (scoreboard_if)
.clk (clk),
.reset (scoreboard_reset),
.writeback_if (writeback_if),
.scoreboard_if (scoreboard_if),
.ibuffer_if (ibuffer_if),
.used_regs (used_regs)
);
VX_gpr_stage #(
.CORE_ID (CORE_ID)
) gpr_stage (
.clk (clk),
.reset (gpr_reset),
.clk (clk),
.reset (gpr_reset),
.writeback_if (writeback_if),
.ibuffer_if (ibuffer_if),
.gpr_stage_if (gpr_stage_if)
);
VX_dispatch dispatch (
.clk (clk),
.reset (dispatch_reset),
.dispatch_if(dispatch_if),
.ibuffer_if (ibuffer_if),
.gpr_stage_if (gpr_stage_if),
.alu_exe_if (alu_exe_if),
.lsu_exe_if (lsu_exe_if),
.csr_exe_if (csr_exe_if),
@ -128,7 +92,7 @@ module VX_issue #(
`ifdef DBG_TRACE_CORE_PIPELINE
`TRACE(3, ("%d: *** core%0d-stall: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, wb=%0d, cycles=%0d, inuse=%b%b%b%b, dispatch=%b (#%0d)\n",
$time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.tmask, ibuffer_if.rd, ibuffer_if.wb, timeout_ctr,
scoreboard_if.used_regs[0], scoreboard_if.used_regs[1], scoreboard_if.used_regs[2], scoreboard_if.used_regs[3], ~dispatch_if.ready, ibuffer_if.uuid));
used_regs[0], used_regs[1], used_regs[2], used_regs[3], ~ibuffer_if.ready, ibuffer_if.uuid));
`endif
timeout_ctr <= timeout_ctr + 1;
end else if (ibuffer_if_fire) begin
@ -139,14 +103,14 @@ module VX_issue #(
`RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT,
("%t: *** core%0d-issue-timeout: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, wb=%0d, inuse=%b%b%b%b, dispatch=%b (#%0d)",
$time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.tmask, ibuffer_if.rd, ibuffer_if.wb,
scoreboard_if.used_regs[0], scoreboard_if.used_regs[1], scoreboard_if.used_regs[2], scoreboard_if.used_regs[3], ~dispatch_if.ready, ibuffer_if.uuid));
used_regs[0], used_regs[1], used_regs[2], used_regs[3], ~ibuffer_if.ready, ibuffer_if.uuid));
`ifdef DBG_SCOPE_ISSUE
if (CORE_ID == 0) begin
`ifdef SCOPE
localparam UUID_WIDTH = `UP(`UUID_BITS);
wire scoreboard_if_not_ready = ~scoreboard_if.ready;
wire dispatch_if_not_ready = ~dispatch_if.ready;
wire ibuffer_if_not_ready = ~ibuffer_if.ready;
wire writeback_if_valid = writeback_if.valid;
VX_scope_tap #(
.SCOPE_ID (2),
@ -163,7 +127,7 @@ module VX_issue #(
reset,
ibuffer_if_fire,
scoreboard_if_not_ready,
dispatch_if_not_ready,
ibuffer_if_not_ready,
writeback_if_valid
}),
.probes({
@ -180,9 +144,9 @@ module VX_issue #(
ibuffer_if.imm,
ibuffer_if.use_PC,
ibuffer_if.use_imm,
dispatch_if.rs1_data,
dispatch_if.rs2_data,
dispatch_if.rs3_data,
ibuffer_if.rs1_data,
ibuffer_if.rs2_data,
ibuffer_if.rs3_data,
writeback_if.uuid,
writeback_if.tmask,
writeback_if.rd,
@ -196,7 +160,7 @@ module VX_issue #(
`ifdef CHIPSCOPE
ila_issue ila_issue_inst (
.clk (clk),
.probe0 ({ibuffer_if.uuid, ibuffer.rs3, ibuffer.rs2, ibuffer.rs1, ibuffer_if.PC, ibuffer_if.tmask, ibuffer_if.wid, ibuffer_if.ex_type, ibuffer_if.op_type, ibuffer_if.ready, ibuffer_if.valid, scoreboard_if.used_regs, scoreboard_if.ready, dispatch_if.ready, ibuffer_if.ready, ibuffer_if.valid}),
.probe0 ({ibuffer_if.uuid, ibuffer.rs3, ibuffer.rs2, ibuffer.rs1, ibuffer_if.PC, ibuffer_if.tmask, ibuffer_if.wid, ibuffer_if.ex_type, ibuffer_if.op_type, ibuffer_if.ready, ibuffer_if.valid, used_regs, scoreboard_if.ready, ibuffer_if.ready, ibuffer_if.ready, ibuffer_if.valid}),
.probe1 ({writeback_if.uuid, writeback_if.data[0], writeback_if.PC, writeback_if.tmask, writeback_if.wid, writeback_if.eop, writeback_if.valid})
);
`endif
@ -234,8 +198,8 @@ module VX_issue #(
if (scoreboard_if.valid && ~scoreboard_if.ready) begin
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'(1);
end
if (dispatch_if.valid && ~dispatch_if.ready) begin
case (dispatch_if.ex_type)
if (ibuffer_if.valid && ~ibuffer_if.ready) begin
case (ibuffer_if.ex_type)
`EX_ALU: perf_alu_stalls <= perf_alu_stalls + `PERF_CTR_BITS'(1);
`ifdef EXT_F_ENABLE
`EX_FPU: perf_fpu_stalls <= perf_fpu_stalls + `PERF_CTR_BITS'(1);
@ -262,18 +226,18 @@ module VX_issue #(
`ifdef DBG_TRACE_CORE_PIPELINE
always @(posedge clk) begin
if (dispatch_if.valid && dispatch_if.ready) begin
`TRACE(1, ("%d: core%0d-issue: wid=%0d, PC=0x%0h, ex=", $time, CORE_ID, dispatch_if.wid, dispatch_if.PC));
trace_ex_type(1, dispatch_if.ex_type);
if (ibuffer_if.valid && ibuffer_if.ready) begin
`TRACE(1, ("%d: core%0d-issue: wid=%0d, PC=0x%0h, ex=", $time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC));
trace_ex_type(1, ibuffer_if.ex_type);
`TRACE(1, (", op="));
trace_ex_op(1, dispatch_if.ex_type, dispatch_if.op_type, dispatch_if.op_mod, dispatch_if.imm);
`TRACE(1, (", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1_data=", dispatch_if.op_mod, dispatch_if.tmask, dispatch_if.wb, dispatch_if.rd));
`TRACE_ARRAY1D(1, dispatch_if.rs1_data, `NUM_THREADS);
trace_ex_op(1, ibuffer_if.ex_type, ibuffer_if.op_type, ibuffer_if.op_mod, ibuffer_if.imm);
`TRACE(1, (", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1_data=", ibuffer_if.op_mod, ibuffer_if.tmask, ibuffer_if.wb, ibuffer_if.rd));
`TRACE_ARRAY1D(1, gpr_stage_if.rs1_data, `NUM_THREADS);
`TRACE(1, (", rs2_data="));
`TRACE_ARRAY1D(1, dispatch_if.rs2_data, `NUM_THREADS);
`TRACE_ARRAY1D(1, gpr_stage_if.rs2_data, `NUM_THREADS);
`TRACE(1, (", rs3_data="));
`TRACE_ARRAY1D(1, dispatch_if.rs3_data, `NUM_THREADS);
`TRACE(1, (" (#%0d)\n", dispatch_if.uuid));
`TRACE_ARRAY1D(1, gpr_stage_if.rs3_data, `NUM_THREADS);
`TRACE(1, (" (#%0d)\n", ibuffer_if.uuid));
end
end
`endif

View file

@ -6,18 +6,22 @@ module VX_scoreboard #(
input wire clk,
input wire reset,
VX_writeback_if.slave writeback_if,
VX_scoreboard_if.slave scoreboard_if,
VX_writeback_if.slave writeback_if
VX_ibuffer_if.scoreboard ibuffer_if,
output wire [3:0] used_regs
);
localparam NW_WIDTH = `UP(`NW_BITS);
reg [`NUM_WARPS-1:0][`NUM_REGS-1:0] inuse_regs, inuse_regs_n;
wire reserve_reg = scoreboard_if.valid && scoreboard_if.ready && scoreboard_if.wb;
wire reserve_reg = ibuffer_if.valid && ibuffer_if.ready && ibuffer_if.wb;
wire release_reg = writeback_if.valid && writeback_if.ready && writeback_if.eop;
always @(*) begin
inuse_regs_n = inuse_regs;
if (reserve_reg) begin
inuse_regs_n[scoreboard_if.wid][scoreboard_if.rd] = 1;
inuse_regs_n[ibuffer_if.wid][ibuffer_if.rd] = 1;
end
if (release_reg) begin
inuse_regs_n[writeback_if.wid][writeback_if.rd] = 0;
@ -31,32 +35,30 @@ module VX_scoreboard #(
inuse_regs <= inuse_regs_n;
end
end
reg deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3;
always @(posedge clk) begin
deq_inuse_rd <= inuse_regs_n[scoreboard_if.wid_n][scoreboard_if.rd_n];
deq_inuse_rs1 <= inuse_regs_n[scoreboard_if.wid_n][scoreboard_if.rs1_n];
deq_inuse_rs2 <= inuse_regs_n[scoreboard_if.wid_n][scoreboard_if.rs2_n];
deq_inuse_rs3 <= inuse_regs_n[scoreboard_if.wid_n][scoreboard_if.rs3_n];
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
assign scoreboard_if.ready[i] = ~(inuse_regs_n[i][scoreboard_if.rd[i]]
| inuse_regs_n[i][scoreboard_if.rs1[i]]
| inuse_regs_n[i][scoreboard_if.rs2[i]]
| inuse_regs_n[i][scoreboard_if.rs3[i]]);
end
assign writeback_if.ready = 1'b1;
wire [NW_WIDTH-1:0] wid_sel;
VX_lzc #(
.N (`NUM_WARPS),
.REVERSE (1)
) wid_select (
.data_in (scoreboard_if.valid),
.data_out (wid_sel),
`UNUSED_PIN (valid_out)
);
assign scoreboard_if.ready = ~(deq_inuse_rd
| deq_inuse_rs1
| deq_inuse_rs2
| deq_inuse_rs3);
assign scoreboard_if.used_regs[0] = deq_inuse_rd;
assign scoreboard_if.used_regs[1] = deq_inuse_rs1;
assign scoreboard_if.used_regs[2] = deq_inuse_rs2;
assign scoreboard_if.used_regs[3] = deq_inuse_rs3;
assign used_regs[0] = inuse_regs_n[wid_sel][scoreboard_if.rd[wid_sel]];
assign used_regs[1] = inuse_regs_n[wid_sel][scoreboard_if.rs1[wid_sel]];
assign used_regs[2] = inuse_regs_n[wid_sel][scoreboard_if.rs2[wid_sel]];
assign used_regs[3] = inuse_regs_n[wid_sel][scoreboard_if.rs3[wid_sel]];
`UNUSED_VAR (writeback_if.PC)
`UNUSED_VAR (scoreboard_if.PC)
`UNUSED_VAR (scoreboard_if.tmask)
`UNUSED_VAR (scoreboard_if.uuid)
always @(posedge clk) begin
if (release_reg) begin

View file

@ -1,65 +0,0 @@
`include "VX_define.vh"
interface VX_dispatch_if ();
wire valid;
wire [`UP(`UUID_BITS)-1:0] uuid;
wire [`UP(`NW_BITS)-1:0] wid;
wire [`NUM_THREADS-1:0] tmask;
wire [`XLEN-1:0] PC;
wire [`EX_BITS-1:0] ex_type;
wire [`INST_OP_BITS-1:0] op_type;
wire [`INST_MOD_BITS-1:0] op_mod;
wire wb;
wire use_PC;
wire use_imm;
wire [`XLEN-1:0] imm;
wire [`NR_BITS-1:0] rd;
wire [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data;
wire [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data;
wire [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data;
wire ready;
modport master (
output valid,
output uuid,
output wid,
output tmask,
output PC,
output ex_type,
output op_type,
output op_mod,
output wb,
output use_PC,
output use_imm,
output imm,
output rd,
output rs1_data,
output rs2_data,
output rs3_data,
input ready
);
modport slave (
input valid,
input uuid,
input wid,
input tmask,
input PC,
input ex_type,
input op_type,
input op_mod,
input wb,
input use_PC,
input use_imm,
input imm,
input rd,
input rs1_data,
input rs2_data,
input rs3_data,
output ready
);
endinterface

View file

@ -2,32 +2,17 @@
interface VX_gpr_stage_if ();
wire [`UP(`NW_BITS)-1:0] wid;
wire [`NR_BITS-1:0] rs1;
wire [`NR_BITS-1:0] rs2;
wire [`NR_BITS-1:0] rs3;
wire [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data;
wire [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data;
wire [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data;
modport master (
output wid,
output rs1,
output rs2,
output rs3,
input rs1_data,
input rs2_data,
input rs3_data
);
modport slave (
input wid,
input rs1,
input rs2,
input rs3,
output rs1_data,
output rs2_data,
output rs3_data

View file

@ -18,12 +18,6 @@ interface VX_ibuffer_if ();
wire [`NR_BITS-1:0] rs1;
wire [`NR_BITS-1:0] rs2;
wire [`NR_BITS-1:0] rs3;
wire [`NR_BITS-1:0] rd_n;
wire [`NR_BITS-1:0] rs1_n;
wire [`NR_BITS-1:0] rs2_n;
wire [`NR_BITS-1:0] rs3_n;
wire [`UP(`NW_BITS)-1:0] wid_n;
wire ready;
@ -44,12 +38,7 @@ interface VX_ibuffer_if ();
output rs1,
output rs2,
output rs3,
output rd_n,
output rs1_n,
output rs2_n,
output rs3_n,
output wid_n,
input ready
input ready
);
modport slave (
@ -68,13 +57,23 @@ interface VX_ibuffer_if ();
input rd,
input rs1,
input rs2,
input rs3,
input rd_n,
input rs1_n,
input rs2_n,
input rs3_n,
input wid_n,
input rs3,
output ready
);
modport scoreboard (
input valid,
input wid,
input wb,
input rd,
output ready
);
modport gpr (
input wid,
input rs1,
input rs2,
input rs3
);
endinterface

View file

@ -2,55 +2,28 @@
interface VX_scoreboard_if ();
wire valid;
wire [`UP(`UUID_BITS)-1:0] uuid;
wire [`UP(`NW_BITS)-1:0] wid;
wire [`NUM_THREADS-1:0] tmask;
wire [`XLEN-1:0] PC;
wire wb;
wire [`NR_BITS-1:0] rd;
wire [`NR_BITS-1:0] rd_n;
wire [`NR_BITS-1:0] rs1_n;
wire [`NR_BITS-1:0] rs2_n;
wire [`NR_BITS-1:0] rs3_n;
wire [`UP(`NW_BITS)-1:0] wid_n;
wire [3:0] used_regs;
wire ready;
wire [`NUM_WARPS-1:0] valid;
wire [`NUM_WARPS-1:0][`NR_BITS-1:0] rd;
wire [`NUM_WARPS-1:0][`NR_BITS-1:0] rs1;
wire [`NUM_WARPS-1:0][`NR_BITS-1:0] rs2;
wire [`NUM_WARPS-1:0][`NR_BITS-1:0] rs3;
wire [`NUM_WARPS-1:0] ready;
modport master (
output valid,
output uuid,
output wid,
output tmask,
output PC,
output wb,
output rd,
output rd_n,
output rs1_n,
output rs2_n,
output rs3_n,
output wid_n,
input used_regs,
output rs1,
output rs2,
output rs3,
input ready
);
modport slave (
input valid,
input uuid,
input wid,
input tmask,
input PC,
input wb,
input rd,
input rd_n,
input rs1_n,
input rs2_n,
input rs3_n,
input wid_n,
output used_regs,
input rs1,
input rs2,
input rs3,
output ready
);