mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
memory fence update
This commit is contained in:
parent
b1ae82bae5
commit
f6663d6618
4 changed files with 98 additions and 64 deletions
|
@ -132,21 +132,21 @@
|
|||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define INST_ALU_ADD 4'b0000
|
||||
//`define INST_ALU_UNUSED 4'b0001
|
||||
`define INST_ALU_LUI 4'b0010
|
||||
`define INST_ALU_AUIPC 4'b0011
|
||||
`define INST_ALU_SLTU 4'b0100
|
||||
`define INST_ALU_SLT 4'b0101
|
||||
//`define INST_ALU_UNUSED 4'b0110
|
||||
`define INST_ALU_SUB 4'b0111
|
||||
`define INST_ALU_SRL 4'b1000
|
||||
`define INST_ALU_SRA 4'b1001
|
||||
`define INST_ALU_CZEQ 4'b1010
|
||||
`define INST_ALU_CZNE 4'b1011
|
||||
`define INST_ALU_AND 4'b1100
|
||||
`define INST_ALU_OR 4'b1101
|
||||
`define INST_ALU_XOR 4'b1110
|
||||
`define INST_ALU_SLL 4'b1111
|
||||
`define INST_ALU_CZEQ 4'b1010
|
||||
`define INST_ALU_CZNE 4'b1011
|
||||
//`define INST_ALU_UNUSED 4'b0001
|
||||
//`define INST_ALU_UNUSED 4'b0110
|
||||
|
||||
|
||||
`define ALU_TYPE_BITS 2
|
||||
|
@ -300,9 +300,10 @@
|
|||
`define L1_ENABLE
|
||||
`endif
|
||||
|
||||
`define ADDR_TYPE_IO 0
|
||||
`define ADDR_TYPE_LOCAL 1
|
||||
`define ADDR_TYPE_WIDTH (`LMEM_ENABLED + 1)
|
||||
`define ADDR_TYPE_FLUSH 0
|
||||
`define ADDR_TYPE_IO 1
|
||||
`define ADDR_TYPE_LOCAL 2 // shoud be last since optional
|
||||
`define ADDR_TYPE_WIDTH (`ADDR_TYPE_LOCAL + `LMEM_ENABLED)
|
||||
|
||||
`define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE
|
||||
`define VX_MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH - `CLOG2(`L3_LINE_SIZE))
|
||||
|
|
|
@ -92,7 +92,8 @@ package VX_gpu_pkg;
|
|||
} fpu_mod_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic [($bits(alu_mod_t)-1-`OFFSET_BITS)-1:0] __padding;
|
||||
logic [($bits(alu_mod_t)-1-1-`OFFSET_BITS)-1:0] __padding;
|
||||
logic is_store;
|
||||
logic is_float;
|
||||
logic [`OFFSET_BITS-1:0] offset;
|
||||
} lsu_mod_t;
|
||||
|
|
|
@ -304,6 +304,9 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
`INST_FENCE: begin
|
||||
ex_type = `EX_LSU;
|
||||
op_type = `INST_LSU_FENCE;
|
||||
op_mod.lsu.is_store = 0;
|
||||
op_mod.lsu.is_float = 0;
|
||||
op_mod.lsu.offset = 0;
|
||||
end
|
||||
`INST_SYS : begin
|
||||
if (func3[1:0] != 0) begin
|
||||
|
@ -338,6 +341,7 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
`INST_L: begin
|
||||
ex_type = `EX_LSU;
|
||||
op_type = `INST_OP_BITS'({1'b0, func3});
|
||||
op_mod.lsu.is_store = 0;
|
||||
op_mod.lsu.is_float = opcode[2];
|
||||
op_mod.lsu.offset = u_12;
|
||||
use_rd = 1;
|
||||
|
@ -355,6 +359,7 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
`INST_S: begin
|
||||
ex_type = `EX_LSU;
|
||||
op_type = `INST_OP_BITS'({1'b1, func3});
|
||||
op_mod.lsu.is_store = 1;
|
||||
op_mod.lsu.is_float = opcode[2];
|
||||
op_mod.lsu.offset = s_imm;
|
||||
`USED_IREG (rs1);
|
||||
|
|
|
@ -38,19 +38,19 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
|
||||
localparam MEM_ADDRW = `MEM_ADDR_WIDTH - MEM_ASHIFT;
|
||||
|
||||
// tag_id = wid + PC + rd + op_type + align + pid + pkt_addr
|
||||
localparam TAG_ID_WIDTH = `NW_WIDTH + `PC_BITS + `NR_BITS + `INST_LSU_BITS + (NUM_LANES * REQ_ASHIFT) + PID_WIDTH + LSUQ_SIZEW;
|
||||
// tag_id = wid + PC + wb + rd + op_type + align + pid + pkt_addr + fence
|
||||
localparam TAG_ID_WIDTH = `NW_WIDTH + `PC_BITS + 1 + `NR_BITS + `INST_LSU_BITS + (NUM_LANES * REQ_ASHIFT) + PID_WIDTH + LSUQ_SIZEW + 1;
|
||||
|
||||
// tag = uuid + tag_id
|
||||
localparam TAG_WIDTH = `UUID_WIDTH + TAG_ID_WIDTH;
|
||||
|
||||
VX_commit_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) commit_st_if();
|
||||
) commit_rsp_if();
|
||||
|
||||
VX_commit_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
) commit_ld_if();
|
||||
) commit_no_rsp_if();
|
||||
|
||||
`UNUSED_VAR (execute_if.data.op_mod)
|
||||
`UNUSED_VAR (execute_if.data.rs3_data)
|
||||
|
@ -58,6 +58,8 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
|
||||
// full address calculation
|
||||
|
||||
wire req_is_fence, rsp_is_fence;
|
||||
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] full_addr;
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
assign full_addr[i] = execute_if.data.rs1_data[i] + `SEXT(`XLEN, execute_if.data.op_mod.lsu.offset);
|
||||
|
@ -70,6 +72,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
wire [MEM_ADDRW-1:0] block_addr = full_addr[i][MEM_ASHIFT +: MEM_ADDRW];
|
||||
// is I/O address
|
||||
wire [MEM_ADDRW-1:0] io_addr_start = MEM_ADDRW'(`XLEN'(`IO_BASE_ADDR) >> MEM_ASHIFT);
|
||||
assign mem_req_atype[i][`ADDR_TYPE_FLUSH] = req_is_fence;
|
||||
assign mem_req_atype[i][`ADDR_TYPE_IO] = (block_addr >= io_addr_start);
|
||||
`ifdef LMEM_ENABLE
|
||||
// is local memory address
|
||||
|
@ -79,17 +82,6 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
`endif
|
||||
end
|
||||
|
||||
wire mem_req_empty;
|
||||
wire st_rsp_ready;
|
||||
wire lsu_valid, lsu_ready;
|
||||
|
||||
// fence: stall the pipeline until all pending requests are sent
|
||||
wire is_fence = `INST_LSU_IS_FENCE(execute_if.data.op_type);
|
||||
wire fence_wait = is_fence && ~mem_req_empty;
|
||||
|
||||
assign lsu_valid = execute_if.valid && ~fence_wait;
|
||||
assign execute_if.ready = lsu_ready && ~fence_wait;
|
||||
|
||||
// schedule memory request
|
||||
|
||||
wire mem_req_valid;
|
||||
|
@ -109,18 +101,53 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
wire mem_rsp_eop;
|
||||
wire mem_rsp_ready;
|
||||
|
||||
assign mem_req_valid = lsu_valid;
|
||||
assign lsu_ready = mem_req_ready
|
||||
&& (~mem_req_rw || st_rsp_ready); // writes commit directly
|
||||
|
||||
assign mem_req_mask = execute_if.data.tmask;
|
||||
assign mem_req_rw = ~execute_if.data.wb;
|
||||
|
||||
wire mem_req_fire = mem_req_valid && mem_req_ready;
|
||||
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
|
||||
`UNUSED_VAR (mem_req_fire)
|
||||
`UNUSED_VAR (mem_rsp_fire)
|
||||
|
||||
wire mem_rsp_sop_pkt, mem_rsp_eop_pkt;
|
||||
wire no_rsp_buf_valid, no_rsp_buf_ready;
|
||||
|
||||
// fence handling
|
||||
|
||||
reg fence_lock;
|
||||
|
||||
assign req_is_fence = `INST_LSU_IS_FENCE(execute_if.data.op_type);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
fence_lock <= 0;
|
||||
end else begin
|
||||
if (mem_req_fire && req_is_fence && execute_if.data.eop) begin
|
||||
fence_lock <= 1;
|
||||
end
|
||||
if (mem_rsp_fire && rsp_is_fence && mem_rsp_eop_pkt) begin
|
||||
fence_lock <= 0;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
wire req_skip = req_is_fence && ~execute_if.data.eop;
|
||||
wire no_rsp_buf_use = (mem_req_rw && ~execute_if.data.wb) || req_skip;
|
||||
|
||||
assign mem_req_valid = execute_if.valid
|
||||
&& ~req_skip
|
||||
&& ~(no_rsp_buf_use && ~no_rsp_buf_ready)
|
||||
&& ~fence_lock;
|
||||
|
||||
assign no_rsp_buf_valid = execute_if.valid
|
||||
&& no_rsp_buf_use
|
||||
&& (req_skip || mem_req_ready)
|
||||
&& ~fence_lock;
|
||||
|
||||
assign execute_if.ready = (mem_req_ready || req_skip)
|
||||
&& ~(no_rsp_buf_use && ~no_rsp_buf_ready)
|
||||
&& ~fence_lock;
|
||||
|
||||
assign mem_req_mask = execute_if.data.tmask;
|
||||
assign mem_req_rw = execute_if.data.op_mod.lsu.is_store;
|
||||
|
||||
// address formatting
|
||||
|
||||
wire [NUM_LANES-1:0][REQ_ASHIFT-1:0] req_align;
|
||||
|
@ -158,7 +185,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
// memory misalignment not supported!
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin
|
||||
wire lsu_req_fire = execute_if.valid && execute_if.ready;
|
||||
`RUNTIME_ASSERT((~lsu_req_fire || ~execute_if.data.tmask[i] || is_fence || (full_addr[i] % (1 << `INST_LSU_WSIZE(execute_if.data.op_type))) == 0),
|
||||
`RUNTIME_ASSERT((~lsu_req_fire || ~execute_if.data.tmask[i] || req_is_fence || (full_addr[i] % (1 << `INST_LSU_WSIZE(execute_if.data.op_type))) == 0),
|
||||
("misaligned memory access, wid=%0d, PC=0x%0h, addr=0x%0h, wsize=%0d! (#%0d)",
|
||||
execute_if.data.wid, {execute_if.data.PC, 1'b0}, full_addr[i], `INST_LSU_WSIZE(execute_if.data.op_type), execute_if.data.uuid));
|
||||
end
|
||||
|
@ -185,13 +212,12 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
// track SOP/EOP for out-of-order memory responses
|
||||
|
||||
wire [LSUQ_SIZEW-1:0] pkt_waddr, pkt_raddr;
|
||||
wire mem_rsp_sop_pkt, mem_rsp_eop_pkt;
|
||||
|
||||
if (PID_BITS != 0) begin
|
||||
reg [`LSUQ_IN_SIZE-1:0][PID_BITS:0] pkt_ctr;
|
||||
reg [`LSUQ_IN_SIZE-1:0] pkt_sop, pkt_eop;
|
||||
|
||||
wire mem_req_rd_fire = mem_req_fire && execute_if.data.wb;
|
||||
wire mem_req_rd_fire = mem_req_fire && ~mem_req_rw;
|
||||
wire mem_req_rd_sop_fire = mem_req_rd_fire && execute_if.data.sop;
|
||||
wire mem_req_rd_eop_fire = mem_req_rd_fire && execute_if.data.eop;
|
||||
wire mem_rsp_eop_fire = mem_rsp_fire && mem_rsp_eop;
|
||||
|
@ -258,10 +284,13 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
execute_if.data.uuid,
|
||||
execute_if.data.wid,
|
||||
execute_if.data.PC,
|
||||
execute_if.data.wb,
|
||||
execute_if.data.rd,
|
||||
execute_if.data.op_type,
|
||||
req_align, execute_if.data.pid,
|
||||
pkt_waddr
|
||||
req_align,
|
||||
execute_if.data.pid,
|
||||
pkt_waddr,
|
||||
req_is_fence
|
||||
};
|
||||
|
||||
wire lsu_mem_req_valid;
|
||||
|
@ -311,7 +340,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
.core_req_data (mem_req_data),
|
||||
.core_req_tag (mem_req_tag),
|
||||
.core_req_ready (mem_req_ready),
|
||||
.core_req_empty (mem_req_empty),
|
||||
`UNUSED_PIN (core_req_empty),
|
||||
`UNUSED_PIN (core_req_sent),
|
||||
|
||||
// Output response
|
||||
|
@ -361,6 +390,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
wire [`UUID_WIDTH-1:0] rsp_uuid;
|
||||
wire [`NW_WIDTH-1:0] rsp_wid;
|
||||
wire [`PC_BITS-1:0] rsp_pc;
|
||||
wire rsp_wb;
|
||||
wire [`NR_BITS-1:0] rsp_rd;
|
||||
wire [`INST_LSU_BITS-1:0] rsp_op_type;
|
||||
wire [NUM_LANES-1:0][REQ_ASHIFT-1:0] rsp_align;
|
||||
|
@ -371,11 +401,14 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
assign {
|
||||
rsp_uuid,
|
||||
rsp_wid,
|
||||
rsp_pc, rsp_rd,
|
||||
rsp_pc,
|
||||
rsp_wb,
|
||||
rsp_rd,
|
||||
rsp_op_type,
|
||||
rsp_align,
|
||||
rsp_pid,
|
||||
pkt_raddr
|
||||
pkt_raddr,
|
||||
rsp_is_fence
|
||||
} = mem_rsp_tag;
|
||||
|
||||
// load response formatting
|
||||
|
@ -419,44 +452,38 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
end
|
||||
end
|
||||
|
||||
// load commit
|
||||
// commit
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1),
|
||||
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + 1 + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1),
|
||||
.SIZE (2)
|
||||
) ld_rsp_buf (
|
||||
) rsp_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (mem_rsp_valid),
|
||||
.ready_in (mem_rsp_ready),
|
||||
.data_in ({rsp_uuid, rsp_wid, mem_rsp_mask, rsp_pc, rsp_rd, rsp_data, rsp_pid, mem_rsp_sop_pkt, mem_rsp_eop_pkt}),
|
||||
.data_out ({commit_ld_if.data.uuid, commit_ld_if.data.wid, commit_ld_if.data.tmask, commit_ld_if.data.PC, commit_ld_if.data.rd, commit_ld_if.data.data, commit_ld_if.data.pid, commit_ld_if.data.sop, commit_ld_if.data.eop}),
|
||||
.valid_out (commit_ld_if.valid),
|
||||
.ready_out (commit_ld_if.ready)
|
||||
.data_in ({rsp_uuid, rsp_wid, mem_rsp_mask, rsp_pc, rsp_wb, rsp_rd, rsp_data, rsp_pid, mem_rsp_sop_pkt, mem_rsp_eop_pkt}),
|
||||
.data_out ({commit_rsp_if.data.uuid, commit_rsp_if.data.wid, commit_rsp_if.data.tmask, commit_rsp_if.data.PC, commit_rsp_if.data.wb, commit_rsp_if.data.rd, commit_rsp_if.data.data, commit_rsp_if.data.pid, commit_rsp_if.data.sop, commit_rsp_if.data.eop}),
|
||||
.valid_out (commit_rsp_if.valid),
|
||||
.ready_out (commit_rsp_if.ready)
|
||||
);
|
||||
|
||||
assign commit_ld_if.data.wb = 1'b1;
|
||||
|
||||
// store commit
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + PID_WIDTH + 1 + 1),
|
||||
.SIZE (2)
|
||||
) st_rsp_buf (
|
||||
) no_rsp_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (mem_req_fire && mem_req_rw),
|
||||
.ready_in (st_rsp_ready),
|
||||
.valid_in (no_rsp_buf_valid),
|
||||
.ready_in (no_rsp_buf_ready),
|
||||
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop}),
|
||||
.data_out ({commit_st_if.data.uuid, commit_st_if.data.wid, commit_st_if.data.tmask, commit_st_if.data.PC, commit_st_if.data.pid, commit_st_if.data.sop, commit_st_if.data.eop}),
|
||||
.valid_out (commit_st_if.valid),
|
||||
.ready_out (commit_st_if.ready)
|
||||
.data_out ({commit_no_rsp_if.data.uuid, commit_no_rsp_if.data.wid, commit_no_rsp_if.data.tmask, commit_no_rsp_if.data.PC, commit_no_rsp_if.data.pid, commit_no_rsp_if.data.sop, commit_no_rsp_if.data.eop}),
|
||||
.valid_out (commit_no_rsp_if.valid),
|
||||
.ready_out (commit_no_rsp_if.ready)
|
||||
);
|
||||
assign commit_st_if.data.rd = '0;
|
||||
assign commit_st_if.data.wb = 1'b0;
|
||||
assign commit_st_if.data.data = commit_ld_if.data.data; // force arbiter passthru
|
||||
|
||||
// lsu commit
|
||||
assign commit_no_rsp_if.data.rd = '0;
|
||||
assign commit_no_rsp_if.data.wb = 1'b0;
|
||||
assign commit_no_rsp_if.data.data = commit_rsp_if.data.data; // arbiter MUX optimization
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
|
@ -465,9 +492,9 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
) rsp_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in ({commit_st_if.valid, commit_ld_if.valid}),
|
||||
.ready_in ({commit_st_if.ready, commit_ld_if.ready}),
|
||||
.data_in ({commit_st_if.data, commit_ld_if.data}),
|
||||
.valid_in ({commit_no_rsp_if.valid, commit_rsp_if.valid}),
|
||||
.ready_in ({commit_no_rsp_if.ready, commit_rsp_if.ready}),
|
||||
.data_in ({commit_no_rsp_if.data, commit_rsp_if.data}),
|
||||
.data_out (commit_if.data),
|
||||
.valid_out (commit_if.valid),
|
||||
.ready_out (commit_if.ready),
|
||||
|
@ -476,7 +503,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
|
||||
`ifdef DBG_TRACE_MEM
|
||||
always @(posedge clk) begin
|
||||
if (execute_if.valid && fence_wait) begin
|
||||
if (execute_if.valid && fence_lock) begin
|
||||
`TRACE(1, ("%d: *** D$%0d fence wait\n", $time, CORE_ID));
|
||||
end
|
||||
if (mem_req_fire) begin
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue