adding lsu_slice

This commit is contained in:
Blaise Tine 2024-03-18 01:11:03 -07:00
parent 6556e8c66d
commit d65f6e064a
2 changed files with 561 additions and 520 deletions

543
hw/rtl/core/VX_lsu_slice.sv Normal file
View file

@ -0,0 +1,543 @@
// Copyright © 2019-2023
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
`include "VX_define.vh"
module VX_lsu_slice import VX_gpu_pkg::*; #(
parameter CORE_ID = 0,
parameter BLOCK_ID = 0
) (
`SCOPE_IO_DECL
input wire clk,
input wire reset,
// Inputs
VX_execute_if.slave execute_if,
// Outputs
VX_commit_if.master commit_if,
VX_mem_bus_if.master cache_bus_if [DCACHE_CHANNELS]
);
localparam WORD_SIZE = `XLEN / 8;
localparam ADDR_WIDTH = `MEM_ADDR_WIDTH - `CLOG2(WORD_SIZE);
localparam NUM_LANES = `NUM_LSU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
localparam LSUQ_SIZEW = `LOG2UP(`LSUQ_IN_SIZE);
localparam REQ_ASHIFT = `CLOG2(WORD_SIZE);
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
localparam MEM_ADDRW = `MEM_ADDR_WIDTH - MEM_ASHIFT;
// tag_id = wid + PC + rd + op_type + align + pid + pkt_addr
localparam TAG_ID_WIDTH = `NW_WIDTH + `XLEN + `NR_BITS + `INST_LSU_BITS + (NUM_LANES * (REQ_ASHIFT)) + PID_WIDTH + LSUQ_SIZEW;
// tag = uuid + tag_id
localparam TAG_WIDTH = `UUID_WIDTH + TAG_ID_WIDTH;
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_st_if();
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_ld_if();
`UNUSED_VAR (execute_if.data.op_mod)
`UNUSED_VAR (execute_if.data.use_PC)
`UNUSED_VAR (execute_if.data.use_imm)
`UNUSED_VAR (execute_if.data.rs3_data)
`UNUSED_VAR (execute_if.data.tid)
// full address calculation
wire [NUM_LANES-1:0][`XLEN-1:0] full_addr;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign full_addr[i] = execute_if.data.rs1_data[i][`XLEN-1:0] + execute_if.data.imm;
end
// address type calculation
wire [NUM_LANES-1:0][`ADDR_TYPE_WIDTH-1:0] mem_req_atype;
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [MEM_ADDRW-1:0] block_addr = full_addr[i][MEM_ASHIFT +: MEM_ADDRW];
// is I/O address
wire [MEM_ADDRW-1:0] io_addr_start = MEM_ADDRW'(`XLEN'(`IO_BASE_ADDR) >> MEM_ASHIFT);
assign mem_req_atype[i][`ADDR_TYPE_IO] = (block_addr >= io_addr_start);
`ifdef LMEM_ENABLE
// is local memory address
wire [MEM_ADDRW-1:0] lmem_addr_start = MEM_ADDRW'(`XLEN'(`LMEM_BASE_ADDR) >> MEM_ASHIFT);
wire [MEM_ADDRW-1:0] lmem_addr_end = MEM_ADDRW'((`XLEN'(`LMEM_BASE_ADDR) + `XLEN'(1 << `LMEM_LOG_SIZE)) >> MEM_ASHIFT);
assign mem_req_atype[i][`ADDR_TYPE_LOCAL] = (block_addr >= lmem_addr_start) && (block_addr < lmem_addr_end);
`endif
end
wire mem_req_empty;
wire st_rsp_ready;
wire lsu_valid, lsu_ready;
// fence: stall the pipeline until all pending requests are sent
wire is_fence = `INST_LSU_IS_FENCE(execute_if.data.op_type);
wire fence_wait = is_fence && ~mem_req_empty;
assign lsu_valid = execute_if.valid && ~fence_wait;
assign execute_if.ready = lsu_ready && ~fence_wait;
// schedule memory request
wire mem_req_valid;
wire [NUM_LANES-1:0] mem_req_mask;
wire mem_req_rw;
wire [NUM_LANES-1:0][`MEM_ADDR_WIDTH-REQ_ASHIFT-1:0] mem_req_addr;
reg [NUM_LANES-1:0][WORD_SIZE-1:0] mem_req_byteen;
reg [NUM_LANES-1:0][`XLEN-1:0] mem_req_data;
wire [TAG_WIDTH-1:0] mem_req_tag;
wire mem_req_ready;
wire mem_rsp_valid;
wire [NUM_LANES-1:0] mem_rsp_mask;
wire [NUM_LANES-1:0][`XLEN-1:0] mem_rsp_data;
wire [TAG_WIDTH-1:0] mem_rsp_tag;
wire mem_rsp_sop;
wire mem_rsp_eop;
wire mem_rsp_ready;
assign mem_req_valid = lsu_valid;
assign lsu_ready = mem_req_ready
&& (~mem_req_rw || st_rsp_ready); // writes commit directly
assign mem_req_mask = execute_if.data.tmask;
assign mem_req_rw = ~execute_if.data.wb;
wire mem_req_fire = mem_req_valid && mem_req_ready;
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
`UNUSED_VAR (mem_req_fire)
`UNUSED_VAR (mem_rsp_fire)
// address formatting
wire [NUM_LANES-1:0][REQ_ASHIFT-1:0] req_align;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign req_align[i] = full_addr[i][REQ_ASHIFT-1:0];
assign mem_req_addr[i] = full_addr[i][`MEM_ADDR_WIDTH-1:REQ_ASHIFT];
end
// byte enable formatting
for (genvar i = 0; i < NUM_LANES; ++i) begin
always @(*) begin
mem_req_byteen[i] = '0;
case (`INST_LSU_WSIZE(execute_if.data.op_type))
0: begin // 8-bit
mem_req_byteen[i][req_align[i]] = 1'b1;
end
1: begin // 16 bit
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:1], 1'b0}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:1], 1'b1}] = 1'b1;
end
`ifdef XLEN_64
2: begin // 32 bit
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b00}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b01}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b10}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b11}] = 1'b1;
end
`endif
default : mem_req_byteen[i] = {WORD_SIZE{1'b1}};
endcase
end
end
// memory misalignment not supported!
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire lsu_req_fire = execute_if.valid && execute_if.ready;
`RUNTIME_ASSERT((~lsu_req_fire || ~execute_if.data.tmask[i] || is_fence || (full_addr[i] % (1 << `INST_LSU_WSIZE(execute_if.data.op_type))) == 0),
("misaligned memory access, wid=%0d, PC=0x%0h, addr=0x%0h, wsize=%0d! (#%0d)",
execute_if.data.wid, execute_if.data.PC, full_addr[i], `INST_LSU_WSIZE(execute_if.data.op_type), execute_if.data.uuid));
end
// store data formatting
for (genvar i = 0; i < NUM_LANES; ++i) begin
always @(*) begin
mem_req_data[i] = execute_if.data.rs2_data[i];
case (req_align[i])
1: mem_req_data[i][`XLEN-1:8] = execute_if.data.rs2_data[i][`XLEN-9:0];
2: mem_req_data[i][`XLEN-1:16] = execute_if.data.rs2_data[i][`XLEN-17:0];
3: mem_req_data[i][`XLEN-1:24] = execute_if.data.rs2_data[i][`XLEN-25:0];
`ifdef XLEN_64
4: mem_req_data[i][`XLEN-1:32] = execute_if.data.rs2_data[i][`XLEN-33:0];
5: mem_req_data[i][`XLEN-1:40] = execute_if.data.rs2_data[i][`XLEN-41:0];
6: mem_req_data[i][`XLEN-1:48] = execute_if.data.rs2_data[i][`XLEN-49:0];
7: mem_req_data[i][`XLEN-1:56] = execute_if.data.rs2_data[i][`XLEN-57:0];
`endif
default:;
endcase
end
end
// track SOP/EOP for out-of-order memory responses
wire [LSUQ_SIZEW-1:0] pkt_waddr, pkt_raddr;
wire mem_rsp_sop_pkt, mem_rsp_eop_pkt;
if (PID_BITS != 0) begin
reg [`LSUQ_IN_SIZE-1:0][PID_BITS:0] pkt_ctr;
reg [`LSUQ_IN_SIZE-1:0] pkt_sop, pkt_eop;
wire mem_req_rd_fire = mem_req_fire && execute_if.data.wb;
wire mem_req_rd_sop_fire = mem_req_rd_fire && execute_if.data.sop;
wire mem_req_rd_eop_fire = mem_req_rd_fire && execute_if.data.eop;
wire mem_rsp_eop_fire = mem_rsp_fire && mem_rsp_eop;
wire full;
VX_allocator #(
.SIZE (`LSUQ_IN_SIZE)
) pkt_allocator (
.clk (clk),
.reset (reset),
.acquire_en (mem_req_rd_eop_fire),
.acquire_addr(pkt_waddr),
.release_en (mem_rsp_eop_pkt),
.release_addr(pkt_raddr),
`UNUSED_PIN (empty),
.full (full)
);
wire rd_during_wr = mem_req_rd_fire && mem_rsp_eop_fire && (pkt_raddr == pkt_waddr);
always @(posedge clk) begin
if (reset) begin
pkt_ctr <= '0;
pkt_sop <= '0;
pkt_eop <= '0;
end else begin
if (mem_req_rd_sop_fire) begin
pkt_sop[pkt_waddr] <= 1;
end
if (mem_req_rd_eop_fire) begin
pkt_eop[pkt_waddr] <= 1;
end
if (mem_rsp_fire) begin
pkt_sop[pkt_raddr] <= 0;
end
if (mem_rsp_eop_pkt) begin
pkt_eop[pkt_raddr] <= 0;
end
if (~rd_during_wr) begin
if (mem_req_rd_fire) begin
pkt_ctr[pkt_waddr] <= pkt_ctr[pkt_waddr] + PID_BITS'(1);
end
if (mem_rsp_eop_fire) begin
pkt_ctr[pkt_raddr] <= pkt_ctr[pkt_raddr] - PID_BITS'(1);
end
end
end
end
assign mem_rsp_sop_pkt = pkt_sop[pkt_raddr];
assign mem_rsp_eop_pkt = mem_rsp_eop_fire && pkt_eop[pkt_raddr] && (pkt_ctr[pkt_raddr] == 1);
`RUNTIME_ASSERT(~(mem_req_rd_fire && full), ("allocator full!"))
`RUNTIME_ASSERT(~mem_req_rd_sop_fire || 0 == pkt_ctr[pkt_waddr], ("Oops!"))
`UNUSED_VAR (mem_rsp_sop)
end else begin
assign pkt_waddr = 0;
assign mem_rsp_sop_pkt = mem_rsp_sop;
assign mem_rsp_eop_pkt = mem_rsp_eop;
`UNUSED_VAR (pkt_raddr)
end
// pack memory request tag
assign mem_req_tag = {
execute_if.data.uuid,
execute_if.data.wid,
execute_if.data.PC,
execute_if.data.rd,
execute_if.data.op_type,
req_align, execute_if.data.pid,
pkt_waddr
};
wire [DCACHE_CHANNELS-1:0] cache_req_valid;
wire [DCACHE_CHANNELS-1:0] cache_req_rw;
wire [DCACHE_CHANNELS-1:0][DCACHE_WORD_SIZE-1:0] cache_req_byteen;
wire [DCACHE_CHANNELS-1:0][DCACHE_ADDR_WIDTH-1:0] cache_req_addr;
wire [DCACHE_CHANNELS-1:0][`ADDR_TYPE_WIDTH-1:0] cache_req_atype;
wire [DCACHE_CHANNELS-1:0][(DCACHE_WORD_SIZE*8)-1:0] cache_req_data;
wire [DCACHE_CHANNELS-1:0][DCACHE_TAG_WIDTH-1:0] cache_req_tag;
wire [DCACHE_CHANNELS-1:0] cache_req_ready;
wire [DCACHE_CHANNELS-1:0] cache_rsp_valid;
wire [DCACHE_CHANNELS-1:0][(DCACHE_WORD_SIZE*8)-1:0] cache_rsp_data;
wire [DCACHE_CHANNELS-1:0][DCACHE_TAG_WIDTH-1:0] cache_rsp_tag;
wire [DCACHE_CHANNELS-1:0] cache_rsp_ready;
`RESET_RELAY (mem_scheduler_reset, reset);
VX_mem_scheduler #(
.INSTANCE_ID ($sformatf("core%0d-lsu-memsched%0d", CORE_ID, BLOCK_ID)),
.CORE_REQS (`NUM_LSU_LANES),
.MEM_CHANNELS(DCACHE_CHANNELS),
.WORD_SIZE (WORD_SIZE),
.LINE_SIZE (DCACHE_WORD_SIZE),
.ADDR_WIDTH (ADDR_WIDTH),
.ATYPE_WIDTH (`ADDR_TYPE_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.CORE_QUEUE_SIZE (`LSUQ_IN_SIZE),
.MEM_QUEUE_SIZE (`LSUQ_OUT_SIZE),
.UUID_WIDTH (`UUID_WIDTH),
.RSP_PARTIAL (1),
.MEM_OUT_BUF (2)
) mem_scheduler (
.clk (clk),
.reset (mem_scheduler_reset),
// Input request
.core_req_valid (mem_req_valid),
.core_req_rw (mem_req_rw),
.core_req_mask (mem_req_mask),
.core_req_byteen(mem_req_byteen),
.core_req_addr (mem_req_addr),
.core_req_atype (mem_req_atype),
.core_req_data (mem_req_data),
.core_req_tag (mem_req_tag),
.core_req_ready (mem_req_ready),
.core_req_empty (mem_req_empty),
`UNUSED_PIN (core_req_sent),
// Output response
.core_rsp_valid (mem_rsp_valid),
.core_rsp_mask (mem_rsp_mask),
.core_rsp_data (mem_rsp_data),
.core_rsp_tag (mem_rsp_tag),
.core_rsp_sop (mem_rsp_sop),
.core_rsp_eop (mem_rsp_eop),
.core_rsp_ready (mem_rsp_ready),
// Memory request
.mem_req_valid (cache_req_valid),
.mem_req_rw (cache_req_rw),
.mem_req_byteen (cache_req_byteen),
.mem_req_addr (cache_req_addr),
.mem_req_atype (cache_req_atype),
.mem_req_data (cache_req_data),
.mem_req_tag (cache_req_tag),
.mem_req_ready (cache_req_ready),
// Memory response
.mem_rsp_valid (cache_rsp_valid),
.mem_rsp_data (cache_rsp_data),
.mem_rsp_tag (cache_rsp_tag),
.mem_rsp_ready (cache_rsp_ready)
);
for (genvar i = 0; i < DCACHE_CHANNELS; ++i) begin
assign cache_bus_if[i].req_valid = cache_req_valid[i];
assign cache_bus_if[i].req_data.rw = cache_req_rw[i];
assign cache_bus_if[i].req_data.byteen = cache_req_byteen[i];
assign cache_bus_if[i].req_data.addr = cache_req_addr[i];
assign cache_bus_if[i].req_data.atype = cache_req_atype[i];
assign cache_bus_if[i].req_data.data = cache_req_data[i];
assign cache_bus_if[i].req_data.tag = cache_req_tag[i];
assign cache_req_ready[i] = cache_bus_if[i].req_ready;
assign cache_rsp_valid[i] = cache_bus_if[i].rsp_valid;
assign cache_rsp_data[i] = cache_bus_if[i].rsp_data.data;
assign cache_rsp_tag[i] = cache_bus_if[i].rsp_data.tag;
assign cache_bus_if[i].rsp_ready = cache_rsp_ready[i];
end
wire [`UUID_WIDTH-1:0] rsp_uuid;
wire [`NW_WIDTH-1:0] rsp_wid;
wire [`XLEN-1:0] rsp_pc;
wire [`NR_BITS-1:0] rsp_rd;
wire [`INST_LSU_BITS-1:0] rsp_op_type;
wire [NUM_LANES-1:0][REQ_ASHIFT-1:0] rsp_align;
wire [PID_WIDTH-1:0] rsp_pid;
`UNUSED_VAR (rsp_op_type)
// unpack memory response tag
assign {
rsp_uuid,
rsp_wid,
rsp_pc, rsp_rd,
rsp_op_type,
rsp_align,
rsp_pid,
pkt_raddr
} = mem_rsp_tag;
// load response formatting
reg [NUM_LANES-1:0][`XLEN-1:0] rsp_data;
`ifdef XLEN_64
`ifdef EXT_F_ENABLE
// apply nan-boxing to flw outputs
wire rsp_is_float = rsp_rd[5];
`else
wire rsp_is_float = 0;
`endif
`endif
for (genvar i = 0; i < NUM_LANES; i++) begin
`ifdef XLEN_64
wire [63:0] rsp_data64 = mem_rsp_data[i];
wire [31:0] rsp_data32 = (rsp_align[i][2] ? mem_rsp_data[i][63:32] : mem_rsp_data[i][31:0]);
`else
wire [31:0] rsp_data32 = mem_rsp_data[i];
`endif
wire [15:0] rsp_data16 = rsp_align[i][1] ? rsp_data32[31:16] : rsp_data32[15:0];
wire [7:0] rsp_data8 = rsp_align[i][0] ? rsp_data16[15:8] : rsp_data16[7:0];
always @(*) begin
case (`INST_LSU_FMT(rsp_op_type))
`INST_FMT_B: rsp_data[i] = `XLEN'(signed'(rsp_data8));
`INST_FMT_H: rsp_data[i] = `XLEN'(signed'(rsp_data16));
`INST_FMT_BU: rsp_data[i] = `XLEN'(unsigned'(rsp_data8));
`INST_FMT_HU: rsp_data[i] = `XLEN'(unsigned'(rsp_data16));
`ifdef XLEN_64
`INST_FMT_W: rsp_data[i] = rsp_is_float ? (`XLEN'(rsp_data32) | 64'hffffffff00000000) : `XLEN'(signed'(rsp_data32));
`INST_FMT_WU: rsp_data[i] = `XLEN'(unsigned'(rsp_data32));
`INST_FMT_D: rsp_data[i] = `XLEN'(signed'(rsp_data64));
`else
`INST_FMT_W: rsp_data[i] = `XLEN'(signed'(rsp_data32));
`endif
default: rsp_data[i] = 'x;
endcase
end
end
// load commit
VX_elastic_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1),
.SIZE (2)
) ld_rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_rsp_valid),
.ready_in (mem_rsp_ready),
.data_in ({rsp_uuid, rsp_wid, mem_rsp_mask, rsp_pc, rsp_rd, rsp_data, rsp_pid, mem_rsp_sop_pkt, mem_rsp_eop_pkt}),
.data_out ({commit_ld_if.data.uuid, commit_ld_if.data.wid, commit_ld_if.data.tmask, commit_ld_if.data.PC, commit_ld_if.data.rd, commit_ld_if.data.data, commit_ld_if.data.pid, commit_ld_if.data.sop, commit_ld_if.data.eop}),
.valid_out (commit_ld_if.valid),
.ready_out (commit_ld_if.ready)
);
assign commit_ld_if.data.wb = 1'b1;
// store commit
VX_elastic_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + PID_WIDTH + 1 + 1),
.SIZE (2)
) st_rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_req_fire && mem_req_rw),
.ready_in (st_rsp_ready),
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop}),
.data_out ({commit_st_if.data.uuid, commit_st_if.data.wid, commit_st_if.data.tmask, commit_st_if.data.PC, commit_st_if.data.pid, commit_st_if.data.sop, commit_st_if.data.eop}),
.valid_out (commit_st_if.valid),
.ready_out (commit_st_if.ready)
);
assign commit_st_if.data.rd = '0;
assign commit_st_if.data.wb = 1'b0;
assign commit_st_if.data.data = commit_ld_if.data.data; // force arbiter passthru
// lsu commit
`RESET_RELAY (commit_arb_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (RSP_ARB_DATAW),
.OUT_BUF (3)
) rsp_arb (
.clk (clk),
.reset (commit_arb_reset),
.valid_in ({commit_st_if.valid, commit_ld_if.valid}),
.ready_in ({commit_st_if.ready, commit_ld_if.ready}),
.data_in ({commit_st_if.data, commit_ld_if.data}),
.data_out (commit_if.data),
.valid_out (commit_if.valid),
.ready_out (commit_if.ready),
`UNUSED_PIN (sel_out)
);
`ifdef DBG_TRACE_CORE_DCACHE
always @(posedge clk) begin
if (execute_if.valid && fence_wait) begin
`TRACE(1, ("%d: *** D$%0d fence wait\n", $time, CORE_ID));
end
if (mem_req_fire) begin
if (mem_req_rw) begin
`TRACE(1, ("%d: D$%0d Wr Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, CORE_ID, execute_if.data.wid, execute_if.data.PC, mem_req_mask));
`TRACE_ARRAY1D(1, "0x%h", full_addr, NUM_LANES);
`TRACE(1, (", atype="));
`TRACE_ARRAY1D(1, "%b", mem_req_atype, NUM_LANES);
`TRACE(1, (", byteen=0x%0h, data=", mem_req_byteen));
`TRACE_ARRAY1D(1, "0x%0h", mem_req_data, NUM_LANES);
`TRACE(1, (", tag=0x%0h (#%0d)\n", mem_req_tag, execute_if.data.uuid));
end else begin
`TRACE(1, ("%d: D$%0d Rd Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, CORE_ID, execute_if.data.wid, execute_if.data.PC, mem_req_mask));
`TRACE_ARRAY1D(1, "0x%h", full_addr, NUM_LANES);
`TRACE(1, (", atype="));
`TRACE_ARRAY1D(1, "%b", mem_req_atype, NUM_LANES);
`TRACE(1, (", byteen=0x%0h, rd=%0d, tag=0x%0h (#%0d)\n", mem_req_byteen, execute_if.data.rd, mem_req_tag, execute_if.data.uuid));
end
end
if (mem_rsp_fire) begin
`TRACE(1, ("%d: D$%0d Rsp: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, sop=%b, eop=%b, data=",
$time, CORE_ID, rsp_wid, rsp_pc, mem_rsp_mask, rsp_rd, mem_rsp_sop, mem_rsp_eop));
`TRACE_ARRAY1D(1, "0x%0h", mem_rsp_data, NUM_LANES);
`TRACE(1, (", tag=0x%0h (#%0d)\n", mem_rsp_tag, rsp_uuid));
end
end
`endif
`ifdef DBG_SCOPE_LSU
if (CORE_ID == 0 && BLOCK_ID == 0) begin
`ifdef SCOPE
VX_scope_tap #(
.SCOPE_ID (3),
.TRIGGERW (3),
.PROBEW (`UUID_WIDTH+NUM_LANES*(`XLEN+4+`XLEN)+1+`UUID_WIDTH+NUM_LANES*`XLEN)
) scope_tap (
.clk(clk),
.reset(scope_reset),
.start(1'b0),
.stop(1'b0),
.triggers({reset, mem_req_fire, mem_rsp_fire}),
.probes({execute_if.data.uuid, full_addr, mem_req_rw, mem_req_byteen, mem_req_data, rsp_uuid, rsp_data}),
.bus_in(scope_bus_in),
.bus_out(scope_bus_out)
);
`endif
`ifdef CHIPSCOPE
wire [31:0] full_addr_0 = full_addr[0];
wire [31:0] mem_req_data_0 = mem_req_data[0];
wire [31:0] rsp_data_0 = rsp_data[0];
ila_lsu ila_lsu_inst (
.clk (clk),
.probe0 ({mem_req_data_0, execute_if.data.uuid, execute_if.data.wid, execute_if.data.PC, mem_req_mask, full_addr_0, mem_req_byteen, mem_req_rw, mem_req_ready, mem_req_valid}),
.probe1 ({rsp_data_0, rsp_uuid, mem_rsp_eop, rsp_pc, rsp_rd, mem_rsp_mask, rsp_wid, mem_rsp_ready, mem_rsp_valid}),
.probe2 ({cache_bus_if.req_data.data, cache_bus_if.req_data.tag, cache_bus_if.req_data.byteen, cache_bus_if.req_data.addr, cache_bus_if.req_data.rw, cache_bus_if.req_ready, cache_bus_if.req_valid}),
.probe3 ({cache_bus_if.rsp_data.data, cache_bus_if.rsp_data.tag, cache_bus_if.rsp_ready, cache_bus_if.rsp_valid})
);
`endif
end
`else
`SCOPE_IO_UNUSED()
`endif
endmodule

View file

@ -28,29 +28,12 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
VX_commit_if.master commit_if [`ISSUE_WIDTH],
VX_mem_bus_if.master cache_bus_if [DCACHE_NUM_REQS]
);
localparam WORD_SIZE = `XLEN / 8;
localparam ADDR_WIDTH = `MEM_ADDR_WIDTH - `CLOG2(WORD_SIZE);
localparam BLOCK_SIZE = `NUM_LSU_BLOCKS;
localparam NUM_LANES = `NUM_LSU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
localparam LSUQ_SIZEW = `LOG2UP(`LSUQ_IN_SIZE);
localparam REQ_ASHIFT = `CLOG2(WORD_SIZE);
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
localparam MEM_ADDRW = `MEM_ADDR_WIDTH - MEM_ASHIFT;
// tag_id = wid + PC + rd + op_type + align + pid + pkt_addr
localparam TAG_ID_WIDTH = `NW_WIDTH + `XLEN + `NR_BITS + `INST_LSU_BITS + (NUM_LANES * (REQ_ASHIFT)) + PID_WIDTH + LSUQ_SIZEW;
// tag = uuid + tag_id
localparam TAG_WIDTH = `UUID_WIDTH + TAG_ID_WIDTH;
localparam BLOCK_SIZE = `NUM_LSU_BLOCKS;
localparam NUM_LANES = `NUM_LSU_LANES;
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) execute_if[BLOCK_SIZE]();
`RESET_RELAY (dispatch_reset, reset);
) per_block_execute_if[BLOCK_SIZE]();
VX_dispatch_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
@ -58,522 +41,37 @@ module VX_lsu_unit import VX_gpu_pkg::*; #(
.OUT_BUF (1)
) dispatch_unit (
.clk (clk),
.reset (dispatch_reset),
.reset (reset),
.dispatch_if(dispatch_if),
.execute_if (execute_if)
.execute_if (per_block_execute_if)
);
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_block_if[BLOCK_SIZE]();
) per_block_commit_if[BLOCK_SIZE]();
for (genvar block_idx = 0; block_idx < BLOCK_SIZE; ++block_idx) begin
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_st_if();
VX_commit_if #(
.NUM_LANES (NUM_LANES)
) commit_ld_if();
`UNUSED_VAR (execute_if[block_idx].data.op_mod)
`UNUSED_VAR (execute_if[block_idx].data.use_PC)
`UNUSED_VAR (execute_if[block_idx].data.use_imm)
`UNUSED_VAR (execute_if[block_idx].data.rs3_data)
`UNUSED_VAR (execute_if[block_idx].data.tid)
// full address calculation
wire [NUM_LANES-1:0][`XLEN-1:0] full_addr;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign full_addr[i] = execute_if[block_idx].data.rs1_data[i][`XLEN-1:0] + execute_if[block_idx].data.imm;
end
// address type calculation
wire [NUM_LANES-1:0][`ADDR_TYPE_WIDTH-1:0] mem_req_atype;
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [MEM_ADDRW-1:0] block_addr = full_addr[i][MEM_ASHIFT +: MEM_ADDRW];
// is I/O address
wire [MEM_ADDRW-1:0] io_addr_start = MEM_ADDRW'(`XLEN'(`IO_BASE_ADDR) >> MEM_ASHIFT);
assign mem_req_atype[i][`ADDR_TYPE_IO] = (block_addr >= io_addr_start);
`ifdef LMEM_ENABLE
// is local memory address
wire [MEM_ADDRW-1:0] lmem_addr_start = MEM_ADDRW'(`XLEN'(`LMEM_BASE_ADDR) >> MEM_ASHIFT);
wire [MEM_ADDRW-1:0] lmem_addr_end = MEM_ADDRW'((`XLEN'(`LMEM_BASE_ADDR) + `XLEN'(1 << `LMEM_LOG_SIZE)) >> MEM_ASHIFT);
assign mem_req_atype[i][`ADDR_TYPE_LOCAL] = (block_addr >= lmem_addr_start) && (block_addr < lmem_addr_end);
`endif
end
wire mem_req_empty;
wire st_rsp_ready;
wire lsu_valid, lsu_ready;
// fence: stall the pipeline until all pending requests are sent
wire is_fence = `INST_LSU_IS_FENCE(execute_if[block_idx].data.op_type);
wire fence_wait = is_fence && ~mem_req_empty;
assign lsu_valid = execute_if[block_idx].valid && ~fence_wait;
assign execute_if[block_idx].ready = lsu_ready && ~fence_wait;
// schedule memory request
wire mem_req_valid;
wire [NUM_LANES-1:0] mem_req_mask;
wire mem_req_rw;
wire [NUM_LANES-1:0][`MEM_ADDR_WIDTH-REQ_ASHIFT-1:0] mem_req_addr;
reg [NUM_LANES-1:0][WORD_SIZE-1:0] mem_req_byteen;
reg [NUM_LANES-1:0][`XLEN-1:0] mem_req_data;
wire [TAG_WIDTH-1:0] mem_req_tag;
wire mem_req_ready;
wire mem_rsp_valid;
wire [NUM_LANES-1:0] mem_rsp_mask;
wire [NUM_LANES-1:0][`XLEN-1:0] mem_rsp_data;
wire [TAG_WIDTH-1:0] mem_rsp_tag;
wire mem_rsp_sop;
wire mem_rsp_eop;
wire mem_rsp_ready;
assign mem_req_valid = lsu_valid;
assign lsu_ready = mem_req_ready
&& (~mem_req_rw || st_rsp_ready); // writes commit directly
assign mem_req_mask = execute_if[block_idx].data.tmask;
assign mem_req_rw = ~execute_if[block_idx].data.wb;
wire mem_req_fire = mem_req_valid && mem_req_ready;
wire mem_rsp_fire = mem_rsp_valid && mem_rsp_ready;
`UNUSED_VAR (mem_req_fire)
`UNUSED_VAR (mem_rsp_fire)
// address formatting
wire [NUM_LANES-1:0][REQ_ASHIFT-1:0] req_align;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign req_align[i] = full_addr[i][REQ_ASHIFT-1:0];
assign mem_req_addr[i] = full_addr[i][`MEM_ADDR_WIDTH-1:REQ_ASHIFT];
end
// byte enable formatting
for (genvar i = 0; i < NUM_LANES; ++i) begin
always @(*) begin
mem_req_byteen[i] = '0;
case (`INST_LSU_WSIZE(execute_if[block_idx].data.op_type))
0: begin // 8-bit
mem_req_byteen[i][req_align[i]] = 1'b1;
end
1: begin // 16 bit
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:1], 1'b0}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:1], 1'b1}] = 1'b1;
end
`ifdef XLEN_64
2: begin // 32 bit
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b00}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b01}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b10}] = 1'b1;
mem_req_byteen[i][{req_align[i][REQ_ASHIFT-1:2], 2'b11}] = 1'b1;
end
`endif
default : mem_req_byteen[i] = {WORD_SIZE{1'b1}};
endcase
end
end
// memory misalignment not supported!
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire lsu_req_fire = execute_if[block_idx].valid && execute_if[block_idx].ready;
`RUNTIME_ASSERT((~lsu_req_fire || ~execute_if[block_idx].data.tmask[i] || is_fence || (full_addr[i] % (1 << `INST_LSU_WSIZE(execute_if[block_idx].data.op_type))) == 0),
("misaligned memory access, wid=%0d, PC=0x%0h, addr=0x%0h, wsize=%0d! (#%0d)",
execute_if[block_idx].data.wid, execute_if[block_idx].data.PC, full_addr[i], `INST_LSU_WSIZE(execute_if[block_idx].data.op_type), execute_if[block_idx].data.uuid));
end
// store data formatting
for (genvar i = 0; i < NUM_LANES; ++i) begin
always @(*) begin
mem_req_data[i] = execute_if[block_idx].data.rs2_data[i];
case (req_align[i])
1: mem_req_data[i][`XLEN-1:8] = execute_if[block_idx].data.rs2_data[i][`XLEN-9:0];
2: mem_req_data[i][`XLEN-1:16] = execute_if[block_idx].data.rs2_data[i][`XLEN-17:0];
3: mem_req_data[i][`XLEN-1:24] = execute_if[block_idx].data.rs2_data[i][`XLEN-25:0];
`ifdef XLEN_64
4: mem_req_data[i][`XLEN-1:32] = execute_if[block_idx].data.rs2_data[i][`XLEN-33:0];
5: mem_req_data[i][`XLEN-1:40] = execute_if[block_idx].data.rs2_data[i][`XLEN-41:0];
6: mem_req_data[i][`XLEN-1:48] = execute_if[block_idx].data.rs2_data[i][`XLEN-49:0];
7: mem_req_data[i][`XLEN-1:56] = execute_if[block_idx].data.rs2_data[i][`XLEN-57:0];
`endif
default:;
endcase
end
end
// track SOP/EOP for out-of-order memory responses
wire [LSUQ_SIZEW-1:0] pkt_waddr, pkt_raddr;
wire mem_rsp_sop_pkt, mem_rsp_eop_pkt;
if (PID_BITS != 0) begin
reg [`LSUQ_IN_SIZE-1:0][PID_BITS:0] pkt_ctr;
reg [`LSUQ_IN_SIZE-1:0] pkt_sop, pkt_eop;
wire mem_req_rd_fire = mem_req_fire && execute_if[block_idx].data.wb;
wire mem_req_rd_sop_fire = mem_req_rd_fire && execute_if[block_idx].data.sop;
wire mem_req_rd_eop_fire = mem_req_rd_fire && execute_if[block_idx].data.eop;
wire mem_rsp_eop_fire = mem_rsp_fire && mem_rsp_eop;
wire full;
VX_allocator #(
.SIZE (`LSUQ_IN_SIZE)
) pkt_allocator (
.clk (clk),
.reset (reset),
.acquire_en (mem_req_rd_eop_fire),
.acquire_addr(pkt_waddr),
.release_en (mem_rsp_eop_pkt),
.release_addr(pkt_raddr),
`UNUSED_PIN (empty),
.full (full)
);
wire rd_during_wr = mem_req_rd_fire && mem_rsp_eop_fire && (pkt_raddr == pkt_waddr);
always @(posedge clk) begin
if (reset) begin
pkt_ctr <= '0;
pkt_sop <= '0;
pkt_eop <= '0;
end else begin
if (mem_req_rd_sop_fire) begin
pkt_sop[pkt_waddr] <= 1;
end
if (mem_req_rd_eop_fire) begin
pkt_eop[pkt_waddr] <= 1;
end
if (mem_rsp_fire) begin
pkt_sop[pkt_raddr] <= 0;
end
if (mem_rsp_eop_pkt) begin
pkt_eop[pkt_raddr] <= 0;
end
if (~rd_during_wr) begin
if (mem_req_rd_fire) begin
pkt_ctr[pkt_waddr] <= pkt_ctr[pkt_waddr] + PID_BITS'(1);
end
if (mem_rsp_eop_fire) begin
pkt_ctr[pkt_raddr] <= pkt_ctr[pkt_raddr] - PID_BITS'(1);
end
end
end
end
assign mem_rsp_sop_pkt = pkt_sop[pkt_raddr];
assign mem_rsp_eop_pkt = mem_rsp_eop_fire && pkt_eop[pkt_raddr] && (pkt_ctr[pkt_raddr] == 1);
`RUNTIME_ASSERT(~(mem_req_rd_fire && full), ("allocator full!"))
`RUNTIME_ASSERT(~mem_req_rd_sop_fire || 0 == pkt_ctr[pkt_waddr], ("Oops!"))
`UNUSED_VAR (mem_rsp_sop)
end else begin
assign pkt_waddr = 0;
assign mem_rsp_sop_pkt = mem_rsp_sop;
assign mem_rsp_eop_pkt = mem_rsp_eop;
`UNUSED_VAR (pkt_raddr)
end
// pack memory request tag
assign mem_req_tag = {
execute_if[block_idx].data.uuid,
execute_if[block_idx].data.wid,
execute_if[block_idx].data.PC,
execute_if[block_idx].data.rd,
execute_if[block_idx].data.op_type,
req_align, execute_if[block_idx].data.pid,
pkt_waddr
};
wire [DCACHE_CHANNELS-1:0] cache_req_valid;
wire [DCACHE_CHANNELS-1:0] cache_req_rw;
wire [DCACHE_CHANNELS-1:0][DCACHE_WORD_SIZE-1:0] cache_req_byteen;
wire [DCACHE_CHANNELS-1:0][DCACHE_ADDR_WIDTH-1:0] cache_req_addr;
wire [DCACHE_CHANNELS-1:0][`ADDR_TYPE_WIDTH-1:0] cache_req_atype;
wire [DCACHE_CHANNELS-1:0][(DCACHE_WORD_SIZE*8)-1:0] cache_req_data;
wire [DCACHE_CHANNELS-1:0][DCACHE_TAG_WIDTH-1:0] cache_req_tag;
wire [DCACHE_CHANNELS-1:0] cache_req_ready;
wire [DCACHE_CHANNELS-1:0] cache_rsp_valid;
wire [DCACHE_CHANNELS-1:0][(DCACHE_WORD_SIZE*8)-1:0] cache_rsp_data;
wire [DCACHE_CHANNELS-1:0][DCACHE_TAG_WIDTH-1:0] cache_rsp_tag;
wire [DCACHE_CHANNELS-1:0] cache_rsp_ready;
`RESET_RELAY (mem_scheduler_reset, reset);
VX_mem_scheduler #(
.INSTANCE_ID ($sformatf("core%0d-lsu-memsched%0d", CORE_ID, block_idx)),
.CORE_REQS (`NUM_LSU_LANES),
.MEM_CHANNELS(DCACHE_CHANNELS),
.WORD_SIZE (WORD_SIZE),
.LINE_SIZE (DCACHE_WORD_SIZE),
.ADDR_WIDTH (ADDR_WIDTH),
.ATYPE_WIDTH (`ADDR_TYPE_WIDTH),
.TAG_WIDTH (TAG_WIDTH),
.CORE_QUEUE_SIZE (`LSUQ_IN_SIZE),
.MEM_QUEUE_SIZE (`LSUQ_OUT_SIZE),
.UUID_WIDTH (`UUID_WIDTH),
.RSP_PARTIAL (1),
.MEM_OUT_BUF (2)
) mem_scheduler (
.clk (clk),
.reset (mem_scheduler_reset),
// Input request
.core_req_valid (mem_req_valid),
.core_req_rw (mem_req_rw),
.core_req_mask (mem_req_mask),
.core_req_byteen(mem_req_byteen),
.core_req_addr (mem_req_addr),
.core_req_atype (mem_req_atype),
.core_req_data (mem_req_data),
.core_req_tag (mem_req_tag),
.core_req_ready (mem_req_ready),
.core_req_empty (mem_req_empty),
`UNUSED_PIN (core_req_sent),
// Output response
.core_rsp_valid (mem_rsp_valid),
.core_rsp_mask (mem_rsp_mask),
.core_rsp_data (mem_rsp_data),
.core_rsp_tag (mem_rsp_tag),
.core_rsp_sop (mem_rsp_sop),
.core_rsp_eop (mem_rsp_eop),
.core_rsp_ready (mem_rsp_ready),
// Memory request
.mem_req_valid (cache_req_valid),
.mem_req_rw (cache_req_rw),
.mem_req_byteen (cache_req_byteen),
.mem_req_addr (cache_req_addr),
.mem_req_atype (cache_req_atype),
.mem_req_data (cache_req_data),
.mem_req_tag (cache_req_tag),
.mem_req_ready (cache_req_ready),
// Memory response
.mem_rsp_valid (cache_rsp_valid),
.mem_rsp_data (cache_rsp_data),
.mem_rsp_tag (cache_rsp_tag),
.mem_rsp_ready (cache_rsp_ready)
`RESET_RELAY (slice_reset, reset);
VX_lsu_slice #(
.CORE_ID (CORE_ID),
.BLOCK_ID (block_idx)
) lsu_slice(
.clk (clk),
.reset (slice_reset),
.execute_if (per_block_execute_if[block_idx]),
.commit_if (per_block_commit_if[block_idx]),
.cache_bus_if (cache_bus_if[block_idx * DCACHE_CHANNELS +: DCACHE_CHANNELS])
);
for (genvar i = 0; i < DCACHE_CHANNELS; ++i) begin
assign cache_bus_if[block_idx * DCACHE_CHANNELS + i].req_valid = cache_req_valid[i];
assign cache_bus_if[block_idx * DCACHE_CHANNELS + i].req_data.rw = cache_req_rw[i];
assign cache_bus_if[block_idx * DCACHE_CHANNELS + i].req_data.byteen = cache_req_byteen[i];
assign cache_bus_if[block_idx * DCACHE_CHANNELS + i].req_data.addr = cache_req_addr[i];
assign cache_bus_if[block_idx * DCACHE_CHANNELS + i].req_data.atype = cache_req_atype[i];
assign cache_bus_if[block_idx * DCACHE_CHANNELS + i].req_data.data = cache_req_data[i];
assign cache_bus_if[block_idx * DCACHE_CHANNELS + i].req_data.tag = cache_req_tag[i];
assign cache_req_ready[i] = cache_bus_if[block_idx * DCACHE_CHANNELS + i].req_ready;
assign cache_rsp_valid[i] = cache_bus_if[block_idx * DCACHE_CHANNELS + i].rsp_valid;
assign cache_rsp_data[i] = cache_bus_if[block_idx * DCACHE_CHANNELS + i].rsp_data.data;
assign cache_rsp_tag[i] = cache_bus_if[block_idx * DCACHE_CHANNELS + i].rsp_data.tag;
assign cache_bus_if[block_idx * DCACHE_CHANNELS + i].rsp_ready = cache_rsp_ready[i];
end
wire [`UUID_WIDTH-1:0] rsp_uuid;
wire [`NW_WIDTH-1:0] rsp_wid;
wire [`XLEN-1:0] rsp_pc;
wire [`NR_BITS-1:0] rsp_rd;
wire [`INST_LSU_BITS-1:0] rsp_op_type;
wire [NUM_LANES-1:0][REQ_ASHIFT-1:0] rsp_align;
wire [PID_WIDTH-1:0] rsp_pid;
`UNUSED_VAR (rsp_op_type)
// unpack memory response tag
assign {
rsp_uuid,
rsp_wid,
rsp_pc, rsp_rd,
rsp_op_type,
rsp_align,
rsp_pid,
pkt_raddr
} = mem_rsp_tag;
// load response formatting
reg [NUM_LANES-1:0][`XLEN-1:0] rsp_data;
`ifdef XLEN_64
`ifdef EXT_F_ENABLE
// apply nan-boxing to flw outputs
wire rsp_is_float = rsp_rd[5];
`else
wire rsp_is_float = 0;
`endif
`endif
for (genvar i = 0; i < NUM_LANES; i++) begin
`ifdef XLEN_64
wire [63:0] rsp_data64 = mem_rsp_data[i];
wire [31:0] rsp_data32 = (rsp_align[i][2] ? mem_rsp_data[i][63:32] : mem_rsp_data[i][31:0]);
`else
wire [31:0] rsp_data32 = mem_rsp_data[i];
`endif
wire [15:0] rsp_data16 = rsp_align[i][1] ? rsp_data32[31:16] : rsp_data32[15:0];
wire [7:0] rsp_data8 = rsp_align[i][0] ? rsp_data16[15:8] : rsp_data16[7:0];
always @(*) begin
case (`INST_LSU_FMT(rsp_op_type))
`INST_FMT_B: rsp_data[i] = `XLEN'(signed'(rsp_data8));
`INST_FMT_H: rsp_data[i] = `XLEN'(signed'(rsp_data16));
`INST_FMT_BU: rsp_data[i] = `XLEN'(unsigned'(rsp_data8));
`INST_FMT_HU: rsp_data[i] = `XLEN'(unsigned'(rsp_data16));
`ifdef XLEN_64
`INST_FMT_W: rsp_data[i] = rsp_is_float ? (`XLEN'(rsp_data32) | 64'hffffffff00000000) : `XLEN'(signed'(rsp_data32));
`INST_FMT_WU: rsp_data[i] = `XLEN'(unsigned'(rsp_data32));
`INST_FMT_D: rsp_data[i] = `XLEN'(signed'(rsp_data64));
`else
`INST_FMT_W: rsp_data[i] = `XLEN'(signed'(rsp_data32));
`endif
default: rsp_data[i] = 'x;
endcase
end
end
// load commit
VX_elastic_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1),
.SIZE (2)
) ld_rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_rsp_valid),
.ready_in (mem_rsp_ready),
.data_in ({rsp_uuid, rsp_wid, mem_rsp_mask, rsp_pc, rsp_rd, rsp_data, rsp_pid, mem_rsp_sop_pkt, mem_rsp_eop_pkt}),
.data_out ({commit_ld_if.data.uuid, commit_ld_if.data.wid, commit_ld_if.data.tmask, commit_ld_if.data.PC, commit_ld_if.data.rd, commit_ld_if.data.data, commit_ld_if.data.pid, commit_ld_if.data.sop, commit_ld_if.data.eop}),
.valid_out (commit_ld_if.valid),
.ready_out (commit_ld_if.ready)
);
assign commit_ld_if.data.wb = 1'b1;
// store commit
VX_elastic_buffer #(
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + PID_WIDTH + 1 + 1),
.SIZE (2)
) st_rsp_buf (
.clk (clk),
.reset (reset),
.valid_in (mem_req_fire && mem_req_rw),
.ready_in (st_rsp_ready),
.data_in ({execute_if[block_idx].data.uuid, execute_if[block_idx].data.wid, execute_if[block_idx].data.tmask, execute_if[block_idx].data.PC, execute_if[block_idx].data.pid, execute_if[block_idx].data.sop, execute_if[block_idx].data.eop}),
.data_out ({commit_st_if.data.uuid, commit_st_if.data.wid, commit_st_if.data.tmask, commit_st_if.data.PC, commit_st_if.data.pid, commit_st_if.data.sop, commit_st_if.data.eop}),
.valid_out (commit_st_if.valid),
.ready_out (commit_st_if.ready)
);
assign commit_st_if.data.rd = '0;
assign commit_st_if.data.wb = 1'b0;
assign commit_st_if.data.data = commit_ld_if.data.data; // force arbiter passthru
// lsu commit
`RESET_RELAY (commit_arb_reset, reset);
VX_stream_arb #(
.NUM_INPUTS (2),
.DATAW (RSP_ARB_DATAW),
.OUT_BUF (3)
) rsp_arb (
.clk (clk),
.reset (commit_arb_reset),
.valid_in ({commit_st_if.valid, commit_ld_if.valid}),
.ready_in ({commit_st_if.ready, commit_ld_if.ready}),
.data_in ({commit_st_if.data, commit_ld_if.data}),
.data_out (commit_block_if[block_idx].data),
.valid_out (commit_block_if[block_idx].valid),
.ready_out (commit_block_if[block_idx].ready),
`UNUSED_PIN (sel_out)
);
`ifdef DBG_TRACE_CORE_DCACHE
always @(posedge clk) begin
if (execute_if[block_idx].valid && fence_wait) begin
`TRACE(1, ("%d: *** D$%0d fence wait\n", $time, CORE_ID));
end
if (mem_req_fire) begin
if (mem_req_rw) begin
`TRACE(1, ("%d: D$%0d Wr Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, CORE_ID, execute_if[block_idx].data.wid, execute_if[block_idx].data.PC, mem_req_mask));
`TRACE_ARRAY1D(1, "0x%h", full_addr, NUM_LANES);
`TRACE(1, (", atype="));
`TRACE_ARRAY1D(1, "%b", mem_req_atype, NUM_LANES);
`TRACE(1, (", byteen=0x%0h, data=", mem_req_byteen));
`TRACE_ARRAY1D(1, "0x%0h", mem_req_data, NUM_LANES);
`TRACE(1, (", tag=0x%0h (#%0d)\n", mem_req_tag, execute_if[block_idx].data.uuid));
end else begin
`TRACE(1, ("%d: D$%0d Rd Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, CORE_ID, execute_if[block_idx].data.wid, execute_if[block_idx].data.PC, mem_req_mask));
`TRACE_ARRAY1D(1, "0x%h", full_addr, NUM_LANES);
`TRACE(1, (", atype="));
`TRACE_ARRAY1D(1, "%b", mem_req_atype, NUM_LANES);
`TRACE(1, (", byteen=0x%0h, rd=%0d, tag=0x%0h (#%0d)\n", mem_req_byteen, execute_if[block_idx].data.rd, mem_req_tag, execute_if[block_idx].data.uuid));
end
end
if (mem_rsp_fire) begin
`TRACE(1, ("%d: D$%0d Rsp: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, sop=%b, eop=%b, data=",
$time, CORE_ID, rsp_wid, rsp_pc, mem_rsp_mask, rsp_rd, mem_rsp_sop, mem_rsp_eop));
`TRACE_ARRAY1D(1, "0x%0h", mem_rsp_data, NUM_LANES);
`TRACE(1, (", tag=0x%0h (#%0d)\n", mem_rsp_tag, rsp_uuid));
end
end
`endif
`ifdef DBG_SCOPE_LSU
if (CORE_ID == 0 && block_idx == 0) begin
`ifdef SCOPE
VX_scope_tap #(
.SCOPE_ID (3),
.TRIGGERW (3),
.PROBEW (`UUID_WIDTH+NUM_LANES*(`XLEN+4+`XLEN)+1+`UUID_WIDTH+NUM_LANES*`XLEN)
) scope_tap (
.clk(clk),
.reset(scope_reset),
.start(1'b0),
.stop(1'b0),
.triggers({reset, mem_req_fire, mem_rsp_fire}),
.probes({execute_if[0].data.uuid, full_addr, mem_req_rw, mem_req_byteen, mem_req_data, rsp_uuid, rsp_data}),
.bus_in(scope_bus_in),
.bus_out(scope_bus_out)
);
`endif
`ifdef CHIPSCOPE
wire [31:0] full_addr_0 = full_addr[0];
wire [31:0] mem_req_data_0 = mem_req_data[0];
wire [31:0] rsp_data_0 = rsp_data[0];
ila_lsu ila_lsu_inst (
.clk (clk),
.probe0 ({mem_req_data_0, execute_if[0].data.uuid, execute_if[0].data.wid, execute_if[0].data.PC, mem_req_mask, full_addr_0, mem_req_byteen, mem_req_rw, mem_req_ready, mem_req_valid}),
.probe1 ({rsp_data_0, rsp_uuid, mem_rsp_eop, rsp_pc, rsp_rd, mem_rsp_mask, rsp_wid, mem_rsp_ready, mem_rsp_valid}),
.probe2 ({cache_bus_if.req_data.data, cache_bus_if.req_data.tag, cache_bus_if.req_data.byteen, cache_bus_if.req_data.addr, cache_bus_if.req_data.rw, cache_bus_if.req_ready, cache_bus_if.req_valid}),
.probe3 ({cache_bus_if.rsp_data.data, cache_bus_if.rsp_data.tag, cache_bus_if.rsp_ready, cache_bus_if.rsp_valid})
);
`endif
end
`else
`SCOPE_IO_UNUSED()
`endif
end
`RESET_RELAY (commit_reset, reset);
VX_gather_unit #(
.BLOCK_SIZE (BLOCK_SIZE),
.NUM_LANES (NUM_LANES),
.OUT_BUF (3)
) gather_unit (
.clk (clk),
.reset (commit_reset),
.commit_in_if (commit_block_if),
.reset (reset),
.commit_in_if (per_block_commit_if),
.commit_out_if (commit_if)
);