minor updates

This commit is contained in:
Blaise Tine 2022-05-30 06:25:48 -04:00
parent 2cfae6313e
commit 0c92092af6
17 changed files with 188 additions and 185 deletions

View file

@ -1,4 +1,9 @@
`include "VX_define.vh"
`include "VX_gpu_types.vh"
`IGNORE_WARNINGS_BEGIN
import VX_gpu_types::*;
`IGNORE_WARNINGS_END
module VX_core #(
parameter CORE_ID = 0

View file

@ -1,7 +1,9 @@
`include "VX_define.vh"
`include "VX_gpu_types.vh"
`include "VX_fpu_types.vh"
`IGNORE_WARNINGS_BEGIN
import VX_gpu_types::*;
import VX_fpu_types::*;
`IGNORE_WARNINGS_END

View file

@ -1,4 +1,9 @@
`include "VX_define.vh"
`include "VX_gpu_types.vh"
`IGNORE_WARNINGS_BEGIN
import VX_gpu_types::*;
`IGNORE_WARNINGS_END
module VX_csr_unit #(
parameter CORE_ID = 0

View file

@ -1,4 +1,9 @@
`include "VX_define.vh"
`include "VX_gpu_types.vh"
`IGNORE_WARNINGS_BEGIN
import VX_gpu_types::*;
`IGNORE_WARNINGS_END
module VX_execute #(
parameter CORE_ID = 0

View file

@ -1,4 +1,9 @@
`include "VX_define.vh"
`include "VX_gpu_types.vh"
`IGNORE_WARNINGS_BEGIN
import VX_gpu_types::*;
`IGNORE_WARNINGS_END
module VX_fetch #(
parameter CORE_ID = 0

View file

@ -31,7 +31,7 @@ module VX_lsu_unit #(
localparam STACK_START_W = MEM_ADDRW'(`STACK_BASE_ADDR >> MEM_ASHIFT);
localparam STACK_END_W = MEM_ADDRW'((`STACK_BASE_ADDR - TOTAL_STACK_SIZE) >> MEM_ASHIFT);
// req_uuid, req_addr_type, req_wid, req_pc, req_tmask, req_rd, req_op_type, req_align, req_is_dup
// uuid, addr_type, wid, PC, tmask, rd, op_type, align, is_dup
localparam TAG_WIDTH = `UUID_BITS + (`NUM_THREADS * `CACHE_ADDR_TYPE_BITS) + `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + `INST_LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1;
`STATIC_ASSERT(0 == (`IO_BASE_ADDR % MEM_ASHIFT), ("invalid parameter"))
@ -40,23 +40,7 @@ module VX_lsu_unit #(
`STATIC_ASSERT(`STACK_SIZE == `MEM_BLOCK_SIZE * (`STACK_SIZE / `MEM_BLOCK_SIZE), ("invalid parameter"))
`STATIC_ASSERT(`SMEM_LOCAL_SIZE >= `MEM_BLOCK_SIZE, ("invalid parameter"))
wire req_valid;
wire [`UUID_BITS-1:0] req_uuid;
wire [`NUM_THREADS-1:0] req_tmask;
wire [`NUM_THREADS-1:0][31:0] req_addr;
wire [`INST_LSU_BITS-1:0] req_op_type;
wire [`NUM_THREADS-1:0][31:0] req_data;
wire [`NR_BITS-1:0] req_rd;
wire req_wb;
wire [`NW_BITS-1:0] req_wid;
wire [31:0] req_pc;
wire req_is_dup;
wire req_ready;
wire mem_req_empty;
wire mem_rsp_eop;
wire [`NUM_THREADS-1:0][`CACHE_ADDR_TYPE_BITS-1:0] lsu_addr_type, req_addr_type;
wire [`NUM_THREADS-1:0][`CACHE_ADDR_TYPE_BITS-1:0] lsu_addr_type;
// full address calculation
@ -98,26 +82,14 @@ module VX_lsu_unit #(
`endif
end
wire mem_req_empty;
wire lsu_valid, lsu_ready;
// fence: stall the pipeline until all pending requests are sent
wire fence_wait = lsu_req_if.is_fence && (req_valid || ~mem_req_empty);
wire fence_wait = lsu_req_if.is_fence && ~mem_req_empty;
wire stall_in = req_valid && ~req_ready;
wire lsu_valid = lsu_req_if.valid && ~fence_wait;
VX_pipe_register #(
.DATAW (1 + 1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * `CACHE_ADDR_TYPE_BITS) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)),
.RESETW (1)
) req_pipe_reg (
.clk (clk),
.reset (reset),
.enable (!stall_in),
.data_in ({lsu_valid, lsu_is_dup, lsu_req_if.uuid, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, full_addr, lsu_addr_type, lsu_req_if.op_type, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.store_data}),
.data_out ({req_valid, req_is_dup, req_uuid, req_wid, req_tmask, req_pc, req_addr, req_addr_type, req_op_type, req_rd, req_wb, req_data})
);
// Can accept new request?
assign lsu_req_if.ready = ~stall_in && ~fence_wait;
assign lsu_valid = lsu_req_if.valid && ~fence_wait;
assign lsu_req_if.ready = lsu_ready && ~fence_wait;
// schedule memory request
@ -125,8 +97,8 @@ module VX_lsu_unit #(
wire [`NUM_THREADS-1:0] mem_req_mask;
wire mem_req_rw;
wire [`NUM_THREADS-1:0][29:0] mem_req_addr;
wire [`NUM_THREADS-1:0][3:0] mem_req_byteen;
wire [`NUM_THREADS-1:0][31:0] mem_req_data;
reg [`NUM_THREADS-1:0][3:0] mem_req_byteen;
reg [`NUM_THREADS-1:0][31:0] mem_req_data;
wire [TAG_WIDTH-1:0] mem_req_tag;
wire mem_req_ready;
@ -134,32 +106,33 @@ module VX_lsu_unit #(
wire [`NUM_THREADS-1:0] mem_rsp_mask;
wire [`NUM_THREADS-1:0][31:0] mem_rsp_data;
wire [TAG_WIDTH-1:0] mem_rsp_tag;
wire mem_rsp_eop;
wire mem_rsp_ready;
assign mem_req_valid = req_valid;
assign req_ready = mem_req_ready;
assign mem_req_valid = lsu_valid;
assign lsu_ready = mem_req_ready;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
assign mem_req_mask[i] = req_tmask[i] && (~req_is_dup || (i == 0));
assign mem_req_mask[i] = lsu_req_if.tmask[i] && (~lsu_is_dup || (i == 0));
end
assign mem_req_rw = ~req_wb;
assign mem_req_rw = ~lsu_req_if.wb;
// address formatting
wire [`NUM_THREADS-1:0][REQ_ASHIFT-1:0] req_align;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
assign mem_req_addr[i] = req_addr[i][31:2];
assign req_align[i] = req_addr[i][1:0];
assign req_align[i] = full_addr[i][1:0];
assign mem_req_addr[i] = full_addr[i][31:2];
end
// data formatting
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
always @(*) begin
mem_req_byteen[i] = {4{req_wb}};
case (`INST_LSU_WSIZE(req_op_type))
mem_req_byteen[i] = {4{lsu_req_if.wb}};
case (`INST_LSU_WSIZE(lsu_req_if.op_type))
0: mem_req_byteen[i][req_align[i]] = 1;
1: begin
mem_req_byteen[i][req_align[i]] = 1;
@ -170,17 +143,17 @@ module VX_lsu_unit #(
end
always @(*) begin
mem_req_data[i] = req_data[i];
mem_req_data[i] = lsu_req_if.store_data[i];
case (req_align[i])
1: mem_req_data[i][31:8] = req_data[i][23:0];
2: mem_req_data[i][31:16] = req_data[i][15:0];
3: mem_req_data[i][31:24] = req_data[i][7:0];
1: mem_req_data[i][31:8] = lsu_req_if.store_data[i][23:0];
2: mem_req_data[i][31:16] = lsu_req_if.store_data[i][15:0];
3: mem_req_data[i][31:24] = lsu_req_if.store_data[i][7:0];
default:;
endcase
end
end
assign mem_req_tag = {req_uuid, req_addr_type, req_wid, req_tmask, req_pc, req_rd, req_op_type, req_align, req_is_dup};
assign mem_req_tag = {lsu_req_if.uuid, lsu_addr_type, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, lsu_req_if.rd, lsu_req_if.op_type, req_align, lsu_is_dup};
VX_cache_req_if #(
.NUM_REQS (`DCACHE_NUM_REQS),
@ -195,6 +168,7 @@ module VX_lsu_unit #(
) cache_rsp_tmp_if();
VX_mem_scheduler #(
.INSTANCE_ID($sformatf("core%0d-lsu-memsched", CORE_ID)),
.NUM_REQS (`LSU_MEM_REQS),
.NUM_BANKS (`DCACHE_NUM_REQS),
.ADDR_WIDTH (`DCACHE_ADDR_WIDTH),
@ -257,30 +231,33 @@ module VX_lsu_unit #(
wire [`CLOG2(`LSUQ_SIZE)-1:0] cache_req_tag, cache_rsp_tag;
if (`DCACHE_NUM_BATCHES > 1) begin
wire [`DCACHE_NUM_BATCHES-1:0][`DCACHE_NUM_REQS-1:0][`CACHE_ADDR_TYPE_BITS-1:0] cache_req_type_w, cache_rsp_type_w;
wire [`DCACHE_BATCH_SEL_BITS-1:0] cache_req_bid, cache_rsp_bid;
wire [`DCACHE_NUM_BATCHES-1:0][`DCACHE_NUM_REQS-1:0][`CACHE_ADDR_TYPE_BITS-1:0] cache_req_type_b, cache_rsp_type_b;
wire [`DCACHE_BATCH_SEL_BITS-1:0] cache_req_bid, cache_rsp_bid;
for (genvar j = 0; j < `DCACHE_NUM_BATCHES; ++j) begin
localparam k = j * `DCACHE_NUM_REQS + i;
if (k < `NUM_THREADS) begin
assign cache_req_type_w[j][i] = cache_req_type[k];
assign cache_rsp_type[k] = cache_rsp_type_w[j][i];
assign cache_req_type_b[j][i] = cache_req_type[k];
assign cache_rsp_type[k] = cache_rsp_type_b[j][i];
end else begin
assign cache_req_type_w[j][i] = 'x;
`UNUSED_VAR (cache_rsp_type_w[j][i])
assign cache_req_type_b[j][i] = 'x;
`UNUSED_VAR (cache_rsp_type_b[j][i])
end
end
wire [`CACHE_ADDR_TYPE_BITS-1:0] cache_rsp_type_bi = cache_rsp_type_b[cache_rsp_bid][i];
assign {cache_req_uuid, cache_req_type, cache_req_bid, cache_req_tag} = cache_req_tmp_if.tag[i];
assign cache_rsp_tmp_if.tag[i] = {cache_rsp_uuid, cache_rsp_type, cache_rsp_bid, cache_rsp_tag};
assign cache_req_if.tag[i] = {cache_req_uuid, cache_req_bid, cache_req_tag, cache_req_type_w[cache_req_bid][i]};
assign {cache_rsp_uuid, cache_rsp_bid, cache_rsp_tag, cache_rsp_type_w[cache_rsp_bid][i]} = cache_rsp_if.tag[i];
assign cache_req_if.tag[i] = {cache_req_uuid, cache_req_bid, cache_req_tag, cache_req_type_b[cache_req_bid][i]};
assign {cache_rsp_uuid, cache_rsp_bid, cache_rsp_tag, cache_rsp_type_bi} = cache_rsp_if.tag[i];
for (genvar j = 0; j < `DCACHE_NUM_REQS; ++j) begin
if (i != j) begin
`UNUSED_VAR (cache_req_type_w[cache_req_bid][j])
assign cache_rsp_type_w[cache_req_bid][j] = 0;
if (i != j) begin
wire [`CACHE_ADDR_TYPE_BITS-1:0] cache_rsp_type_bj = cache_rsp_type_b[cache_rsp_bid][j];
`UNUSED_VAR (cache_req_type_b[cache_req_bid][j])
assign cache_rsp_type_bj = 0;
end
end
end else begin
@ -316,10 +293,10 @@ module VX_lsu_unit #(
// send store commit
assign st_commit_if.valid = mem_req_fire && mem_req_rw;
assign st_commit_if.uuid = req_uuid;
assign st_commit_if.wid = req_wid;
assign st_commit_if.tmask = req_tmask;
assign st_commit_if.PC = req_pc;
assign st_commit_if.uuid = lsu_req_if.uuid;
assign st_commit_if.wid = lsu_req_if.wid;
assign st_commit_if.tmask = lsu_req_if.tmask;
assign st_commit_if.PC = lsu_req_if.PC;
assign st_commit_if.rd = 0;
assign st_commit_if.wb = 0;
assign st_commit_if.eop = 1'b1;
@ -366,9 +343,9 @@ module VX_lsu_unit #(
// scope registration
`SCOPE_ASSIGN (dcache_req_fire, mem_req_fire);
`SCOPE_ASSIGN (dcache_req_uuid, req_uuid);
`SCOPE_ASSIGN (dcache_req_addr, req_addr);
`SCOPE_ASSIGN (dcache_req_rw, ~req_wb);
`SCOPE_ASSIGN (dcache_req_uuid, lsu_req_if.uuid);
`SCOPE_ASSIGN (dcache_req_addr, full_addr);
`SCOPE_ASSIGN (dcache_req_rw, ~lsu_req_if.wb);
`SCOPE_ASSIGN (dcache_req_byteen, mem_req_byteen);
`SCOPE_ASSIGN (dcache_req_data, mem_req_data);
`SCOPE_ASSIGN (dcache_rsp_fire, mem_rsp_fire);
@ -382,19 +359,19 @@ module VX_lsu_unit #(
end
if (mem_req_fire) begin
if (mem_req_rw) begin
`TRACE(1, ("%d: D$%0d Wr Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, CORE_ID, req_wid, req_pc, mem_req_mask));
`TRACE_ARRAY1D(1, req_addr, `NUM_THREADS);
`TRACE(1, ("%d: D$%0d Wr Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, CORE_ID, lsu_req_if.wid, lsu_req_if.PC, mem_req_mask));
`TRACE_ARRAY1D(1, full_addr, `NUM_THREADS);
`TRACE(1, (", tag=0x%0h, byteen=0x%0h, type=", mem_req_tag, mem_req_byteen));
`TRACE_ARRAY1D(1, req_addr_type, `NUM_THREADS);
`TRACE_ARRAY1D(1, lsu_addr_type, `NUM_THREADS);
`TRACE(1, (", data="));
`TRACE_ARRAY1D(1, mem_req_data, `NUM_THREADS);
`TRACE(1, (", is_dup=%b (#%0d)\n", req_is_dup, req_uuid));
`TRACE(1, (", is_dup=%b (#%0d)\n", lsu_is_dup, lsu_req_if.uuid));
end else begin
`TRACE(1, ("%d: D$%0d Rd Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, CORE_ID, req_wid, req_pc, mem_req_mask));
`TRACE_ARRAY1D(1, req_addr, `NUM_THREADS);
`TRACE(1, ("%d: D$%0d Rd Req: wid=%0d, PC=0x%0h, tmask=%b, addr=", $time, CORE_ID, lsu_req_if.wid, lsu_req_if.PC, mem_req_mask));
`TRACE_ARRAY1D(1, full_addr, `NUM_THREADS);
`TRACE(1, (", tag=0x%0h, byteen=0x%0h, type=", mem_req_tag, mem_req_byteen));
`TRACE_ARRAY1D(1, req_addr_type, `NUM_THREADS);
`TRACE(1, (", rd=%0d, is_dup=%b (#%0d)\n", req_rd, req_is_dup, req_uuid));
`TRACE_ARRAY1D(1, lsu_addr_type, `NUM_THREADS);
`TRACE(1, (", rd=%0d, is_dup=%b (#%0d)\n", lsu_req_if.rd, lsu_is_dup, lsu_req_if.uuid));
end
end
if (mem_rsp_fire) begin

View file

@ -1,4 +1,9 @@
`include "VX_define.vh"
`include "VX_gpu_types.vh"
`IGNORE_WARNINGS_BEGIN
import VX_gpu_types::*;
`IGNORE_WARNINGS_END
module VX_pipeline #(
parameter CORE_ID = 0

View file

@ -1,4 +1,9 @@
`include "VX_define.vh"
`include "VX_gpu_types.vh"
`IGNORE_WARNINGS_BEGIN
import VX_gpu_types::*;
`IGNORE_WARNINGS_END
module VX_warp_sched #(
parameter CORE_ID = 0

View file

@ -219,7 +219,7 @@ module Vortex (
end
`endif
`ifndef NDEBUG
`ifdef SIMULATION
always @(posedge clk) begin
$fflush(); // flush stdout buffer
end

View file

@ -164,8 +164,9 @@ assign cmd_scope_write = cp2af_sRxPort.c0.mmioWrValid && (MMIO_SCOPE_WRITE == mm
wire [COUT_QUEUE_DATAW-1:0] cout_q_dout;
wire cout_q_full, cout_q_empty;
// disable assertions until full reset
`ifdef SIMULATION
`ifndef VERILATOR
// disable assertions until full reset
reg [$clog2(`RESET_DELAY+1)-1:0] assert_delay_ctr;
initial begin
$assertoff;
@ -174,13 +175,14 @@ always @(posedge clk) begin
if (reset) begin
assert_delay_ctr <= 0;
end else begin
assert_delay_ctr <= assert_delay_ctr + 1;
assert_delay_ctr <= assert_delay_ctr + $bits(assert_delay_ctr)'(1);
if (assert_delay_ctr == (`RESET_DELAY-1)) begin
$asserton; // enable assertions
end
end
end
`endif
`endif
always @(posedge clk) begin
if (reset) begin
@ -303,7 +305,7 @@ always @(posedge clk) begin
if (state == STATE_IDLE) begin
vx_reset_ctr <= 0;
end else if (state == STATE_RUN) begin
vx_reset_ctr <= vx_reset_ctr + 1;
vx_reset_ctr <= vx_reset_ctr + $bits(vx_reset_ctr)'(1);
end
end
@ -686,7 +688,7 @@ always @(posedge clk) begin
if (cci_rd_req_fire) begin
cci_rd_req_addr <= cci_rd_req_addr + 1;
cci_rd_req_ctr <= cci_rd_req_ctr + 1;
cci_rd_req_ctr <= cci_rd_req_ctr + $bits(cci_rd_req_ctr)'(1);
`ifdef DBG_TRACE_AFU
`TRACE(2, ("%d: CCI Rd Req: addr=0x%0h, tag=0x%0h, rem=%0d, pending=%0d\n", $time, cci_rd_req_addr, cci_rd_req_tag, (cmd_data_size - cci_rd_req_ctr - 1), cci_pending_reads));
`endif

View file

@ -2,6 +2,7 @@
`TRACING_OFF
module VX_mem_scheduler #(
parameter string INSTANCE_ID = "",
parameter NUM_REQS = 4,
parameter NUM_BANKS = 4,
parameter ADDR_WIDTH = 32,
@ -10,7 +11,6 @@ module VX_mem_scheduler #(
parameter UUID_WIDTH = 0,
parameter QUEUE_SIZE = 16,
parameter PARTIAL_RESPONSE = 0,
parameter DUPLICATE_ADDR = 0,
parameter OUT_REG = 0,
localparam BYTEENW = DATA_WIDTH / 8,
@ -64,7 +64,6 @@ module VX_mem_scheduler #(
`STATIC_ASSERT (DATA_WIDTH == 8 * (DATA_WIDTH / 8), ("invalid parameter"))
`STATIC_ASSERT ((0 == PARTIAL_RESPONSE) || (1 == PARTIAL_RESPONSE), ("invalid parameter"))
`STATIC_ASSERT ((0 == DUPLICATE_ADDR) || (1 == DUPLICATE_ADDR), ("invalid parameter"))
`RUNTIME_ASSERT ((~req_valid || req_mask != 0), ("invalid input"));
wire mem_rsp_valid_s;
@ -99,49 +98,13 @@ module VX_mem_scheduler #(
wire [NUM_REQS-1:0][DATA_WIDTH-1:0] crsp_data;
wire [TAG_WIDTH-1:0] crsp_tag;
wire crsp_ready;
// Duplicate address optimization /////////////////////////////////////////
wire [NUM_REQS-1:0] req_dup_mask;
wire rsp_dup;
if (DUPLICATE_ADDR) begin
reg [QUEUE_SIZE-1:0] rsp_dups;
wire [NUM_REQS-2:0] addr_matches;
for (genvar i = 0; i < NUM_REQS-1; ++i) begin
assign addr_matches[i] = (req_addr[i+1] == req_addr[0]) || ~req_mask[i+1];
end
wire req_dup = req_mask[0] && (& addr_matches);
always @(posedge clk) begin
if (reset) begin
rsp_dups <= '0;
end else begin
if (sreq_push) begin
rsp_dups[stag_waddr] <= req_dup;
end
end
end
assign req_dup_mask = req_mask & {{(NUM_REQS-1){~req_dup}}, 1'b1};
assign rsp_dup = rsp_dups[stag_raddr];
end else begin
assign req_dup_mask = req_mask;
assign rsp_dup = 1'b0;
end
// Store request //////////////////////////////////////////////////////
wire req_complete;
assign sreq_push = req_valid && !sreq_full && !stag_full;
assign sreq_pop = req_complete;
assign sreq_push = req_valid && ~sreq_full && (req_rw || ~stag_full);
assign sreq_pop = ~sreq_empty && req_complete;
assign req_ready = !sreq_full && !stag_full;
wire [`UP(UUID_WIDTH)-1:0] req_uuid;
@ -160,8 +123,8 @@ module VX_mem_scheduler #(
.reset (reset),
.push (sreq_push),
.pop (sreq_pop),
.data_in ({req_rw, req_dup_mask, req_byteen, req_addr, req_data, req_uuid, stag_waddr}),
.data_out ({sreq_rw, sreq_mask, sreq_byteen, sreq_addr, sreq_data, sreq_uuid, sreq_tag}),
.data_in ({req_rw, req_mask, req_byteen, req_addr, req_data, req_uuid, stag_waddr}),
.data_out ({sreq_rw, sreq_mask, sreq_byteen, sreq_addr, sreq_data, sreq_uuid, sreq_tag}),
.full (sreq_full),
.empty (sreq_empty),
`UNUSED_PIN (alm_full),
@ -205,43 +168,60 @@ module VX_mem_scheduler #(
// Handle memory requests /////////////////////////////////////////////////
wire [NUM_BATCHES-1:0][BATCH_DATAW-1:0] mem_req_data_b;
wire [NUM_BATCHES-1:0][NUM_BANKS-1:0] mem_req_mask_b;
wire [NUM_BANKS-1:0] mem_req_fire;
reg [NUM_BANKS-1:0] req_sent_mask;
wire [NUM_BANKS-1:0] req_sent_mask_n;
wire [NUM_BATCHES-1:0][NUM_BANKS-1:0] mem_req_mask_b;
reg [NUM_BANKS-1:0] req_sent_mask;
wire [NUM_BANKS-1:0] req_sent_mask_n;
reg [`UP(BATCH_SEL_BITS)-1:0] req_batch_idx;
for (genvar i = 0; i < NUM_BATCHES; ++i) begin
localparam SIZE = ((i + 1) * NUM_BANKS > NUM_REQS) ? REM_BATCH_SIZE : NUM_BANKS;
assign mem_req_mask_b[i] = NUM_BANKS'(sreq_mask[i * NUM_BANKS +: SIZE]);
assign mem_req_data_b[i] = {
{NUM_BANKS{sreq_rw}},
(NUM_BANKS * BYTEENW)'(sreq_byteen[i * NUM_BANKS +: SIZE]),
(NUM_BANKS * ADDR_WIDTH)'(sreq_addr[i * NUM_BANKS +: SIZE]),
(NUM_BANKS * DATA_WIDTH)'(sreq_data[i * NUM_BANKS +: SIZE])
};
end
wire [NUM_REQS-1:0] req_sent_mask_all;
for (genvar i = 0; i < NUM_REQS; ++i) begin
if (NUM_BATCHES > 1) begin
localparam j = i / NUM_BANKS;
localparam k = i % NUM_BANKS;
wire [BATCH_SEL_BITS-1:0] batch_idx = BATCH_SEL_BITS'(i / NUM_BANKS);
if (j < (NUM_BATCHES-1)) begin
assign req_sent_mask_all[i] = (batch_idx < req_batch_idx) ? sreq_mask[i] : ((batch_idx == req_batch_idx) & req_sent_mask_n[k]);
end else begin
assign req_sent_mask_all[i] = (batch_idx == req_batch_idx) & req_sent_mask_n[k];
end
end else begin
assign req_sent_mask_all[i] = req_sent_mask_n[i];
end
end
assign mem_req_fire = mem_req_valid & mem_req_ready;
wire [NUM_BANKS-1:0] mem_req_fire = mem_req_valid & mem_req_ready;
assign mem_req_mask_b = (NUM_BATCHES * NUM_BANKS)'(sreq_mask);
assign req_sent_mask_n = req_sent_mask | mem_req_fire;
assign req_complete = (req_sent_mask_all == sreq_mask);
wire req_complete_b = ~sreq_empty && (req_sent_mask_n == mem_req_mask_b[req_batch_idx]);
assign req_complete = req_complete_b && (req_batch_idx == `UP(BATCH_SEL_BITS)'(NUM_BATCHES-1));
always @(posedge clk) begin
if (reset) begin
req_sent_mask <= '0;
req_batch_idx <= 0;
end else begin
if (req_complete_b) begin
req_sent_mask <= '0;
if (req_batch_idx == `UP(BATCH_SEL_BITS)'(NUM_BATCHES-1)) begin
if (req_complete
|| (req_batch_idx == `UP(BATCH_SEL_BITS)'(NUM_BATCHES-1))) begin
req_batch_idx <= 0;
end else begin
req_batch_idx <= req_batch_idx + `UP(BATCH_SEL_BITS)'(1);
end
req_sent_mask <= 0;
end else begin
req_sent_mask <= req_sent_mask_n;
end
@ -269,9 +249,10 @@ module VX_mem_scheduler #(
// Handle memory responses ////////////////////////////////////////////////
reg [QUEUE_SIZE-1:0][REQ_SIZEW-1:0] rsp_rem_size;
reg [QUEUE_SIZE-1:0][NUM_REQS-1:0] rsp_orig_mask;
wire [`UP(BATCH_SEL_BITS)-1:0] rsp_batch_idx;
reg [QUEUE_SIZE-1:0][NUM_BATCHES-1:0][NUM_BANKS-1:0] rsp_rem_mask;
wire [NUM_BATCHES-1:0][NUM_BANKS-1:0] rsp_rem_mask_n;
reg [QUEUE_SIZE-1:0][NUM_REQS-1:0] rsp_orig_mask;
wire [`UP(BATCH_SEL_BITS)-1:0] rsp_batch_idx;
// Select memory response
VX_mem_rsp_sel #(
@ -296,29 +277,26 @@ module VX_mem_scheduler #(
if (NUM_BATCHES > 1) begin
assign rsp_batch_idx = mem_rsp_tag_s[QUEUE_ADDRW +: BATCH_SEL_BITS];
for (genvar i = 0; i < NUM_BATCHES; ++i) begin
assign rsp_rem_mask_n[i] = rsp_rem_mask[stag_raddr][i] & ~({NUM_BANKS{(i == rsp_batch_idx)}} & mem_rsp_mask_s);
end
end else begin
assign rsp_batch_idx = 0;
end
assign rsp_rem_mask_n = rsp_rem_mask[stag_raddr] & ~mem_rsp_mask_s;
end
wire [REQ_SIZEW-1:0] rsp_rem_size_0;
`POP_COUNT(rsp_rem_size_0, req_dup_mask);
wire [$clog2(NUM_BANKS+1)-1:0] rsp_rem_size_r;
`POP_COUNT(rsp_rem_size_r, mem_rsp_mask_s);
assign rsp_complete = (rsp_rem_size[stag_raddr] == REQ_SIZEW'(rsp_rem_size_r));
assign rsp_complete = (0 == rsp_rem_mask_n);
always @(posedge clk) begin
if (reset) begin
rsp_orig_mask <= '0;
rsp_rem_size <= '0;
rsp_rem_mask <= '0;
end else begin
if (sreq_push) begin
rsp_orig_mask[stag_waddr] <= req_mask;
rsp_rem_size[stag_waddr] <= rsp_rem_size_0;
rsp_rem_mask[stag_waddr] <= (NUM_BATCHES * NUM_BANKS)'(req_mask);
end
if (mem_rsp_fire) begin
rsp_rem_size[stag_raddr] <= rsp_rem_size[stag_raddr] - REQ_SIZEW'(rsp_rem_size_r);
rsp_rem_mask[stag_raddr] <= rsp_rem_mask_n;
end
end
end
@ -333,8 +311,8 @@ module VX_mem_scheduler #(
for (genvar i = 0; i < NUM_BATCHES; ++i) begin
localparam SIZE = ((i + 1) * NUM_BANKS > NUM_REQS) ? REM_BATCH_SIZE : NUM_BANKS;
assign crsp_mask[i] = rsp_dup ? rsp_orig_mask[stag_raddr][i]: ((i == rsp_batch_idx) ? mem_rsp_mask_s : '0);
assign crsp_data[i * NUM_BANKS +: SIZE] = rsp_dup ? {SIZE{mem_rsp_data_s[0]}} : mem_rsp_data_s[0 +: SIZE];
assign crsp_mask[i] = {NUM_BANKS{(i == rsp_batch_idx)}} & mem_rsp_mask_s;
assign crsp_data[i * NUM_BANKS +: SIZE] = mem_rsp_data_s[0 +: SIZE];
end
end else begin
@ -350,13 +328,13 @@ module VX_mem_scheduler #(
assign crsp_mask = rsp_orig_mask[stag_raddr];
for (genvar i = 0; i < NUM_BANKS; ++i) begin
assign mem_rsp_data_m[i] = mem_rsp_mask_s[i] ? mem_rsp_data_s[i] : '0;
assign mem_rsp_data_m[i] = {DATA_WIDTH{mem_rsp_mask_s[i]}} & mem_rsp_data_s[i];
end
for (genvar i = 0; i < NUM_BATCHES; ++i) begin
localparam SIZE = ((i + 1) * NUM_BANKS > NUM_REQS) ? REM_BATCH_SIZE : NUM_BANKS;
assign rsp_store_n[i] = rsp_store[stag_raddr][i] | ((i == rsp_batch_idx) ? mem_rsp_data_m : '0);
assign crsp_data[i * NUM_BANKS +: SIZE] = rsp_dup ? {SIZE{mem_rsp_data_s[0]}} : rsp_store_n[i][0 +: SIZE];
assign rsp_store_n[i] = rsp_store[stag_raddr][i] | ({NUM_BANKS * DATA_WIDTH{(i == rsp_batch_idx)}} & mem_rsp_data_m);
assign crsp_data[i * NUM_BANKS +: SIZE] = rsp_store_n[i][0 +: SIZE];
end
always @(posedge clk) begin
@ -436,7 +414,7 @@ module VX_mem_scheduler #(
if (pending_reqs[i][0]) begin
`ASSERT(($time - pending_reqs[i][1 +: 64]) < `STALL_TIMEOUT,
("%t: *** mem_scheduler response timeout: remaining=%b, tag=0x%0h (#%0d)",
$time, rsp_rem_size[i], pending_reqs[i][1+64 +: TAG_ONLY_WIDTH],
$time, rsp_rem_mask[i], pending_reqs[i][1+64 +: TAG_ONLY_WIDTH],
pending_reqs[i][1+64+TAG_ONLY_WIDTH +: `UUID_BITS]));
end
end
@ -447,7 +425,7 @@ module VX_mem_scheduler #(
/*always @(posedge clk) begin
if (req_valid && req_ready) begin
dpi_trace(1, "%d: mem_scheduler req: rw=%b, mask=%b, byteen=", $time, req_rw, req_mask);
dpi_trace(1, "%d: %s-req: rw=%b, mask=%b, byteen=", $time, INSTANCE_ID, req_rw, req_mask);
`TRACE_ARRAY1D(1, req_byteen, NUM_REQS);
dpi_trace(1, ", addr=");
`TRACE_ARRAY1D(1, req_addr, NUM_REQS);
@ -456,13 +434,13 @@ module VX_mem_scheduler #(
dpi_trace(1, ", tag=0x%0h (#%0d)\n", req_tag, req_dbg_uuid);
end
if (rsp_valid && rsp_ready) begin
dpi_trace(1, "%d: mem_scheduler rsp: mask=%b, data=", $time, rsp_mask);
dpi_trace(1, "%d: %s-rsp: mask=%b, data=", $time, INSTANCE_ID, rsp_mask);
`TRACE_ARRAY1D(1, rsp_data, NUM_REQS);
dpi_trace(1, ", tag=0x%0h (#%0d)\n", rsp_tag, rsp_dbg_uuid);
end
if (| mem_req_fire) begin
if (| mem_req_rw) begin
dpi_trace(1, "%d: mem_scheduler mem-wr-req: valid=%b, byteen=", $time, mem_req_fire);
dpi_trace(1, "%d: %s-mem-wr: valid=%b, byteen=", $time, INSTANCE_ID, mem_req_fire);
`TRACE_ARRAY1D(1, mem_req_byteen, NUM_BANKS);
dpi_trace(1, ", addr=");
`TRACE_ARRAY1D(1, mem_req_addr, NUM_BANKS);
@ -472,7 +450,7 @@ module VX_mem_scheduler #(
`TRACE_ARRAY1D(1, stag_waddr, NUM_BANKS);
dpi_trace(1, ", batch=%0d (#%0d)\n", req_batch_idx, mem_req_dbg_uuid);
end else begin
dpi_trace(1, "%d: mem_scheduler mem-rd-req: valid=%b, addr=", $time, mem_req_fire);
dpi_trace(1, "%d: %s-mem-rd: valid=%b, addr=", $time, INSTANCE_ID, mem_req_fire);
`TRACE_ARRAY1D(1, mem_req_addr, NUM_BANKS);
dpi_trace(1, ", tag=");
`TRACE_ARRAY1D(1, stag_waddr, NUM_BANKS);
@ -480,7 +458,7 @@ module VX_mem_scheduler #(
end
end
if (mem_rsp_fire) begin
dpi_trace(1, "%d: mem_scheduler mem-rd-rsp: mask=%b, data=", $time, mem_rsp_mask_s);
dpi_trace(1, "%d: %s-mem-rsp: mask=%b, data=", $time, INSTANCE_ID, mem_rsp_mask_s);
`TRACE_ARRAY1D(1, mem_rsp_data_s, NUM_BANKS);
dpi_trace(1, ", tag=0x%0h, batch=%0d (#%0d)\n", stag_raddr, rsp_batch_idx, mem_rsp_dbg_uuid);
end

View file

@ -246,6 +246,7 @@ module VX_raster_mem #(
// schedule memory request
VX_mem_scheduler #(
.INSTANCE_ID($sformatf("%s-memsched", INSTANCE_ID)),
.NUM_REQS (NUM_REQS),
.NUM_BANKS (`RCACHE_NUM_REQS),
.ADDR_WIDTH (`RCACHE_ADDR_WIDTH),

View file

@ -186,13 +186,14 @@ module VX_rop_mem #(
// schedule memory request
VX_mem_scheduler #(
.NUM_REQS (NUM_REQS),
.NUM_BANKS (`OCACHE_NUM_REQS),
.ADDR_WIDTH (`OCACHE_ADDR_WIDTH),
.DATA_WIDTH (32),
.TAG_WIDTH (TAG_WIDTH),
.QUEUE_SIZE (`ROP_MEM_PENDING_SIZE),
.OUT_REG (1)
.INSTANCE_ID($sformatf("%s-memsched", INSTANCE_ID)),
.NUM_REQS (NUM_REQS),
.NUM_BANKS (`OCACHE_NUM_REQS),
.ADDR_WIDTH (`OCACHE_ADDR_WIDTH),
.DATA_WIDTH (32),
.TAG_WIDTH (TAG_WIDTH),
.QUEUE_SIZE (`ROP_MEM_PENDING_SIZE),
.OUT_REG (1)
) mem_scheduler (
.clk (clk),
.reset (reset),

View file

@ -97,6 +97,7 @@ module VX_tex_mem #(
// schedule memory request
VX_mem_scheduler #(
.INSTANCE_ID($sformatf("core%0d-tex-memsched", CORE_ID)),
.NUM_REQS (`TEX_MEM_REQS),
.NUM_BANKS (`TCACHE_NUM_REQS),
.ADDR_WIDTH (`TCACHE_ADDR_WIDTH),

View file

@ -105,25 +105,34 @@ $(BUILD_DIR)_$(DEVICE_FAMILY)_fpga_32c/build/dcp.qpf:
$(BUILD_DIR)_$(DEVICE_FAMILY)_fpga_64c/build/dcp.qpf:
afu_synth_setup -s setup.cfg $(BUILD_DIR)_$(DEVICE_FAMILY)_fpga_64c
gen-sources-1c:
gen-sources-ase-1c:
./gen_sources.sh $(CFLAGS) $(CONFIG1) -DSIMULATION > sources.txt
gen-sources-ase-2c:
./gen_sources.sh $(CFLAGS) $(CONFIG2) -DSIMULATION > sources.txt
gen-sources-ase-4c:
./gen_sources.sh $(CFLAGS) $(CONFIG4) -DSIMULATION > sources.txt
gen-sources-fpga-1c:
./gen_sources.sh $(CFLAGS) $(CONFIG1) > sources.txt
gen-sources-2c:
gen-sources-fpga-2c:
./gen_sources.sh $(CFLAGS) $(CONFIG2) > sources.txt
gen-sources-4c:
gen-sources-fpga-4c:
./gen_sources.sh $(CFLAGS) $(CONFIG4) > sources.txt
gen-sources-8c:
gen-sources-fpga-8c:
./gen_sources.sh $(CFLAGS) $(CONFIG8) > sources.txt
gen-sources-16c:
gen-sources-fpga-16c:
./gen_sources.sh $(CFLAGS) $(CONFIG16) > sources.txt
gen-sources-32c:
gen-sources-fpga-32c:
./gen_sources.sh $(CFLAGS) $(CONFIG32) > sources.txt
gen-sources-64c:
gen-sources-fpga-64c:
./gen_sources.sh $(CFLAGS) $(CONFIG64) > sources.txt
# setup
@ -150,37 +159,37 @@ setup-fpga-64c: $(BUILD_DIR)_$(DEVICE_FAMILY)_fpga_64c/build/dcp.qpf
# build
ase-1c: gen-sources-1c setup-ase-1c
ase-1c: gen-sources-ase-1c setup-ase-1c
make -C $(BUILD_DIR)_$(DEVICE_FAMILY)_ase_1c
cp $(RTL_DIR)/fp_cores/altera/$(DEVICE_FAMILY)/*.hex $(BUILD_DIR)_$(DEVICE_FAMILY)_ase_1c/work
ase-2c: gen-sources-2c setup-ase-2c
ase-2c: gen-sources-ase-2c setup-ase-2c
make -C $(BUILD_DIR)_$(DEVICE_FAMILY)_ase_2c
cp $(RTL_DIR)/fp_cores/altera/$(DEVICE_FAMILY)/*.hex $(BUILD_DIR)_$(DEVICE_FAMILY)_ase_2c/work
ase-4c: gen-sources-4c setup-ase-4c
ase-4c: gen-sources-ase-4c setup-ase-4c
make -C $(BUILD_DIR)_$(DEVICE_FAMILY)_ase_4c
cp $(RTL_DIR)/fp_cores/altera/$(DEVICE_FAMILY)/*.hex $(BUILD_DIR)_$(DEVICE_FAMILY)_ase_4c/work
fpga-1c: gen-sources-1c setup-fpga-1c
fpga-1c: gen-sources-fpga-1c setup-fpga-1c
cd $(BUILD_DIR)_$(DEVICE_FAMILY)_fpga_1c && $(RUN_SYNTH)
fpga-2c: gen-sources-2c setup-fpga-2c
fpga-2c: gen-sources-fpga-2c setup-fpga-2c
cd $(BUILD_DIR)_$(DEVICE_FAMILY)_fpga_2c && $(RUN_SYNTH)
fpga-4c: gen-sources-4c setup-fpga-4c
fpga-4c: gen-sources-fpga-4c setup-fpga-4c
cd $(BUILD_DIR)_$(DEVICE_FAMILY)_fpga_4c && $(RUN_SYNTH)
fpga-8c: gen-sources-8c setup-fpga-8c
fpga-8c: gen-sources-fpga-8c setup-fpga-8c
cd $(BUILD_DIR)_$(DEVICE_FAMILY)_fpga_8c && $(RUN_SYNTH)
fpga-16c: gen-sources-16c setup-fpga-16c
fpga-16c: gen-sources-fpga-16c setup-fpga-16c
cd $(BUILD_DIR)_$(DEVICE_FAMILY)_fpga_16c && $(RUN_SYNTH)
fpga-32c: gen-sources-32c setup-fpga-32c
fpga-32c: gen-sources-fpga-32c setup-fpga-32c
cd $(BUILD_DIR)_$(DEVICE_FAMILY)_fpga_32c && $(RUN_SYNTH)
fpga-64c: gen-sources-64c setup-fpga-64c
fpga-64c: gen-sources-fpga-64c setup-fpga-64c
cd $(BUILD_DIR)_$(DEVICE_FAMILY)_fpga_64c && $(RUN_SYNTH)
# cleanup

View file

@ -51,6 +51,7 @@ VL_FLAGS = --exe --cc $(TOP) --top-module $(TOP)
VL_FLAGS += --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += -DSIMULATION
VL_FLAGS += verilator.vlt
VL_FLAGS += $(RTL_INCLUDE)

View file

@ -48,6 +48,7 @@ VL_FLAGS = --exe --cc $(TOP) --top-module $(TOP)
VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
VL_FLAGS += --x-initial unique --x-assign unique
VL_FLAGS += -DSIMULATION
VL_FLAGS += verilator.vlt
VL_FLAGS += $(RTL_INCLUDE)