raster unit refactoring

This commit is contained in:
Blaise Tine 2022-05-20 23:08:01 -04:00
parent 55eb8562a7
commit e11bc9d0fb
18 changed files with 271 additions and 194 deletions

View file

@ -145,6 +145,9 @@ export PERF_CLASS=$PERF_CLASS
status=0
# ensure config update
make -C hw config
# ensure the stub driver is present
make -C $VORTEX_HOME/driver/stub

View file

@ -1,7 +1,9 @@
RTL_DIR=./rtl
SCRIPT_DIR=./scripts
all: VX_config.h VX_types.h
all: config
config: VX_config.h VX_types.h
VX_config.h: $(RTL_DIR)/VX_config.vh
$(SCRIPT_DIR)/gen_config.py -i $(RTL_DIR)/VX_config.vh -o VX_config.h

View file

@ -68,7 +68,6 @@ module VX_cluster #(
.TILE_LOGSIZE (`RASTER_TILE_LOGSIZE),
.BLOCK_LOGSIZE (`RASTER_BLOCK_LOGSIZE),
.MEM_FIFO_DEPTH (`RASTER_MEM_FIFO_DEPTH),
.TILE_FIFO_DEPTH (`RASTER_TILE_FIFO_DEPTH),
.QUAD_FIFO_DEPTH (`RASTER_QUAD_FIFO_DEPTH),
.OUTPUT_QUADS (`NUM_THREADS)
) raster_unit (

View file

@ -258,6 +258,10 @@
`define RESET_DELAY 6
`ifndef STALL_TIMEOUT
`define STALL_TIMEOUT (10000 * (1 ** (`L2_ENABLED + `L3_ENABLED)))
`endif
// Pipeline Queues ////////////////////////////////////////////////////////////
// Size of Instruction Buffer
@ -292,7 +296,7 @@
// RASTER tile size
`ifndef RASTER_TILE_LOGSIZE
`define RASTER_TILE_LOGSIZE 4
`define RASTER_TILE_LOGSIZE 5
`endif
// RASTER block size
@ -305,19 +309,14 @@
`define RASTER_MEM_FIFO_DEPTH 8
`endif
// RASTER tile queue size
`ifndef RASTER_TILE_FIFO_DEPTH
`define RASTER_TILE_FIFO_DEPTH (1 << (2 * (`RASTER_TILE_LOGSIZE - `RASTER_BLOCK_LOGSIZE)))
// ROP memory pending size
`ifndef ROP_MEM_PENDING_SIZE
`define ROP_MEM_PENDING_SIZE 4
`endif
// RASTER quad queue size
`ifndef RASTER_QUAD_FIFO_DEPTH
`define RASTER_QUAD_FIFO_DEPTH 16
`endif
// ROP memory pending size
`ifndef ROP_MEM_PENDING_SIZE
`define ROP_MEM_PENDING_SIZE 4
`define RASTER_QUAD_FIFO_DEPTH 8
`endif
// ROP number of slices

View file

@ -18,7 +18,6 @@ module VX_icache_stage #(
// reponse
VX_ifetch_rsp_if.master ifetch_rsp_if
);
`UNUSED_PARAM (CORE_ID)
`UNUSED_VAR (reset)
@ -50,7 +49,7 @@ module VX_icache_stage #(
// Ensure that the ibuffer doesn't fill up.
// This will resolve potential deadlock if ibuffer fills and the LSU stalls the execute stage due to pending dcache request.
// This issue is particularly prevalent when the icache and dcache is disabled and both request share the same bus.
wire [`NUM_WARPS-1:0] pending_reads_full;
wire [`NUM_WARPS-1:0] pending_ibuf_full;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
VX_pending_size #(
.SIZE (`IBUF_SIZE + 1)
@ -59,7 +58,7 @@ module VX_icache_stage #(
.reset (reset),
.incr (icache_req_fire && (ifetch_req_if.wid == `NW_BITS'(i))),
.decr (ifetch_rsp_if.ibuf_pop[i]),
.full (pending_reads_full[i]),
.full (pending_ibuf_full[i]),
`UNUSED_PIN (size),
`UNUSED_PIN (empty)
);
@ -69,7 +68,7 @@ module VX_icache_stage #(
("%t: *** invalid PC=0x%0h, wid=%0d, tmask=%b (#%0d)", $time, ifetch_req_if.PC, ifetch_req_if.wid, ifetch_req_if.tmask, ifetch_req_if.uuid))
// Icache Request
assign icache_req_if.valid = ifetch_req_if.valid && ~pending_reads_full[ifetch_req_if.wid];
assign icache_req_if.valid = ifetch_req_if.valid && ~pending_ibuf_full[ifetch_req_if.wid];
assign icache_req_if.rw = 0;
assign icache_req_if.byteen = '0;
assign icache_req_if.addr = ifetch_req_if.PC[31:2];
@ -77,7 +76,7 @@ module VX_icache_stage #(
assign icache_req_if.tag = {ifetch_req_if.uuid, req_tag};
// Can accept new request?
assign ifetch_req_if.ready = icache_req_if.ready && ~pending_reads_full[ifetch_req_if.wid];
assign ifetch_req_if.ready = icache_req_if.ready && ~pending_ibuf_full[ifetch_req_if.wid];
wire [`NW_BITS-1:0] rsp_wid = rsp_tag;

View file

@ -23,6 +23,7 @@ module VX_issue #(
`endif
VX_gpu_req_if.master gpu_req_if
);
VX_ibuffer_if ibuffer_if();
VX_gpr_req_if gpr_req_if();
VX_gpr_rsp_if gpr_rsp_if();
@ -30,6 +31,8 @@ module VX_issue #(
VX_scoreboard_if scoreboard_if();
VX_dispatch_if dispatch_if();
wire [3:0] in_use_regs;
// GPR request interface
assign gpr_req_if.wid = ibuffer_if.wid;
assign gpr_req_if.rs1 = ibuffer_if.rs1;
@ -99,7 +102,8 @@ module VX_issue #(
.clk (clk),
.reset (scoreboard_reset),
.writeback_if (writeback_if),
.scoreboard_if (scoreboard_if)
.scoreboard_if (scoreboard_if),
.in_use_regs (in_use_regs)
);
VX_gpr_stage #(
@ -125,6 +129,28 @@ module VX_issue #(
`endif
.gpu_req_if (gpu_req_if)
);
reg [31:0] timeout_ctr;
always @(posedge clk) begin
if (reset) begin
timeout_ctr <= 0;
end else begin
if (ibuffer_if.valid && ~ibuffer_if.ready) begin
`ifdef DBG_TRACE_CORE_PIPELINE
dpi_trace(3, "%d: *** core%0d-stall: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, wb=%0d, cycles=%0d, inuse=%b%b%b%b, dispatch=%b (#%0d)\n",
$time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.tmask, ibuffer_if.rd, ibuffer_if.wb, timeout_ctr,
in_use_regs[0], in_use_regs[1], in_use_regs[2], in_use_regs[3], ~dispatch_if.ready, ibuffer_if.uuid);
`endif
`ASSERT(timeout_ctr < `STALL_TIMEOUT,
("%t: *** core%0d-issue-timeout: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, wb=%0d, inuse=%b%b%b%b, dispatch=%b (#%0d)",
$time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.tmask, ibuffer_if.rd, ibuffer_if.wb,
in_use_regs[0], in_use_regs[1], in_use_regs[2], in_use_regs[3], ~dispatch_if.ready, ibuffer_if.uuid));
timeout_ctr <= timeout_ctr + 1;
end else if (ibuffer_if.valid && ibuffer_if.ready) begin
timeout_ctr <= 0;
end
end
end
`SCOPE_ASSIGN (issue_fire, ibuffer_if.valid && ibuffer_if.ready);
`SCOPE_ASSIGN (issue_uuid, ibuffer_if.uuid);

View file

@ -3,11 +3,12 @@
module VX_scoreboard #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
input wire clk,
input wire reset,
VX_scoreboard_if.slave scoreboard_if,
VX_writeback_if.slave writeback_if
VX_scoreboard_if.slave scoreboard_if,
VX_writeback_if.slave writeback_if,
output wire [3:0] in_use_regs
);
reg [`NUM_WARPS-1:0][`NUM_REGS-1:0] inuse_regs, inuse_regs_n;
@ -45,40 +46,25 @@ module VX_scoreboard #(
assign writeback_if.ready = 1'b1;
assign scoreboard_if.ready = ~(deq_inuse_rd
| deq_inuse_rs1
| deq_inuse_rs2
| deq_inuse_rs3);
| deq_inuse_rs1
| deq_inuse_rs2
| deq_inuse_rs3);
`UNUSED_VAR (writeback_if.PC)
`UNUSED_VAR (scoreboard_if.PC)
`UNUSED_VAR (scoreboard_if.tmask)
`UNUSED_VAR (scoreboard_if.uuid)
reg [31:0] deadlock_ctr;
wire [31:0] deadlock_timeout = 10000 * (1 ** (`L2_ENABLED + `L3_ENABLED));
assign in_use_regs[0] = deq_inuse_rd;
assign in_use_regs[1] = deq_inuse_rs1;
assign in_use_regs[2] = deq_inuse_rs2;
assign in_use_regs[3] = deq_inuse_rs3;
always @(posedge clk) begin
if (reset) begin
deadlock_ctr <= 0;
end else begin
`ifdef DBG_TRACE_CORE_PIPELINE
if (scoreboard_if.valid && ~scoreboard_if.ready) begin
dpi_trace(3, "%d: *** core%0d-stall: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, wb=%0d, inuse=%b%b%b%b (#%0d)\n",
$time, CORE_ID, scoreboard_if.wid, scoreboard_if.PC, scoreboard_if.tmask, scoreboard_if.rd, scoreboard_if.wb,
deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3, scoreboard_if.uuid);
end
`endif
if (release_reg) begin
`ASSERT(inuse_regs[writeback_if.wid][writeback_if.rd] != 0,
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
$time, CORE_ID, writeback_if.wid, writeback_if.PC, writeback_if.tmask, writeback_if.rd, writeback_if.uuid));
end
if (scoreboard_if.valid && ~scoreboard_if.ready) begin
deadlock_ctr <= deadlock_ctr + 1;
`ASSERT(deadlock_ctr < deadlock_timeout,
("%t: *** core%0d-deadlock: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d, wb=%0d, inuse=%b%b%b%b (#%0d)",
$time, CORE_ID, scoreboard_if.wid, scoreboard_if.PC, scoreboard_if.tmask, scoreboard_if.rd, scoreboard_if.wb,
deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3, scoreboard_if.uuid));
end else if (scoreboard_if.valid && scoreboard_if.ready) begin
deadlock_ctr <= 0;
end
always @(posedge clk) begin
if (release_reg) begin
`ASSERT(inuse_regs[writeback_if.wid][writeback_if.rd] != 0,
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=0x%0h, tmask=%b, rd=%0d (#%0d)",
$time, CORE_ID, writeback_if.wid, writeback_if.PC, writeback_if.tmask, writeback_if.rd, writeback_if.uuid));
end
end

View file

@ -245,12 +245,27 @@ module VX_warp_sched #(
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (!stall_out),
.enable (~stall_out),
.data_in ({schedule_valid, instr_uuid, schedule_tmask, schedule_pc, schedule_wid}),
.data_out ({ifetch_req_if.valid, ifetch_req_if.uuid, ifetch_req_if.tmask, ifetch_req_if.PC, ifetch_req_if.wid})
);
assign busy = (active_warps != 0);
assign busy = (active_warps != 0);
reg [31:0] timeout_ctr;
always @(posedge clk) begin
if (reset) begin
timeout_ctr <= 0;
end else begin
if (active_warps !=0 && active_warps == stalled_warps) begin
`ASSERT(timeout_ctr < `STALL_TIMEOUT,
("%t: *** core%0d-scheduler-timeout: stalled_warps=%b", $time, CORE_ID, stalled_warps));
timeout_ctr <= timeout_ctr + 1;
end else if (active_warps == 0 || active_warps != stalled_warps) begin
timeout_ctr <= 0;
end
end
end
`SCOPE_ASSIGN (wsched_scheduled, warp_scheduled);
`SCOPE_ASSIGN (wsched_schedule_uuid, instr_uuid);

View file

@ -7,10 +7,10 @@
`include "VX_raster_define.vh"
module VX_raster_be #(
parameter SLICE_ID = 1,
parameter BLOCK_LOGSIZE = 6,
parameter SLICE_ID = 0,
parameter BLOCK_LOGSIZE = 5,
parameter OUTPUT_QUADS = 2,
parameter QUAD_FIFO_DEPTH = 16
parameter QUAD_FIFO_DEPTH = 4
) (
// Standard inputs
input wire clk,

View file

@ -15,21 +15,23 @@ module VX_raster_edge_function #(
output wire [2:0][`RASTER_DATA_BITS-1:0] result
);
localparam PROD_WIDTH = `RASTER_DATA_BITS + `RASTER_DIM_BITS;
`UNUSED_VAR (reset)
`STATIC_ASSERT((LATENCY >= `LATENCY_IMUL), ("invalid parameter"))
wire [2:0][`RASTER_DATA_BITS-1:0] prod_x;
wire [2:0][`RASTER_DATA_BITS-1:0] prod_y;
wire [2:0][`RASTER_DATA_BITS-1:0] edge_c;
wire [2:0][PROD_WIDTH-1:0] prod_x;
wire [2:0][PROD_WIDTH-1:0] prod_y;
wire [2:0][`RASTER_DATA_BITS-1:0] edge_c, edge_c_s;
wire [2:0][`RASTER_DATA_BITS-1:0] edge_c_s, result_s;
wire [2:0][`RASTER_DATA_BITS-1:0] result_s;
for (genvar i = 0; i < 3; ++i) begin
VX_multiplier #(
.WIDTHA (`RASTER_DATA_BITS),
.WIDTHB (`RASTER_DIM_BITS),
.WIDTHP (`RASTER_DATA_BITS),
.WIDTHP (PROD_WIDTH),
.SIGNED (1),
.LATENCY (`LATENCY_IMUL)
) x_multiplier (
@ -43,7 +45,7 @@ module VX_raster_edge_function #(
VX_multiplier #(
.WIDTHA (`RASTER_DATA_BITS),
.WIDTHB (`RASTER_DIM_BITS),
.WIDTHP (`RASTER_DATA_BITS),
.WIDTHP (PROD_WIDTH),
.SIGNED (1),
.LATENCY (`LATENCY_IMUL)
) y_multiplier (
@ -69,7 +71,9 @@ module VX_raster_edge_function #(
);
for (genvar i = 0; i < 3; ++i) begin
assign result_s[i] = prod_x[i] + prod_y[i] + edge_c_s[i];
wire [PROD_WIDTH-1:0] sum = prod_x[i] + prod_y[i] + PROD_WIDTH'(edge_c_s[i]);
`UNUSED_VAR (sum)
assign result_s[i] = sum[`RASTER_DATA_BITS-1:0];
end
VX_pipe_register #(

View file

@ -1,7 +1,7 @@
`include "VX_raster_define.vh"
module VX_raster_extents #(
parameter TILE_LOGSIZE = 64
parameter TILE_LOGSIZE = 5
) (
input wire signed [2:0][2:0][`RASTER_DATA_BITS-1:0] edges,
output wire signed [2:0][`RASTER_DATA_BITS-1:0] extents

View file

@ -7,7 +7,7 @@
// 3. Store primitive data in an elastic buffer
module VX_raster_mem #(
parameter TILE_LOGSIZE = 16,
parameter TILE_LOGSIZE = 5,
parameter QUEUE_SIZE = 8
) (
input wire clk,
@ -34,61 +34,69 @@ module VX_raster_mem #(
);
`UNUSED_VAR (dcrs)
localparam MUL_LATENCY = 3;
localparam NUM_REQS = `RASTER_MEM_REQS;
localparam FSM_BITS = 2;
localparam TAG_WIDTH = `RASTER_PID_BITS;
localparam MUL_LATENCY = 3;
localparam NUM_REQS = `RASTER_MEM_REQS;
localparam FSM_BITS = 2;
localparam FETCH_FLAG_BITS = 2;
localparam TAG_WIDTH = `RASTER_PID_BITS + FETCH_FLAG_BITS;
localparam STATE_IDLE = 2'b00;
localparam STATE_TILE = 2'b01;
localparam STATE_PRIM = 2'b10;
localparam STATE_IDLE = 2'b00;
localparam STATE_TILE = 2'b01;
localparam STATE_PRIM = 2'b10;
localparam TILE_FETCH_MASK = 9'(2'b11);
localparam PID_FETCH_MASK = 9'(1'b01);
localparam PDATA_FETCH_MASK = {9{1'b1}};
localparam FETCH_FLAG_TILE = 2'b00;
localparam FETCH_FLAG_PID = 2'b01;
localparam FETCH_FLAG_PDATA = 2'b10;
// A primitive data contains (x_loc, y_loc, pid, edges)
localparam PRIM_DATA_WIDTH = 2 * `RASTER_DIM_BITS + `RASTER_PID_BITS + 9 * `RASTER_DATA_BITS;
localparam PRIM_DATA_WIDTH = 2 * `RASTER_DIM_BITS+ 9 * `RASTER_DATA_BITS + `RASTER_PID_BITS ;
// Storage to cycle through all primitives and tiles
reg [`RASTER_DCR_DATA_BITS-1:0] curr_tbuf_addr;
reg [`RASTER_PID_BITS-1:0] curr_num_prims;
reg [`RASTER_PID_BITS-1:0] rem_num_prims;
reg [`RASTER_PID_BITS-1:0] curr_pid_reqs;
reg [`RASTER_PID_BITS-1:0] curr_pid_rsps;
reg [`RASTER_TILE_BITS-1:0] curr_num_tiles;
reg [`RASTER_DIM_BITS-1:0] curr_x_loc;
reg [`RASTER_DIM_BITS-1:0] curr_y_loc;
// Output buffer
wire buf_out_valid;
wire buf_out_ready;
wire buf_in_valid;
wire buf_in_ready;
// Memory request
reg mem_req_valid;
reg mem_req_valid, mem_req_valid_qual;
reg [NUM_REQS-1:0] mem_req_mask;
reg [8:0][`RASTER_DCR_DATA_BITS-1:0] mem_req_addr;
reg [TAG_WIDTH-1:0] mem_req_tag;
wire mem_req_ready;
// Memory response
wire mem_rsp_valid;
reg [NUM_REQS-1:0] mem_rsp_mask;
wire mem_rsp_valid;
wire [8:0][`RASTER_DATA_BITS-1:0] mem_rsp_data;
wire [TAG_WIDTH-1:0] mem_rsp_tag;
wire mem_rsp_ready;
// Primitive info
wire prim_id_rsp_valid;
wire prim_data_rsp_valid;
wire prim_addr_rsp_valid;
wire prim_addr_rsp_ready;
wire [8:0][`RASTER_DATA_BITS-1:0] prim_mem_addr;
wire [`RASTER_PID_BITS-1:0] prim_id;
wire [`RASTER_PID_BITS-1:0] primitive_id;
// Memory fetch FSM
reg [FSM_BITS-1:0] state;
wire is_prim_id_req = (mem_req_tag[FETCH_FLAG_BITS-1:0] == FETCH_FLAG_PID);
wire is_prim_id_rsp = (mem_rsp_tag[FETCH_FLAG_BITS-1:0] == FETCH_FLAG_PID);
wire fsm_req_fire = mem_req_valid && mem_req_ready;
wire is_prim_data_req = (mem_req_tag[FETCH_FLAG_BITS-1:0] == FETCH_FLAG_PDATA);
wire is_prim_data_rsp = (mem_rsp_tag[FETCH_FLAG_BITS-1:0] == FETCH_FLAG_PDATA);
wire prim_data_rsp_valid = mem_rsp_valid
&& (state == STATE_PRIM)
&& mem_rsp_mask[1];
wire mem_req_fire = mem_req_valid_qual && mem_req_ready;
wire prim_addr_rsp_fire = prim_addr_rsp_valid && prim_addr_rsp_ready;
wire prim_data_rsp_fire = prim_data_rsp_valid && mem_rsp_ready;
@ -97,26 +105,28 @@ module VX_raster_mem #(
state <= STATE_IDLE;
mem_req_valid <= 0;
curr_tbuf_addr <= 0;
curr_num_prims <= 0;
rem_num_prims <= 0;
curr_pid_reqs <= 0;
curr_pid_rsps <= 0;
curr_num_tiles <= 0;
end begin
// deassert valid when request is sent
if (fsm_req_fire) begin
// deassert memory request when fired
if (mem_req_fire) begin
mem_req_valid <= 0;
end
case (state)
STATE_IDLE: begin
if (start && (dcrs.tile_count != 0)) begin
// fetch the next tile header
state <= STATE_TILE;
mem_req_valid <= 1;
curr_num_tiles <= dcrs.tile_count;
mem_req_addr[0] <= dcrs.tbuf_addr;
mem_req_addr[1] <= dcrs.tbuf_addr + 4;
mem_req_mask <= TILE_FETCH_MASK;
mem_req_tag <= 'x;
mem_req_mask <= 9'b11;
mem_req_tag <= TAG_WIDTH'(FETCH_FLAG_TILE);
// set tile counters
curr_tbuf_addr <= dcrs.tbuf_addr + 4 + 4;
curr_num_tiles <= dcrs.tile_count;
end
end
STATE_TILE: begin
@ -125,54 +135,64 @@ module VX_raster_mem #(
state <= STATE_PRIM;
curr_x_loc <= `RASTER_DIM_BITS'(mem_rsp_data[0][0 +: 16] << TILE_LOGSIZE);
curr_y_loc <= `RASTER_DIM_BITS'(mem_rsp_data[0][16 +: 16] << TILE_LOGSIZE);
// send next primitive address
// fetch next primitive pid
mem_req_valid <= 1;
mem_req_addr[0] <= curr_tbuf_addr;
mem_req_mask <= PID_FETCH_MASK;
mem_req_tag <= 'x;
curr_tbuf_addr <= curr_tbuf_addr + 4;
curr_num_prims <= mem_rsp_data[1][`RASTER_PID_BITS-1:0];
rem_num_prims <= mem_rsp_data[1][`RASTER_PID_BITS-1:0];
mem_req_mask <= 9'b1;
mem_req_tag <= TAG_WIDTH'(FETCH_FLAG_PID);
// set primitive counters
curr_pid_reqs <= mem_rsp_data[1][`RASTER_PID_BITS-1:0];
curr_pid_rsps <= mem_rsp_data[1][`RASTER_PID_BITS-1:0];
end
end
STATE_PRIM: begin
if (prim_addr_rsp_valid) begin
// handle primitive address response
mem_req_valid <= 1;
// handle memory submissions
if (mem_req_fire) begin
if (is_prim_id_req) begin
// update pid counters
curr_tbuf_addr <= curr_tbuf_addr + 4;
curr_pid_reqs <= curr_pid_reqs - `RASTER_PID_BITS'(1);
end
if ((curr_pid_reqs > 1)
|| (curr_pid_reqs == 1 && ~is_prim_id_req)) begin
// fetch next primitive pid
mem_req_valid <= 1;
mem_req_mask <= 9'b1;
mem_req_addr[0] <= curr_tbuf_addr + (is_prim_id_req ? 4 : 0);
mem_req_tag <= TAG_WIDTH'(FETCH_FLAG_PID);
end
end
// handle primitive address response
if (prim_addr_rsp_fire) begin
mem_req_valid <= 1;
mem_req_mask <= 9'b111111111;
mem_req_addr <= prim_mem_addr;
mem_req_mask <= PDATA_FETCH_MASK;
mem_req_tag <= prim_id;
end else
mem_req_tag <= TAG_WIDTH'({primitive_id, FETCH_FLAG_PDATA});
end
// handle primitive data response
if (prim_data_rsp_fire) begin
// handle primitive data response
if (rem_num_prims == 1) begin
if (curr_num_tiles != 1) begin
// Fetch the next tile
state <= STATE_TILE;
mem_req_valid <= 1;
mem_req_addr[0] <= curr_tbuf_addr;
mem_req_addr[1] <= curr_tbuf_addr + 4;
mem_req_mask <= TILE_FETCH_MASK;
mem_req_tag <= 'x;
curr_tbuf_addr <= curr_tbuf_addr + 4 + 4;
curr_num_tiles <= curr_num_tiles - `RASTER_TILE_BITS'(1);
end else begin
if (curr_pid_rsps == 1) begin
if (curr_num_tiles == 1) begin
// done, return to idle
state <= STATE_IDLE;
end else begin
// fetch the next tile header
state <= STATE_TILE;
mem_req_valid <= 1;
mem_req_mask <= 9'b11;
mem_req_addr[0] <= curr_tbuf_addr;
mem_req_addr[1] <= curr_tbuf_addr + 4;
mem_req_tag <= TAG_WIDTH'(FETCH_FLAG_TILE);
curr_tbuf_addr <= curr_tbuf_addr + 4 + 4;
end
// update tile counter
curr_num_tiles <= curr_num_tiles - `RASTER_TILE_BITS'(1);
end
rem_num_prims <= rem_num_prims - `RASTER_PID_BITS'(1);
end else
if (fsm_req_fire) begin
// send next primitive address
if (curr_num_prims != 1) begin
mem_req_valid <= 1;
mem_req_addr[0] <= curr_tbuf_addr;
mem_req_mask <= PID_FETCH_MASK;
mem_req_tag <= 'x;
curr_tbuf_addr <= curr_tbuf_addr + 4;
curr_num_prims <= curr_num_prims - `RASTER_PID_BITS'(1);
end
// update pid counter
curr_pid_rsps <= curr_pid_rsps - `RASTER_PID_BITS'(1);
end
end
default:;
@ -182,9 +202,37 @@ module VX_raster_mem #(
// Memory streamer
// stall the memory response only if edge data cannot be taken
assign mem_rsp_ready = (~prim_data_rsp_valid || buf_out_ready)
&& ~prim_addr_rsp_valid;
// ensure that we have space in the output buffer to prevent memory deadlock
wire pending_output_full;
VX_pending_size #(
.SIZE (QUEUE_SIZE-1)
) pending_reads (
.clk (clk),
.reset (reset),
.incr (mem_req_fire && is_prim_id_req),
.decr (valid_out && ready_out),
.full (pending_output_full),
`UNUSED_PIN (size),
`UNUSED_PIN (empty)
);
assign mem_req_valid_qual = mem_req_valid && (~is_prim_id_req || ~pending_output_full);
// the memory response is for primitive id
assign prim_id_rsp_valid = mem_rsp_valid && is_prim_id_rsp;
// the memory response is for primitive data
assign prim_data_rsp_valid = mem_rsp_valid && is_prim_data_rsp;
// stall primitive address handling if primitive data fetch stalls
wire prim_data_req_stall = mem_req_valid && is_prim_data_req && ~mem_req_ready;
assign prim_addr_rsp_ready = ~prim_data_req_stall || ~prim_addr_rsp_valid;
// Push primitive data into output buffer
assign buf_in_valid = prim_data_rsp_valid;
// stall the memory response
assign mem_rsp_ready = (~prim_id_rsp_valid || prim_addr_rsp_ready)
&& (~prim_data_rsp_valid || buf_in_ready);
wire [8:0][`RCACHE_ADDR_WIDTH-1:0] mem_req_addr_w;
for (genvar i = 0; i < 9; ++i) begin
@ -204,7 +252,7 @@ module VX_raster_mem #(
.reset (reset),
// Input request
.req_valid (mem_req_valid),
.req_valid (mem_req_valid_qual),
.req_rw (1'b0),
.req_mask (mem_req_mask),
`UNUSED_PIN (req_byteen),
@ -215,7 +263,7 @@ module VX_raster_mem #(
// Output response
.rsp_valid (mem_rsp_valid),
.rsp_mask (mem_rsp_mask),
`UNUSED_PIN (rsp_mask),
.rsp_data (mem_rsp_data),
.rsp_tag (mem_rsp_tag),
.rsp_ready (mem_rsp_ready),
@ -246,7 +294,7 @@ module VX_raster_mem #(
.LATENCY (MUL_LATENCY)
) multiplier (
.clk (clk),
.enable (1'b1),
.enable (prim_addr_rsp_ready),
.dataa (mem_rsp_data[0]),
.datab (dcrs.pbuf_stride),
.result (prim_mem_offset)
@ -256,9 +304,6 @@ module VX_raster_mem #(
assign prim_mem_addr[i] = dcrs.pbuf_addr + prim_mem_offset + 4 * i;
end
// onlt delay primitive addresses for multiplication (mask = 1)
wire mem_rsp_valid_p = mem_rsp_valid && ~mem_rsp_mask[1];
VX_shift_register #(
.DATAW (1 + `RASTER_PID_BITS),
.DEPTH (MUL_LATENCY),
@ -266,17 +311,12 @@ module VX_raster_mem #(
) mul_shift_reg (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({mem_rsp_valid_p, mem_rsp_data[0][`RASTER_PID_BITS-1:0]}),
.data_out ({prim_addr_rsp_valid, prim_id})
.enable (prim_addr_rsp_ready),
.data_in ({prim_id_rsp_valid, mem_rsp_data[0][`RASTER_PID_BITS-1:0]}),
.data_out ({prim_addr_rsp_valid, primitive_id})
);
// Output buffer
assign buf_out_valid = prim_data_rsp_valid
&& ~prim_addr_rsp_valid;
`UNUSED_VAR (mem_rsp_mask)
// Output buffer
VX_elastic_buffer #(
.DATAW (PRIM_DATA_WIDTH),
@ -285,10 +325,10 @@ module VX_raster_mem #(
) buf_out (
.clk (clk),
.reset (reset),
.valid_in (buf_out_valid),
.ready_in (buf_out_ready),
.data_in ({curr_x_loc, curr_y_loc, mem_rsp_tag, mem_rsp_data}),
.data_out ({x_loc_out, y_loc_out, pid_out, edges_out}),
.valid_in (buf_in_valid),
.ready_in (buf_in_ready),
.data_in ({curr_x_loc, curr_y_loc, mem_rsp_data, mem_rsp_tag[FETCH_FLAG_BITS +: `RASTER_PID_BITS]}),
.data_out ({x_loc_out, y_loc_out, edges_out, pid_out}),
.valid_out (valid_out),
.ready_out (ready_out)
);

View file

@ -5,7 +5,7 @@
`include "VX_raster_define.vh"
module VX_raster_qe #(
parameter SLICE_ID = 1,
parameter SLICE_ID = 0,
parameter NUM_QUADS = 4
) (
input wire clk,

View file

@ -9,12 +9,11 @@
module VX_raster_slice #(
parameter CLUSTER_ID = 0,
parameter SLICE_ID = 1,
parameter TILE_LOGSIZE = 6, // tile log size
parameter SLICE_ID = 0,
parameter TILE_LOGSIZE = 5, // tile log size
parameter BLOCK_LOGSIZE = 2, // block log size
parameter OUTPUT_QUADS = 4,
parameter QUAD_FIFO_DEPTH = 1,
parameter TILE_FIFO_DEPTH = 16
parameter QUAD_FIFO_DEPTH = 4
) (
input wire clk,
input wire reset,

View file

@ -7,7 +7,7 @@
`include "VX_raster_define.vh"
module VX_raster_te #(
parameter TILE_LOGSIZE = 6,
parameter TILE_LOGSIZE = 5,
parameter BLOCK_LOGSIZE = 2
) (
input wire clk,
@ -32,8 +32,8 @@ module VX_raster_te #(
output wire [2:0][2:0][`RASTER_DATA_BITS-1:0] edges_out,
input wire ready_out
);
localparam LEVEL_BITS = (TILE_LOGSIZE - BLOCK_LOGSIZE) + 1;
localparam TILE_FIFO_DEPTH = 1 << (TILE_LOGSIZE - BLOCK_LOGSIZE);
localparam LEVEL_BITS = (TILE_LOGSIZE - BLOCK_LOGSIZE) + 1;
localparam TILE_FIFO_DEPTH = 1 << (2 * (TILE_LOGSIZE - BLOCK_LOGSIZE));
localparam FIFO_DATA_WIDTH = 2 * `RASTER_DIM_BITS + 3 * `RASTER_DATA_BITS + LEVEL_BITS;
wire stall;
@ -80,32 +80,32 @@ module VX_raster_te #(
tile_valid <= 0;
if (fifo_arb_valid) begin
// select fifo input
tile_valid <= 1;
tile_x_loc <= fifo_x_loc;
tile_y_loc <= fifo_y_loc;
tile_edge_eval <= fifo_edge_eval;
tile_level <= fifo_level;
tile_valid <= 1;
tile_x_loc <= fifo_x_loc;
tile_y_loc <= fifo_y_loc;
tile_edge_eval <= fifo_edge_eval;
tile_level <= fifo_level;
end else
if (is_fifo_bypass) begin
// fifo bypass first sub-tile
tile_valid <= 1;
tile_x_loc <= subtile_x_loc_r[0];
tile_y_loc <= subtile_y_loc_r[0];
tile_edge_eval <= subtile_edge_eval_r[0];
tile_level <= subtile_level_r;
tile_valid <= 1;
tile_x_loc <= subtile_x_loc_r[0];
tile_y_loc <= subtile_y_loc_r[0];
tile_edge_eval <= subtile_edge_eval_r[0];
tile_level <= subtile_level_r;
end else
if (valid_in && ~tile_valid) begin
// select new tile input
tile_valid <= 1;
tile_extents <= extents_in;
tile_edges <= edges_in;
tile_pid <= pid_in;
tile_x_loc <= x_loc_in;
tile_y_loc <= y_loc_in;
tile_edge_eval[0]<= edges_in[0][2];
tile_edge_eval[1]<= edges_in[1][2];
tile_edge_eval[2]<= edges_in[2][2];
tile_level <= 0;
tile_valid <= 1;
tile_extents <= extents_in;
tile_edges <= edges_in;
tile_pid <= pid_in;
tile_x_loc <= x_loc_in;
tile_y_loc <= y_loc_in;
tile_edge_eval[0] <= edges_in[0][2];
tile_edge_eval[1] <= edges_in[1][2];
tile_edge_eval[2] <= edges_in[2][2];
tile_level <= 0;
end
end
end

View file

@ -3,12 +3,11 @@
module VX_raster_unit #(
parameter CLUSTER_ID = 0,
parameter NUM_SLICES = 1, // number of raster slices
parameter TILE_LOGSIZE = 6, // tile log size
parameter TILE_LOGSIZE = 5, // tile log size
parameter BLOCK_LOGSIZE = 2, // block log size
parameter MEM_FIFO_DEPTH = 8, // memory queue size
parameter TILE_FIFO_DEPTH = (1 << (2 * (TILE_LOGSIZE - BLOCK_LOGSIZE))), // tile queue size
parameter QUAD_FIFO_DEPTH = 16, // quad queue size
parameter OUTPUT_QUADS = 4 // number of output quads
parameter MEM_FIFO_DEPTH = 4, // memory queue size
parameter QUAD_FIFO_DEPTH = 4, // quad queue size
parameter OUTPUT_QUADS = 4 // number of output quads
) (
input wire clk,
@ -35,7 +34,6 @@ module VX_raster_unit #(
localparam PRIM_DATA_WIDTH = 2 * `RASTER_DIM_BITS + `RASTER_PID_BITS + 9 * `RASTER_DATA_BITS + 3 * `RASTER_DATA_BITS;
`STATIC_ASSERT(TILE_LOGSIZE > BLOCK_LOGSIZE, ("invalid parameter"))
`STATIC_ASSERT(TILE_FIFO_DEPTH >= (1 << (2 * (TILE_LOGSIZE - BLOCK_LOGSIZE))), ("invalid parameter"))
raster_dcrs_t raster_dcrs;
assign raster_dcrs = raster_dcr_if.data;
@ -191,8 +189,7 @@ module VX_raster_unit #(
.TILE_LOGSIZE (TILE_LOGSIZE),
.BLOCK_LOGSIZE (BLOCK_LOGSIZE),
.OUTPUT_QUADS (OUTPUT_QUADS),
.QUAD_FIFO_DEPTH (QUAD_FIFO_DEPTH),
.TILE_FIFO_DEPTH (TILE_FIFO_DEPTH)
.QUAD_FIFO_DEPTH (QUAD_FIFO_DEPTH)
) raster_slice (
.clk (clk),
.reset (reset),

View file

@ -71,7 +71,7 @@ double sc_time_stamp() {
static bool trace_enabled = false;
static uint64_t trace_start_time = TRACE_START_TIME;
static uint64_t trace_stop_time = TRACE_STOP_TIME;
static uint64_t trace_stop_time = TRACE_STOP_TIME;
bool sim_trace_enabled() {
if (timestamp >= trace_start_time

View file

@ -134,6 +134,8 @@ int render(const CGLTrace& trace) {
std::cout << "render" << std::endl;
auto time_begin = std::chrono::high_resolution_clock::now();
uint32_t draw_idx = 0;
// render each draw call
for (auto& drawcall : trace.drawcalls) {
auto& states = drawcall.states;
@ -143,7 +145,7 @@ int render(const CGLTrace& trace) {
// Perform tile binning
auto num_tiles = Binning(tilebuf, primbuf, drawcall.vertices, drawcall.primitives, dst_width, dst_height, drawcall.viewport.near, drawcall.viewport.far, tile_size);
std::cout << "Binning allocated " << std::dec << num_tiles << " tiles with " << primbuf.size() << " total primitives." << std::endl;
std::cout << "Binning allocated " << std::dec << num_tiles << " tiles with " << (primbuf.size() / sizeof(rast_prim_t)) << " total primitives." << std::endl;
if (0 == num_tiles)
continue;
@ -337,6 +339,12 @@ int render(const CGLTrace& trace) {
auto time_end = std::chrono::high_resolution_clock::now();
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
printf("Elapsed time: %lg ms\n", elapsed);
if (draw_idx < trace.drawcalls.size()-1) {
vx_dump_perf(device, stdout);
}
++draw_idx;
}
// download destination buffer