synthesis optimizations

This commit is contained in:
Blaise Tine 2021-06-17 16:43:43 -07:00
parent 1e677c8e5e
commit 57143f5889
16 changed files with 173 additions and 229 deletions

View file

@ -36,19 +36,16 @@ module VX_ibuffer #(
wire writing = enq_fire && (i == ibuf_enq_if.wid);
wire reading = deq_fire && (i == ibuf_deq_if.wid);
wire is_slot0 = empty_r[i] || (alm_empty_r[i] && reading);
wire push = writing && !is_slot0;
wire pop = reading && !alm_empty_r[i];
wire is_head_ptr = empty_r[i] || (alm_empty_r[i] && reading);
VX_skid_buffer #(
.DATAW (DATAW)
) queue (
.clk (clk),
.reset (reset),
.valid_in (push),
.valid_in (writing && !is_head_ptr),
.data_in (q_data_in),
.ready_out(pop),
.ready_out(reading),
.data_out (q_data_prev[i]),
`UNUSED_PIN (ready_in),
`UNUSED_PIN (valid_out)
@ -79,9 +76,9 @@ module VX_ibuffer #(
used_r[i] <= used_r[i] + ADDRW'($signed(2'(writing) - 2'(reading)));
end
if (writing && is_slot0) begin
if (writing && is_head_ptr) begin
q_data_out[i] <= q_data_in;
end else if (pop) begin
end else if (reading) begin
q_data_out[i] <= q_data_prev[i];
end
end
@ -111,26 +108,17 @@ module VX_ibuffer #(
end
// schedule the next instruction to issue
// do round-robin when multiple warps are active
always @(*) begin
deq_valid_n = 0;
deq_wid_n = 'x;
deq_instr_n = 'x;
schedule_table_n = 'x;
always @(*) begin
deq_valid_n = 1;
if (num_warps > 1) begin
deq_valid_n = (| schedule_table);
schedule_table_n = schedule_table;
for (integer i = 0; i < `NUM_WARPS; i++) begin
if (schedule_table[i]) begin
deq_wid_n = `NW_BITS'(i);
deq_instr_n = q_data_out[i];
schedule_table_n[i] = 0;
break;
end
end
end else if (1 == num_warps && !(deq_fire && q_alm_empty[deq_wid])) begin
deq_valid_n = 1;
deq_wid_n = deq_wid;
deq_instr_n = deq_fire ? q_data_prev[deq_wid] : q_data_out[deq_wid];
end else begin
@ -139,6 +127,17 @@ module VX_ibuffer #(
deq_instr_n = q_data_in;
end
end
// do round-robin with multiple active warps
always @(*) begin
schedule_table_n = schedule_table;
for (integer i = 0; i < `NUM_WARPS; i++) begin
if (schedule_table[i]) begin
schedule_table_n[i] = 0;
break;
end
end
end
wire warp_added = enq_fire && q_empty[ibuf_enq_if.wid];
wire warp_removed = deq_fire && ~(enq_fire && ibuf_enq_if.wid == deq_wid) && q_alm_empty[deq_wid];

View file

@ -38,8 +38,7 @@ module VX_instr_demux (
wire alu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_ALU);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `ALU_BITS + `MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)),
.USE_FASTREG (1)
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `ALU_BITS + `MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32))
) alu_buffer (
.clk (clk),
.reset (reset),
@ -56,8 +55,7 @@ module VX_instr_demux (
wire lsu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_LSU);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `LSU_BITS + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32)),
.USE_FASTREG (1)
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `LSU_BITS + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32))
) lsu_buffer (
.clk (clk),
.reset (reset),
@ -74,8 +72,7 @@ module VX_instr_demux (
wire csr_req_valid = execute_if.valid && (execute_if.ex_type == `EX_CSR);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NR_BITS + 32),
.USE_FASTREG (1)
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NR_BITS + 32)
) csr_buffer (
.clk (clk),
.reset (reset),
@ -93,8 +90,7 @@ module VX_instr_demux (
wire fpu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_FPU);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `FPU_BITS + `MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)),
.USE_FASTREG (1)
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `FPU_BITS + `MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32))
) fpu_buffer (
.clk (clk),
.reset (reset),
@ -115,8 +111,7 @@ module VX_instr_demux (
wire gpu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_GPU);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32 + 32)),
.USE_FASTREG (1)
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32 + 32))
) gpu_buffer (
.clk (clk),
.reset (reset),

View file

@ -11,11 +11,17 @@ module VX_scoreboard #(
output wire delay
);
reg [`NUM_WARPS-1:0][`NUM_REGS-1:0] inuse_regs;
wire [`NUM_REGS-1:0] deq_inuse_regs;
assign deq_inuse_regs = inuse_regs[ibuf_deq_if.wid] & ibuf_deq_if.used_regs;
assign delay = (| deq_inuse_regs);
reg is_reg_busy;
always @(*) begin
is_reg_busy = 0;
for (integer i = 0; i < `NUM_WARPS; ++i) begin
if (ibuf_deq_if.wid == `NW_BITS'(i)) begin
is_reg_busy = | (inuse_regs[i] & ibuf_deq_if.used_regs);
end
end
end
assign delay = is_reg_busy;
wire reserve_reg = ibuf_deq_if.valid && ibuf_deq_if.ready && (ibuf_deq_if.wb != 0);
@ -37,6 +43,8 @@ module VX_scoreboard #(
end
end
wire [`NUM_REGS-1:0] deq_inuse_regs = inuse_regs[ibuf_deq_if.wid];
`ifdef DBG_PRINT_PIPELINE
always @(posedge clk) begin
if (ibuf_deq_if.valid && ~ibuf_deq_if.ready) begin

View file

@ -32,8 +32,8 @@ module VX_smem_arb (
VX_stream_demux #(
.NUM_REQS (2),
.DATAW (REQ_DATAW),
.BUFFERED (0)
) rsp_demux (
.BUFFERED (1)
) req_demux (
.clk (clk),
.reset (reset),
.sel (core_req_if.tag[i][0]),

View file

@ -53,7 +53,6 @@ localparam AVS_REQ_TAGW_CCI = `MAX(CCI_ADDR_WIDTH, CCI_ADDR_WIDTH + $clog2(LME
localparam AVS_REQ_TAGW = `MAX(AVS_REQ_TAGW_VX, AVS_REQ_TAGW_CCI);
localparam CCI_RD_WINDOW_SIZE = 8;
localparam CCI_RD_QUEUE_SIZE = 2 * CCI_RD_WINDOW_SIZE;
localparam CCI_RW_PENDING_SIZE= 256;
localparam AFU_ID_L = 16'h0002; // AFU ID Lower
@ -78,15 +77,15 @@ localparam MMIO_SCOPE_WRITE = `AFU_IMAGE_MMIO_SCOPE_WRITE;
localparam MMIO_DEV_CAPS = `AFU_IMAGE_MMIO_DEV_CAPS;
localparam CCI_RD_RQ_TAGW = $clog2(CCI_RD_WINDOW_SIZE);
localparam CCI_RD_RQ_DATAW = CCI_LINE_WIDTH + CCI_RD_RQ_TAGW;
localparam CCI_RD_QUEUE_SIZE = 2 * CCI_RD_WINDOW_SIZE;
localparam CCI_RD_QUEUE_TAGW = $clog2(CCI_RD_WINDOW_SIZE);
localparam CCI_RD_QUEUE_DATAW = CCI_LINE_WIDTH + CCI_ADDR_WIDTH;
localparam STATE_IDLE = 0;
localparam STATE_READ = 1;
localparam STATE_WRITE = 2;
localparam STATE_WRITE = 1;
localparam STATE_READ = 2;
localparam STATE_START = 3;
localparam STATE_RUN = 4;
localparam STATE_MAX_VALUE = 5;
localparam STATE_MAX_VALUE = 4;
localparam STATE_WIDTH = $clog2(STATE_MAX_VALUE);
`ifdef SCOPE
@ -114,11 +113,9 @@ wire [`VX_MEM_LINE_WIDTH-1:0] vx_mem_rsp_data;
wire [`VX_MEM_TAG_WIDTH-1:0] vx_mem_rsp_tag;
wire vx_mem_rsp_ready;
reg vx_reset;
wire vx_busy;
reg vx_reset;
reg vx_mem_en;
// CMD variables //////////////////////////////////////////////////////////////
t_ccip_clAddr cmd_io_addr;
@ -292,8 +289,9 @@ end
// COMMAND FSM ////////////////////////////////////////////////////////////////
wire cmd_read_done;
wire cmd_write_done;
reg cmd_write_done;
wire cmd_run_done;
reg vx_started;
reg [$clog2(RESET_DELAY+1)-1:0] vx_reset_ctr;
always @(posedge clk) begin
@ -306,9 +304,9 @@ end
always @(posedge clk) begin
if (reset) begin
state <= STATE_IDLE;
state <= STATE_IDLE;
vx_started <= 0;
vx_reset <= 0;
vx_mem_en <= 0;
end else begin
case (state)
STATE_IDLE: begin
@ -358,21 +356,20 @@ always @(posedge clk) begin
STATE_START: begin
// vortex reset cycles
if (vx_reset_ctr == $bits(vx_reset_ctr)'(RESET_DELAY)) begin
vx_reset <= 0;
vx_mem_en <= 1;
state <= STATE_RUN;
end
end
STATE_RUN: begin
if (cmd_run_done) begin
vx_mem_en <= 0;
state <= STATE_IDLE;
`ifdef DBG_PRINT_OPAE
$display("%t: STATE IDLE", $time);
`endif
end
if (vx_started) begin
if (cmd_run_done) begin
vx_started <= 0;
state <= STATE_IDLE;
`ifdef DBG_PRINT_OPAE
$display("%t: STATE IDLE", $time);
`endif
end
end else begin
if (vx_reset_ctr == $bits(vx_reset_ctr)'(RESET_DELAY)) begin
vx_started <= 1;
vx_reset <= 0;
end
end
end
default: begin
@ -387,11 +384,12 @@ end
wire cci_mem_rd_req_valid;
wire cci_mem_wr_req_valid;
wire [CCI_RD_RQ_DATAW-1:0] cci_rdq_dout;
wire [CCI_RD_QUEUE_DATAW-1:0] cci_rdq_dout;
wire cci_mem_req_valid;
wire cci_mem_req_rw;
wire [CCI_ADDR_WIDTH-1:0] cci_mem_req_addr;
wire [CCI_LINE_WIDTH-1:0] cci_mem_req_data;
wire [CCI_ADDR_WIDTH-1:0] cci_mem_req_tag;
wire cci_mem_req_ready;
@ -430,7 +428,7 @@ VX_to_mem #(
.mem_req_addr_in (cci_mem_req_addr),
.mem_req_rw_in (cci_mem_req_rw),
.mem_req_byteen_in ({CCI_LINE_SIZE{1'b1}}),
.mem_req_data_in (cci_rdq_dout[CCI_RD_RQ_DATAW-1:CCI_RD_RQ_TAGW]),
.mem_req_data_in (cci_mem_req_data),
.mem_req_tag_in (cci_mem_req_tag),
.mem_req_ready_in (cci_mem_req_ready),
@ -473,7 +471,7 @@ wire vx_mem_req_valid_qual;
wire vx_mem_req_ready_qual;
assign vx_mem_req_valid_qual = vx_mem_req_valid
&& vx_mem_en
&& vx_started
&& ~vx_mem_is_cout;
assign vx_mem_req_ready = vx_mem_is_cout ? ~cout_q_full : vx_mem_req_ready_qual;
@ -617,19 +615,20 @@ VX_avs_wrapper #(
reg [CCI_ADDR_WIDTH-1:0] cci_mem_wr_req_ctr;
wire [CCI_ADDR_WIDTH-1:0] cci_mem_wr_req_addr;
reg [CCI_ADDR_WIDTH-1:0] cci_mem_wr_req_addr_unqual;
reg [CCI_ADDR_WIDTH-1:0] cci_rd_req_ctr;
wire [CCI_ADDR_WIDTH-1:0] cci_rd_req_ctr_next;
wire [CCI_RD_RQ_TAGW-1:0] cci_rd_req_tag;
wire [CCI_RD_RQ_TAGW-1:0] cci_rd_rsp_tag;
reg [CCI_RD_RQ_TAGW-1:0] cci_rd_rsp_ctr;
reg [CCI_ADDR_WIDTH-1:0] cci_mem_wr_req_addr_base;
wire cci_rd_req_fire;
t_ccip_clAddr cci_rd_req_addr;
reg cci_rd_req_valid, cci_rd_req_wait;
reg [CCI_ADDR_WIDTH-1:0] cci_rd_req_ctr;
wire [CCI_ADDR_WIDTH-1:0] cci_rd_req_ctr_next;
wire [CCI_RD_QUEUE_TAGW-1:0] cci_rd_req_tag;
wire [CCI_RD_QUEUE_TAGW-1:0] cci_rd_rsp_tag;
reg [CCI_RD_QUEUE_TAGW-1:0] cci_rd_rsp_ctr;
wire cci_rdq_push, cci_rdq_pop;
wire [CCI_RD_RQ_DATAW-1:0] cci_rdq_din;
wire [CCI_RD_QUEUE_DATAW-1:0] cci_rdq_din;
wire cci_rdq_empty;
always @(*) begin
@ -641,16 +640,15 @@ end
wire cci_mem_wr_req_fire = cci_mem_wr_req_valid && cci_mem_req_ready;
wire cci_rd_rsp_fire = (STATE_WRITE == state)
&& cp2af_sRxPort.c0.rspValid
wire cci_rd_rsp_fire = cp2af_sRxPort.c0.rspValid
&& (cp2af_sRxPort.c0.hdr.resp_type == eRSP_RDLINE);
assign cci_rd_req_tag = CCI_RD_RQ_TAGW'(cci_rd_req_ctr);
assign cci_rd_rsp_tag = CCI_RD_RQ_TAGW'(cp2af_sRxPort.c0.hdr.mdata);
assign cci_rd_req_tag = CCI_RD_QUEUE_TAGW'(cci_rd_req_ctr);
assign cci_rd_rsp_tag = CCI_RD_QUEUE_TAGW'(cp2af_sRxPort.c0.hdr.mdata);
assign cci_rdq_push = cci_rd_rsp_fire;
assign cci_rdq_pop = cci_mem_wr_req_fire;
assign cci_rdq_din = {cp2af_sRxPort.c0.data, cci_rd_rsp_tag};
assign cci_rdq_din = {cp2af_sRxPort.c0.data, cci_mem_wr_req_addr_base + CCI_ADDR_WIDTH'(cci_rd_rsp_tag)};
wire [$clog2(CCI_RD_QUEUE_SIZE+1)-1:0] cci_pending_reads;
wire cci_pending_reads_full;
@ -673,9 +671,7 @@ assign cci_rd_req_fire = cci_rd_req_valid && !(cci_rd_req_wait || cci_pending_re
assign cci_mem_wr_req_valid = !cci_rdq_empty;
assign cci_mem_wr_req_addr = cci_mem_wr_req_addr_unqual + (CCI_ADDR_WIDTH'(CCI_RD_RQ_TAGW'(cci_rdq_dout)));
assign cmd_write_done = (cci_mem_wr_req_ctr == cmd_data_size);
assign cci_mem_wr_req_addr = cci_rdq_dout[CCI_ADDR_WIDTH-1:0];
// Send read requests to CCI
always @(posedge clk) begin
@ -693,11 +689,11 @@ always @(posedge clk) begin
&& (cci_rd_req_ctr_next != cmd_data_size)
&& !cp2af_sRxPort.c0TxAlmFull;
if (cci_rd_req_fire && (cci_rd_req_tag == CCI_RD_RQ_TAGW'(CCI_RD_WINDOW_SIZE-1))) begin
if (cci_rd_req_fire && (cci_rd_req_tag == CCI_RD_QUEUE_TAGW'(CCI_RD_WINDOW_SIZE-1))) begin
cci_rd_req_wait <= 1; // end current request batch
end
if (cci_rd_rsp_fire && (cci_rd_rsp_ctr == CCI_RD_RQ_TAGW'(CCI_RD_WINDOW_SIZE-1))) begin
if (cci_rd_rsp_fire && (cci_rd_rsp_ctr == CCI_RD_QUEUE_TAGW'(CCI_RD_WINDOW_SIZE-1))) begin
cci_rd_req_wait <= 0; // begin new request batch
end
end
@ -708,7 +704,8 @@ always @(posedge clk) begin
cci_rd_req_ctr <= 0;
cci_rd_rsp_ctr <= 0;
cci_mem_wr_req_ctr <= 0;
cci_mem_wr_req_addr_unqual <= cmd_mem_addr;
cci_mem_wr_req_addr_base <= cmd_mem_addr;
cmd_write_done <= 0;
end
if (cci_rd_req_fire) begin
@ -720,7 +717,7 @@ always @(posedge clk) begin
end
if (cci_rd_rsp_fire) begin
cci_rd_rsp_ctr <= cci_rd_rsp_ctr + CCI_RD_RQ_TAGW'(1);
cci_rd_rsp_ctr <= cci_rd_rsp_ctr + CCI_RD_QUEUE_TAGW'(1);
`ifdef DBG_PRINT_OPAE
$display("%t: CCI Rd Rsp: idx=%0d, ctr=%0d, data=%0h", $time, cci_rd_rsp_tag, cci_rd_rsp_ctr, cp2af_sRxPort.c0.data);
`endif
@ -733,13 +730,18 @@ always @(posedge clk) begin
end
if (cci_mem_wr_req_fire) begin
cci_mem_wr_req_addr_unqual <= cci_mem_wr_req_addr_unqual + ((CCI_RD_RQ_TAGW'(cci_mem_wr_req_ctr) == CCI_RD_RQ_TAGW'(CCI_RD_WINDOW_SIZE-1)) ? CCI_ADDR_WIDTH'(CCI_RD_WINDOW_SIZE) : CCI_ADDR_WIDTH'(0));
cci_mem_wr_req_ctr <= cci_mem_wr_req_ctr + CCI_ADDR_WIDTH'(1);
if (CCI_RD_QUEUE_TAGW'(cci_mem_wr_req_ctr) == CCI_RD_QUEUE_TAGW'(CCI_RD_WINDOW_SIZE-1)) begin
cci_mem_wr_req_addr_base <= cci_mem_wr_req_addr_base + CCI_ADDR_WIDTH'(CCI_RD_WINDOW_SIZE);
end
cci_mem_wr_req_ctr <= cci_mem_wr_req_ctr + CCI_ADDR_WIDTH'(1);
if (cci_mem_wr_req_ctr == (cmd_data_size-1)) begin
cmd_write_done <= 1;
end
end
end
VX_fifo_queue #(
.DATAW (CCI_RD_RQ_DATAW),
.DATAW (CCI_RD_QUEUE_DATAW),
.SIZE (CCI_RD_QUEUE_SIZE)
) cci_rd_req_queue (
.clk (clk),
@ -779,11 +781,13 @@ VX_fifo_queue #(
reg [CCI_ADDR_WIDTH-1:0] cci_mem_rd_req_ctr;
reg [CCI_ADDR_WIDTH-1:0] cci_mem_rd_req_addr;
reg [CCI_ADDR_WIDTH-1:0] cci_wr_req_ctr;
reg cci_mem_rd_req_done;
reg [CCI_ADDR_WIDTH-1:0] cci_wr_req_ctr;
reg cci_wr_req_fire;
t_ccip_clAddr cci_wr_req_addr;
t_ccip_clData cci_wr_req_data;
reg cci_wr_req_done;
always @(*) begin
af2cp_sTxPort.c1.valid = cci_wr_req_fire;
@ -818,12 +822,12 @@ VX_pending_size #(
`UNUSED_VAR (cci_pending_writes)
assign cci_mem_rd_req_valid = (STATE_READ == state)
&& (cci_mem_rd_req_ctr != cmd_data_size);
&& !cci_mem_rd_req_done;
assign cci_mem_rsp_ready = !cp2af_sRxPort.c1TxAlmFull
&& !cci_pending_writes_full;
assign cmd_read_done = (0 == cci_wr_req_ctr)
assign cmd_read_done = cci_wr_req_done
&& cci_pending_writes_empty;
// Send write requests to CCI
@ -839,12 +843,17 @@ begin
&& (CMD_MEM_READ == cmd_type)) begin
cci_mem_rd_req_ctr <= 0;
cci_mem_rd_req_addr <= cmd_mem_addr;
cci_mem_rd_req_done <= 0;
cci_wr_req_ctr <= cmd_data_size;
cci_wr_req_done <= 0;
end
if (cci_mem_rd_req_fire) begin
cci_mem_rd_req_addr <= cci_mem_rd_req_addr + CCI_ADDR_WIDTH'(1);
cci_mem_rd_req_ctr <= cci_mem_rd_req_ctr + CCI_ADDR_WIDTH'(1);
if (cci_mem_rd_req_ctr == (cmd_data_size-1)) begin
cci_mem_rd_req_done <= 1;
end
end
cci_wr_req_addr <= cmd_io_addr + t_ccip_clAddr'(cci_mem_rsp_tag);
@ -853,6 +862,9 @@ begin
if (cci_wr_req_fire) begin
assert(cci_wr_req_ctr != 0);
cci_wr_req_ctr <= cci_wr_req_ctr - CCI_ADDR_WIDTH'(1);
if (cci_wr_req_ctr == CCI_ADDR_WIDTH'(1)) begin
cci_wr_req_done <= 1;
end
`ifdef DBG_PRINT_OPAE
$display("%t: CCI Wr Req: addr=%0h, rem=%0d, pending=%0d, data=%0h", $time, cci_wr_req_addr, (cci_wr_req_ctr - 1), cci_pending_writes, af2cp_sTxPort.c1.data);
`endif
@ -867,9 +879,10 @@ end
//--
assign cci_mem_req_rw = (CMD_MEM_WRITE == state);
assign cci_mem_req_rw = state[0]; // STATE_WRITE=00, STATE_WRITE=01
assign cci_mem_req_valid = cci_mem_req_rw ? cci_mem_wr_req_valid : cci_mem_rd_req_valid;
assign cci_mem_req_addr = cci_mem_req_rw ? cci_mem_wr_req_addr : cci_mem_rd_req_addr;
assign cci_mem_req_data = cci_rdq_dout[CCI_RD_QUEUE_DATAW-1:CCI_ADDR_WIDTH];
assign cci_mem_req_tag = cci_mem_req_rw ? cci_mem_wr_req_ctr : cci_mem_rd_req_ctr;
// Vortex /////////////////////////////////////////////////////////////////////
@ -920,7 +933,7 @@ assign cout_char = vx_mem_req_data_ar[cout_tid];
assign vx_mem_is_cout = (vx_mem_req_addr == `VX_MEM_ADDR_WIDTH'(`IO_COUT_ADDR >> (32 - `VX_MEM_ADDR_WIDTH)));
wire cout_q_push = vx_mem_req_valid
&& vx_mem_en
&& vx_started
&& vx_mem_is_cout
&& ~cout_q_full;

View file

@ -475,8 +475,7 @@ module VX_bank #(
end
VX_skid_buffer #(
.DATAW (CORE_TAG_WIDTH + (1 + `WORD_WIDTH + `REQS_BITS) * NUM_PORTS),
.USE_FASTREG (NUM_BANKS == 1)
.DATAW (CORE_TAG_WIDTH + (1 + `WORD_WIDTH + `REQS_BITS) * NUM_PORTS)
) core_rsp_req (
.clk (clk),
.reset (reset),

View file

@ -106,8 +106,7 @@ module VX_cache_core_rsp_merge #(
wire core_rsp_valid_any = (| per_bank_core_rsp_valid);
VX_skid_buffer #(
.DATAW (NUM_REQS + CORE_TAG_WIDTH + (NUM_REQS *`WORD_WIDTH)),
.USE_FASTREG (1)
.DATAW (NUM_REQS + CORE_TAG_WIDTH + (NUM_REQS *`WORD_WIDTH))
) pipe_reg (
.clk (clk),
.reset (reset),
@ -155,8 +154,7 @@ module VX_cache_core_rsp_merge #(
for (genvar i = 0; i < NUM_REQS; i++) begin
VX_skid_buffer #(
.DATAW (CORE_TAG_WIDTH + `WORD_WIDTH),
.USE_FASTREG (1)
.DATAW (CORE_TAG_WIDTH + `WORD_WIDTH)
) pipe_reg (
.clk (clk),
.reset (reset),

View file

@ -48,7 +48,7 @@ module VX_priority_encoder #(
VX_onehot_encoder #(
.N (N),
.REVERSE (REVERSE)
) b (
) onehot_encoder (
.data_in (onehot),
.data_out (index),
`UNUSED_PIN (valid)

View file

@ -3,7 +3,8 @@
module VX_rr_arbiter #(
parameter NUM_REQS = 1,
parameter LOCK_ENABLE = 0,
parameter LOG_NUM_REQS = $clog2(NUM_REQS)
parameter LOG_NUM_REQS = $clog2(NUM_REQS),
parameter FAST = 1
) (
input wire clk,
input wire reset,
@ -23,6 +24,58 @@ module VX_rr_arbiter #(
assign grant_onehot = requests;
assign grant_valid = requests[0];
end else if (FAST == 1) begin
wire [NUM_REQS-1:0] req_masked;
wire [NUM_REQS-1:0] grant, grant_masked, grant_unmasked;
/* verilator lint_off UNOPTFLAT */
wire [NUM_REQS-1:0] mask_higher_pri_reqs;
/* verilator lint_off UNOPTFLAT */
wire [NUM_REQS-1:0] unmask_higher_pri_reqs;
wire no_req_masked;
reg [NUM_REQS-1:0] pointer_reg;
// Simple priority arbitration for masked portion
assign req_masked = requests & pointer_reg;
assign mask_higher_pri_reqs[NUM_REQS-1:1] = mask_higher_pri_reqs[NUM_REQS-2:0] | req_masked[NUM_REQS-2:0];
assign mask_higher_pri_reqs[0] = 1'b0;
assign grant_masked[NUM_REQS-1:0] = req_masked[NUM_REQS-1:0] & ~mask_higher_pri_reqs[NUM_REQS-1:0];
// Simple priority arbitration for unmasked portion
assign unmask_higher_pri_reqs[NUM_REQS-1:1] = unmask_higher_pri_reqs[NUM_REQS-2:0] | requests[NUM_REQS-2:0];
assign unmask_higher_pri_reqs[0] = 1'b0;
assign grant_unmasked[NUM_REQS-1:0] = requests[NUM_REQS-1:0] & ~unmask_higher_pri_reqs[NUM_REQS-1:0];
// Use grant_masked if there is any there, otherwise use grant_unmasked.
assign no_req_masked = ~(| req_masked);
assign grant = ({NUM_REQS{no_req_masked}} & grant_unmasked) | grant_masked;
// Generate arbiter pointer update
wire mask_ptr_sel = (| req_masked) & (!LOCK_ENABLE || enable);
wire unmask_ptr_sel = (| requests) & (!LOCK_ENABLE || enable);
// Pointer update
always @(posedge clk) begin
if (reset) begin
pointer_reg <= {NUM_REQS{1'b1}};
end else if (mask_ptr_sel) begin // select if masked arbiter used
pointer_reg <= mask_higher_pri_reqs;
end else if (unmask_ptr_sel) begin // select if unmasked arbiter used
pointer_reg <= unmask_higher_pri_reqs;
end
end
VX_onehot_encoder #(
.N (NUM_REQS)
) onehot_encoder (
.data_in (grant),
.data_out (grant_index),
`UNUSED_PIN (valid)
);
assign grant_onehot = grant;
assign grant_valid = (| requests);
end else begin
reg [LOG_NUM_REQS-1:0] grant_table [NUM_REQS-1:0];

View file

@ -83,7 +83,7 @@ module VX_skid_buffer #(
end
if (pop && !use_buffer) begin
data_out_r <= data_in;
end else if (pop) begin
end else if (ready_out) begin
data_out_r <= buffer;
end
end

View file

@ -76,16 +76,16 @@ $(FPGA_BUILD_DIR)_4c/build/dcp.qpf:
afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_4c
$(FPGA_BUILD_DIR)_8c/build/dcp.qpf:
afu_synth_setup -s setup8.cfg $(FPGA_BUILD_DIR)_8c
afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_8c
$(FPGA_BUILD_DIR)_16c/build/dcp.qpf:
afu_synth_setup -s setup16.cfg $(FPGA_BUILD_DIR)_16c
afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_16c
$(FPGA_BUILD_DIR)_32c/build/dcp.qpf:
afu_synth_setup -s setup16.cfg $(FPGA_BUILD_DIR)_32c
afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_32c
$(FPGA_BUILD_DIR)_64c/build/dcp.qpf:
afu_synth_setup -s setup16.cfg $(FPGA_BUILD_DIR)_64c
afu_synth_setup -s setup.cfg $(FPGA_BUILD_DIR)_64c
gen-sources-1c:
./gen_sources.sh $(CFLAGS) $(CONFIG1) > sources.txt

View file

@ -1,7 +0,0 @@
+define+SYNTHESIS
+define+QUARTUS
vortex_afu16.json
QI:vortex_afu.qsf
C:sources.txt

View file

@ -1,7 +0,0 @@
+define+SYNTHESIS
+define+QUARTUS
vortex_afu8.json
QI:vortex_afu.qsf
C:sources.txt

View file

@ -2,8 +2,8 @@
"version": 1,
"afu-image": {
"power": 0,
"clock-frequency-high": "auto-220",
"clock-frequency-low": "auto-220",
"clock-frequency-high": "auto-210",
"clock-frequency-low": "auto-210",
"cmd-mem-read": 1,
"cmd-mem-write": 2,

View file

@ -1,53 +0,0 @@
{
"version": 1,
"afu-image": {
"power": 0,
"clock-frequency-high": "auto-200",
"clock-frequency-low": "auto-200",
"cmd-mem-read": 1,
"cmd-mem-write": 2,
"cmd-run": 3,
"cmd-csr-read": 4,
"cmd-csr-write": 5,
"mmio-cmd-type": 10,
"mmio-io-addr": 12,
"mmio-mem-addr": 14,
"mmio-data-size": 16,
"mmio-status": 18,
"mmio-scope-read": 20,
"mmio-scope-write": 22,
"mmio-dev-caps": 24,
"afu-top-interface":
{
"class": "ccip_std_afu_avalon_mm",
"module-ports" :
[
{
"class": "cci-p",
"params":
{
"clock": "uClk_usr"
}
},
{
"class": "local-memory",
"params":
{
"clock": "uClk_usr"
}
}
]
},
"accelerator-clusters":
[
{
"name": "vortex_afu",
"total-contexts": 1,
"accelerator-type-uuid": "35f9452b-25c2-434c-93d5-6f8c60db361c"
}
]
}
}

View file

@ -1,54 +0,0 @@
{
"version": 1,
"afu-image": {
"power": 0,
"clock-frequency-high": "auto-210",
"clock-frequency-low": "auto-210",
"cmd-mem-read": 1,
"cmd-mem-write": 2,
"cmd-run": 3,
"cmd-csr-read": 4,
"cmd-csr-write": 5,
"mmio-cmd-type": 10,
"mmio-io-addr": 12,
"mmio-mem-addr": 14,
"mmio-data-size": 16,
"mmio-status": 18,
"mmio-scope-read": 20,
"mmio-scope-write": 22,
"mmio-dev-caps": 24,
"afu-top-interface":
{
"class": "ccip_std_afu_avalon_mm",
"module-ports" :
[
{
"class": "cci-p",
"params":
{
"clock": "uClk_usr"
}
},
{
"class": "local-memory",
"params":
{
"clock": "uClk_usr"
}
}
]
},
"accelerator-clusters":
[
{
"name": "vortex_afu",
"total-contexts": 1,
"accelerator-type-uuid": "35f9452b-25c2-434c-93d5-6f8c60db361c"
}
]
}
}