optimized opae cci to dev memcpy using double buffering and request window to work around unordered read requests

This commit is contained in:
Blaise Tine 2020-04-23 01:30:45 -07:00
parent 3e64cb4380
commit 77a52ea20b
10 changed files with 249 additions and 110 deletions

View file

@ -57,6 +57,12 @@ int vx_start(vx_device_h hdevice);
// Wait for device ready with milliseconds timeout
int vx_ready_wait(vx_device_h hdevice, long long timeout);
// set device constant registers
int vx_set_regiters(int state, int value);
// get device constant registers
int vx_get_regiters(int state, int* value);
////////////////////////////// UTILITY FUNCIONS ///////////////////////////////
// upload kernel bytes to device

Binary file not shown.

View file

@ -38,9 +38,13 @@ make clean
make
./basic
ASE build instructions
#ASE build instructions
vcd file vortex.vcd
vcd add -r /*/Vortex/hw/rtl/*
run -all
run -all
#compress
tar -zcvf vortex.vcd.tar.gz work/vortex.vcd
# decompress
tar -zxvf vortex.vcd.tar.gz vortex.vcd

View file

@ -72,7 +72,7 @@ vortex_afu.json
../rtl/VX_dmem_ctrl.v
../rtl/VX_alu_unit.v
../rtl/VX_csr_data.v
../rtl/VX_lsu_uint.v
../rtl/VX_lsu_unit.v
../rtl/VX_decode.v
../rtl/VX_inst_multiplex.v
../rtl/VX_csr_wrapper.v

View file

@ -1,12 +1,7 @@
// Interface between CSR and FSM
// All the MMIOs read/write are done from CSR and passed to the FSM for state transitions
// To be done:
// Change address size to buffer's address size and data size based on IO address size. Check from hello_world
`include "platform_if.vh"
import local_mem_cfg_pkg::*;
`include "afu_json_info.vh"
`include "VX_define.vh"
module vortex_afu #(
parameter NUM_LOCAL_MEM_BANKS = 2
@ -35,6 +30,9 @@ module vortex_afu #(
localparam AVS_RD_QUEUE_SIZE = 16;
localparam CCI_RD_WINDOW_SIZE = 8;
localparam CCI_RD_QUEUE_SIZE = 2 * CCI_RD_WINDOW_SIZE;
localparam VX_SNOOP_DELAY = 300;
localparam VX_SNOOP_LEVELS = 2;
@ -62,6 +60,9 @@ typedef enum logic[3:0] {
STATE_CLFLUSH
} state_t;
typedef logic [`LOG2UP(CCI_RD_WINDOW_SIZE)-1:0] t_cci_rdq_tag;
typedef logic [$bits(t_ccip_clData) + $bits(t_cci_rdq_tag)-1:0] t_cci_rdq_data;
state_t state;
// Vortex signals /////////////////////////////////////////////////////////////
@ -117,8 +118,7 @@ end
always_ff @(posedge clk)
begin
if (SoftReset)
begin
if (SoftReset) begin
af2cp_sTxPort.c2.hdr <= 0;
af2cp_sTxPort.c2.data <= 0;
af2cp_sTxPort.c2.mmioRdValid <= 0;
@ -152,12 +152,17 @@ begin
csr_cmd <= $bits(csr_cmd)'(cp2af_sRxPort.c0.data);
$display("%t: CSR_CMD: %0d", $time, $bits(csr_cmd)'(cp2af_sRxPort.c0.data));
end
default: begin
// user-defined CSRs
//if (mmioHdr.addres >= MMIO_CSR_USER) begin
// write Vortex CRS
//end
end
endcase
end
// serve MMIO read requests
if (cp2af_sRxPort.c0.mmioRdValid)
begin
if (cp2af_sRxPort.c0.mmioRdValid) begin
af2cp_sTxPort.c2.hdr.tid <= mmioHdr.tid; // copy TID
case (mmioHdr.address)
// AFU header
@ -176,8 +181,9 @@ begin
16'h0006: af2cp_sTxPort.c2.data <= 64'h0; // next AFU
16'h0008: af2cp_sTxPort.c2.data <= 64'h0; // reserved
MMIO_CSR_STATUS: begin
if (state != af2cp_sTxPort.c2.data)
if (state != af2cp_sTxPort.c2.data) begin
$display("%t: STATUS: state=%0d", $time, state);
end
af2cp_sTxPort.c2.data <= state;
end
default: af2cp_sTxPort.c2.data <= 64'h0;
@ -198,8 +204,7 @@ logic vx_reset;
always_ff @(posedge clk)
begin
if (SoftReset)
begin
if (SoftReset) begin
state <= STATE_IDLE;
vx_reset <= 0;
end
@ -217,7 +222,7 @@ begin
CMD_TYPE_WRITE: begin
$display("%t: STATE WRITE: ia=%h da=%h sz=%0d", $time, csr_io_addr, csr_mem_addr, csr_data_size);
state <= STATE_WRITE;
end
end
CMD_TYPE_RUN: begin
$display("%t: STATE START", $time);
vx_reset <= 1;
@ -231,29 +236,25 @@ begin
end
STATE_READ: begin
if (cci_write_ctr >= csr_data_size)
begin
if (cci_write_ctr >= csr_data_size) begin
state <= STATE_IDLE;
end
end
STATE_WRITE: begin
if (avs_write_ctr >= csr_data_size)
begin
if (avs_write_ctr >= csr_data_size) begin
state <= STATE_IDLE;
end
end
STATE_RUN: begin
if (vx_ebreak)
begin
if (vx_ebreak) begin
state <= STATE_IDLE;
end
end
STATE_CLFLUSH: begin
if (vx_snoop_delay >= VX_SNOOP_DELAY)
begin
if (vx_snoop_delay >= VX_SNOOP_DELAY) begin
state <= STATE_IDLE;
end
end
@ -264,6 +265,20 @@ end
// AVS Controller /////////////////////////////////////////////////////////////
logic cci_rdq_empty;
t_cci_rdq_data cci_rdq_dout;
logic cci_rdq_pop;
t_ccip_clAddr next_avs_address;
always_comb
begin
next_avs_address = csr_mem_addr + {avs_write_ctr[31:$bits(t_cci_rdq_tag)], t_cci_rdq_tag'(cci_rdq_dout)};
cci_rdq_pop = (state == STATE_WRITE
&& !cci_rdq_empty
&& !avs_waitrequest
&& avs_write_ctr < csr_data_size);
end
always_ff @(posedge clk)
begin
if (SoftReset)
@ -295,22 +310,21 @@ begin
&& !avs_waitrequest
&& avs_read_ctr < csr_data_size)
begin
avs_address <= csr_mem_addr + avs_read_ctr;
avs_read <= 1;
avs_address <= csr_mem_addr + avs_read_ctr;
avs_read_ctr <= avs_read_ctr + 1;
avs_read <= 1;
$display("%t: AVS Rd Req: addr=%h", $time, csr_mem_addr + avs_read_ctr);
end
end
STATE_WRITE: begin
if (cp2af_sRxPort.c0.rspValid
&& avs_write_ctr < csr_data_size)
begin
avs_writedata <= cp2af_sRxPort.c0.data;
avs_address <= csr_mem_addr + avs_write_ctr;
avs_write <= 1;
if (cci_rdq_pop)
begin
avs_writedata <= cci_rdq_dout[$bits(t_ccip_clData) + $bits(t_cci_rdq_tag)-1:$bits(t_cci_rdq_tag)];
avs_address <= next_avs_address;
avs_write_ctr <= avs_write_ctr + 1;
$display("%t: AVS Wr Req: addr=%h (%0d/%0d)", $time, csr_mem_addr + avs_write_ctr, avs_write_ctr + 1, csr_data_size);
avs_write <= 1;
$display("%t: AVS Wr Req: addr=%h (%0d/%0d)", $time, next_avs_address, avs_write_ctr + 1, csr_data_size);
end
end
@ -319,7 +333,7 @@ begin
&& vx_dram_req_ready)
begin
avs_address <= (vx_dram_req_addr >> 6);
avs_read <= 1;
avs_read <= 1;
$display("%t: AVS Rd Req: addr=%h", $time, vx_dram_req_addr >> 6);
end
@ -327,8 +341,8 @@ begin
&& vx_dram_req_ready)
begin
avs_writedata <= vx_dram_req_data;
avs_address <= (vx_dram_req_addr >> 6);
avs_write <= 1;
avs_address <= (vx_dram_req_addr >> 6);
avs_write <= 1;
$display("%t: AVS Wr Req: addr=%h", $time, vx_dram_req_addr >> 6);
end
end
@ -362,11 +376,11 @@ end
// AVS address read request queue /////////////////////////////////////////////
logic cci_write_req;
logic cci_wr_req;
always_comb
begin
avs_raq_pop = vx_dram_rsp_valid || cci_write_req;
avs_raq_pop = vx_dram_rsp_valid || cci_wr_req;
avs_raq_din = avs_address;
avs_raq_push = avs_read;
end
@ -374,7 +388,7 @@ end
VX_generic_queue #(
.DATAW($bits(t_local_mem_addr)),
.SIZE(AVS_RD_QUEUE_SIZE)
) vx_rd_addr_queue (
) avs_rd_req_queue (
.clk (clk),
.reset (SoftReset),
.push (avs_raq_push),
@ -397,7 +411,7 @@ end
VX_generic_queue #(
.DATAW($bits(t_local_mem_data)),
.SIZE(AVS_RD_QUEUE_SIZE)
) vx_rd_data_queue (
) avs_rd_rsp_queue (
.clk (clk),
.reset (SoftReset),
.push (avs_rdq_push),
@ -410,101 +424,134 @@ VX_generic_queue #(
// CCI Read Request ///////////////////////////////////////////////////////////
t_ccip_c0_ReqMemHdr rd_hdr;
t_ccip_c0_ReqMemHdr cci_read_hdr;
logic cci_read_pending;
logic [31:0] cci_read_ctr;
t_cci_rdq_tag cci_rdq_ctr;
logic cci_rdq_full;
logic cci_rdq_push;
t_cci_rdq_data cci_rdq_din;
logic cci_read_wait;
always_comb
begin
rd_hdr = t_ccip_c0_ReqMemHdr'(0);
rd_hdr.address = csr_io_addr + avs_write_ctr;
cci_read_hdr = t_ccip_c0_ReqMemHdr'(0);
cci_read_hdr.address = csr_io_addr + cci_read_ctr;
cci_read_hdr.mdata = t_cci_rdq_tag'(cci_read_ctr);
cci_rdq_push = (STATE_WRITE == state) && cp2af_sRxPort.c0.rspValid;
cci_rdq_din = {cp2af_sRxPort.c0.data, t_cci_rdq_tag'(cp2af_sRxPort.c0.hdr.mdata)};
end
// Send read requests to CCI
always_ff @(posedge clk)
begin
if (SoftReset)
begin
if (SoftReset) begin
af2cp_sTxPort.c0.hdr <= 0;
af2cp_sTxPort.c0.valid <= 0;
cci_read_pending <= 0;
cci_read_ctr <= 0;
cci_rdq_ctr <= 0;
cci_read_wait <= 0;
end
else begin
af2cp_sTxPort.c0.valid <= 0;
if (STATE_WRITE == state
&& !cp2af_sRxPort.c0TxAlmFull // ensure read queue not full
&& !avs_waitrequest // ensure AVS write queue not full
&& !cci_read_pending // ensure no read pending
&& avs_write_ctr < csr_data_size) // ensure not done
begin
af2cp_sTxPort.c0.hdr <= rd_hdr;
af2cp_sTxPort.c0.valid <= 1;
cci_read_pending <= 1;
$display("%t: CCI Rd Req: addr=%h", $time, rd_hdr.address);
if (STATE_IDLE == state) begin
cci_read_ctr <= 0;
cci_rdq_ctr <= 0;
cci_read_wait <= 0;
end
if (cci_read_pending
&& cp2af_sRxPort.c0.rspValid)
if (STATE_WRITE == state
&& !cp2af_sRxPort.c0TxAlmFull // ensure read queue not full
&& !cci_rdq_full // ensure destination queue not full
&& !cci_read_wait // ensure the last batch has arrived
&& cci_read_ctr < csr_data_size) // ensure not done
begin
$display("%t: CCI Rd Rsp", $time);
cci_read_pending <= 0;
end
af2cp_sTxPort.c0.hdr <= cci_read_hdr;
af2cp_sTxPort.c0.valid <= 1;
cci_read_ctr <= cci_read_ctr + 1;
if (cci_read_ctr == (CCI_RD_WINDOW_SIZE-1)) begin
cci_read_wait <= 1; // end current request batch
end
$display("%t: CCI Rd Req: addr=%h", $time, cci_read_hdr.address);
end
if (cci_rdq_push) begin
cci_rdq_ctr <= cci_rdq_ctr + 1;
if (cci_rdq_ctr == (CCI_RD_WINDOW_SIZE-1)) begin
cci_read_wait <= 0; // restart new request batch
end
$display("%t: CCI Rd Rsp: idx=%d, ctr=%d", $time, t_cci_rdq_tag'(cp2af_sRxPort.c0.hdr.mdata), cci_rdq_ctr);
end
end
end
VX_generic_queue #(
.DATAW($bits(t_ccip_clData) + $bits(t_cci_rdq_tag)),
.SIZE(CCI_RD_QUEUE_SIZE)
) cci_rd_req_queue (
.clk (clk),
.reset (SoftReset),
.push (cci_rdq_push),
.data_in (cci_rdq_din),
.pop (cci_rdq_pop),
.data_out (cci_rdq_dout),
.empty (cci_rdq_empty),
.full (cci_rdq_full)
);
// CCI Write Request //////////////////////////////////////////////////////////
t_ccip_c1_ReqMemHdr wr_hdr;
t_ccip_c1_ReqMemHdr cci_write_hdr;
logic cci_write_pending;
logic cci_write_wait;
always_comb
begin
cci_write_req = (STATE_READ == state)
&& !avs_rdq_empty
&& !cp2af_sRxPort.c1TxAlmFull
&& !cci_write_pending
&& cci_write_ctr < csr_data_size;
cci_wr_req = (STATE_READ == state)
&& !avs_rdq_empty
&& !cp2af_sRxPort.c1TxAlmFull
&& !cci_write_wait
&& cci_write_ctr < csr_data_size;
wr_hdr = t_ccip_c1_ReqMemHdr'(0);
wr_hdr.address = csr_io_addr + cci_write_ctr;
wr_hdr.sop = 1; // single line write mode
cci_write_hdr = t_ccip_c1_ReqMemHdr'(0);
cci_write_hdr.address = csr_io_addr + cci_write_ctr;
cci_write_hdr.sop = 1; // single line write mode
end
// Send write requests to CCI
always_ff @(posedge clk)
begin
if (SoftReset)
begin
if (SoftReset) begin
af2cp_sTxPort.c1.hdr <= 0;
af2cp_sTxPort.c1.data <= 0;
af2cp_sTxPort.c1.valid <= 0;
cci_write_ctr <= 0;
cci_write_pending <= 0;
cci_write_wait <= 0;
end
else begin
af2cp_sTxPort.c1.valid <= 0;
if (STATE_IDLE == state)
begin
if (STATE_IDLE == state) begin
cci_write_ctr <= 0;
end
if (cci_write_req)
begin
af2cp_sTxPort.c1.hdr <= wr_hdr;
if (cci_wr_req) begin
af2cp_sTxPort.c1.hdr <= cci_write_hdr;
af2cp_sTxPort.c1.data <= t_ccip_clData'(avs_rdq_dout);
af2cp_sTxPort.c1.valid <= 1;
cci_write_pending <= 1;
$display("%t: CCI Wr Req: addr=%h", $time, wr_hdr.address);
cci_write_wait <= 1;
$display("%t: CCI Wr Req: addr=%h", $time, cci_write_hdr.address);
end
if (cci_write_pending
if (cci_write_wait
&& cp2af_sRxPort.c1.rspValid)
begin
cci_write_ctr <= cci_write_ctr + 1;
cci_write_pending <= 0;
cci_write_ctr <= cci_write_ctr + 1;
cci_write_wait <= 0;
$display("%t: CCI Wr Rsp (%0d/%0d)", $time, cci_write_ctr + 1, csr_data_size);
end
end
@ -514,15 +561,13 @@ end
always_ff @(posedge clk)
begin
if (SoftReset)
begin
if (SoftReset) begin
vx_snp_req <= 0;
vx_snoop_ctr <= 0;
vx_snoop_delay <= 0;
end
else begin
if (STATE_IDLE == state)
begin
if (STATE_IDLE == state) begin
vx_snoop_ctr <= 0;
vx_snoop_delay <= 0;
end
@ -532,14 +577,13 @@ begin
if ((STATE_CLFLUSH == state)
&& vx_snoop_ctr < csr_data_size
&& vx_snp_req_ready)
begin
begin
vx_snp_req_addr <= (csr_mem_addr + vx_snoop_ctr) << 6;
vx_snp_req <= 1;
vx_snoop_ctr <= vx_snoop_ctr + 1;
end
if (vx_snoop_ctr == csr_data_size)
begin
if (vx_snoop_ctr == csr_data_size) begin
vx_snoop_delay <= vx_snoop_delay + 1;
end
end

View file

@ -29,7 +29,12 @@
if (!(cond)) $error(msg); \
endgenerate
`define LOG2UP(x) ((x > 1) ? $clog2(x) : 1)
`define CLOG2(x) $clog2(x);
`define FLOG2(x) ($clog2(x) - (((1 << $clog2(x)) > x) ? 1 : 0))
`define LOG2UP(x) ((x > 1) ? $clog2(x) : 1)
`define MIN(x, y) ((x < y) ? x : y);
`define MAX(x, y) ((x > y) ? x : y);
///////////////////////////////////////////////////////////////////////////////

View file

@ -43,6 +43,15 @@ module Vortex #(
input wire [31:0] llc_snp_req_addr,
output wire llc_snp_req_ready,
// CSR request
//input wire csr_read_valid;
//input wire csr_write_valid;
//input wire [`CSR_WIDTH-1:0 csr_index;
//input wire csr_data_in;
//output wire [15:0] csr_data_out;
output wire ebreak
);
`DEBUG_BEGIN

View file

@ -10,16 +10,16 @@ module VX_generic_queue #(
input wire push,
input wire pop,
output wire empty,
output wire full,
output wire full,
`IGNORE_WARNINGS_END
input wire [DATAW-1:0] data_in,
output wire [DATAW-1:0] data_out
);
if (SIZE == 0) begin
assign empty = 1;
assign data_out = data_in;
assign full = 0;
assign empty = 1;
assign data_out = data_in;
assign full = 0;
end else begin // (SIZE > 0)
@ -56,10 +56,9 @@ module VX_generic_queue #(
end
end
assign data_out = head_r;
assign empty = (size_r == 0);
assign full = (size_r != 0) && !pop;
assign data_out = head_r;
assign empty = (size_r == 0);
assign full = (size_r != 0);
end else begin // (SIZE > 1)
reg [DATAW-1:0] curr_r;
@ -82,18 +81,21 @@ module VX_generic_queue #(
always @(posedge clk) begin
if (reset) begin
size_r <= 0;
empty_r <= 1;
empty_r <= 1;
full_r <= 0;
end else begin
if (writing && !reading) begin
size_r <= size_r + 1;
empty_r <= 0;
if (size_r == SIZE-1)
if (size_r == SIZE-1) begin
full_r <= 1;
end else if (reading && !writing) begin
end
end else
if (reading && !writing) begin
size_r <= size_r - 1;
if (size_r == 1)
empty_r <= 1;
if (size_r == 1) begin
empty_r <= 1;
end;
full_r <= 0;
end
end
@ -133,5 +135,5 @@ module VX_generic_queue #(
assign full = full_r;
end
end
endmodule

View file

@ -0,0 +1,19 @@
`include "VX_define.vh"
module VX_tex_mgr (
input wire clk,
input wire reset,
);
//--
endmodule

View file

@ -0,0 +1,50 @@
`include "VX_define.vh"
module VX_tex_unit #(
parameter TADDRW = 32,
parameter MADDRW = 32,
parameter DATAW = 32,
parameter MAXWTW = 8,
parameter MAXHTW = 8,
parameter MAXFTW = 2,
parameter MAXFMW = 1,
parameter MAXAMW = 2,
parameter TAGW = 16,
parameter NUMCRQS = 32,
) (
input wire clk,
input wire reset,
// Texture Request
input wire tex_req_valid,
input wire [TADDRW-1:0] tex_req_u,
input wire [TADDRW-1:0] tex_req_v,
input wire [MADDRW-1:0] tex_req_addr,
input wire [MAXWTW-1:0] tex_req_width,
input wire [MAXHTW-1:0] tex_req_height,
input wire [MAXFTW-1:0] tex_req_format,
input wire [MAXFMW-1:0] tex_req_filter,
input wire [MAXAMW-1:0] tex_req_clamp,
input wire [TAGW-1:0] tex_req_tag,
output wire tex_req_ready,
// Texture Response
output wire tex_rsp_valid,
output wire [TAGW-1:0] tex_rsp_tag,
input wire [DATAW-1:0] tex_rsp_data,
input wire tex_rsp_ready,
// Cache Request
output wire [NUMCRQS-1:0] cache_req_valids,
output wire [NUMCRQS-1:0][MADDRW-1:0] cache_req_addrs,
input wire cache_req_ready,
// Cache Response
input wire cache_rsp_valid,
input wire [MADDRW-1:0] cache_rsp_addr,
input wire [DATAW-1:0] cache_rsp_data,
output wire cache_rsp_ready
);
endmodule