Updated README and synthesis scripts

This commit is contained in:
Blaise Tine 2021-09-22 07:50:47 -07:00
parent feca2db24e
commit 9b04f3d9d6
11 changed files with 134 additions and 62 deletions

View file

@ -8,7 +8,11 @@ Vortex is a full-system RISCV-based GPGPU processor.
## Specifications
- Support RISC-V RV32IMF ISA
- Scalability: 1 to 32 cores with optional L2 and L3 caches
- Performance:
- 1024 total threads running at 250 MHz
- 128 Gflops of compute bandwidth
- 16 GB/s of memory bandwidth
- Scalability: up to 64 cores with optional L2 and L3 caches
- Software: OpenCL 1.2 Support
- Supported FPGAs:
- Intel Arria 10

View file

@ -431,7 +431,7 @@ module VX_bank #(
VX_elastic_buffer #(
.DATAW (NUM_PORTS * (CORE_TAG_WIDTH + 1 + `WORD_WIDTH + `REQS_BITS)),
.SIZE (CRSQ_SIZE),
.OUT_REG (1 == NUM_BANKS)
.OUT_REG (1)
) core_rsp_req (
.clk (clk),
.reset (reset),
@ -470,7 +470,8 @@ module VX_bank #(
VX_fifo_queue #(
.DATAW (1 + `LINE_ADDR_WIDTH + MSHR_ADDR_WIDTH + NUM_PORTS * (1 + WORD_SIZE + WORD_SELECT_BITS + `WORD_WIDTH)),
.SIZE (MREQ_SIZE),
.ALM_FULL (MREQ_SIZE-2)
.ALM_FULL (MREQ_SIZE-2),
.OUT_REG (1 == NUM_BANKS)
) mem_req_queue (
.clk (clk),
.reset (reset),

View file

@ -105,6 +105,71 @@ module VX_cache #(
wire [NUM_BANKS-1:0] perf_pipe_stall_per_bank;
`endif
///////////////////////////////////////////////////////////////////////////
wire mem_req_valid_sb;
wire mem_req_rw_sb;
wire [CACHE_LINE_SIZE-1:0] mem_req_byteen_sb;
wire [`MEM_ADDR_WIDTH-1:0] mem_req_addr_sb;
wire [`CACHE_LINE_WIDTH-1:0] mem_req_data_sb;
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_sb;
wire mem_req_ready_sb;
VX_skid_buffer #(
.DATAW (1+CACHE_LINE_SIZE+`MEM_ADDR_WIDTH+`CACHE_LINE_WIDTH+MEM_TAG_WIDTH),
.PASSTHRU (1 == NUM_BANKS)
) mem_req_sbuf (
.clk (clk),
.reset (reset),
.valid_in (mem_req_valid_sb),
.ready_in (mem_req_ready_sb),
.data_in ({mem_req_rw_sb, mem_req_byteen_sb, mem_req_addr_sb, mem_req_data_sb, mem_req_tag_sb}),
.data_out ({mem_req_rw, mem_req_byteen, mem_req_addr, mem_req_data, mem_req_tag}),
.valid_out (mem_req_valid),
.ready_out (mem_req_ready)
);
///////////////////////////////////////////////////////////////////////////
wire [`CORE_RSP_TAGS-1:0] core_rsp_valid_sb;
wire [NUM_REQS-1:0] core_rsp_tmask_sb;
wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_sb;
wire [`CORE_RSP_TAGS-1:0][CORE_TAG_WIDTH-1:0] core_rsp_tag_sb;
wire [`CORE_RSP_TAGS-1:0] core_rsp_ready_sb;
if (CORE_TAG_ID_BITS != 0) begin
VX_skid_buffer #(
.DATAW (NUM_REQS + NUM_REQS*`WORD_WIDTH + CORE_TAG_WIDTH),
.PASSTHRU (1 == NUM_BANKS)
) core_rsp_sbuf (
.clk (clk),
.reset (reset),
.valid_in (core_rsp_valid_sb),
.ready_in (core_rsp_ready_sb),
.data_in ({core_rsp_tmask_sb, core_rsp_data_sb, core_rsp_tag_sb}),
.data_out ({core_rsp_tmask, core_rsp_data, core_rsp_tag}),
.valid_out (core_rsp_valid),
.ready_out (core_rsp_ready)
);
end else begin
for (genvar i = 0; i < NUM_REQS; i++) begin
VX_skid_buffer #(
.DATAW (1 + `WORD_WIDTH + CORE_TAG_WIDTH),
.PASSTHRU (1 == NUM_BANKS)
) core_rsp_sbuf (
.clk (clk),
.reset (reset),
.valid_in (core_rsp_valid_sb[i]),
.ready_in (core_rsp_ready_sb[i]),
.data_in ({core_rsp_tmask_sb[i], core_rsp_data_sb[i], core_rsp_tag_sb[i]}),
.data_out ({core_rsp_tmask[i], core_rsp_data[i], core_rsp_tag[i]}),
.valid_out (core_rsp_valid[i]),
.ready_out (core_rsp_ready[i])
);
end
end
///////////////////////////////////////////////////////////////////////////
wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mem_req_byteen_p;
@ -129,15 +194,15 @@ module VX_cache #(
end
end
assign mem_req_rw = mem_req_rw_p;
assign mem_req_byteen = mem_req_byteen_r;
assign mem_req_data = mem_req_data_r;
assign mem_req_rw_sb = mem_req_rw_p;
assign mem_req_byteen_sb = mem_req_byteen_r;
assign mem_req_data_sb = mem_req_data_r;
end else begin
`UNUSED_VAR (mem_req_pmask_p)
`UNUSED_VAR (mem_req_wsel_p)
assign mem_req_rw = mem_req_rw_p;
assign mem_req_byteen = mem_req_byteen_p;
assign mem_req_data = mem_req_data_p;
assign mem_req_rw_sb = mem_req_rw_p;
assign mem_req_byteen_sb = mem_req_byteen_p;
assign mem_req_data_sb = mem_req_data_p;
end
end else begin
`UNUSED_VAR (mem_req_byteen_p)
@ -146,13 +211,11 @@ module VX_cache #(
`UNUSED_VAR (mem_req_data_p)
`UNUSED_VAR (mem_req_rw_p)
assign mem_req_rw = 0;
assign mem_req_byteen = 'x;
assign mem_req_data = 'x;
assign mem_req_rw_sb = 0;
assign mem_req_byteen_sb = 'x;
assign mem_req_data_sb = 'x;
end
///////////////////////////////////////////////////////////////////////////
// Core request
@ -233,11 +296,11 @@ module VX_cache #(
.core_rsp_ready_in (core_rsp_ready_nc),
// Core response out
.core_rsp_valid_out (core_rsp_valid),
.core_rsp_tmask_out (core_rsp_tmask),
.core_rsp_data_out (core_rsp_data),
.core_rsp_tag_out (core_rsp_tag),
.core_rsp_ready_out (core_rsp_ready),
.core_rsp_valid_out (core_rsp_valid_sb),
.core_rsp_tmask_out (core_rsp_tmask_sb),
.core_rsp_data_out (core_rsp_data_sb),
.core_rsp_tag_out (core_rsp_tag_sb),
.core_rsp_ready_out (core_rsp_ready_sb),
// Memory request in
.mem_req_valid_in (mem_req_valid_nc),
@ -251,15 +314,15 @@ module VX_cache #(
.mem_req_ready_in (mem_req_ready_nc),
// Memory request out
.mem_req_valid_out (mem_req_valid),
.mem_req_addr_out (mem_req_addr),
.mem_req_valid_out (mem_req_valid_sb),
.mem_req_addr_out (mem_req_addr_sb),
.mem_req_rw_out (mem_req_rw_p),
.mem_req_pmask_out (mem_req_pmask_p),
.mem_req_byteen_out (mem_req_byteen_p),
.mem_req_wsel_out (mem_req_wsel_p),
.mem_req_data_out (mem_req_data_p),
.mem_req_tag_out (mem_req_tag),
.mem_req_ready_out (mem_req_ready),
.mem_req_tag_out (mem_req_tag_sb),
.mem_req_ready_out (mem_req_ready_sb),
// Memory response in
.mem_rsp_valid_in (mem_rsp_valid),
@ -282,21 +345,21 @@ module VX_cache #(
assign core_req_tag_nc = core_req_tag;
assign core_req_ready = core_req_ready_nc;
assign core_rsp_valid = core_rsp_valid_nc;
assign core_rsp_tmask = core_rsp_tmask_nc;
assign core_rsp_data = core_rsp_data_nc;
assign core_rsp_tag = core_rsp_tag_nc;
assign core_rsp_ready_nc = core_rsp_ready;
assign core_rsp_valid_sb = core_rsp_valid_nc;
assign core_rsp_tmask_sb = core_rsp_tmask_nc;
assign core_rsp_data_sb = core_rsp_data_nc;
assign core_rsp_tag_sb = core_rsp_tag_nc;
assign core_rsp_ready_nc = core_rsp_ready_sb;
assign mem_req_valid = mem_req_valid_nc;
assign mem_req_addr = mem_req_addr_nc;
assign mem_req_valid_sb = mem_req_valid_nc;
assign mem_req_addr_sb = mem_req_addr_nc;
assign mem_req_rw_p = mem_req_rw_nc;
assign mem_req_pmask_p = mem_req_pmask_nc;
assign mem_req_byteen_p = mem_req_byteen_nc;
assign mem_req_wsel_p = mem_req_wsel_nc;
assign mem_req_data_p = mem_req_data_nc;
assign mem_req_tag = mem_req_tag_nc;
assign mem_req_ready_nc = mem_req_ready;
assign mem_req_tag_sb = mem_req_tag_nc;
assign mem_req_ready_nc = mem_req_ready_sb;
assign mem_rsp_valid_nc = mem_rsp_valid;
assign mem_rsp_data_nc = mem_rsp_data;
@ -617,7 +680,6 @@ module VX_cache #(
VX_stream_arbiter #(
.NUM_REQS (NUM_BANKS),
.DATAW (`MEM_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + NUM_PORTS * (1 + WORD_SIZE + WORD_SELECT_BITS + `WORD_WIDTH)),
.BUFFERED (1),
.TYPE ("R")
) mem_req_arb (
.clk (clk),

View file

@ -14,7 +14,9 @@ module VX_core_rsp_merge #(
// core request tag size
parameter CORE_TAG_WIDTH = 1,
// size of tag id in core request tag
parameter CORE_TAG_ID_BITS = 0
parameter CORE_TAG_ID_BITS = 0,
// output register
parameter OUT_REG = 0
) (
input wire clk,
input wire reset,
@ -151,8 +153,9 @@ module VX_core_rsp_merge #(
wire core_rsp_valid_any = (| per_bank_core_rsp_valid);
VX_skid_buffer #(
.DATAW (NUM_REQS + CORE_TAG_WIDTH + (NUM_REQS *`WORD_WIDTH))
) skid_buf (
.DATAW (NUM_REQS + CORE_TAG_WIDTH + (NUM_REQS *`WORD_WIDTH)),
.PASSTHRU (0 == OUT_REG)
) out_sbuf (
.clk (clk),
.reset (reset),
.valid_in (core_rsp_valid_any),
@ -259,8 +262,9 @@ module VX_core_rsp_merge #(
for (genvar i = 0; i < NUM_REQS; i++) begin
VX_skid_buffer #(
.DATAW (CORE_TAG_WIDTH + `WORD_WIDTH)
) skid_buf (
.DATAW (CORE_TAG_WIDTH + `WORD_WIDTH),
.PASSTHRU (0 == OUT_REG)
) out_sbuf (
.clk (clk),
.reset (reset),
.valid_in (core_rsp_valid_unqual[i]),

View file

@ -14,7 +14,9 @@ module VX_scan #(
);
`IGNORE_WARNINGS_BEGIN
wire [$clog2(N):0][N-1:0] t;
localparam LOGN = $clog2(N);
wire [LOGN:0][N-1:0] t;
// reverses bits
if (REVERSE) begin
@ -25,15 +27,15 @@ module VX_scan #(
// optimize for the common case of small and-scans
if ((N == 2) && (OP == 1)) begin
assign t[$clog2(N)] = {t[0][1], &t[0][1:0]};
assign t[LOGN] = {t[0][1], &t[0][1:0]};
end else if ((N == 3) && (OP == 1)) begin
assign t[$clog2(N)] = {t[0][2], &t[0][2:1], &t[0][2:0]};
assign t[LOGN] = {t[0][2], &t[0][2:1], &t[0][2:0]};
end else if ((N == 4) && (OP == 1)) begin
assign t[$clog2(N)] = {t[0][3], &t[0][3:2], &t[0][3:1], &t[0][3:0]};
assign t[LOGN] = {t[0][3], &t[0][3:2], &t[0][3:1], &t[0][3:0]};
end else begin
// general case
wire [N-1:0] fill;
for (genvar i = 0; i < $clog2(N); i++) begin
for (genvar i = 0; i < LOGN; i++) begin
wire [N-1:0] shifted = N'({fill, t[i]} >> (1<<i));
if (OP == 0) begin
assign fill = {N{1'b0}};
@ -50,10 +52,10 @@ module VX_scan #(
// reverse bits
if (REVERSE) begin
assign data_out = t[$clog2(N)];
assign data_out = t[LOGN];
end else begin
for (genvar i = 0; i < N; i++) begin
assign data_out[i] = t[$clog2(N)][N-1-i];
assign data_out[i] = t[LOGN][N-1-i];
end
end

View file

@ -26,11 +26,11 @@ DBG_FLAGS += -DDBG_CACHE_REQ_INFO
CONFIG1 := -DNUM_CLUSTERS=1 -DNUM_CORES=1 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)
CONFIG2 := -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)
CONFIG4 := -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)
CONFIG8 := -DNUM_CLUSTERS=2 -DNUM_CORES=4 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)
CONFIG16 := -DNUM_CLUSTERS=4 -DNUM_CORES=4 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)
CONFIG32 := -DNUM_CLUSTERS=8 -DNUM_CORES=4 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)
CONFIG64 := -DNUM_CLUSTERS=8 -DNUM_CORES=8 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)
CONFIG4 := -DNUM_CLUSTERS=1 -DNUM_CORES=4 -DL2_ENABLE=1 -DL3_ENABLE=0 -DICACHE_SIZE=8192 -DDCACHE_SIZE=8192 -DL2CACHE_SIZE=131072 $(CONFIGS)
CONFIG8 := -DNUM_CLUSTERS=1 -DNUM_CORES=8 -DL2_ENABLE=1 -DL3_ENABLE=0 -DICACHE_SIZE=8192 -DDCACHE_SIZE=8192 -DL2CACHE_SIZE=131072 $(CONFIGS)
CONFIG16 := -DNUM_CLUSTERS=4 -DNUM_CORES=4 -DL2_ENABLE=0 -DL3_ENABLE=1 -DICACHE_SIZE=8192 -DDCACHE_SIZE=8192 -DL3CACHE_SIZE=262144 $(CONFIGS)
CONFIG32 := -DNUM_CLUSTERS=4 -DNUM_CORES=8 -DL2_ENABLE=0 -DL3_ENABLE=1 -DICACHE_SIZE=8192 -DDCACHE_SIZE=8192 -DL3CACHE_SIZE=262144 $(CONFIGS)
CONFIG64 := -DNUM_CLUSTERS=8 -DNUM_CORES=8 -DL2_ENABLE=0 -DL3_ENABLE=1 -DICACHE_SIZE=8192 -DDCACHE_SIZE=8192 -DL3CACHE_SIZE=524288 $(CONFIGS)
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/altera/$(DEVICE_FAMILY)
RTL_INCLUDE = -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) -I$(RTL_DIR) -I$(RTL_DIR)/afu

View file

@ -58,7 +58,7 @@ smart.log: $(PROJECT_FILES)
# Project initialization
$(PROJECT_FILES):
quartus_sh -t ../../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../../project.sdc -inc "$(RTL_INCLUDE)" -set "NOPAE" -set "NUM_CORES=4" -set "NUM_CLUSTERS=4"
quartus_sh -t ../../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../../project.sdc -inc "$(RTL_INCLUDE)" -set "NOPAE" -set "NUM_CORES=4" -set "NUM_CLUSTERS=4" -set "L2_ENABLE=0" -set "L3_ENABLE=1" -set "ICACHE_SIZE=8192" -set "DCACHE_SIZE=8192" -set "L3CACHE_SIZE=262144"
syn.chg:
$(STAMP) syn.chg

View file

@ -3,13 +3,13 @@ TOP_LEVEL_ENTITY = vortex_afu
SRC_FILE = vortex_afu.sv
RTL_DIR = ../../../../rtl
#FAMILY = "Arria 10"
#DEVICE = 10AX115N3F40E2SG
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
FAMILY = "Arria 10"
DEVICE = 10AX115N3F40E2SG
FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
FAMILY = "Stratix 10"
DEVICE = 1SX280HN2F43E2VG
FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
#FAMILY = "Stratix 10"
#DEVICE = 1SX280HN2F43E2VG
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE)
@ -58,7 +58,7 @@ smart.log: $(PROJECT_FILES)
# Project initialization
$(PROJECT_FILES):
quartus_sh -t ../../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../../project.sdc -inc "$(RTL_INCLUDE)" -set "NOPAE" -set "NUM_CORES=4" -set "NUM_CLUSTERS=8"
quartus_sh -t ../../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../../project.sdc -inc "$(RTL_INCLUDE)" -set "NOPAE" -set "NUM_CORES=8" -set "NUM_CLUSTERS=4" -set "L2_ENABLE=0" -set "L3_ENABLE=1" -set "ICACHE_SIZE=8192" -set "DCACHE_SIZE=8192" -set "L3CACHE_SIZE=262144"
syn.chg:
$(STAMP) syn.chg

View file

@ -58,8 +58,7 @@ smart.log: $(PROJECT_FILES)
# Project initialization
$(PROJECT_FILES):
quartus_sh -t ../../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../../project.sdc -inc "$(RTL_INCLUDE)" -set "NOPAE" -set "NUM_CORES=4"
quartus_sh -t ../../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../../project.sdc -inc "$(RTL_INCLUDE)" -set "NOPAE" -set "NUM_CORES=4" -set "NUM_CLUSTERS=1" -set "L2_ENABLE=1" -set "L3_ENABLE=0" -set "ICACHE_SIZE=8192" -set "DCACHE_SIZE=8192" -set "L2CACHE_SIZE=65536"
syn.chg:
$(STAMP) syn.chg

View file

@ -58,7 +58,7 @@ smart.log: $(PROJECT_FILES)
# Project initialization
$(PROJECT_FILES):
quartus_sh -t ../../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../../project.sdc -inc "$(RTL_INCLUDE)" -set "NOPAE" -set "NUM_CORES=8" -set "NUM_CLUSTERS=8"
quartus_sh -t ../../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../../project.sdc -inc "$(RTL_INCLUDE)" -set "NOPAE" -set "NUM_CORES=8" -set "NUM_CLUSTERS=8" -set "L2_ENABLE=0" -set "L3_ENABLE=1" -set "ICACHE_SIZE=8192" -set "DCACHE_SIZE=8192" -set "L3CACHE_SIZE=524288"
syn.chg:
$(STAMP) syn.chg

View file

@ -58,7 +58,7 @@ smart.log: $(PROJECT_FILES)
# Project initialization
$(PROJECT_FILES):
quartus_sh -t ../../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../../project.sdc -inc "$(RTL_INCLUDE)" -set "NOPAE" -set "NUM_CORES=4" -set "NUM_CLUSTERS=2"
quartus_sh -t ../../project.tcl -project $(PROJECT) -family $(FAMILY) -device $(DEVICE) -top $(TOP_LEVEL_ENTITY) -src "$(SRC_FILE)" -sdc ../../project.sdc -inc "$(RTL_INCLUDE)" -set "NOPAE" -set "NUM_CORES=8" -set "NUM_CLUSTERS=1" -set "L2_ENABLE=1" -set "L3_ENABLE=0" -set "ICACHE_SIZE=8192" -set "DCACHE_SIZE=8192" -set "L2CACHE_SIZE=131072"
syn.chg:
$(STAMP) syn.chg