RTL code refactoring

This commit is contained in:
Blaise Tine 2020-04-20 06:47:24 -04:00
parent 5671b08a5e
commit 07135263f5
22 changed files with 334 additions and 474 deletions

View file

@ -34,7 +34,7 @@ gen-singlecore-t: build_config
verilator $(VF) -cc $(SINGLE_CORE) -CFLAGS '$(CF) -DNDEBUG -O2' --threads $(THREADS)
gen-singlecore-d: build_config
verilator $(VF) -cc $(SINGLE_CORE) -CFLAGS '$(CF) -DVCD_OUTPUT' $(DBG)
verilator $(VF) -cc $(SINGLE_CORE) -CFLAGS '$(CF) -g -O0 -DVCD_OUTPUT' $(DBG)
gen-multicore: build_config
verilator $(VF) -DNDEBUG -cc $(MULTI_CORE) -CFLAGS '$(CF) -DNDEBUG -DUSE_MULTICORE'
@ -43,7 +43,7 @@ gen-multicore-t: build_config
verilator $(VF) -DNDEBUG -cc $(MULTI_CORE) -CFLAGS '$(CF) -DNDEBUG -O2 -DUSE_MULTICORE' --threads $(THREADS)
gen-multicore-d: build_config
verilator $(VF) -cc $(MULTI_CORE) -CFLAGS '$(CF) -DVCD_OUTPUT -DUSE_MULTICORE' $(DBG)
verilator $(VF) -cc $(MULTI_CORE) -CFLAGS '$(CF) -g -O0 -DVCD_OUTPUT -DUSE_MULTICORE' $(DBG)
singlecore: gen-singlecore
(cd obj_dir && make -j -f VVortex.mk)

View file

@ -347,15 +347,15 @@ logic vortex_enabled;
always_comb
begin
vortex_enabled = (STATE_RUN == state) || (STATE_CLFLUSH == state);
vx_dram_req_full = !vortex_enabled || avs_waitrequest || avs_raq_full || avs_rdq_full;
vortex_enabled = (STATE_RUN == state) || (STATE_CLFLUSH == state);
vx_dram_req_ready = vortex_enabled && !avs_waitrequest && !avs_raq_full && !avs_rdq_full;
end
// Vortex DRAM fill response
always_comb
begin
vx_dram_rsp_valid = vortex_enabled && !avs_rdq_empty && vx_dram_rsp_ready;
vx_dram_rsp_valid = vortex_enabled && !avs_rdq_empty && vx_dram_rsp_ready;
vx_dram_rsp_addr = (avs_raq_dout << 6);
{>>{vx_dram_rsp_data}} = avs_rdq_dout;
end
@ -531,7 +531,7 @@ begin
if ((STATE_CLFLUSH == state)
&& vx_snoop_ctr < csr_data_size
&& !vx_snp_req_full)
&& vx_snp_req_ready)
begin
vx_snp_req_addr <= (csr_mem_addr + vx_snoop_ctr) << 6;
vx_snp_req <= 1;
@ -556,7 +556,7 @@ Vortex_Socket #() vx_socket (
.dram_req_read (vx_dram_req_read),
.dram_req_addr (vx_dram_req_addr),
.dram_req_data (vx_dram_req_data),
.dram_req_full (vx_dram_req_full),
.dram_req_ready (vx_dram_req_ready),
// DRAM Rsp
.out_dram_rsp_ready (vx_dram_rsp_ready),
@ -567,7 +567,7 @@ Vortex_Socket #() vx_socket (
// Cache Snooping Req
.llc_snp_req_valid (vx_snp_req),
.llc_snp_req_addr (vx_snp_req_addr),
.llc_snp_req_full (vx_snp_req_full),
.llc_snp_req_ready (vx_snp_req_ready),
// program exit signal
.out_ebreak (vx_ebreak)

View file

@ -32,53 +32,53 @@ assign writeback_if.wb_pc = writeback_temp_if.wb_pc;
// assign VX_writeback_if(writeback_temp_if);
wire no_slot_mem;
wire no_slot_exec;
wire no_slot_mem;
wire no_slot_exec;
// LSU input + output
VX_lsu_req_if lsu_req_if();
VX_inst_mem_wb_if mem_wb_if();
VX_lsu_req_if lsu_req_if();
VX_inst_mem_wb_if mem_wb_if();
// Exec unit input + output
VX_exec_unit_req_if exec_unit_req_if();
VX_inst_exec_wb_if inst_exec_wb_if();
VX_exec_unit_req_if exec_unit_req_if();
VX_inst_exec_wb_if inst_exec_wb_if();
// GPU unit input
VX_gpu_inst_req_if gpu_inst_req_if();
VX_gpu_inst_req_if gpu_inst_req_if();
// CSR unit inputs
VX_csr_req_if csr_req_if();
VX_csr_wb_if csr_wb_if();
wire no_slot_csr;
wire stall_gpr_csr;
VX_csr_req_if csr_req_if();
VX_csr_wb_if csr_wb_if();
wire no_slot_csr;
wire stall_gpr_csr;
VX_gpr_stage gpr_stage (
.clk (clk),
.reset (reset),
.schedule_delay (schedule_delay),
.writeback_if (writeback_temp_if),
.bckE_req_if (bckE_req_if),
.clk (clk),
.reset (reset),
.schedule_delay (schedule_delay),
.writeback_if (writeback_temp_if),
.bckE_req_if (bckE_req_if),
// New
.exec_unit_req_if(exec_unit_req_if),
.lsu_req_if (lsu_req_if),
.gpu_inst_req_if (gpu_inst_req_if),
.csr_req_if (csr_req_if),
.stall_gpr_csr (stall_gpr_csr),
.exec_unit_req_if (exec_unit_req_if),
.lsu_req_if (lsu_req_if),
.gpu_inst_req_if (gpu_inst_req_if),
.csr_req_if (csr_req_if),
.stall_gpr_csr (stall_gpr_csr),
// End new
.memory_delay (out_mem_delay),
.exec_delay (out_exec_delay),
.gpr_stage_delay (gpr_stage_delay)
.memory_delay (out_mem_delay),
.exec_delay (out_exec_delay),
.gpr_stage_delay (gpr_stage_delay)
);
VX_lsu load_store_unit (
.clk (clk),
.reset (reset),
.lsu_req_if (lsu_req_if),
.mem_wb_if (mem_wb_if),
.dcache_rsp_if(dcache_rsp_if),
.dcache_req_if(dcache_req_if),
.out_delay (out_mem_delay),
.no_slot_mem (no_slot_mem)
.clk (clk),
.reset (reset),
.lsu_req_if (lsu_req_if),
.mem_wb_if (mem_wb_if),
.dcache_rsp_if (dcache_rsp_if),
.dcache_req_if (dcache_req_if),
.out_delay (out_mem_delay),
.no_slot_mem (no_slot_mem)
);
VX_execute_unit execUnit (
@ -97,11 +97,6 @@ VX_gpgpu_inst gpgpu_inst (
.warp_ctl_if (warp_ctl_if)
);
// VX_csr_wrapper csr_wrapper(
// .csr_req_if(csr_req_if),
// .csr_wb_if (csr_wb_if)
// );
VX_csr_pipe #(
.CORE_ID(CORE_ID)
) csr_pipe (

View file

@ -23,8 +23,6 @@
`define NUM_BARRIERS 4
`endif
// `define SINGLE_CORE_BENCH
`ifndef GLOBAL_BLOCK_SIZE_BYTES
`define GLOBAL_BLOCK_SIZE_BYTES 16
`endif

View file

@ -11,16 +11,15 @@ module VX_decode(
VX_join_if join_if,
output wire terminate_sim
);
wire[31:0] in_instruction = fd_inst_meta_de.instruction;
wire[31:0] in_curr_PC = fd_inst_meta_de.inst_pc;
wire[`NW_BITS-1:0] in_warp_num = fd_inst_meta_de.warp_num;
wire[31:0] in_instruction = fd_inst_meta_de.instruction;
wire[31:0] in_curr_PC = fd_inst_meta_de.inst_pc;
wire[`NW_BITS-1:0] in_warp_num = fd_inst_meta_de.warp_num;
assign frE_to_bckE_req_if.curr_PC = in_curr_PC;
assign frE_to_bckE_req_if.curr_PC = in_curr_PC;
wire[`NUM_THREADS-1:0] in_valid = fd_inst_meta_de.valid;
wire[`NUM_THREADS-1:0] in_valid = fd_inst_meta_de.valid;
wire[6:0] curr_opcode;
@ -122,28 +121,22 @@ module VX_decode(
assign is_split = is_gpgpu && (func3 == 2); // Goes to BE
assign is_join = is_gpgpu && (func3 == 3); // Doesn't go to BE
assign join_if.is_join = is_join;
assign join_if.join_warp_num = in_warp_num;
assign frE_to_bckE_req_if.is_wspawn = is_wspawn;
assign frE_to_bckE_req_if.is_tmc = is_tmc;
assign frE_to_bckE_req_if.is_split = is_split;
assign frE_to_bckE_req_if.is_barrier = is_barrier;
assign frE_to_bckE_req_if.csr_immed = is_csr_immed;
assign frE_to_bckE_req_if.is_csr = is_csr;
assign frE_to_bckE_req_if.wb = (is_jal || is_jalr || is_e_inst) ? `WB_JAL :
is_linst ? `WB_MEM :
(is_itype || is_rtype || is_lui || is_auipc || is_csr) ? `WB_ALU :
`NO_WB;
assign frE_to_bckE_req_if.rs2_src = (is_itype || is_stype) ? `RS2_IMMED : `RS2_REG;
// MEM signals
@ -161,7 +154,6 @@ module VX_decode(
assign frE_to_bckE_req_if.upper_immed = temp_upper_immed;
assign jal_b_19_to_12 = in_instruction[19:12];
assign jal_b_11 = in_instruction[20];
assign jal_b_10_to_1 = in_instruction[30:21];
@ -170,11 +162,9 @@ module VX_decode(
assign jal_unsigned_offset = {jal_b_20, jal_b_19_to_12, jal_b_11, jal_b_10_to_1, jal_b_0};
assign jal_1_offset = {{11{jal_b_20}}, jal_unsigned_offset};
assign jalr_immed = {func7, frE_to_bckE_req_if.rs2};
assign jal_2_offset = {{20{jalr_immed[11]}}, jalr_immed};
assign jal_sys_cond1 = func3 == 3'h0;
assign jal_sys_cond2 = u_12 < 12'h2;
@ -214,13 +204,11 @@ module VX_decode(
// wire is_ebreak;
// assign is_ebreak = is_e_inst;
wire ebreak = (curr_opcode == `SYS_INST) && (jal_sys_jal && (|in_valid));
assign frE_to_bckE_req_if.ebreak = ebreak;
assign terminate_sim = is_e_inst;
// CSR
assign csr_cond1 = func3 != 3'h0;
@ -228,13 +216,11 @@ module VX_decode(
assign frE_to_bckE_req_if.csr_address = (csr_cond1 && csr_cond2) ? u_12 : 12'h55;
// ITYPE IMEED
assign alu_shift_i = (func3 == 3'h1) || (func3 == 3'h5);
assign alu_shift_i_immed = {{7{1'b0}}, frE_to_bckE_req_if.rs2};
assign alu_tempp = alu_shift_i ? alu_shift_i_immed : u_12;
always @(*) begin
case(curr_opcode)
`ALU_INST: temp_itype_immed = {{20{alu_tempp[11]}}, alu_tempp};
@ -331,11 +317,11 @@ module VX_decode(
wire[4:0] temp_final_alu;
assign temp_final_alu = is_btype ? ((frE_to_bckE_req_if.branch_type < `BLTU) ? `SUB : `SUBU) :
is_lui ? `LUI_ALU :
is_auipc ? `AUIPC_ALU :
is_csr ? csr_alu :
(is_stype || is_linst) ? `ADD :
alu_op;
is_lui ? `LUI_ALU :
is_auipc ? `AUIPC_ALU :
is_csr ? csr_alu :
(is_stype || is_linst) ? `ADD :
alu_op;
assign frE_to_bckE_req_if.alu_op = ((func7[0] == 1'b1) && is_rtype) ? mul_alu : temp_final_alu;

View file

@ -135,6 +135,9 @@
`define ZERO_REG 5'h0
// IO BUS
`define IO_BUS_ADDR 32'h00010000
// ======================= Dcache Configurable Knobs ==========================
// Function ID

View file

@ -23,28 +23,29 @@ module VX_dmem_controller (
VX_gpu_dcache_req_if icache_req_if
);
VX_gpu_dcache_rsp_if #(.NUM_REQUESTS(`DNUM_REQUESTS)) dcache_rsp_smem_if();
VX_gpu_dcache_req_if #(.NUM_REQUESTS(`DNUM_REQUESTS)) dcache_req_smem_if();
VX_gpu_dcache_rsp_if #(.NUM_REQUESTS(`DNUM_REQUESTS)) dcache_rsp_dcache_if();
VX_gpu_dcache_rsp_if #(.NUM_REQUESTS(`DNUM_REQUESTS)) dcache_rsp_smem_if();
VX_gpu_dcache_req_if #(.NUM_REQUESTS(`DNUM_REQUESTS)) dcache_req_dcache_if();
VX_gpu_dcache_rsp_if #(.NUM_REQUESTS(`DNUM_REQUESTS)) dcache_rsp_dcache_if();
wire to_shm = dcache_req_if.core_req_addr[0][31:24] == 8'hFF;
wire dcache_wants_wb = (|dcache_rsp_dcache_if.core_wb_valid);
// Dcache Request
assign dcache_req_dcache_if.core_req_valid = dcache_req_if.core_req_valid & {`NUM_THREADS{~to_shm}};
assign dcache_req_dcache_if.core_req_addr = dcache_req_if.core_req_addr;
assign dcache_req_dcache_if.core_req_writedata = dcache_req_if.core_req_writedata;
assign dcache_req_dcache_if.core_req_mem_read = dcache_req_if.core_req_mem_read;
assign dcache_req_dcache_if.core_req_mem_write = dcache_req_if.core_req_mem_write;
assign dcache_req_dcache_if.core_req_rd = dcache_req_if.core_req_rd;
assign dcache_req_dcache_if.core_req_addr = dcache_req_if.core_req_addr;
assign dcache_req_dcache_if.core_req_writedata = dcache_req_if.core_req_writedata;
assign dcache_req_dcache_if.core_req_rd = dcache_req_if.core_req_rd;
assign dcache_req_dcache_if.core_req_wb = dcache_req_if.core_req_wb;
assign dcache_req_dcache_if.core_req_warp_num = dcache_req_if.core_req_warp_num;
assign dcache_req_dcache_if.core_req_pc = dcache_req_if.core_req_pc;
assign dcache_req_dcache_if.core_no_wb_slot = dcache_req_if.core_no_wb_slot;
// Shred Memory Request
assign dcache_req_dcache_if.core_no_wb_slot = dcache_req_if.core_no_wb_slot;
// Shared Memory Request
assign dcache_req_smem_if.core_req_valid = dcache_req_if.core_req_valid & {`NUM_THREADS{to_shm}};
assign dcache_req_smem_if.core_req_addr = dcache_req_if.core_req_addr;
assign dcache_req_smem_if.core_req_writedata = dcache_req_if.core_req_writedata;
@ -54,17 +55,18 @@ module VX_dmem_controller (
assign dcache_req_smem_if.core_req_wb = dcache_req_if.core_req_wb;
assign dcache_req_smem_if.core_req_warp_num = dcache_req_if.core_req_warp_num;
assign dcache_req_smem_if.core_req_pc = dcache_req_if.core_req_pc;
assign dcache_req_smem_if.core_no_wb_slot = dcache_req_if.core_no_wb_slot || dcache_wants_wb;
// Dcache Response
assign dcache_req_smem_if.core_no_wb_slot = dcache_req_if.core_no_wb_slot || dcache_wants_wb;
// Dcache Response
assign dcache_rsp_if.core_wb_valid = dcache_wants_wb ? dcache_rsp_dcache_if.core_wb_valid : dcache_rsp_smem_if.core_wb_valid;
assign dcache_rsp_if.core_wb_req_rd = dcache_wants_wb ? dcache_rsp_dcache_if.core_wb_req_rd : dcache_rsp_smem_if.core_wb_req_rd;
assign dcache_rsp_if.core_wb_req_wb = dcache_wants_wb ? dcache_rsp_dcache_if.core_wb_req_wb : dcache_rsp_smem_if.core_wb_req_wb;
assign dcache_rsp_if.core_wb_warp_num = dcache_wants_wb ? dcache_rsp_dcache_if.core_wb_warp_num : dcache_rsp_smem_if.core_wb_warp_num;
assign dcache_rsp_if.core_wb_pc = dcache_wants_wb ? dcache_rsp_dcache_if.core_wb_pc : dcache_rsp_smem_if.core_wb_pc;
assign dcache_rsp_if.core_wb_readdata = dcache_wants_wb ? dcache_rsp_dcache_if.core_wb_readdata : dcache_rsp_smem_if.core_wb_readdata;
assign dcache_rsp_if.core_wb_pc = dcache_wants_wb ? dcache_rsp_dcache_if.core_wb_pc : dcache_rsp_smem_if.core_wb_pc;
assign dcache_rsp_if.core_wb_warp_num = dcache_wants_wb ? dcache_rsp_dcache_if.core_wb_warp_num : dcache_rsp_smem_if.core_wb_warp_num;
assign dcache_rsp_if.delay_req = to_shm ? dcache_rsp_smem_if.delay_req : dcache_rsp_dcache_if.delay_req;
assign dcache_rsp_if.core_req_ready = to_shm ? dcache_rsp_smem_if.core_req_ready : dcache_rsp_dcache_if.core_req_ready;
VX_gpu_dcache_dram_req_if #(.BANK_LINE_WORDS(`DBANK_LINE_WORDS)) gpu_smem_dram_req_if();
VX_gpu_dcache_dram_rsp_if #(.BANK_LINE_WORDS(`DBANK_LINE_WORDS)) gpu_smem_dram_res_if();
@ -105,8 +107,8 @@ module VX_dmem_controller (
.core_req_warp_num (dcache_req_smem_if.core_req_warp_num),
.core_req_pc (dcache_req_smem_if.core_req_pc),
// Delay Core Req
.delay_req (dcache_rsp_smem_if.delay_req),
// Can submit core Req
.core_req_ready (dcache_rsp_smem_if.core_req_ready),
// Core Cache Can't WB
.core_no_wb_slot (dcache_req_smem_if.core_no_wb_slot),
@ -135,7 +137,7 @@ module VX_dmem_controller (
.dram_req_write (gpu_smem_dram_req_if.dram_req_write),
.dram_req_addr (gpu_smem_dram_req_if.dram_req_addr),
.dram_req_data (gpu_smem_dram_req_if.dram_req_data),
.dram_req_full (1),
.dram_req_ready (0),
// Snoop Request
.snp_req_valid (0),
@ -188,8 +190,8 @@ module VX_dmem_controller (
.core_req_warp_num (dcache_req_dcache_if.core_req_warp_num),
.core_req_pc (dcache_req_dcache_if.core_req_pc),
// Delay Core Req
.delay_req (dcache_rsp_dcache_if.delay_req),
// Can submit core Req
.core_req_ready (dcache_rsp_dcache_if.core_req_ready),
// Core Cache Can't WB
.core_no_wb_slot (dcache_req_dcache_if.core_no_wb_slot),
@ -218,7 +220,7 @@ module VX_dmem_controller (
.dram_req_write (gpu_dcache_dram_req_if.dram_req_write),
.dram_req_addr (gpu_dcache_dram_req_if.dram_req_addr),
.dram_req_data (gpu_dcache_dram_req_if.dram_req_data),
.dram_req_full (gpu_dcache_dram_req_if.dram_req_full),
.dram_req_ready (gpu_dcache_dram_req_if.dram_req_ready),
// Snoop Request
.snp_req_valid (gpu_dcache_snp_req_if.snp_req_valid),
@ -269,8 +271,8 @@ module VX_dmem_controller (
.core_req_warp_num (icache_req_if.core_req_warp_num),
.core_req_pc (icache_req_if.core_req_pc),
// Delay Core Req
.delay_req (icache_rsp_if.delay_req),
// Can submit core Req
.core_req_ready (icache_rsp_if.core_req_ready),
// Core Cache Can't WB
.core_no_wb_slot (icache_req_if.core_no_wb_slot),
@ -299,7 +301,7 @@ module VX_dmem_controller (
.dram_req_write (gpu_icache_dram_req_if.dram_req_write),
.dram_req_addr (gpu_icache_dram_req_if.dram_req_addr),
.dram_req_data (gpu_icache_dram_req_if.dram_req_data),
.dram_req_full (gpu_icache_dram_req_if.dram_req_full),
.dram_req_ready (gpu_icache_dram_req_if.dram_req_ready),
// Snoop Request
.snp_req_valid (gpu_icache_snp_req_if.snp_req_valid),

View file

@ -1,22 +1,22 @@
`include "VX_define.vh"
module VX_front_end (
input wire clk,
input wire reset,
input wire clk,
input wire reset,
input wire schedule_delay,
input wire schedule_delay,
VX_warp_ctl_if warp_ctl_if,
VX_warp_ctl_if warp_ctl_if,
VX_gpu_dcache_rsp_if icache_rsp_if,
VX_gpu_dcache_req_if icache_req_if,
VX_gpu_dcache_rsp_if icache_rsp_if,
VX_gpu_dcache_req_if icache_req_if,
VX_jal_response_if jal_rsp_if,
VX_branch_response_if branch_rsp_if,
VX_jal_response_if jal_rsp_if,
VX_branch_response_if branch_rsp_if,
VX_frE_to_bckE_req_if bckE_req_if,
VX_frE_to_bckE_req_if bckE_req_if,
output wire fetch_ebreak
output wire fetch_ebreak
);
VX_inst_meta_if fe_inst_meta_fi();
@ -35,16 +35,7 @@ module VX_front_end (
wire[`NW_BITS-1:0] icache_stage_wid;
wire[`NUM_THREADS-1:0] icache_stage_valids;
reg old_ebreak; // This should be eventually removed
always @(posedge clk) begin
if (reset) begin
old_ebreak <= 0;
end else begin
old_ebreak <= old_ebreak || fetch_ebreak;
end
end
assign fetch_ebreak = vortex_ebreak || terminate_sim || old_ebreak;
assign fetch_ebreak = vortex_ebreak || terminate_sim;
VX_wstall_if wstall_if();
VX_join_if join_if();

View file

@ -39,7 +39,7 @@ module VX_icache_stage (
assign icache_stage_valids = fe_inst_meta_id.valid & {`NUM_THREADS{!icache_stage_delay}};
// Cache can't accept request
assign icache_stage_delay = icache_rsp_if.delay_req;
assign icache_stage_delay = ~icache_rsp_if.core_req_ready;
// Core can't accept response
assign icache_req_if.core_no_wb_slot = total_freeze;

View file

@ -59,7 +59,7 @@ module VX_lsu (
assign dcache_req_if.core_no_wb_slot = no_slot_mem;
// Cache can't accept request
assign out_delay = dcache_rsp_if.delay_req;
assign out_delay = ~dcache_rsp_if.core_req_ready;
// Core Response
assign mem_wb_if.rd = dcache_rsp_if.core_wb_req_rd;

View file

@ -1,26 +1,23 @@
`include "VX_define.vh"
`include "VX_cache_config.vh"
module Vortex
#(
parameter CORE_ID = 0
) (
`ifdef SINGLE_CORE_BENCH
module Vortex #(
parameter CORE_ID = 0
) (
// Clock
input wire clk,
input wire reset,
input wire clk,
input wire reset,
// IO
output wire io_valid,
output wire [31:0] io_data,
output wire io_valid,
output wire [31:0] io_data,
// DRAM Dcache Req
output wire dram_req_read,
output wire dram_req_write,
output wire [31:0] dram_req_addr,
output wire [`DBANK_LINE_SIZE-1:0] dram_req_data,
input wire dram_req_full,
input wire dram_req_ready,
// DRAM Dcache Rsp
input wire dram_rsp_valid,
@ -33,7 +30,7 @@ module Vortex
output wire I_dram_req_write,
output wire [31:0] I_dram_req_addr,
output wire [`IBANK_LINE_SIZE-1:0] I_dram_req_data,
input wire I_dram_req_full,
input wire I_dram_req_ready,
// DRAM Icache Rsp
input wire I_dram_rsp_valid,
@ -42,52 +39,11 @@ module Vortex
output wire I_dram_rsp_ready,
// LLC Snooping
input wire snp_req_valid,
input wire [31:0] snp_req_addr,
output wire snp_req_full,
input wire llc_snp_req_valid,
input wire [31:0] llc_snp_req_addr,
output wire llc_snp_req_full,
output wire out_ebreak
`else
input wire clk,
input wire reset,
// IO
output wire io_valid,
output wire[31:0] io_data,
// DRAM Dcache Req
output wire dram_req_read,
output wire dram_req_write,
output wire [31:0] dram_req_addr,
output wire [`DBANK_LINE_SIZE-1:0] dram_req_data,
input wire dram_req_full,
// DRAM Dcache Rsp
input wire dram_rsp_valid,
input wire [31:0] dram_rsp_addr,
input wire [`DBANK_LINE_SIZE-1:0] dram_rsp_data,
output wire dram_rsp_ready,
// DRAM Icache Req
output wire I_dram_req_read,
output wire I_dram_req_write,
output wire [31:0] I_dram_req_addr,
output wire [`IBANK_LINE_SIZE-1:0] I_dram_req_data,
input wire I_dram_req_full,
// DRAM Icache Rsp
output wire I_dram_rsp_ready,
input wire I_dram_rsp_valid,
input wire [31:0] I_dram_rsp_addr,
input wire [`IBANK_LINE_SIZE-1:0] I_dram_rsp_data,
input wire snp_req_valid,
input wire [31:0] snp_req_addr,
output wire snp_req_full,
output wire out_ebreak
`endif
);
`DEBUG_BEGIN
wire scheduler_empty;
@ -114,36 +70,37 @@ module Vortex
assign dram_req_addr = gpu_dcache_dram_req_if.dram_req_addr;
assign dram_rsp_ready = gpu_dcache_dram_req_if.dram_rsp_ready;
assign gpu_dcache_dram_req_if.dram_req_full = dram_req_full;
assign gpu_dcache_dram_req_if.dram_req_ready = dram_req_ready;
genvar i;
generate
for (i = 0; i < `DBANK_LINE_WORDS; i=i+1) begin
assign gpu_dcache_dram_res_if.dram_rsp_data[i] = dram_rsp_data[i * 32 +: 32];
assign dram_req_data[i * 32 +: 32] = gpu_dcache_dram_req_if.dram_req_data[i];
assign dram_req_data[i * 32 +: 32] = gpu_dcache_dram_req_if.dram_req_data[i];
end
endgenerate
wire temp_io_valid = (!memory_delay)
&& (|dcache_req_if.core_req_valid)
&& (dcache_req_if.core_req_mem_write[0] != `NO_MEM_WRITE)
&& (dcache_req_if.core_req_addr[0] == 32'h00010000);
&& (dcache_req_if.core_req_addr[0] == `IO_BUS_ADDR);
wire[31:0] temp_io_data = dcache_req_if.core_req_writedata[0];
assign io_valid = temp_io_valid;
assign io_data = temp_io_data;
wire [31:0] temp_io_data = dcache_req_if.core_req_writedata[0];
assign io_valid = temp_io_valid;
assign io_data = temp_io_data;
assign dcache_req_qual_if.core_req_valid = dcache_req_if.core_req_valid & {`NUM_THREADS{~io_valid}};
assign dcache_req_qual_if.core_req_addr = dcache_req_if.core_req_addr;
assign dcache_req_qual_if.core_req_writedata = dcache_req_if.core_req_writedata;
assign dcache_req_qual_if.core_req_valid = dcache_req_if.core_req_valid & {`NUM_THREADS{~io_valid}};
assign dcache_req_qual_if.core_req_mem_read = dcache_req_if.core_req_mem_read;
assign dcache_req_qual_if.core_req_mem_write = dcache_req_if.core_req_mem_write;
assign dcache_req_qual_if.core_req_addr = dcache_req_if.core_req_addr;
assign dcache_req_qual_if.core_req_writedata = dcache_req_if.core_req_writedata;
assign dcache_req_qual_if.core_req_rd = dcache_req_if.core_req_rd;
assign dcache_req_qual_if.core_req_wb = dcache_req_if.core_req_wb;
assign dcache_req_qual_if.core_req_warp_num = dcache_req_if.core_req_warp_num;
assign dcache_req_qual_if.core_req_pc = dcache_req_if.core_req_pc;
assign dcache_req_qual_if.core_no_wb_slot = dcache_req_if.core_no_wb_slot;
assign dcache_req_qual_if.core_no_wb_slot = dcache_req_if.core_no_wb_slot;
VX_gpu_dcache_rsp_if #(.NUM_REQUESTS(`INUM_REQUESTS)) icache_rsp_if();
VX_gpu_dcache_req_if #(.NUM_REQUESTS(`INUM_REQUESTS)) icache_req_if();
@ -158,7 +115,7 @@ module Vortex
assign I_dram_req_addr = gpu_icache_dram_req_if.dram_req_addr;
assign I_dram_rsp_ready = gpu_icache_dram_req_if.dram_rsp_ready;
assign gpu_icache_dram_req_if.dram_req_full = I_dram_req_full;
assign gpu_icache_dram_req_if.dram_req_ready = I_dram_req_ready;
genvar j;
generate
@ -168,42 +125,41 @@ module Vortex
end
endgenerate
/////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
// Front-end to Back-end
VX_frE_to_bckE_req_if bckE_req_if(); // New instruction request to EXE/MEM
VX_frE_to_bckE_req_if bckE_req_if(); // New instruction request to EXE/MEM
// Back-end to Front-end
VX_wb_if writeback_if(); // Writeback to GPRs
VX_branch_response_if branch_rsp_if(); // Branch Resolution to Fetch
VX_jal_response_if jal_rsp_if(); // Jump resolution to Fetch
// CSR Buses
// VX_csr_write_request_if csr_w_req_if();
VX_wb_if writeback_if(); // Writeback to GPRs
VX_branch_response_if branch_rsp_if(); // Branch Resolution to Fetch
VX_jal_response_if jal_rsp_if(); // Jump resolution to Fetch
// Warp controls
VX_warp_ctl_if warp_ctl_if();
// Cache snooping
VX_gpu_snp_req_rsp_if gpu_icache_snp_req_if();
VX_gpu_snp_req_rsp_if gpu_dcache_snp_req_if();
assign gpu_dcache_snp_req_if.snp_req_valid = llc_snp_req_valid;
assign gpu_dcache_snp_req_if.snp_req_addr = llc_snp_req_addr;
assign llc_snp_req_full = gpu_dcache_snp_req_if.snp_req_full;
assign gpu_dcache_snp_req_if.snp_req_valid = snp_req_valid;
assign gpu_dcache_snp_req_if.snp_req_addr = snp_req_addr;
assign snp_req_full = gpu_dcache_snp_req_if.snp_req_full;
VX_front_end front_end(
.clk (clk),
.reset (reset),
.warp_ctl_if (warp_ctl_if),
.bckE_req_if (bckE_req_if),
.schedule_delay (schedule_delay),
.icache_rsp_if (icache_rsp_if),
.icache_req_if (icache_req_if),
.jal_rsp_if (jal_rsp_if),
.branch_rsp_if (branch_rsp_if),
.fetch_ebreak (out_ebreak)
VX_front_end front_end (
.clk (clk),
.reset (reset),
.warp_ctl_if (warp_ctl_if),
.bckE_req_if (bckE_req_if),
.schedule_delay (schedule_delay),
.icache_rsp_if (icache_rsp_if),
.icache_req_if (icache_req_if),
.jal_rsp_if (jal_rsp_if),
.branch_rsp_if (branch_rsp_if),
.fetch_ebreak (out_ebreak)
);
VX_scheduler schedule(
.clk (clk),
VX_scheduler schedule (
.clk (clk),
.reset (reset),
.memory_delay (memory_delay),
.exec_delay (exec_delay),
@ -214,7 +170,9 @@ VX_scheduler schedule(
.is_empty (scheduler_empty)
);
VX_back_end #(.CORE_ID(CORE_ID)) back_end(
VX_back_end #(
.CORE_ID(CORE_ID)
) back_end (
.clk (clk),
.reset (reset),
.schedule_delay (schedule_delay),
@ -230,7 +188,7 @@ VX_back_end #(.CORE_ID(CORE_ID)) back_end(
.gpr_stage_delay (gpr_stage_delay)
);
VX_dmem_controller dmem_controller(
VX_dmem_controller dmem_controller (
.clk (clk),
.reset (reset),
@ -253,14 +211,6 @@ VX_dmem_controller dmem_controller(
.dcache_rsp_if (dcache_rsp_if)
);
// VX_csr_handler csr_handler(
// .clk (clk),
// .in_decode_csr_address(decode_csr_address),
// .csr_w_req_if (csr_w_req_if),
// .in_wb_valid (writeback_if.wb_valid[0]),
// .out_decode_csr_data (csr_decode_csr_data)
// );
endmodule // Vortex

View file

@ -1,14 +1,12 @@
`include "VX_define.vh"
`include "VX_cache_config.vh"
module Vortex_Cluster
#(
parameter CLUSTER_ID = 0
) (
module Vortex_Cluster #(
parameter CLUSTER_ID = 0
) (
// Clock
input wire clk,
input wire reset,
input wire clk,
input wire reset,
// IO
output wire[`NUM_CORES_PER_CLUSTER-1:0] io_valid,
@ -19,7 +17,7 @@ module Vortex_Cluster
output wire dram_req_write,
output wire [31:0] dram_req_addr,
output wire [`DBANK_LINE_SIZE-1:0] dram_req_data,
input wire dram_req_full,
input wire dram_req_ready,
// DRAM Rsp
input wire dram_rsp_valid,
@ -28,11 +26,11 @@ module Vortex_Cluster
output wire dram_rsp_ready,
// LLC Snooping
input wire llc_snp_req_valid,
input wire[31:0] llc_snp_req_addr,
output wire llc_snp_req_full,
input wire llc_snp_req_valid,
input wire[31:0] llc_snp_req_addr,
output wire llc_snp_req_full,
output wire out_ebreak
output wire out_ebreak
);
// DRAM Dcache Req
wire[`NUM_CORES_PER_CLUSTER-1:0] per_core_dram_req_read;
@ -64,7 +62,7 @@ module Vortex_Cluster
wire[`NUM_CORES_PER_CLUSTER-1:0] per_core_io_valid;
wire[`NUM_CORES_PER_CLUSTER-1:0][31:0] per_core_io_data;
wire l2c_core_accept;
wire l2c_core_req_ready;
wire snp_fwd_valid;
wire[31:0] snp_fwd_addr;
@ -94,7 +92,7 @@ module Vortex_Cluster
.dram_req_write (per_core_dram_req_write [curr_core]),
.dram_req_addr (per_core_dram_req_addr [curr_core]),
.dram_req_data (curr_core_dram_req_data ),
.dram_req_full (l2c_core_accept ),
.dram_req_ready (l2c_core_req_ready ),
.dram_rsp_valid (per_core_dram_rsp_valid [curr_core]),
.dram_rsp_addr (per_core_dram_rsp_addr [curr_core]),
.dram_rsp_data (per_core_dram_rsp_data [curr_core]),
@ -103,14 +101,14 @@ module Vortex_Cluster
.I_dram_req_write (per_core_I_dram_req_write [curr_core]),
.I_dram_req_addr (per_core_I_dram_req_addr [curr_core]),
.I_dram_req_data (curr_core_I_dram_req_data ),
.I_dram_req_full (l2c_core_accept ),
.I_dram_req_ready (l2c_core_req_ready ),
.I_dram_rsp_valid (per_core_I_dram_rsp_valid [curr_core]),
.I_dram_rsp_addr (per_core_I_dram_rsp_addr [curr_core]),
.I_dram_rsp_data (per_core_I_dram_rsp_data [curr_core]),
.I_dram_rsp_ready (per_core_I_dram_rsp_ready [curr_core]),
.snp_req_valid (snp_fwd_valid),
.snp_req_addr (snp_fwd_addr),
.snp_req_full (snp_fwd_full [curr_core]),
.llc_snp_req_valid (snp_fwd_valid),
.llc_snp_req_addr (snp_fwd_addr),
.llc_snp_req_full (snp_fwd_full [curr_core]),
.out_ebreak (per_core_out_ebreak [curr_core])
);
@ -220,7 +218,7 @@ module Vortex_Cluster
.core_req_pc (0),
// L2 can't accept Core Request
.delay_req (l2c_core_accept),
.core_req_ready (l2c_core_req_ready),
// Core can't accept L2 Request
.core_no_wb_slot (|l2c_core_no_wb_slot),
@ -249,7 +247,7 @@ module Vortex_Cluster
.dram_req_write (dram_req_write),
.dram_req_addr (dram_req_addr),
.dram_req_data ({dram_req_data_port}),
.dram_req_full (dram_req_full),
.dram_req_ready (dram_req_ready),
// Snoop Request
.snp_req_valid (llc_snp_req_valid),

View file

@ -2,21 +2,20 @@
`include "VX_cache_config.vh"
module Vortex_Socket (
// Clock
input wire clk,
input wire reset,
input wire clk,
input wire reset,
// IO
output wire io_valid[`NUM_CORES-1:0],
output wire[31:0] io_data [`NUM_CORES-1:0],
output wire io_valid[`NUM_CORES-1:0],
output wire[31:0] io_data [`NUM_CORES-1:0],
// DRAM Req
output wire dram_req_read,
output wire dram_req_write,
output wire [31:0] dram_req_addr,
output wire [`DBANK_LINE_SIZE-1:0] dram_req_data,
input wire dram_req_full,
input wire dram_req_ready,
// DRAM Rsp
input wire dram_rsp_valid,
@ -25,11 +24,11 @@ module Vortex_Socket (
output wire dram_rsp_ready,
// LLC Snooping
input wire llc_snp_req_valid,
input wire[31:0] llc_snp_req_addr,
output wire llc_snp_req_full,
input wire llc_snp_req_valid,
input wire[31:0] llc_snp_req_addr,
output wire llc_snp_req_full,
output wire out_ebreak
output wire out_ebreak
);
if (`NUM_CLUSTERS == 1) begin
@ -53,7 +52,7 @@ module Vortex_Socket (
.dram_req_write (dram_req_write),
.dram_req_addr (dram_req_addr),
.dram_req_data (dram_req_data),
.dram_req_full (dram_req_full),
.dram_req_ready (dram_req_ready),
.dram_rsp_valid (dram_rsp_valid),
.dram_rsp_addr (dram_rsp_addr),
@ -85,7 +84,7 @@ module Vortex_Socket (
wire[`NUM_CLUSTERS-1:0][`DBANK_LINE_WORDS-1:0][31:0] per_cluster_dram_req_data;
wire[31:0] per_cluster_dram_req_data_up[`NUM_CLUSTERS-1:0][`DBANK_LINE_WORDS-1:0];
wire l3c_core_req_full;
wire l3c_core_req_ready;
// // DRAM Dcache Rsp
wire[`NUM_CLUSTERS-1:0] per_cluster_dram_rsp_ready;
@ -113,7 +112,9 @@ module Vortex_Socket (
genvar curr_cluster;
for (curr_cluster = 0; curr_cluster < `NUM_CLUSTERS; curr_cluster=curr_cluster+1) begin
Vortex_Cluster #(.CLUSTER_ID(curr_cluster)) Vortex_Cluster(
Vortex_Cluster #(
.CLUSTER_ID(curr_cluster)
) Vortex_Cluster(
.clk (clk),
.reset (reset),
.io_valid (per_cluster_io_valid [curr_cluster]),
@ -123,7 +124,7 @@ module Vortex_Socket (
.dram_req_read (per_cluster_dram_req_read [curr_cluster]),
.dram_req_addr (per_cluster_dram_req_addr [curr_cluster]),
.dram_req_data (per_cluster_dram_req_data_up [curr_cluster]),
.dram_req_full (l3c_core_req_full),
.dram_req_ready (l3c_core_req_ready),
.dram_rsp_valid (per_cluster_dram_rsp_valid [curr_cluster]),
.dram_rsp_addr (per_cluster_dram_rsp_addr [curr_cluster]),
@ -139,6 +140,7 @@ module Vortex_Socket (
end
//////////////////// L3 Cache ////////////////////
wire[`L3NUM_REQUESTS-1:0] l3c_core_req_valid;
wire[`L3NUM_REQUESTS-1:0][2:0] l3c_core_req_mem_write;
wire[`L3NUM_REQUESTS-1:0][2:0] l3c_core_req_mem_read;
@ -161,25 +163,24 @@ module Vortex_Socket (
assign dram_rsp_data_port[llb_index] = dram_rsp_data[llb_index];
end
//
genvar l3c_curr_cluster;
for (l3c_curr_cluster = 0; l3c_curr_cluster < `L3NUM_REQUESTS; l3c_curr_cluster=l3c_curr_cluster+1) begin
// Core Request
assign l3c_core_req_valid [l3c_curr_cluster] = per_cluster_dram_req_valid[l3c_curr_cluster];
assign l3c_core_req_mem_read [l3c_curr_cluster] = per_cluster_dram_req_read [l3c_curr_cluster] ? `LW_MEM_READ : `NO_MEM_READ;
assign l3c_core_req_mem_write [l3c_curr_cluster] = per_cluster_dram_req_write[l3c_curr_cluster] ? `SW_MEM_WRITE : `NO_MEM_WRITE;
assign l3c_core_req_wb [l3c_curr_cluster] = per_cluster_dram_req_read [l3c_curr_cluster] ? 1 : 0;
assign l3c_core_req_addr [l3c_curr_cluster] = per_cluster_dram_req_addr [l3c_curr_cluster];
assign l3c_core_req_data [l3c_curr_cluster] = per_cluster_dram_req_data [l3c_curr_cluster];
for (l3c_curr_cluster = 0; l3c_curr_cluster < `L3NUM_REQUESTS; l3c_curr_cluster=l3c_curr_cluster+1) begin
// Core Request
assign l3c_core_req_valid [l3c_curr_cluster] = per_cluster_dram_req_valid[l3c_curr_cluster];
assign l3c_core_req_mem_read [l3c_curr_cluster] = per_cluster_dram_req_read [l3c_curr_cluster] ? `LW_MEM_READ : `NO_MEM_READ;
assign l3c_core_req_mem_write [l3c_curr_cluster] = per_cluster_dram_req_write[l3c_curr_cluster] ? `SW_MEM_WRITE : `NO_MEM_WRITE;
assign l3c_core_req_wb [l3c_curr_cluster] = per_cluster_dram_req_read [l3c_curr_cluster] ? 1 : 0;
assign l3c_core_req_addr [l3c_curr_cluster] = per_cluster_dram_req_addr [l3c_curr_cluster];
assign l3c_core_req_data [l3c_curr_cluster] = per_cluster_dram_req_data [l3c_curr_cluster];
// Core can't accept Response
assign l3c_core_no_wb_slot [l3c_curr_cluster] = ~per_cluster_dram_rsp_ready[l3c_curr_cluster];
// Core can't accept Response
assign l3c_core_no_wb_slot [l3c_curr_cluster] = ~per_cluster_dram_rsp_ready[l3c_curr_cluster];
// Cache Fill Response
assign per_cluster_dram_rsp_valid [l3c_curr_cluster] = l3c_wb [l3c_curr_cluster];
assign per_cluster_dram_rsp_data [l3c_curr_cluster] = l3c_wb_data [l3c_curr_cluster];
assign per_cluster_dram_rsp_addr [l3c_curr_cluster] = l3c_wb_addr [l3c_curr_cluster];
end
// Cache Fill Response
assign per_cluster_dram_rsp_valid [l3c_curr_cluster] = l3c_wb [l3c_curr_cluster];
assign per_cluster_dram_rsp_data [l3c_curr_cluster] = l3c_wb_data [l3c_curr_cluster];
assign per_cluster_dram_rsp_addr [l3c_curr_cluster] = l3c_wb_addr [l3c_curr_cluster];
end
VX_cache #(
.CACHE_SIZE_BYTES (`L3CACHE_SIZE_BYTES),
@ -203,8 +204,8 @@ module Vortex_Socket (
.FILL_INVALIDAOR_SIZE (`L3FILL_INVALIDAOR_SIZE),
.SIMULATED_DRAM_LATENCY_CYCLES(`L3SIMULATED_DRAM_LATENCY_CYCLES)
) gpu_l3cache (
.clk (clk),
.reset (reset),
.clk (clk),
.reset (reset),
// Core Req (DRAM Fills/WB) To L2 Request
.core_req_valid (l3c_core_req_valid),
@ -218,7 +219,7 @@ module Vortex_Socket (
.core_req_pc (0),
// L2 can't accept Core Request
.delay_req (l3c_core_req_full),
.core_req_ready (l3c_core_req_ready),
// Core can't accept L2 Request
.core_no_wb_slot (|l3c_core_no_wb_slot),
@ -247,7 +248,7 @@ module Vortex_Socket (
.dram_req_read (dram_req_read),
.dram_req_addr (dram_req_addr),
.dram_req_data ({dram_req_data_port}),
.dram_req_full (dram_req_full),
.dram_req_ready (dram_req_ready),
// Snoop Request
.snp_req_valid (llc_snp_req_valid),

View file

@ -48,7 +48,7 @@ module VX_bank #(
input wire reset,
// Input Core Request
input wire delay_req,
input wire req_ready,
input wire [NUM_REQUESTS-1:0] bank_valids,
input wire [NUM_REQUESTS-1:0][31:0] bank_addr,
input wire [NUM_REQUESTS-1:0][`WORD_SIZE_RNG] bank_writedata,
@ -168,7 +168,7 @@ module VX_bank #(
wire [2:0] reqq_req_mem_write_st0;
wire [31:0] reqq_req_pc_st0;
assign reqq_push = !delay_req && (|bank_valids);
assign reqq_push = req_ready && (|bank_valids);
VX_cache_req_queue #(
.CACHE_SIZE_BYTES (CACHE_SIZE_BYTES),

View file

@ -52,44 +52,46 @@ module VX_cache #(
input wire clk,
input wire reset,
// Req Info
// Core request
input wire [NUM_REQUESTS-1:0] core_req_valid,
input wire [NUM_REQUESTS-1:0][31:0] core_req_addr,
input wire [NUM_REQUESTS-1:0][`WORD_SIZE_RNG] core_req_writedata,
input wire [NUM_REQUESTS-1:0][2:0] core_req_mem_read,
input wire [NUM_REQUESTS-1:0][2:0] core_req_mem_write,
input wire [NUM_REQUESTS-1:0][31:0] core_req_addr,
input wire [NUM_REQUESTS-1:0][`WORD_SIZE_RNG] core_req_writedata,
output wire core_req_ready,
// Req meta
// Core request meta data
input wire [4:0] core_req_rd,
input wire [NUM_REQUESTS-1:0][1:0] core_req_wb,
input wire [`NW_BITS-1:0] core_req_warp_num,
input wire [31:0] core_req_pc,
output wire delay_req,
// Core Writeback
input wire core_no_wb_slot,
// Core response
output wire [NUM_REQUESTS-1:0] core_wb_valid,
output wire [4:0] core_wb_req_rd,
output wire [1:0] core_wb_req_wb,
output wire [`NW_BITS-1:0] core_wb_warp_num,
output wire [NUM_REQUESTS-1:0][`WORD_SIZE_RNG] core_wb_readdata,
output wire [NUM_REQUESTS-1:0][31:0] core_wb_pc,
output wire [NUM_REQUESTS-1:0][31:0] core_wb_address,
output wire [NUM_REQUESTS-1:0][`WORD_SIZE_RNG] core_wb_readdata,
input wire core_no_wb_slot,
// Dram Fill Response
// Core response meta data
output wire [`NW_BITS-1:0] core_wb_warp_num,
output wire [NUM_REQUESTS-1:0][31:0] core_wb_pc,
// DRAM request
output wire dram_req_read,
output wire dram_req_write,
output wire [31:0] dram_req_addr,
output wire [`IBANK_LINE_WORDS-1:0][31:0] dram_req_data,
input wire dram_req_ready,
// DRAM response
input wire dram_rsp_valid,
input wire [31:0] dram_rsp_addr,
input wire [`IBANK_LINE_WORDS-1:0][31:0] dram_rsp_data,
output wire dram_rsp_ready,
// Dram request
output wire dram_req_read,
output wire dram_req_write,
output wire [31:0] dram_req_addr,
output wire [`IBANK_LINE_WORDS-1:0][31:0] dram_req_data,
input wire dram_req_full,
// Snoop Req
input wire snp_req_valid,
input wire [31:0] snp_req_addr,
@ -132,7 +134,7 @@ module VX_cache #(
wire [NUM_BANKS-1:0][31:0] per_bank_snp_fwd_addr;
wire [NUM_BANKS-1:0] per_bank_snp_fwd_pop;
assign delay_req = (|per_bank_reqq_full);
assign core_req_ready = ~(|per_bank_reqq_full);
assign snp_req_full = (|per_bank_snrq_full);
// assign dram_rsp_ready = (NUM_BANKS == 1) ? per_bank_dram_rsp_ready[0] : per_bank_dram_rsp_ready[dram_rsp_addr[`BANK_SELECT_ADDR_RNG]];
@ -171,7 +173,7 @@ module VX_cache #(
.dram_req_write (dram_req_write),
.dram_req_addr (dram_req_addr),
.dram_req_data (dram_req_data),
.dram_req_full (dram_req_full)
.dram_req_ready (dram_req_ready)
);
VX_cache_core_req_bank_sel #(
@ -372,7 +374,7 @@ module VX_cache #(
.clk (clk),
.reset (reset),
// Core req
.delay_req (delay_req),
.req_ready (core_req_ready),
.bank_valids (curr_bank_valids),
.bank_addr (curr_bank_addr),
.bank_writedata (curr_bank_writedata),

View file

@ -50,14 +50,14 @@ module VX_cache_dram_req_arb #(
// Fill Request
output wire dfqq_full,
input wire[NUM_BANKS-1:0] per_bank_dram_fill_req_valid,
input wire[NUM_BANKS-1:0][31:0] per_bank_dram_fill_req_addr,
input wire [NUM_BANKS-1:0] per_bank_dram_fill_req_valid,
input wire [NUM_BANKS-1:0][31:0] per_bank_dram_fill_req_addr,
// DFQ Request
output wire[NUM_BANKS-1:0] per_bank_dram_wb_queue_pop,
input wire[NUM_BANKS-1:0] per_bank_dram_wb_req_valid,
input wire[NUM_BANKS-1:0][31:0] per_bank_dram_wb_req_addr,
input wire[NUM_BANKS-1:0][`BANK_LINE_WORDS-1:0][`WORD_SIZE-1:0] per_bank_dram_wb_req_data,
output wire [NUM_BANKS-1:0] per_bank_dram_wb_queue_pop,
input wire [NUM_BANKS-1:0] per_bank_dram_wb_req_valid,
input wire [NUM_BANKS-1:0][31:0] per_bank_dram_wb_req_addr,
input wire [NUM_BANKS-1:0][`BANK_LINE_WORDS-1:0][`WORD_SIZE-1:0] per_bank_dram_wb_req_data,
// real Dram request
output wire dram_req_read,
@ -65,7 +65,7 @@ module VX_cache_dram_req_arb #(
output wire [31:0] dram_req_addr,
output wire [`IBANK_LINE_WORDS-1:0][31:0] dram_req_data,
input wire dram_req_full
input wire dram_req_ready
);
wire pref_pop;
@ -75,7 +75,8 @@ module VX_cache_dram_req_arb #(
wire dwb_valid;
wire dfqq_req;
assign pref_pop = !dwb_valid && !dfqq_req && !dram_req_full && pref_valid;
assign pref_pop = !dwb_valid && !dfqq_req && dram_req_ready && pref_valid;
VX_prefetcher #(
.PRFQ_SIZE (PRFQ_SIZE),
.PRFQ_STRIDE (PRFQ_STRIDE),
@ -99,7 +100,7 @@ module VX_cache_dram_req_arb #(
wire dfqq_empty;
`DEBUG_END
wire dfqq_pop = !dwb_valid && dfqq_req && !dram_req_full; // If no dwb, and dfqq has valids, then pop
wire dfqq_pop = !dwb_valid && dfqq_req && dram_req_ready; // If no dwb, and dfqq has valids, then pop
wire dfqq_push = (|per_bank_dram_fill_req_valid);
VX_cache_dfq_queue cache_dfq_queue(
@ -115,9 +116,9 @@ module VX_cache_dram_req_arb #(
.dfqq_full (dfqq_full)
);
wire[`LOG2UP(NUM_BANKS)-1:0] dwb_bank;
wire [`LOG2UP(NUM_BANKS)-1:0] dwb_bank;
wire[NUM_BANKS-1:0] use_wb_valid = per_bank_dram_wb_req_valid;
wire [NUM_BANKS-1:0] use_wb_valid = per_bank_dram_wb_req_valid;
VX_generic_priority_encoder #(
.N(NUM_BANKS)
@ -127,7 +128,7 @@ module VX_cache_dram_req_arb #(
.found (dwb_valid)
);
assign per_bank_dram_wb_queue_pop = dram_req_full ? 0 : use_wb_valid & ((1 << dwb_bank));
assign per_bank_dram_wb_queue_pop = dram_req_ready ? (use_wb_valid & ((1 << dwb_bank))) : 0;
wire dram_req = dwb_valid || dfqq_req || pref_pop;
assign dram_req_read = ((dfqq_req && !dwb_valid) || pref_pop) && dram_req;

View file

@ -12,7 +12,7 @@ interface VX_gpu_dcache_dram_req_if #(
wire dram_req_read;
wire [31:0] dram_req_addr;
wire [BANK_LINE_WORDS-1:0][31:0] dram_req_data;
wire dram_req_full;
wire dram_req_ready;
wire dram_rsp_ready;

View file

@ -7,21 +7,21 @@ interface VX_gpu_dcache_req_if #(
parameter NUM_REQUESTS = 32
) ();
// Core Request
// Core request
wire [NUM_REQUESTS-1:0] core_req_valid;
wire [NUM_REQUESTS-1:0][31:0] core_req_addr;
wire [NUM_REQUESTS-1:0][31:0] core_req_writedata;
wire [NUM_REQUESTS-1:0][2:0] core_req_mem_read;
wire [NUM_REQUESTS-1:0][2:0] core_req_mem_write;
wire [NUM_REQUESTS-1:0][31:0] core_req_addr;
wire [NUM_REQUESTS-1:0][31:0] core_req_writedata;
// Core request Meta data
wire [4:0] core_req_rd;
wire [NUM_REQUESTS-1:0][1:0] core_req_wb;
wire [`NW_BITS-1:0] core_req_warp_num;
wire [31:0] core_req_pc;
// Can't WB
wire core_no_wb_slot;
wire core_no_wb_slot;
endinterface
`endif

View file

@ -7,18 +7,19 @@ interface VX_gpu_dcache_rsp_if #(
parameter NUM_REQUESTS = 32
) ();
// Cache WB
// Core response
wire [NUM_REQUESTS-1:0] core_wb_valid;
`IGNORE_WARNINGS_BEGIN
wire [4:0] core_wb_req_rd;
wire [1:0] core_wb_req_wb;
`IGNORE_WARNINGS_END
wire [`NW_BITS-1:0] core_wb_warp_num;
`IGNORE_WARNINGS_END
wire [NUM_REQUESTS-1:0][31:0] core_wb_pc;
wire [NUM_REQUESTS-1:0][31:0] core_wb_readdata;
wire [NUM_REQUESTS-1:0][31:0] core_wb_pc;
// Core response meta data
wire [`NW_BITS-1:0] core_wb_warp_num;
// Cache Full
wire delay_req;
wire core_req_ready;
endinterface

View file

@ -34,6 +34,77 @@ void Simulator::print_stats(std::ostream& out) {
out << std::setw(24) << "# of total cycles:" << std::dec << total_cycles_ << std::endl;
}
void Simulator::dbus_driver() {
// Iterate through each element, and get pop index
int dequeue_index = -1;
bool dequeue_valid = false;
for (int i = 0; i < dram_req_vec_.size(); i++) {
if (dram_req_vec_[i].cycles_left > 0) {
dram_req_vec_[i].cycles_left -= 1;
}
if ((dram_req_vec_[i].cycles_left == 0) && (!dequeue_valid)) {
dequeue_index = i;
dequeue_valid = true;
}
}
#ifdef ENABLE_DRAM_STALLS
dram_stalled_ = false;
if (0 == (total_cycles_ % DRAM_STALLS_MODULO)) {
dram_stalled_ = true;
} else
if (dram_req_vec_.size() >= DRAM_RQ_SIZE) {
dram_stalled_ = true;
}
#endif
if (!dram_stalled_) {
if (vortex_->dram_req_read) {
// Need to add an element
dram_req_t dram_req;
dram_req.cycles_left = DRAM_LATENCY;
dram_req.base_addr = vortex_->dram_req_addr;
dram_req.data = (unsigned *)malloc(GLOBAL_BLOCK_SIZE_BYTES);
for (int i = 0; i < (GLOBAL_BLOCK_SIZE_BYTES / 4); i++) {
unsigned curr_addr = dram_req.base_addr + (i * 4);
unsigned data_rd;
ram_->getWord(curr_addr, &data_rd);
dram_req.data[i] = data_rd;
}
dram_req_vec_.push_back(dram_req);
}
if (vortex_->dram_req_write) {
unsigned base_addr = vortex_->dram_req_addr;
for (int i = 0; i < (GLOBAL_BLOCK_SIZE_BYTES / 4); i++) {
unsigned curr_addr = base_addr + (i * 4);
unsigned data_wr = vortex_->dram_req_data[i];
ram_->writeWord(curr_addr, &data_wr);
}
}
}
if (vortex_->dram_rsp_ready && dequeue_valid) {
vortex_->dram_rsp_valid = 1;
vortex_->dram_rsp_addr = dram_req_vec_[dequeue_index].base_addr;
for (int i = 0; i < (GLOBAL_BLOCK_SIZE_BYTES / 4); i++) {
vortex_->dram_rsp_data[i] = dram_req_vec_[dequeue_index].data[i];
}
free(dram_req_vec_[dequeue_index].data);
dram_req_vec_.erase(dram_req_vec_.begin() + dequeue_index);
} else {
vortex_->dram_rsp_valid = 0;
vortex_->dram_rsp_addr = 0;
}
vortex_->dram_req_ready = ~dram_stalled_;
}
#ifndef USE_MULTICORE
void Simulator::ibus_driver() {
@ -51,6 +122,16 @@ void Simulator::ibus_driver() {
}
}
#ifdef ENABLE_DRAM_STALLS
I_dram_stalled_ = false;
if (0 == (total_cycles_ % DRAM_STALLS_MODULO)) {
I_dram_stalled_ = true;
} else
if (I_dram_req_vec_.size() >= DRAM_RQ_SIZE) {
I_dram_stalled_ = true;
}
#endif
if (!I_dram_stalled_) {
// std::cout << "Icache Dram Request received!\n";
if (vortex_->I_dram_req_read) {
@ -100,135 +181,11 @@ void Simulator::ibus_driver() {
vortex_->I_dram_rsp_addr = 0;
}
// #ifdef ENABLE_DRAM_STALLS
// I_dram_stalled_ = false;
// if (0 == (total_cycles_ % DRAM_STALLS_MODULO)) {
// I_dram_stalled_ = true;
// } else
// if (I_dram_req_vec_.size() >= DRAM_RQ_SIZE) {
// I_dram_stalled_ = true;
// }
// #endif
// vortex_->dram_req_delay = I_dram_stalled_;
vortex_->I_dram_req_ready = ~I_dram_stalled_;
}
#endif
void Simulator::dbus_driver() {
// Iterate through each element, and get pop index
int dequeue_index = -1;
bool dequeue_valid = false;
for (int i = 0; i < dram_req_vec_.size(); i++) {
if (dram_req_vec_[i].cycles_left > 0) {
dram_req_vec_[i].cycles_left -= 1;
}
if ((dram_req_vec_[i].cycles_left == 0) && (!dequeue_valid)) {
dequeue_index = i;
dequeue_valid = true;
}
}
#ifdef USE_MULTICORE
if (!dram_stalled_) {
if (vortex_->dram_req_read) {
// Need to add an element
dram_req_t dram_req;
dram_req.cycles_left = DRAM_LATENCY;
dram_req.base_addr = vortex_->dram_req_addr;
dram_req.data = (unsigned *)malloc(GLOBAL_BLOCK_SIZE_BYTES);
for (int i = 0; i < (GLOBAL_BLOCK_SIZE_BYTES / 4); i++) {
unsigned curr_addr = dram_req.base_addr + (i * 4);
unsigned data_rd;
ram_->getWord(curr_addr, &data_rd);
dram_req.data[i] = data_rd;
}
dram_req_vec_.push_back(dram_req);
}
if (vortex_->dram_req_write) {
unsigned base_addr = vortex_->dram_req_addr;
for (int i = 0; i < (GLOBAL_BLOCK_SIZE_BYTES / 4); i++) {
unsigned curr_addr = base_addr + (i * 4);
unsigned data_wr = vortex_->dram_req_data[i];
ram_->writeWord(curr_addr, &data_wr);
}
}
}
if (vortex_->dram_rsp_ready && dequeue_valid) {
vortex_->dram_rsp_valid = 1;
vortex_->dram_rsp_addr = dram_req_vec_[dequeue_index].base_addr;
for (int i = 0; i < (GLOBAL_BLOCK_SIZE_BYTES / 4); i++) {
vortex_->dram_rsp_data[i] = dram_req_vec_[dequeue_index].data[i];
}
free(dram_req_vec_[dequeue_index].data);
dram_req_vec_.erase(dram_req_vec_.begin() + dequeue_index);
} else {
vortex_->dram_rsp_valid = 0;
vortex_->dram_rsp_addr = 0;
}
#else
if (!dram_stalled_) {
if (vortex_->dram_req_read) {
// Need to add an element
dram_req_t dram_req;
dram_req.cycles_left = DRAM_LATENCY;
dram_req.base_addr = vortex_->dram_req_addr;
dram_req.data = (unsigned *)malloc(GLOBAL_BLOCK_SIZE_BYTES);
for (int i = 0; i < (GLOBAL_BLOCK_SIZE_BYTES / 4); i++) {
unsigned curr_addr = dram_req.base_addr + (i * 4);
unsigned data_rd;
ram_->getWord(curr_addr, &data_rd);
dram_req.data[i] = data_rd;
}
dram_req_vec_.push_back(dram_req);
}
if (vortex_->dram_req_write) {
unsigned base_addr = vortex_->dram_req_addr;
for (int i = 0; i < (GLOBAL_BLOCK_SIZE_BYTES / 4); i++) {
unsigned curr_addr = base_addr + (i * 4);
unsigned data_wr = vortex_->dram_req_data[i];
ram_->writeWord(curr_addr, &data_wr);
}
}
}
if (vortex_->dram_rsp_ready && dequeue_valid) {
vortex_->dram_rsp_valid = 1;
vortex_->dram_rsp_addr = dram_req_vec_[dequeue_index].base_addr;
for (int i = 0; i < (GLOBAL_BLOCK_SIZE_BYTES / 4); i++) {
vortex_->dram_rsp_data[i] = dram_req_vec_[dequeue_index].data[i];
}
free(dram_req_vec_[dequeue_index].data);
dram_req_vec_.erase(dram_req_vec_.begin() + dequeue_index);
} else {
vortex_->dram_rsp_valid = 0;
vortex_->dram_rsp_addr = 0;
}
#endif
#ifdef USE_MULTICORE
vortex_->dram_req_full = dram_stalled_;
#else
vortex_->dram_req_full = dram_stalled_;
#endif
}
void Simulator::io_handler() {
#ifdef USE_MULTICORE
bool io_valid = false;
@ -309,7 +266,6 @@ void Simulator::send_snoops(uint32_t mem_addr, uint32_t size) {
auto aligned_addr_start = GLOBAL_BLOCK_SIZE_BYTES * (mem_addr / GLOBAL_BLOCK_SIZE_BYTES);
auto aligned_addr_end = GLOBAL_BLOCK_SIZE_BYTES * ((mem_addr + size + GLOBAL_BLOCK_SIZE_BYTES - 1) / GLOBAL_BLOCK_SIZE_BYTES);
#ifdef USE_MULTICORE
// submit snoop requests for the needed blocks
vortex_->llc_snp_req_addr = aligned_addr_start;
vortex_->llc_snp_req_valid = false;
@ -325,37 +281,13 @@ void Simulator::send_snoops(uint32_t mem_addr, uint32_t size) {
vortex_->llc_snp_req_valid = true;
}
}
#else
// submit snoop requests for the needed blocks
vortex_->snp_req_addr = aligned_addr_start;
vortex_->snp_req_valid = false;
for (;;) {
this->step();
if (vortex_->snp_req_valid) {
vortex_->snp_req_valid = false;
if (vortex_->snp_req_addr >= aligned_addr_end)
break;
vortex_->snp_req_addr += GLOBAL_BLOCK_SIZE_BYTES;
}
if (!vortex_->snp_req_full) {
vortex_->snp_req_valid = true;
}
}
#endif
}
void Simulator::flush_caches(uint32_t mem_addr, uint32_t size) {
printf("[sim] total cycles: %ld\n", this->total_cycles_);
// send snoops for L1 flush
// send snoop requests to the caches
this->send_snoops(mem_addr, size);
this->wait(PIPELINE_FLUSH_LATENCY);
// #if NUM_CORES != 1
// send snoops for L2 flush
// this->send_snoops(mem_addr, size);
// this->wait(PIPELINE_FLUSH_LATENCY);
// #endif
}
bool Simulator::run() {

View file

@ -19,7 +19,7 @@
#include <ostream>
#include <vector>
#define ENABLE_DRAM_STALLS
//#define ENABLE_DRAM_STALLS
#define DRAM_LATENCY 200
#define DRAM_RQ_SIZE 16
#define DRAM_STALLS_MODULO 16
@ -55,7 +55,7 @@ private:
void send_snoops(uint32_t mem_addr, uint32_t size);
void wait(uint32_t cycles);
int64_t total_cycles_;
uint64_t total_cycles_;
bool dram_stalled_;
bool I_dram_stalled_;
std::vector<dram_req_t> dram_req_vec_;

View file

@ -12,7 +12,7 @@ int main(int argc, char **argv)
Verilated::commandArgs(argc, argv);
#define ALL_TESTS
//#define ALL_TESTS
#ifdef ALL_TESTS
bool passed = true;