diff --git a/rtl/Makefile b/rtl/Makefile index b0dcdf6f6..44fabb613 100644 --- a/rtl/Makefile +++ b/rtl/Makefile @@ -45,6 +45,10 @@ compdebug: echo "#define VCD_OUTPUT" > simulate/tb_debug.h verilator_bin_dbg $(COMP) -cc $(SINGLE_CORE) $(INCLUDE) $(EXE) $(LIB) -CFLAGS '-std=c++11 -DVL_DEBUG' $(WNO) $(DEB) +compdebugmulti: + echo "#define VCD_OUTPUT" > simulate/tb_debug.h + verilator_bin_dbg $(COMP) -cc $(MULTI_CORE) $(INCLUDE) $(MULTI_EXE) $(LIB) -CFLAGS '-std=c++11 -DVL_DEBUG' $(WNO) $(DEB) + RUNFILE: VERILATOR $(MAKECPP) @@ -57,6 +61,9 @@ w: VERILATORnoWarnings multicore: VERILATORMULTInoWarnings $(MAKEMULTICPP) +dmulticore: compdebugmulti + $(MAKEMULTICPP) + run: w (cd obj_dir && ./VVortex) diff --git a/rtl/VX_back_end.v b/rtl/VX_back_end.v index cbdd452e1..933e9a1d2 100644 --- a/rtl/VX_back_end.v +++ b/rtl/VX_back_end.v @@ -1,6 +1,10 @@ `include "VX_define.v" -module VX_back_end ( +module VX_back_end + #( + parameter CORE_ID = 0 + ) + ( input wire clk, input wire reset, input wire schedule_delay, @@ -112,7 +116,7 @@ VX_gpgpu_inst VX_gpgpu_inst( // .VX_csr_wb (VX_csr_wb) // ); -VX_csr_pipe VX_csr_pipe( +VX_csr_pipe #(.CORE_ID(CORE_ID)) VX_csr_pipe( .clk (clk), .reset (reset), .no_slot_csr (no_slot_csr), diff --git a/rtl/VX_cache/VX_bank.v b/rtl/VX_cache/VX_bank.v index 281f0ee83..92f3d81a2 100644 --- a/rtl/VX_cache/VX_bank.v +++ b/rtl/VX_cache/VX_bank.v @@ -1,5 +1,5 @@ `include "VX_cache_config.v" - +`include "VX_define.v" module VX_bank #( // Size of cache in bytes @@ -54,13 +54,13 @@ module VX_bank input wire delay_req, input wire [NUMBER_REQUESTS-1:0] bank_valids, input wire [NUMBER_REQUESTS-1:0][31:0] bank_addr, - input wire [NUMBER_REQUESTS-1:0][31:0] bank_writedata, + input wire [NUMBER_REQUESTS-1:0][`WORD_SIZE_RNG] bank_writedata, input wire [4:0] bank_rd, - input wire [1:0] bank_wb, + input wire [NUMBER_REQUESTS-1:0][1:0] bank_wb, input wire [31:0] bank_pc, input wire [`NW_M1:0] bank_warp_num, - input wire [2:0] bank_mem_read, - input wire [2:0] bank_mem_write, + input wire [NUMBER_REQUESTS-1:0][2:0] bank_mem_read, + input wire [NUMBER_REQUESTS-1:0][2:0] bank_mem_write, output wire reqq_full, // Output Core WB @@ -70,8 +70,9 @@ module VX_bank output wire [4:0] bank_wb_rd, output wire [1:0] bank_wb_wb, output wire [`NW_M1:0] bank_wb_warp_num, - output wire [31:0] bank_wb_data, + output wire [`WORD_SIZE_RNG] bank_wb_data, output wire [31:0] bank_wb_pc, + output wire [31:0] bank_wb_address, // Dram Fill Requests output wire dram_fill_req, @@ -83,25 +84,18 @@ module VX_bank // Dram Fill Response input wire dram_fill_rsp, input wire [31:0] dram_fill_addr, - input wire[`BANK_LINE_SIZE_RNG][31:0] dram_fill_rsp_data, + input wire[`BANK_LINE_SIZE_RNG][`WORD_SIZE-1:0] dram_fill_rsp_data, output wire dram_fill_accept, // Dram WB Requests input wire dram_wb_queue_pop, output wire dram_wb_req, output wire[31:0] dram_wb_req_addr, - output wire[`BANK_LINE_SIZE_RNG][31:0] dram_wb_req_data, + output wire[`BANK_LINE_SIZE_RNG][`WORD_SIZE-1:0] dram_wb_req_data, // Snp Request input wire snp_req, - input wire[31:0] snp_req_addr, - - // Lower Level Cache Response - input wire llvq_pop, - output wire llvq_valid, - output wire[31:0] llvq_res_addr, - output wire[`BANK_LINE_SIZE_RNG][31:0] llvq_res_data, - output wire[`vx_clog2(NUMBER_REQUESTS)-1:0] llvq_res_tid + input wire[31:0] snp_req_addr ); @@ -132,12 +126,12 @@ module VX_bank wire dfpq_empty; wire dfpq_full; wire[31:0] dfpq_addr_st0; - wire[`BANK_LINE_SIZE_RNG][31:0] dfpq_filldata_st0; + wire[`BANK_LINE_SIZE_RNG][`WORD_SIZE-1:0] dfpq_filldata_st0; reg dfpq_hazard_st0; assign dram_fill_accept = !dfpq_full; - VX_generic_queue_ll #(.DATAW(32+(`BANK_LINE_SIZE_WORDS*32)), .SIZE(DFPQ_SIZE)) dfp_queue( + VX_generic_queue_ll #(.DATAW(32+(`BANK_LINE_SIZE_WORDS*`WORD_SIZE)), .SIZE(DFPQ_SIZE)) dfp_queue( .clk (clk), .reset (reset), .push (dram_fill_rsp), @@ -155,7 +149,7 @@ module VX_bank wire reqq_req_st0; wire[`vx_clog2(NUMBER_REQUESTS)-1:0] reqq_req_tid_st0; wire [31:0] reqq_req_addr_st0; - wire [31:0] reqq_req_writeword_st0; + wire [`WORD_SIZE_RNG] reqq_req_writeword_st0; wire [4:0] reqq_req_rd_st0; wire [1:0] reqq_req_wb_st0; wire [`NW_M1:0] reqq_req_warp_num_st0; @@ -221,7 +215,7 @@ module VX_bank wire mrvq_valid_st0; wire[`vx_clog2(NUMBER_REQUESTS)-1:0] mrvq_tid_st0; wire [31:0] mrvq_addr_st0; - wire [31:0] mrvq_writeword_st0; + wire [`WORD_SIZE_RNG] mrvq_writeword_st0; wire [4:0] mrvq_rd_st0; wire [1:0] mrvq_wb_st0; wire [31:0] miss_resrv_pc_st0; @@ -232,7 +226,7 @@ module VX_bank wire miss_add; wire[31:0] miss_add_addr; - wire[31:0] miss_add_data; + wire[`WORD_SIZE_RNG] miss_add_data; wire[`vx_clog2(NUMBER_REQUESTS)-1:0] miss_add_tid; wire[4:0] miss_add_rd; wire[1:0] miss_add_wb; @@ -311,8 +305,8 @@ module VX_bank assign dfpq_pop = !dfpq_empty && !stall_bank_pipe && !dfpq_hazard_st0; assign mrvq_pop = !dfpq_pop && mrvq_valid_st0 && !stall_bank_pipe && !mrvq_hazard_st0; - assign reqq_pop = !mrvq_pop && !reqq_empty && reqq_req_st0 && !stall_bank_pipe && !is_fill_st1[0] && !(reqq_hazard_st0 || (mrvq_valid_st0 && mrvq_hazard_st0)) && !is_fill_in_pipe; - assign snrq_pop = !reqq_pop && snrq_valid_st0 && !stall_bank_pipe && !snrq_hazard_st0; + assign reqq_pop = !mrvq_pop && !dfpq_pop && !reqq_empty && reqq_req_st0 && !stall_bank_pipe && !is_fill_st1[0] && !(reqq_hazard_st0 || (mrvq_valid_st0 && mrvq_hazard_st0)) && !is_fill_in_pipe; + assign snrq_pop = !reqq_pop && !reqq_pop && !mrvq_pop && !dfpq_pop && snrq_valid_st0 && !stall_bank_pipe && !snrq_hazard_st0; integer st1_cycle; @@ -338,8 +332,8 @@ module VX_bank wire qual_is_fill_st0; wire qual_valid_st0; wire [31:0] qual_addr_st0; - wire [31:0] qual_writeword_st0; - wire [`BANK_LINE_SIZE_RNG][31:0] qual_writedata_st0; + wire [`WORD_SIZE_RNG] qual_writeword_st0; + wire [`BANK_LINE_SIZE_RNG][`WORD_SIZE-1:0] qual_writedata_st0; wire [`REQ_INST_META_SIZE-1:0] qual_inst_meta_st0; wire qual_going_to_write_st0; wire qual_is_snp; @@ -348,14 +342,21 @@ module VX_bank wire valid_st1 [STAGE_1_CYCLES-1:0]; wire going_to_write_st1[STAGE_1_CYCLES-1:0]; wire [31:0] addr_st1 [STAGE_1_CYCLES-1:0]; - wire [31:0] writeword_st1 [STAGE_1_CYCLES-1:0]; + wire [`WORD_SIZE_RNG] writeword_st1 [STAGE_1_CYCLES-1:0]; wire [`REQ_INST_META_SIZE-1:0] inst_meta_st1 [STAGE_1_CYCLES-1:0]; wire is_fill_st1 [STAGE_1_CYCLES-1:0]; - wire [`BANK_LINE_SIZE_RNG][31:0] writedata_st1 [STAGE_1_CYCLES-1:0]; + wire [`BANK_LINE_SIZE_RNG][`WORD_SIZE-1:0] writedata_st1[STAGE_1_CYCLES-1:0]; wire is_snp_st1 [STAGE_1_CYCLES-1:0]; wire [31:0] pc_st1 [STAGE_1_CYCLES-1:0]; assign qual_is_fill_st0 = dfpq_pop; + + // always @(*) begin + // if (qual_is_fill_st0 && (FUNC_ID == 3)) begin + // $display("WHAT THE FUCK FUNC_ID: %x", FUNC_ID); + // end + // end + assign qual_valid_st0 = dfpq_pop || mrvq_pop || reqq_pop || snrq_pop; assign qual_addr_st0 = dfpq_pop ? dfpq_addr_st0 : @@ -364,11 +365,7 @@ module VX_bank snrq_pop ? snrq_addr_st0 : 0; - assign qual_writeword_st0 = mrvq_pop ? mrvq_writeword_st0 : - reqq_pop ? reqq_req_writeword_st0 : - 0; - - assign qual_writedata_st0 = dfpq_pop ? dfpq_filldata_st0 : 0; + assign qual_writedata_st0 = dfpq_pop ? dfpq_filldata_st0 : 57; assign qual_inst_meta_st0 = mrvq_pop ? {mrvq_rd_st0 , mrvq_wb_st0 , mrvq_warp_num_st0 , mrvq_mem_read_st0 , mrvq_mem_write_st0 , mrvq_tid_st0 } : reqq_pop ? {reqq_req_rd_st0, reqq_req_wb_st0, reqq_req_warp_num_st0, reqq_req_mem_read_st0, reqq_req_mem_write_st0, reqq_req_tid_st0} : @@ -387,7 +384,11 @@ module VX_bank 32'h0; assign qual_is_snp = snrq_pop ? 1 : 0; - VX_generic_register #(.N( 1 + 1 + 1 + 32 + 32 + `REQ_INST_META_SIZE + (`BANK_LINE_SIZE_WORDS*32) + 1 + 32)) s0_1_c0 ( + assign qual_writeword_st0 = mrvq_pop ? mrvq_writeword_st0 : + reqq_pop ? reqq_req_writeword_st0 : + 0; + + VX_generic_register #(.N( 1 + 1 + 1 + `WORD_SIZE + 32 + `REQ_INST_META_SIZE + (`BANK_LINE_SIZE_WORDS*`WORD_SIZE) + 1 + 32)) s0_1_c0 ( .clk (clk), .reset(reset), .stall(stall_bank_pipe), @@ -399,7 +400,7 @@ module VX_bank genvar curr_stage; generate for (curr_stage = 1; curr_stage < STAGE_1_CYCLES; curr_stage = curr_stage + 1) begin - VX_generic_register #(.N( 1 + 1 + 1 + 32 + 32 + `REQ_INST_META_SIZE + (`BANK_LINE_SIZE_WORDS*32) + 1 + 32)) s0_1_cc ( + VX_generic_register #(.N( 1 + 1 + 1 + `WORD_SIZE + 32 + `REQ_INST_META_SIZE + (`BANK_LINE_SIZE_WORDS*`WORD_SIZE) + 1 + 32)) s0_1_cc ( .clk (clk), .reset(reset), .stall(stall_bank_pipe), @@ -411,8 +412,8 @@ module VX_bank endgenerate - wire[31:0] readword_st1e; - wire[`BANK_LINE_SIZE_RNG][31:0] readdata_st1e; + wire[`WORD_SIZE_RNG] readword_st1e; + wire[`BANK_LINE_SIZE_RNG][`WORD_SIZE-1:0] readdata_st1e; wire[`TAG_SELECT_SIZE_RNG] readtag_st1e; wire miss_st1e; wire dirty_st1e; @@ -424,7 +425,7 @@ module VX_bank wire [`NW_M1:0] warp_num_st1e; wire [2:0] mem_read_st1e; wire [2:0] mem_write_st1e; - wire [`vx_clog2(NUMBER_REQUESTS)-1:0] tid_st1e; + wire [`vx_clog2(NUMBER_REQUESTS)-1:0] tid_st1e; wire fill_saw_dirty_st1e; wire is_snp_st1e; @@ -486,9 +487,9 @@ module VX_bank wire valid_st2; wire[31:0] addr_st2; - wire[31:0] writeword_st2; - wire[31:0] readword_st2; - wire[`BANK_LINE_SIZE_RNG][31:0] readdata_st2; + wire[`WORD_SIZE_RNG] writeword_st2; + wire[`WORD_SIZE_RNG] readword_st2; + wire[`BANK_LINE_SIZE_RNG][`WORD_SIZE-1:0] readdata_st2; wire miss_st2; wire dirty_st2; wire[`REQ_INST_META_SIZE-1:0] inst_meta_st2; @@ -498,18 +499,19 @@ module VX_bank wire is_snp_st2; wire [31:0] pc_st2; - VX_generic_register #(.N( 1 + 1 + 1 + 1 + 32 + 32 + 32 + (`BANK_LINE_SIZE_WORDS * 32) + 1 + 1 + `REQ_INST_META_SIZE + `TAG_SELECT_NUM_BITS + 32)) st_1e_2 ( + + VX_generic_register #(.N( 1+1+1+1+32+`WORD_SIZE+`WORD_SIZE+(`BANK_LINE_SIZE_WORDS * `WORD_SIZE) + `REQ_INST_META_SIZE + `TAG_SELECT_NUM_BITS + 32 + 2)) st_1e_2 ( .clk (clk), .reset(reset), .stall(stall_bank_pipe), .flush(0), - .in ({is_snp_st1e, fill_saw_dirty_st1e, is_fill_st1[STAGE_1_CYCLES-1], qual_valid_st1e_2, addr_st1[STAGE_1_CYCLES-1], writeword_st1[STAGE_1_CYCLES-1], readword_st1e, readdata_st1e, readtag_st1e, miss_st1e, dirty_st1e, pc_st1e, inst_meta_st1[STAGE_1_CYCLES-1]}), - .out ({is_snp_st2 , fill_saw_dirty_st2 , is_fill_st2 , valid_st2 , addr_st2 , writeword_st2 , readword_st2 , readdata_st2 , readtag_st2 , miss_st2 , dirty_st2 , pc_st2 , inst_meta_st2 }) + .in ({is_snp_st1e, fill_saw_dirty_st1e, is_fill_st1[STAGE_1_CYCLES-1] , qual_valid_st1e_2, addr_st1[STAGE_1_CYCLES-1], writeword_st1[STAGE_1_CYCLES-1], readword_st1e, readdata_st1e, readtag_st1e, miss_st1e, dirty_st1e, pc_st1e, inst_meta_st1[STAGE_1_CYCLES-1]}), + .out ({is_snp_st2 , fill_saw_dirty_st2 , is_fill_st2 , valid_st2 , addr_st2 , writeword_st2 , readword_st2 , readdata_st2 , readtag_st2 , miss_st2 , dirty_st2 , pc_st2 , inst_meta_st2 }) ); // Enqueue to miss reserv if it's a valid miss - assign miss_add = valid_st2 && miss_st2 && !stall_bank_pipe && !mrvq_full && !(dirty_st2 && dwbq_full); + assign miss_add = valid_st2 && miss_st2 && !mrvq_full && !((cwbq_push && cwbq_full) || (dwbq_push && dwbq_full) || (dram_fill_req && dram_fill_req_queue_full)); assign miss_add_pc = pc_st2; assign miss_add_addr = addr_st2; assign miss_add_data = writeword_st2; @@ -517,8 +519,8 @@ module VX_bank // Enqueue to CWB Queue - wire cwbq_push = (valid_st2 && !miss_st2) && !cwbq_full & !llvq_full; - wire [31:0] cwbq_data = readword_st2; + wire cwbq_push = (valid_st2 && !miss_st2) && !cwbq_full && !((FUNC_ID == `LLFUNC_ID) && (miss_add_wb == 0)); + wire [`WORD_SIZE_RNG] cwbq_data = readword_st2; wire [`vx_clog2(NUMBER_REQUESTS)-1:0] cwbq_tid = miss_add_tid; wire [4:0] cwbq_rd = miss_add_rd; wire [1:0] cwbq_wb = miss_add_wb; @@ -528,15 +530,15 @@ module VX_bank wire cwbq_full; wire cwbq_empty; assign bank_wb_valid = !cwbq_empty; - VX_generic_queue_ll #(.DATAW( `vx_clog2(NUMBER_REQUESTS) + 5 + 2 + (`NW_M1+1) + 32 + 32), .SIZE(CWBQ_SIZE)) cwb_queue( + VX_generic_queue_ll #(.DATAW( `vx_clog2(NUMBER_REQUESTS) + 5 + 2 + (`NW_M1+1) + `WORD_SIZE + 32 + 32), .SIZE(CWBQ_SIZE)) cwb_queue( .clk (clk), .reset (reset), .push (cwbq_push), - .in_data ({cwbq_tid, cwbq_rd, cwbq_wb, cwbq_warp_num, cwbq_data, cwbq_pc}), + .in_data ({cwbq_tid, cwbq_rd, cwbq_wb, cwbq_warp_num, cwbq_data, cwbq_pc, addr_st2}), .pop (bank_wb_pop), - .out_data({bank_wb_tid, bank_wb_rd, bank_wb_wb, bank_wb_warp_num, bank_wb_data, bank_wb_pc}), + .out_data({bank_wb_tid, bank_wb_rd, bank_wb_wb, bank_wb_warp_num, bank_wb_data, bank_wb_pc, bank_wb_address}), .empty (cwbq_empty), .full (cwbq_full) ); @@ -544,7 +546,7 @@ module VX_bank // Enqueue to DWB Queue wire dwbq_push = ((valid_st2 && miss_st2 && dirty_st2) || fill_saw_dirty_st2) && !dwbq_full && !(!fill_saw_dirty_st2 && mrvq_full); wire[31:0] dwbq_req_addr = {readtag_st2, addr_st2[`LINE_SELECT_ADDR_END:0]} & `BASE_ADDR_MASK; - wire[`BANK_LINE_SIZE_RNG][31:0] dwbq_req_data = readdata_st2; + wire[`BANK_LINE_SIZE_RNG][`WORD_SIZE-1:0] dwbq_req_data = readdata_st2; wire dwbq_empty; wire dwbq_full; @@ -588,7 +590,7 @@ module VX_bank assign dram_fill_req_addr = addr_st2 & `BASE_ADDR_MASK; assign dram_wb_req = !dwbq_empty; - VX_generic_queue_ll #(.DATAW( 32 + (`BANK_LINE_SIZE_WORDS * 32)), .SIZE(DWBQ_SIZE)) dwb_queue( + VX_generic_queue_ll #(.DATAW( 32 + (`BANK_LINE_SIZE_WORDS * `WORD_SIZE)), .SIZE(DWBQ_SIZE)) dwb_queue( .clk (clk), .reset (reset), @@ -602,29 +604,8 @@ module VX_bank ); - // Lower Cache Hit - wire llvq_empty; - wire llvq_full; - wire llvq_push = valid_st2 && !miss_st2 && !llvq_full && !cwbq_full; - wire[`BANK_LINE_SIZE_RNG][31:0] llvq_push_data = readdata_st2; - wire[31:0] llvq_addr = addr_st2; - wire[`vx_clog2(NUMBER_REQUESTS)-1:0] llvq_tid = miss_add_tid; - assign llvq_valid = !llvq_empty; - - VX_generic_queue_ll #(.DATAW(`vx_clog2(NUMBER_REQUESTS) + 32 + (`BANK_LINE_SIZE_WORDS * 32)), .SIZE(LLVQ_SIZE)) llv_queue( - .clk (clk), - .reset (reset), - .push (llvq_push), - .in_data ({llvq_tid , llvq_addr , llvq_push_data}), - .pop (llvq_pop), - .out_data({llvq_res_tid, llvq_res_addr, llvq_res_data}), - .empty (llvq_empty), - .full (llvq_full) - ); - - - assign stall_bank_pipe = (cwbq_push && cwbq_full) || (llvq_push && llvq_full) || (dwbq_push && dwbq_full) || (miss_add && mrvq_full) || (dram_fill_req && dram_fill_req_queue_full); + assign stall_bank_pipe = (cwbq_push && cwbq_full) || (dwbq_push && dwbq_full) || (miss_add && mrvq_full) || (dram_fill_req && dram_fill_req_queue_full); endmodule diff --git a/rtl/VX_cache/VX_cache.v b/rtl/VX_cache/VX_cache.v index b9ef93737..3ca9b52ae 100644 --- a/rtl/VX_cache/VX_cache.v +++ b/rtl/VX_cache/VX_cache.v @@ -9,13 +9,13 @@ module VX_cache // Number of banks {1, 2, 4, 8,...} parameter NUMBER_BANKS = 8, // Size of a word in bytes - parameter WORD_SIZE_BYTES = 4, + parameter WORD_SIZE_BYTES = 16, // Number of Word requests per cycle {1, 2, 4, 8, ...} parameter NUMBER_REQUESTS = 2, // Number of cycles to complete stage 1 (read from memory) parameter STAGE_1_CYCLES = 2, // Function ID, {Dcache=0, Icache=1, Sharedmemory=2} - parameter FUNC_ID = 0, + parameter FUNC_ID = 3, // Queues feeding into banks Knobs {1, 2, 4, 8, ...} @@ -51,15 +51,15 @@ module VX_cache input wire reset, // Req Info - input wire [NUMBER_REQUESTS-1:0] core_req_valid, - input wire [NUMBER_REQUESTS-1:0][31:0] core_req_addr, - input wire [NUMBER_REQUESTS-1:0][31:0] core_req_writedata, - input wire[2:0] core_req_mem_read, - input wire[2:0] core_req_mem_write, + input wire [NUMBER_REQUESTS-1:0] core_req_valid, + input wire [NUMBER_REQUESTS-1:0][31:0] core_req_addr, + input wire [NUMBER_REQUESTS-1:0][`WORD_SIZE_RNG] core_req_writedata, + input wire[NUMBER_REQUESTS-1:0][2:0] core_req_mem_read, + input wire[NUMBER_REQUESTS-1:0][2:0] core_req_mem_write, // Req meta input wire [4:0] core_req_rd, - input wire [1:0] core_req_wb, + input wire [NUMBER_REQUESTS-1:0][1:0] core_req_wb, input wire [`NW_M1:0] core_req_warp_num, input wire [31:0] core_req_pc, output wire delay_req, @@ -70,14 +70,15 @@ module VX_cache output wire [4:0] core_wb_req_rd, output wire [1:0] core_wb_req_wb, output wire [`NW_M1:0] core_wb_warp_num, - output wire [NUMBER_REQUESTS-1:0][31:0] core_wb_readdata, + output wire [NUMBER_REQUESTS-1:0][`WORD_SIZE_RNG] core_wb_readdata, output wire [NUMBER_REQUESTS-1:0][31:0] core_wb_pc, + output wire [NUMBER_REQUESTS-1:0][31:0] core_wb_address, // Dram Fill Response input wire dram_fill_rsp, input wire [31:0] dram_fill_rsp_addr, - input wire [`BANK_LINE_SIZE_RNG][31:0] dram_fill_rsp_data, + input wire [`IBANK_LINE_SIZE_RNG][31:0] dram_fill_rsp_data, output wire dram_fill_accept, // Dram request @@ -86,20 +87,14 @@ module VX_cache output wire dram_req_read, output wire [31:0] dram_req_addr, output wire [31:0] dram_req_size, - output wire [`BANK_LINE_SIZE_RNG][31:0] dram_req_data, + output wire [`IBANK_LINE_SIZE_RNG][31:0] dram_req_data, output wire dram_req_because_of_wb, output wire dram_snp_full, // Snoop Req input wire snp_req, - input wire[31:0] snp_req_addr, - - // Lower Level Cache - input wire llvq_pop, - output wire[NUMBER_REQUESTS-1:0] llvq_valid, - output wire[NUMBER_REQUESTS-1:0][31:0] llvq_res_addr, - output wire[NUMBER_REQUESTS-1:0][`BANK_LINE_SIZE_RNG][31:0] llvq_res_data + input wire[31:0] snp_req_addr ); @@ -111,8 +106,9 @@ module VX_cache wire [NUMBER_BANKS-1:0][4:0] per_bank_wb_rd; wire [NUMBER_BANKS-1:0][1:0] per_bank_wb_wb; wire [NUMBER_BANKS-1:0][`NW_M1:0] per_bank_wb_warp_num; - wire [NUMBER_BANKS-1:0][31:0] per_bank_wb_data; + wire [NUMBER_BANKS-1:0][`WORD_SIZE_RNG] per_bank_wb_data; wire [NUMBER_BANKS-1:0][31:0] per_bank_wb_pc; + wire [NUMBER_BANKS-1:0][31:0] per_bank_wb_address; wire dfqq_full; @@ -124,54 +120,15 @@ module VX_cache wire[NUMBER_BANKS-1:0] per_bank_dram_wb_req; wire[NUMBER_BANKS-1:0] per_bank_dram_because_of_snp; wire[NUMBER_BANKS-1:0][31:0] per_bank_dram_wb_req_addr; - wire[NUMBER_BANKS-1:0][`BANK_LINE_SIZE_RNG][31:0] per_bank_dram_wb_req_data; + wire[NUMBER_BANKS-1:0][`BANK_LINE_SIZE_RNG][`WORD_SIZE-1:0] per_bank_dram_wb_req_data; wire[NUMBER_BANKS-1:0] per_bank_reqq_full; - - wire[NUMBER_BANKS-1:0] per_bank_llvq_pop; - wire[NUMBER_BANKS-1:0] per_bank_llvq_valid; - wire[NUMBER_BANKS-1:0][31:0] per_bank_llvq_res_addr; - wire[NUMBER_BANKS-1:0][`BANK_LINE_SIZE_RNG][31:0] per_bank_llvq_res_data; - wire [NUMBER_BANKS-1:0][`vx_clog2(NUMBER_REQUESTS)-1:0] per_bank_llvq_res_tid; - assign delay_req = (|per_bank_reqq_full); assign dram_fill_accept = (NUMBER_BANKS == 1) ? per_bank_dram_fill_accept[0] : per_bank_dram_fill_accept[dram_fill_rsp_addr[`BANK_SELECT_ADDR_RNG]]; - - VX_dcache_llv_resp_bank_sel #( - .CACHE_SIZE_BYTES (CACHE_SIZE_BYTES), - .BANK_LINE_SIZE_BYTES (BANK_LINE_SIZE_BYTES), - .NUMBER_BANKS (NUMBER_BANKS), - .WORD_SIZE_BYTES (WORD_SIZE_BYTES), - .NUMBER_REQUESTS (NUMBER_REQUESTS), - .STAGE_1_CYCLES (STAGE_1_CYCLES), - .REQQ_SIZE (REQQ_SIZE), - .MRVQ_SIZE (MRVQ_SIZE), - .DFPQ_SIZE (DFPQ_SIZE), - .SNRQ_SIZE (SNRQ_SIZE), - .CWBQ_SIZE (CWBQ_SIZE), - .DWBQ_SIZE (DWBQ_SIZE), - .DFQQ_SIZE (DFQQ_SIZE), - .LLVQ_SIZE (LLVQ_SIZE), - .FILL_INVALIDAOR_SIZE (FILL_INVALIDAOR_SIZE), - .SIMULATED_DRAM_LATENCY_CYCLES(SIMULATED_DRAM_LATENCY_CYCLES) - ) - VX_dcache_llv_resp_bank_sel - ( - .per_bank_llvq_pop (per_bank_llvq_pop), - .per_bank_llvq_valid (per_bank_llvq_valid), - .per_bank_llvq_res_addr(per_bank_llvq_res_addr), - .per_bank_llvq_res_data(per_bank_llvq_res_data), - .per_bank_llvq_res_tid (per_bank_llvq_res_tid), - .llvq_pop (llvq_pop), - .llvq_valid (llvq_valid), - .llvq_res_addr (llvq_res_addr), - .llvq_res_data (llvq_res_data) - ); - VX_cache_dram_req_arb #( .CACHE_SIZE_BYTES (CACHE_SIZE_BYTES), .BANK_LINE_SIZE_BYTES (BANK_LINE_SIZE_BYTES), @@ -245,6 +202,7 @@ module VX_cache .WORD_SIZE_BYTES (WORD_SIZE_BYTES), .NUMBER_REQUESTS (NUMBER_REQUESTS), .STAGE_1_CYCLES (STAGE_1_CYCLES), + .FUNC_ID (FUNC_ID), .REQQ_SIZE (REQQ_SIZE), .MRVQ_SIZE (MRVQ_SIZE), .DFPQ_SIZE (DFPQ_SIZE), @@ -266,6 +224,7 @@ module VX_cache .per_bank_wb_warp_num(per_bank_wb_warp_num), .per_bank_wb_data (per_bank_wb_data), .per_bank_wb_pop (per_bank_wb_pop), + .per_bank_wb_address (per_bank_wb_address), .core_no_wb_slot (core_no_wb_slot), .core_wb_valid (core_wb_valid), @@ -273,6 +232,7 @@ module VX_cache .core_wb_req_wb (core_wb_req_wb), .core_wb_warp_num (core_wb_warp_num), .core_wb_readdata (core_wb_readdata), + .core_wb_address (core_wb_address), .core_wb_pc (core_wb_pc) ); @@ -281,12 +241,12 @@ module VX_cache for (curr_bank = 0; curr_bank < NUMBER_BANKS; curr_bank=curr_bank+1) begin wire [NUMBER_REQUESTS-1:0] curr_bank_valids; wire [NUMBER_REQUESTS-1:0][31:0] curr_bank_addr; - wire [NUMBER_REQUESTS-1:0][31:0] curr_bank_writedata; + wire [NUMBER_REQUESTS-1:0][`WORD_SIZE_RNG] curr_bank_writedata; wire [4:0] curr_bank_rd; - wire [1:0] curr_bank_wb; + wire [NUMBER_REQUESTS-1:0][1:0] curr_bank_wb; wire [`NW_M1:0] curr_bank_warp_num; - wire [2:0] curr_bank_mem_read; - wire [2:0] curr_bank_mem_write; + wire [NUMBER_REQUESTS-1:0][2:0] curr_bank_mem_read; + wire [NUMBER_REQUESTS-1:0][2:0] curr_bank_mem_write; wire [31:0] curr_bank_pc; wire curr_bank_wb_pop; @@ -296,11 +256,12 @@ module VX_cache wire [4:0] curr_bank_wb_rd; wire [1:0] curr_bank_wb_wb; wire [`NW_M1:0] curr_bank_wb_warp_num; - wire [31:0] curr_bank_wb_data; + wire [`WORD_SIZE_RNG] curr_bank_wb_data; + wire [31:0] curr_bank_wb_address; wire curr_bank_dram_fill_rsp; wire [31:0] curr_bank_dram_fill_rsp_addr; - wire [`BANK_LINE_SIZE_RNG][31:0] curr_bank_dram_fill_rsp_data; + wire [`BANK_LINE_SIZE_RNG][`WORD_SIZE-1:0] curr_bank_dram_fill_rsp_data; wire curr_bank_dram_fill_accept; wire curr_bank_dfqq_full; @@ -312,19 +273,13 @@ module VX_cache wire curr_bank_dram_wb_queue_pop; wire curr_bank_dram_wb_req; wire[31:0] curr_bank_dram_wb_req_addr; - wire[`BANK_LINE_SIZE_RNG][31:0] curr_bank_dram_wb_req_data; + wire[`BANK_LINE_SIZE_RNG][`WORD_SIZE-1:0] curr_bank_dram_wb_req_data; wire curr_bank_snp_req; wire[31:0] curr_bank_snp_req_addr; wire curr_bank_reqq_full; - - wire curr_bank_llvq_pop; - wire curr_bank_llvq_valid; - wire[31:0] curr_bank_llvq_res_addr; - wire[`BANK_LINE_SIZE_RNG][31:0] curr_bank_llvq_res_data; - wire[`vx_clog2(NUMBER_REQUESTS)-1:0] curr_bank_llvq_res_tid; // Core Req @@ -348,6 +303,7 @@ module VX_cache assign per_bank_wb_warp_num[curr_bank] = curr_bank_wb_warp_num; assign per_bank_wb_data [curr_bank] = curr_bank_wb_data; assign per_bank_wb_pc [curr_bank] = curr_bank_wb_pc; + assign per_bank_wb_address [curr_bank] = curr_bank_wb_address; // Dram fill request assign curr_bank_dfqq_full = dfqq_full; @@ -370,14 +326,6 @@ module VX_cache // Snoop Request assign curr_bank_snp_req = snp_req && (snp_req_addr[`BANK_SELECT_ADDR_RNG] == curr_bank); assign curr_bank_snp_req_addr = snp_req_addr; - - - // LLVQ - assign curr_bank_llvq_pop = per_bank_llvq_pop[curr_bank]; - assign per_bank_llvq_valid[curr_bank] = curr_bank_llvq_valid; - assign per_bank_llvq_res_data[curr_bank] = curr_bank_llvq_res_data; - assign per_bank_llvq_res_addr[curr_bank] = curr_bank_llvq_res_addr; - assign per_bank_llvq_res_tid[curr_bank] = curr_bank_llvq_res_tid; VX_bank #( .CACHE_SIZE_BYTES (CACHE_SIZE_BYTES), @@ -424,6 +372,7 @@ module VX_cache .bank_wb_warp_num (curr_bank_wb_warp_num), .bank_wb_data (curr_bank_wb_data), .bank_wb_pc (curr_bank_wb_pc), + .bank_wb_address (curr_bank_wb_address), // Dram fill req .dram_fill_req (curr_bank_dram_fill_req), @@ -446,13 +395,8 @@ module VX_cache // Snoop Request .snp_req (curr_bank_snp_req), - .snp_req_addr (curr_bank_snp_req_addr), + .snp_req_addr (curr_bank_snp_req_addr) - .llvq_pop (curr_bank_llvq_pop), - .llvq_valid (curr_bank_llvq_valid), - .llvq_res_addr (curr_bank_llvq_res_addr), - .llvq_res_data (curr_bank_llvq_res_data), - .llvq_res_tid (curr_bank_llvq_res_tid) ); end diff --git a/rtl/VX_cache/VX_cache_config.v b/rtl/VX_cache/VX_cache_config.v index 68d80446c..360b5352f 100644 --- a/rtl/VX_cache/VX_cache_config.v +++ b/rtl/VX_cache/VX_cache_config.v @@ -4,12 +4,17 @@ `include "../VX_define.v" -// data tid rd wb warp_num read write -`define MRVQ_METADATA_SIZE (32 + $clog2(NUMBER_REQUESTS) + 5 + 2 + (`NW_M1 + 1) + 3 + 3) +// data tid rd wb warp_num read write -`define REQ_INST_META_SIZE (5 + 2 + (`NW_M1+1) + 3 + 3 + $clog2(NUMBER_REQUESTS)) -`define vx_clog2(value) $clog2(value) +`define vx_clog2(value) ((value == 1) ? 1 : $clog2(value)) + + +`define MRVQ_METADATA_SIZE (`WORD_SIZE + `vx_clog2(NUMBER_REQUESTS) + 5 + 2 + (`NW_M1 + 1) + 3 + 3) + +// 5 + 2 + 4 + 3 + 3 + 1 +`define REQ_INST_META_SIZE (5 + 2 + (`NW_M1+1) + 3 + 3 + `vx_clog2(NUMBER_REQUESTS)) + // `define vx_clog2_h(value, x) (value == (1 << x)) ? (x) // `define vx_clog2(value) (value == 0 ) ? 0 : \ @@ -46,6 +51,9 @@ // `vx_clog2_h(value, 31) : \ // 0 +`define WORD_SIZE (8*WORD_SIZE_BYTES) +`define WORD_SIZE_RNG (`WORD_SIZE)-1:0 + // 128 `define BANK_SIZE_BYTES CACHE_SIZE_BYTES/NUMBER_BANKS @@ -65,7 +73,7 @@ `define OFFSET_SIZE_RNG `OFFSET_SIZE_END:0 // 2 -`define WORD_SELECT_NUM_BITS ($clog2(`BANK_LINE_SIZE_WORDS)) +`define WORD_SELECT_NUM_BITS (`vx_clog2(`BANK_LINE_SIZE_WORDS)) // 2 `define WORD_SELECT_SIZE_END (`WORD_SELECT_NUM_BITS) // 2 @@ -77,7 +85,7 @@ `define WORD_SELECT_SIZE_RNG `WORD_SELECT_SIZE_END-1:0 // 3 -`define BANK_SELECT_NUM_BITS ($clog2(NUMBER_BANKS)) +`define BANK_SELECT_NUM_BITS (`vx_clog2(NUMBER_BANKS)) // 3 `define BANK_SELECT_SIZE_END (`BANK_SELECT_NUM_BITS) // 4 @@ -90,7 +98,7 @@ `define BANK_SELECT_SIZE_RNG `BANK_SELECT_SIZE_END-1:0 // 3 -`define LINE_SELECT_NUM_BITS ($clog2(`BANK_LINE_COUNT)) +`define LINE_SELECT_NUM_BITS (`vx_clog2(`BANK_LINE_COUNT)) // 3 `define LINE_SELECT_SIZE_END (`LINE_SELECT_NUM_BITS) // 7 diff --git a/rtl/VX_cache/VX_cache_core_req_bank_sel.v b/rtl/VX_cache/VX_cache_core_req_bank_sel.v index cf47d0630..e19531d19 100644 --- a/rtl/VX_cache/VX_cache_core_req_bank_sel.v +++ b/rtl/VX_cache/VX_cache_core_req_bank_sel.v @@ -15,6 +15,8 @@ module VX_cache_core_req_bank_sel parameter NUMBER_REQUESTS = 2, // Number of cycles to complete stage 1 (read from memory) parameter STAGE_1_CYCLES = 2, + // Function ID, {Dcache=0, Icache=1, Sharedmemory=2} + parameter FUNC_ID = 0, // Queues feeding into banks Knobs {1, 2, 4, 8, ...} diff --git a/rtl/VX_cache/VX_cache_dram_req_arb.v b/rtl/VX_cache/VX_cache_dram_req_arb.v index b2cfab9ac..119ea9adf 100644 --- a/rtl/VX_cache/VX_cache_dram_req_arb.v +++ b/rtl/VX_cache/VX_cache_dram_req_arb.v @@ -58,7 +58,7 @@ module VX_cache_dram_req_arb output wire[NUMBER_BANKS-1:0] per_bank_dram_wb_queue_pop, input wire[NUMBER_BANKS-1:0] per_bank_dram_wb_req, input wire[NUMBER_BANKS-1:0][31:0] per_bank_dram_wb_req_addr, - input wire[NUMBER_BANKS-1:0][`BANK_LINE_SIZE_RNG][31:0] per_bank_dram_wb_req_data, + input wire[NUMBER_BANKS-1:0][`BANK_LINE_SIZE_RNG][`WORD_SIZE-1:0] per_bank_dram_wb_req_data, input wire[NUMBER_BANKS-1:0] per_bank_dram_because_of_snp, // real Dram request @@ -67,7 +67,7 @@ module VX_cache_dram_req_arb output wire dram_req_read, output wire [31:0] dram_req_addr, output wire [31:0] dram_req_size, - output wire [`BANK_LINE_SIZE_RNG][31:0] dram_req_data, + output wire [`IBANK_LINE_SIZE_RNG][31:0] dram_req_data, output wire dram_req_because_of_wb ); @@ -109,7 +109,7 @@ module VX_cache_dram_req_arb assign dram_req_read = dfqq_req && !dwb_valid; assign dram_req_addr = (dwb_valid ? per_bank_dram_wb_req_addr[dwb_bank] : dfqq_req_addr) & `BASE_ADDR_MASK; assign dram_req_size = BANK_LINE_SIZE_BYTES; - assign dram_req_data = dwb_valid ? per_bank_dram_wb_req_data[dwb_bank] : 0; + assign {dram_req_data} = dwb_valid ? {per_bank_dram_wb_req_data[dwb_bank] }: 0; assign dram_req_because_of_wb = dwb_valid ? per_bank_dram_because_of_snp[dwb_bank] : 0; endmodule \ No newline at end of file diff --git a/rtl/VX_cache/VX_cache_miss_resrv.v b/rtl/VX_cache/VX_cache_miss_resrv.v index fcbd5ba7a..3da097456 100644 --- a/rtl/VX_cache/VX_cache_miss_resrv.v +++ b/rtl/VX_cache/VX_cache_miss_resrv.v @@ -52,7 +52,7 @@ module VX_cache_miss_resrv // Miss enqueue input wire miss_add, input wire[31:0] miss_add_addr, - input wire[31:0] miss_add_data, + input wire[`WORD_SIZE_RNG] miss_add_data, input wire[`vx_clog2(NUMBER_REQUESTS)-1:0] miss_add_tid, input wire[4:0] miss_add_rd, input wire[1:0] miss_add_wb, @@ -70,7 +70,7 @@ module VX_cache_miss_resrv input wire miss_resrv_pop, output wire miss_resrv_valid_st0, output wire[31:0] miss_resrv_addr_st0, - output wire[31:0] miss_resrv_data_st0, + output wire[`WORD_SIZE_RNG] miss_resrv_data_st0, output wire[`vx_clog2(NUMBER_REQUESTS)-1:0] miss_resrv_tid_st0, output wire[4:0] miss_resrv_rd_st0, output wire[1:0] miss_resrv_wb_st0, diff --git a/rtl/VX_cache/VX_cache_req_queue.v b/rtl/VX_cache/VX_cache_req_queue.v index d9c71294b..937fa7fb1 100644 --- a/rtl/VX_cache/VX_cache_req_queue.v +++ b/rtl/VX_cache/VX_cache_req_queue.v @@ -52,12 +52,12 @@ module VX_cache_req_queue input wire reqq_push, input wire [NUMBER_REQUESTS-1:0] bank_valids, input wire [NUMBER_REQUESTS-1:0][31:0] bank_addr, - input wire [NUMBER_REQUESTS-1:0][31:0] bank_writedata, + input wire [NUMBER_REQUESTS-1:0][`WORD_SIZE_RNG] bank_writedata, input wire [4:0] bank_rd, - input wire [1:0] bank_wb, + input wire [NUMBER_REQUESTS-1:0][1:0] bank_wb, input wire [`NW_M1:0] bank_warp_num, - input wire [2:0] bank_mem_read, - input wire [2:0] bank_mem_write, + input wire [NUMBER_REQUESTS-1:0][2:0] bank_mem_read, + input wire [NUMBER_REQUESTS-1:0][2:0] bank_mem_write, input wire [31:0] bank_pc, // Dequeue Data @@ -65,7 +65,7 @@ module VX_cache_req_queue output wire reqq_req_st0, output wire [`vx_clog2(NUMBER_REQUESTS)-1:0] reqq_req_tid_st0, output wire [31:0] reqq_req_addr_st0, - output wire [31:0] reqq_req_writedata_st0, + output wire [`WORD_SIZE_RNG] reqq_req_writedata_st0, output wire [4:0] reqq_req_rd_st0, output wire [1:0] reqq_req_wb_st0, output wire [`NW_M1:0] reqq_req_warp_num_st0, @@ -80,34 +80,34 @@ module VX_cache_req_queue wire [NUMBER_REQUESTS-1:0] out_per_valids; wire [NUMBER_REQUESTS-1:0][31:0] out_per_addr; - wire [NUMBER_REQUESTS-1:0][31:0] out_per_writedata; + wire [NUMBER_REQUESTS-1:0][`WORD_SIZE_RNG] out_per_writedata; wire [4:0] out_per_rd; - wire [1:0] out_per_wb; + wire [NUMBER_REQUESTS-1:0][1:0] out_per_wb; wire [`NW_M1:0] out_per_warp_num; - wire [2:0] out_per_mem_read; - wire [2:0] out_per_mem_write; + wire [NUMBER_REQUESTS-1:0][2:0] out_per_mem_read; + wire [NUMBER_REQUESTS-1:0][2:0] out_per_mem_write; wire [31:0] out_per_pc; reg [NUMBER_REQUESTS-1:0] use_per_valids; reg [NUMBER_REQUESTS-1:0][31:0] use_per_addr; - reg [NUMBER_REQUESTS-1:0][31:0] use_per_writedata; + reg [NUMBER_REQUESTS-1:0][`WORD_SIZE_RNG] use_per_writedata; reg [4:0] use_per_rd; - reg [1:0] use_per_wb; + reg [NUMBER_REQUESTS-1:0][1:0] use_per_wb; reg [31:0] use_per_pc; reg [`NW_M1:0] use_per_warp_num; - reg [2:0] use_per_mem_read; - reg [2:0] use_per_mem_write; + reg [NUMBER_REQUESTS-1:0][2:0] use_per_mem_read; + reg [NUMBER_REQUESTS-1:0][2:0] use_per_mem_write; wire [NUMBER_REQUESTS-1:0] qual_valids; wire [NUMBER_REQUESTS-1:0][31:0] qual_addr; - wire [NUMBER_REQUESTS-1:0][31:0] qual_writedata; + wire [NUMBER_REQUESTS-1:0][`WORD_SIZE_RNG] qual_writedata; wire [4:0] qual_rd; - wire [1:0] qual_wb; + wire [NUMBER_REQUESTS-1:0][1:0] qual_wb; wire [`NW_M1:0] qual_warp_num; - wire [2:0] qual_mem_read; - wire [2:0] qual_mem_write; + wire [NUMBER_REQUESTS-1:0][2:0] qual_mem_read; + wire [NUMBER_REQUESTS-1:0][2:0] qual_mem_write; wire [31:0] qual_pc; wire[NUMBER_REQUESTS-1:0] updated_valids; @@ -120,7 +120,7 @@ module VX_cache_req_queue wire push_qual = reqq_push && !reqq_full; wire pop_qual = reqq_pop && use_empty && !out_empty; - VX_generic_queue_ll #(.DATAW( (NUMBER_REQUESTS * (1+32+32)) + 5 + 2 + (`NW_M1+1) + 3 + 3 + 32 ), .SIZE(REQQ_SIZE)) reqq_queue( + VX_generic_queue_ll #(.DATAW( (NUMBER_REQUESTS * (1+32+`WORD_SIZE)) + 5 + (NUMBER_REQUESTS*2) + (`NW_M1+1) + (NUMBER_REQUESTS * (3 + 3)) + 32 ), .SIZE(REQQ_SIZE)) reqq_queue( .clk (clk), .reset (reset), .push (push_qual), @@ -158,10 +158,10 @@ module VX_cache_req_queue assign reqq_req_addr_st0 = qual_addr [qual_request_index]; assign reqq_req_writedata_st0 = qual_writedata[qual_request_index]; assign reqq_req_rd_st0 = qual_rd; - assign reqq_req_wb_st0 = qual_wb; + assign reqq_req_wb_st0 = qual_wb[qual_request_index]; assign reqq_req_warp_num_st0 = qual_warp_num; - assign reqq_req_mem_read_st0 = qual_mem_read; - assign reqq_req_mem_write_st0 = qual_mem_write; + assign reqq_req_mem_read_st0 = qual_mem_read [qual_request_index]; + assign reqq_req_mem_write_st0 = qual_mem_write[qual_request_index]; assign reqq_req_pc_st0 = qual_pc; assign updated_valids = qual_valids & (~(1 << qual_request_index)); diff --git a/rtl/VX_cache/VX_cache_wb_sel_merge.v b/rtl/VX_cache/VX_cache_wb_sel_merge.v index 99c5d8155..7199c483f 100644 --- a/rtl/VX_cache/VX_cache_wb_sel_merge.v +++ b/rtl/VX_cache/VX_cache_wb_sel_merge.v @@ -14,6 +14,8 @@ module VX_cache_wb_sel_merge parameter NUMBER_REQUESTS = 2, // Number of cycles to complete stage 1 (read from memory) parameter STAGE_1_CYCLES = 2, + // Function ID, {Dcache=0, Icache=1, Sharedmemory=2} + parameter FUNC_ID = 0, // Queues feeding into banks Knobs {1, 2, 4, 8, ...} @@ -52,19 +54,21 @@ module VX_cache_wb_sel_merge input wire [NUMBER_BANKS-1:0][4:0] per_bank_wb_rd, input wire [NUMBER_BANKS-1:0][1:0] per_bank_wb_wb, input wire [NUMBER_BANKS-1:0][`NW_M1:0] per_bank_wb_warp_num, - input wire [NUMBER_BANKS-1:0][31:0] per_bank_wb_data, + input wire [NUMBER_BANKS-1:0][`WORD_SIZE_RNG] per_bank_wb_data, input wire [NUMBER_BANKS-1:0][31:0] per_bank_wb_pc, + input wire [NUMBER_BANKS-1:0][31:0] per_bank_wb_address, output wire [NUMBER_BANKS-1:0] per_bank_wb_pop, // Core Writeback input wire core_no_wb_slot, output reg [NUMBER_REQUESTS-1:0] core_wb_valid, - output reg [NUMBER_REQUESTS-1:0][31:0] core_wb_readdata, + output reg [NUMBER_REQUESTS-1:0][`WORD_SIZE_RNG] core_wb_readdata, output reg [NUMBER_REQUESTS-1:0][31:0] core_wb_pc, output wire [4:0] core_wb_req_rd, output wire [1:0] core_wb_req_wb, - output wire [`NW_M1:0] core_wb_warp_num + output wire [`NW_M1:0] core_wb_warp_num, + output reg [NUMBER_REQUESTS-1:0][31:0] core_wb_address ); @@ -99,11 +103,13 @@ module VX_cache_wb_sel_merge core_wb_valid = 0; core_wb_readdata = 0; core_wb_pc = 0; + core_wb_address = 0; for (this_bank = 0; this_bank < NUMBER_BANKS; this_bank = this_bank + 1) begin - if (found_bank && (per_bank_wb_valid[this_bank]) && (per_bank_wb_rd[this_bank] == per_bank_wb_rd[main_bank_index]) && (per_bank_wb_warp_num[this_bank] == per_bank_wb_warp_num[main_bank_index])) begin + if (((FUNC_ID == `LLFUNC_ID) && found_bank && per_bank_wb_valid[this_bank]) || (found_bank && (per_bank_wb_valid[this_bank]) && (per_bank_wb_rd[this_bank] == per_bank_wb_rd[main_bank_index]) && (per_bank_wb_warp_num[this_bank] == per_bank_wb_warp_num[main_bank_index]))) begin core_wb_valid[per_bank_wb_tid[this_bank]] = 1; core_wb_readdata[per_bank_wb_tid[this_bank]] = per_bank_wb_data[this_bank]; core_wb_pc[per_bank_wb_tid[this_bank]] = per_bank_wb_pc[this_bank]; + core_wb_address[per_bank_wb_tid[this_bank]] = per_bank_wb_address[this_bank]; per_bank_wb_pop_unqual[this_bank] = 1; end else begin per_bank_wb_pop_unqual[this_bank] = 0; diff --git a/rtl/VX_cache/VX_tag_data_access.v b/rtl/VX_cache/VX_tag_data_access.v index d717e02ce..25f046928 100644 --- a/rtl/VX_cache/VX_tag_data_access.v +++ b/rtl/VX_cache/VX_tag_data_access.v @@ -58,13 +58,13 @@ module VX_tag_data_access input wire valid_req_st1e, input wire writefill_st1e, input wire[31:0] writeaddr_st1e, - input wire[31:0] writeword_st1e, - input wire[`BANK_LINE_SIZE_RNG][31:0] writedata_st1e, + input wire[`WORD_SIZE_RNG] writeword_st1e, + input wire[`DBANK_LINE_SIZE_RNG][31:0] writedata_st1e, input wire[2:0] mem_write_st1e, input wire[2:0] mem_read_st1e, - output wire[31:0] readword_st1e, - output wire[`BANK_LINE_SIZE_RNG][31:0] readdata_st1e, + output wire[`WORD_SIZE_RNG] readword_st1e, + output wire[`DBANK_LINE_SIZE_RNG][31:0] readdata_st1e, output wire[`TAG_SELECT_SIZE_RNG] readtag_st1e, output wire miss_st1e, output wire dirty_st1e, @@ -73,25 +73,25 @@ module VX_tag_data_access ); - reg[`BANK_LINE_SIZE_RNG][31:0] readdata_st[STAGE_1_CYCLES-2:0]; + reg[`DBANK_LINE_SIZE_RNG][31:0] readdata_st[STAGE_1_CYCLES-2:0]; reg read_valid_st1c[STAGE_1_CYCLES-2:0]; reg read_dirty_st1c[STAGE_1_CYCLES-2:0]; reg[`TAG_SELECT_SIZE_RNG] read_tag_st1c [STAGE_1_CYCLES-2:0]; - reg[`BANK_LINE_SIZE_RNG][31:0] read_data_st1c [STAGE_1_CYCLES-2:0]; + reg[`DBANK_LINE_SIZE_RNG][31:0] read_data_st1c [STAGE_1_CYCLES-2:0]; wire qual_read_valid_st1; wire qual_read_dirty_st1; wire[`TAG_SELECT_SIZE_RNG] qual_read_tag_st1; - wire[`BANK_LINE_SIZE_RNG][31:0] qual_read_data_st1; + wire[`DBANK_LINE_SIZE_RNG][31:0] qual_read_data_st1; wire use_read_valid_st1e; wire use_read_dirty_st1e; wire[`TAG_SELECT_SIZE_RNG] use_read_tag_st1e; - wire[`BANK_LINE_SIZE_RNG][31:0] use_read_data_st1e; - wire[`BANK_LINE_SIZE_RNG][3:0] use_write_enable; - wire[`BANK_LINE_SIZE_RNG][31:0] use_write_data; + wire[`DBANK_LINE_SIZE_RNG][31:0] use_read_data_st1e; + wire[`DBANK_LINE_SIZE_RNG][3:0] use_write_enable; + wire[`DBANK_LINE_SIZE_RNG][31:0] use_write_data; wire fill_sent; @@ -134,7 +134,7 @@ module VX_tag_data_access .fill_sent (fill_sent) ); - VX_generic_register #(.N( 1 + 1 + `TAG_SELECT_NUM_BITS + (`BANK_LINE_SIZE_WORDS*32) )) s0_1_c0 ( + VX_generic_register #(.N( 1 + 1 + `TAG_SELECT_NUM_BITS + (`DBANK_LINE_SIZE_WORDS*32) )) s0_1_c0 ( .clk (clk), .reset(reset), .stall(stall), @@ -146,7 +146,7 @@ module VX_tag_data_access genvar curr_stage; generate for (curr_stage = 1; curr_stage < STAGE_1_CYCLES-2; curr_stage = curr_stage + 1) begin - VX_generic_register #(.N( 1 + 1 + `TAG_SELECT_NUM_BITS + (`BANK_LINE_SIZE_WORDS*32) )) s0_1_cc ( + VX_generic_register #(.N( 1 + 1 + `TAG_SELECT_NUM_BITS + (`DBANK_LINE_SIZE_WORDS*32) )) s0_1_cc ( .clk (clk), .reset(reset), .stall(stall), @@ -163,7 +163,7 @@ module VX_tag_data_access assign use_read_tag_st1e = (FUNC_ID == `SFUNC_ID) ? writeaddr_st1e[`TAG_SELECT_ADDR_RNG] : read_tag_st1c [STAGE_1_CYCLES-2]; // Tag is always the same in SM genvar curr_w; - for (curr_w = 0; curr_w < `BANK_LINE_SIZE_WORDS; curr_w = curr_w+1) assign use_read_data_st1e[curr_w][31:0] = read_data_st1c[STAGE_1_CYCLES-2][curr_w][31:0]; + for (curr_w = 0; curr_w < `DBANK_LINE_SIZE_WORDS; curr_w = curr_w+1) assign use_read_data_st1e[curr_w][31:0] = read_data_st1c[STAGE_1_CYCLES-2][curr_w][31:0]; // assign use_read_data_st1e = read_data_st1c [STAGE_1_CYCLES-2]; /////////////////////// LOAD LOGIC /////////////////// @@ -202,14 +202,14 @@ module VX_tag_data_access wire[31:0] lw_data = (data_unQual); - wire[31:0] sw_data = writeword_st1e; + wire[31:0] sw_data = writeword_st1e[31:0]; wire[31:0] sb_data = b1 ? {{16{1'b0}}, writeword_st1e[7:0], { 8{1'b0}}} : b2 ? {{ 8{1'b0}}, writeword_st1e[7:0], {16{1'b0}}} : b3 ? {{ 0{1'b0}}, writeword_st1e[7:0], {24{1'b0}}} : - writeword_st1e; + writeword_st1e[31:0]; - wire[31:0] sh_data = b2 ? {writeword_st1e[15:0], {16{1'b0}}} : writeword_st1e; + wire[31:0] sh_data = b2 ? {writeword_st1e[15:0], {16{1'b0}}} : writeword_st1e[31:0]; @@ -236,20 +236,24 @@ module VX_tag_data_access wire should_write = (sw || sb || sh) && valid_req_st1e && use_read_valid_st1e && !miss_st1e; wire force_write = writefill_st1e && valid_req_st1e && (!use_read_valid_st1e || (use_read_valid_st1e && !miss_st1e)); - wire[`BANK_LINE_SIZE_RNG][3:0] we; - wire[`BANK_LINE_SIZE_RNG][31:0] data_write; + wire[`DBANK_LINE_SIZE_RNG][3:0] we; + wire[`DBANK_LINE_SIZE_RNG][31:0] data_write; genvar g; generate - for (g = 0; g < `BANK_LINE_SIZE_WORDS; g = g + 1) begin : write_enables - wire normal_write = (block_offset == g) && should_write && !writefill_st1e; + for (g = 0; g < `DBANK_LINE_SIZE_WORDS; g = g + 1) begin : write_enables + wire normal_write = (block_offset == g[`WORD_SELECT_SIZE_RNG]) && should_write && !writefill_st1e; assign we[g] = (force_write) ? 4'b1111 : + (normal_write && (FUNC_ID == `LLFUNC_ID)) ? 4'b1111 : (normal_write && sw) ? 4'b1111 : (normal_write && sb) ? sb_mask : (normal_write && sh) ? sh_mask : 4'b0000; - assign data_write[g] = force_write ? writedata_st1e[g] : use_write_dat; + if (!(FUNC_ID == `LLFUNC_ID)) assign data_write[g] = force_write ? writedata_st1e[g] : use_write_dat; + end + if ((FUNC_ID == `LLFUNC_ID)) begin + assign data_write = force_write ? writedata_st1e : writeword_st1e; end endgenerate @@ -257,8 +261,12 @@ module VX_tag_data_access assign use_write_data = data_write; /////////////////////// - - assign readword_st1e = data_Qual; + if (FUNC_ID == `LLFUNC_ID) begin + assign readword_st1e = read_data_st1c[STAGE_1_CYCLES-2]; + end else begin + assign readword_st1e = data_Qual; + end + assign miss_st1e = ((valid_req_st1e || is_snp_st1e) && !use_read_valid_st1e) || (valid_req_st1e && use_read_valid_st1e && !writefill_st1e && (writeaddr_st1e[`TAG_SELECT_ADDR_RNG] != use_read_tag_st1e)); assign dirty_st1e = valid_req_st1e && use_read_valid_st1e && use_read_dirty_st1e; assign readdata_st1e = use_read_data_st1e; diff --git a/rtl/VX_cache/VX_tag_data_structure.v b/rtl/VX_cache/VX_tag_data_structure.v index de8544c55..c7edb6a62 100644 --- a/rtl/VX_cache/VX_tag_data_structure.v +++ b/rtl/VX_cache/VX_tag_data_structure.v @@ -54,18 +54,18 @@ module VX_tag_data_structure output wire read_valid, output wire read_dirty, output wire[`TAG_SELECT_SIZE_RNG] read_tag, - output wire[`BANK_LINE_SIZE_RNG][31:0] read_data, + output wire[`DBANK_LINE_SIZE_RNG][31:0] read_data, input wire invalidate, - input wire[`BANK_LINE_SIZE_RNG][3:0] write_enable, + input wire[`DBANK_LINE_SIZE_RNG][3:0] write_enable, input wire write_fill, input wire[31:0] write_addr, - input wire[`BANK_LINE_SIZE_RNG][31:0] write_data, + input wire[`DBANK_LINE_SIZE_RNG][31:0] write_data, input wire fill_sent ); - reg[`BANK_LINE_SIZE_RNG][3:0][7:0] data [`BANK_LINE_COUNT-1:0]; + reg[`DBANK_LINE_SIZE_RNG][3:0][7:0] data [`BANK_LINE_COUNT-1:0]; reg[`TAG_SELECT_SIZE_RNG] tag [`BANK_LINE_COUNT-1:0]; reg valid[`BANK_LINE_COUNT-1:0]; reg dirty[`BANK_LINE_COUNT-1:0]; @@ -98,7 +98,7 @@ module VX_tag_data_structure valid[write_addr[`LINE_SELECT_ADDR_RNG]] <= 0; end - for (f = 0; f < `BANK_LINE_SIZE_WORDS; f = f + 1) begin + for (f = 0; f < `DBANK_LINE_SIZE_WORDS; f = f + 1) begin if (write_enable[f][0]) data[write_addr[`LINE_SELECT_ADDR_RNG]][f][0] <= write_data[f][7 :0 ]; if (write_enable[f][1]) data[write_addr[`LINE_SELECT_ADDR_RNG]][f][1] <= write_data[f][15:8 ]; if (write_enable[f][2]) data[write_addr[`LINE_SELECT_ADDR_RNG]][f][2] <= write_data[f][23:16]; diff --git a/rtl/VX_csr_data.v b/rtl/VX_csr_data.v index 6dc899a15..5fce2eb1a 100644 --- a/rtl/VX_csr_data.v +++ b/rtl/VX_csr_data.v @@ -17,6 +17,8 @@ module VX_csr_data ( ); + /* verilator lint_off WIDTH */ + // wire[`NT_M1:0][31:0] thread_ids; // wire[`NT_M1:0][31:0] warp_ids; @@ -83,4 +85,5 @@ module VX_csr_data ( read_instreth ? instret[63:32] : {{20{1'b0}}, csr[in_read_csr_address]}; + /* verilator lint_on WIDTH */ endmodule : VX_csr_data diff --git a/rtl/VX_csr_pipe.v b/rtl/VX_csr_pipe.v index e9194f537..a04f51de8 100644 --- a/rtl/VX_csr_pipe.v +++ b/rtl/VX_csr_pipe.v @@ -1,6 +1,10 @@ `include "VX_define.v" -module VX_csr_pipe ( +module VX_csr_pipe + #( + parameter CORE_ID = 0 + ) + ( input wire clk, // Clock input wire reset, input wire no_slot_csr, @@ -56,7 +60,7 @@ module VX_csr_pipe ( wire zero = 0; - VX_generic_register #(.N(`NT + `NW_M1 + 1 + 5 + 2 + 5 + 12 + 64)) csr_reg_s2 ( + VX_generic_register #(.N(32 + 32 + 12 + 1 + 2 + 5 + (`NW_M1+1) + `NT)) csr_reg_s2 ( .clk (clk), .reset(reset), .stall(no_slot_csr), @@ -70,6 +74,7 @@ module VX_csr_pipe ( wire[`NT_M1:0][31:0] thread_ids; wire[`NT_M1:0][31:0] warp_ids; + wire[`NT_M1:0][31:0] warp_idz; wire[`NT_M1:0][31:0] csr_vec_read_data_s2; genvar cur_t; @@ -80,8 +85,11 @@ module VX_csr_pipe ( genvar cur_tw; for (cur_tw = 0; cur_tw < `NT; cur_tw = cur_tw + 1) begin assign warp_ids[cur_tw] = {{(31-`NW_M1){1'b0}}, warp_num_s2}; + assign warp_idz[cur_tw] = (warp_num_s2 + (CORE_ID*`NW)); end + + genvar cur_v; for (cur_v = 0; cur_v < `NT; cur_v = cur_v + 1) begin assign csr_vec_read_data_s2[cur_v] = csr_read_data_s2; @@ -89,9 +97,11 @@ module VX_csr_pipe ( wire thread_select = csr_address_s2 == 12'h20; wire warp_select = csr_address_s2 == 12'h21; + wire warp_id_select = csr_address_s2 == 12'h22; - assign final_csr_data = thread_select ? thread_ids : - warp_select ? warp_ids : + assign final_csr_data = thread_select ? thread_ids : + warp_select ? warp_ids : + warp_id_select ? warp_idz : csr_vec_read_data_s2; diff --git a/rtl/VX_define.v b/rtl/VX_define.v index ad2c1a964..d1f236361 100644 --- a/rtl/VX_define.v +++ b/rtl/VX_define.v @@ -124,102 +124,17 @@ (x <= 1024) ? 10 : \ -199 -// `define PARAM - -// oooooo - -//Cache configurations -//Cache configurations - //Bytes -`define ICACHE_SIZE 4096 -`define ICACHE_WAYS 2 -//Bytes -`define ICACHE_BLOCK 64 -`define ICACHE_BANKS 4 -`define ICACHE_LOG_NUM_BANKS `CLOG2(`ICACHE_BANKS) - -`define ICACHE_NUM_WORDS_PER_BLOCK (`ICACHE_BLOCK / (`ICACHE_BANKS * 4)) -`define ICACHE_NUM_REQ 1 -`define ICACHE_LOG_NUM_REQ `CLOG2(`ICACHE_NUM_REQ) - - //set this to 1 if CACHE_WAYS is 1 -`define ICACHE_WAY_INDEX `CLOG2(`ICACHE_WAYS) -//`define ICACHE_WAY_INDEX 1 -`define ICACHE_BLOCK_PER_BANK (`ICACHE_BLOCK / `ICACHE_BANKS) - -// Offset -`define ICACHE_OFFSET_NB (`CLOG2(`ICACHE_NUM_WORDS_PER_BLOCK)) - -`define ICACHE_ADDR_OFFSET_ST (2+$clog2(`ICACHE_BANKS)) -`define ICACHE_ADDR_OFFSET_ED (`ICACHE_ADDR_OFFSET_ST+(`ICACHE_OFFSET_NB)-1) - - -`define ICACHE_ADDR_OFFSET_RNG `ICACHE_ADDR_OFFSET_ED:`ICACHE_ADDR_OFFSET_ST -`define ICACHE_OFFSET_SIZE_RNG (`CLOG2(`ICACHE_NUM_WORDS_PER_BLOCK)-1):0 -`define ICACHE_OFFSET_ST 0 -`define ICACHE_OFFSET_ED ($clog2(`ICACHE_NUM_WORDS_PER_BLOCK)-1) - -// Index -// `define ICACHE_NUM_IND (`ICACHE_SIZE / (`ICACHE_WAYS * `ICACHE_BLOCK_PER_BANK)) -`define ICACHE_NUM_IND (`ICACHE_SIZE / (`ICACHE_WAYS * `ICACHE_BLOCK)) -`define ICACHE_IND_NB ($clog2(`ICACHE_NUM_IND)) - -`define ICACHE_IND_ST (`ICACHE_ADDR_OFFSET_ED+1) -`define ICACHE_IND_ED (`ICACHE_IND_ST+`ICACHE_IND_NB-1) - -`define ICACHE_ADDR_IND_RNG `ICACHE_IND_ED:`ICACHE_IND_ST -`define ICACHE_IND_SIZE_RNG `ICACHE_IND_NB-1:0 - -`define ICACHE_IND_SIZE_START 0 -`define ICACHE_IND_SIZE_END `ICACHE_IND_NB-1 - - -// Tag -`define ICACHE_ADDR_TAG_RNG 31:(`ICACHE_IND_ED+1) -`define ICACHE_TAG_SIZE_RNG (32-(`ICACHE_IND_ED+1)-1):0 -`define ICACHE_TAG_SIZE_START 0 -`define ICACHE_TAG_SIZE_END (32-(`ICACHE_IND_ED+1)-1) -`define ICACHE_ADDR_TAG_START (`ICACHE_IND_ED+1) -`define ICACHE_ADDR_TAG_END 31 -`define ICACHE_MEM_REQ_ADDR_MASK (32'hffffffff - (`ICACHE_BLOCK-1)) - -/////// - -//`define SHARED_MEMORY_SIZE 4096 -`define SHARED_MEMORY_SIZE 8192 -`define SHARED_MEMORY_BANKS 4 -//`define SHARED_MEMORY_BYTES_PER_READ 16 -//`define SHARED_MEMORY_HEIGHT ((`SHARED_MEMORY_SIZE) / (`SHARED_MEMORY_BANKS * `SHARED_MEMORY_BYTES_PER_READ)) - -//`define SHARED_MEMORY_SIZE 16384 -//`define SHARED_MEMORY_BANKS 8 -`define SHARED_MEMORY_BYTES_PER_READ 16 -//`define SHARED_MEMORY_BITS_PER_BANK 3 -`define SHARED_MEMORY_BITS_PER_BANK `CLOG2(`SHARED_MEMORY_BANKS) -`define SHARED_MEMORY_NUM_REQ `NT -`define SHARED_MEMORY_WORDS_PER_READ (`SHARED_MEMORY_BYTES_PER_READ / 4) -`define SHARED_MEMORY_LOG_WORDS_PER_READ $clog2(`SHARED_MEMORY_WORDS_PER_READ) -`define SHARED_MEMORY_HEIGHT ((`SHARED_MEMORY_SIZE) / (`SHARED_MEMORY_BANKS * `SHARED_MEMORY_BYTES_PER_READ)) - -`define SHARED_MEMORY_BANK_OFFSET_ST (2) -`define SHARED_MEMORY_BANK_OFFSET_ED (2+$clog2(`SHARED_MEMORY_BANKS)-1) -`define SHARED_MEMORY_BLOCK_OFFSET_ST (`SHARED_MEMORY_BANK_OFFSET_ED + 1) -`define SHARED_MEMORY_BLOCK_OFFSET_ED (`SHARED_MEMORY_BLOCK_OFFSET_ST +`SHARED_MEMORY_LOG_WORDS_PER_READ-1) -`define SHARED_MEMORY_INDEX_OFFSET_ST (`SHARED_MEMORY_BLOCK_OFFSET_ED + 1) -`define SHARED_MEMORY_INDEX_OFFSET_ED (`SHARED_MEMORY_INDEX_OFFSET_ST + $clog2(`SHARED_MEMORY_HEIGHT)-1) - - - - - +`define NUMBER_CORES 2 +// `define SINGLE_CORE_BENCH 0 +`define GLOBAL_BLOCK_SIZE_BYTES 16 // ========================================= Dcache Configurable Knobs ========================================= // General Cache Knobs // Size of cache in bytes `define DCACHE_SIZE_BYTES 1024 // Size of line inside a bank in bytes - `define DBANK_LINE_SIZE_BYTES 16 + `define DBANK_LINE_SIZE_BYTES `GLOBAL_BLOCK_SIZE_BYTES // Number of banks {1, 2, 4, 8,...} `define DNUMBER_BANKS 8 // Size of a word in bytes @@ -270,7 +185,7 @@ // Size of cache in bytes `define ICACHE_SIZE_BYTES 1024 // Size of line inside a bank in bytes - `define IBANK_LINE_SIZE_BYTES 16 + `define IBANK_LINE_SIZE_BYTES `GLOBAL_BLOCK_SIZE_BYTES // Number of banks {1, 2, 4, 8,...} `define INUMBER_BANKS 8 // Size of a word in bytes @@ -314,19 +229,19 @@ // ========================================= Icache Configurable Knobs ========================================= -// ========================================= Icache Configurable Knobs ========================================= +// ========================================= SM Configurable Knobs ========================================= // General Cache Knobs // Size of cache in bytes `define SCACHE_SIZE_BYTES 1024 // Size of line inside a bank in bytes - `define SBANK_LINE_SIZE_BYTES 16 + `define SBANK_LINE_SIZE_BYTES `GLOBAL_BLOCK_SIZE_BYTES // Number of banks {1, 2, 4, 8,...} `define SNUMBER_BANKS 8 // Size of a word in bytes `define SWORD_SIZE_BYTES 4 // Number of Word requests per cycle {1, 2, 4, 8, ...} - `define SNUMBER_REQUESTS 1 + `define SNUMBER_REQUESTS `NT // Number of cycles to complete stage 1 (read from memory) `define SSTAGE_1_CYCLES 2 // Function ID @@ -362,7 +277,59 @@ // Dram knobs `define SSIMULATED_DRAM_LATENCY_CYCLES 10 -// ========================================= Icache Configurable Knobs ========================================= +// ========================================= SM Configurable Knobs ========================================= + + + +// ========================================= L2cache Configurable Knobs ========================================= + +// General Cache Knobs + // Size of cache in bytes + `define LLCACHE_SIZE_BYTES 1024 + // Size of line inside a bank in bytes + `define LLBANK_LINE_SIZE_BYTES `GLOBAL_BLOCK_SIZE_BYTES + // Number of banks {1, 2, 4, 8,...} + `define LLNUMBER_BANKS 8 + // Size of a word in bytes + `define LLWORD_SIZE_BYTES (`LLBANK_LINE_SIZE_BYTES) + // Number of Word requests per cycle {1, 2, 4, 8, ...} + `define LLNUMBER_REQUESTS (2*`NUMBER_CORES) + // Number of cycles to complete stage 1 (read from memory) + `define LLSTAGE_1_CYCLES 2 + // Function ID + `define LLFUNC_ID 3 + + // Bank Number of words in a line + `define LLBANK_LINE_SIZE_WORDS (`LLBANK_LINE_SIZE_BYTES / `LLWORD_SIZE_BYTES) + `define LLBANK_LINE_SIZE_RNG `LLBANK_LINE_SIZE_WORDS-1:0 +// Queues feeding into banks Knobs {1, 2, 4, 8, ...} + + // Core Request Queue Size + `define LLREQQ_SIZE (`NT*`NW*`NUMBER_CORES) + // Miss Reserv Queue Knob + `define LLMRVQ_SIZE `LLREQQ_SIZE + // Dram Fill Rsp Queue Size + `define LLDFPQ_SIZE 2 + // Snoop Req Queue + `define LLSNRQ_SIZE 8 + +// Queues for writebacks Knobs {1, 2, 4, 8, ...} + // Core Writeback Queue Size + `define LLCWBQ_SIZE `LLREQQ_SIZE + // Dram Writeback Queue Size + `define LLDWBQ_SIZE 4 + // Dram Fill Req Queue Size + `define LLDFQQ_SIZE `LLREQQ_SIZE + // Lower Level Cache Hit Queue Size + `define LLLLVQ_SIZE 0 + + // Fill Invalidator Size {Fill invalidator must be active} + `define LLFILL_INVALIDAOR_SIZE 16 + +// Dram knobs + `define LLSIMULATED_DRAM_LATENCY_CYCLES 10 + +// ========================================= L2cache Configurable Knobs ========================================= `endif diff --git a/rtl/VX_dmem_controller.v b/rtl/VX_dmem_controller.v index 29ba47dd8..4e1596b73 100644 --- a/rtl/VX_dmem_controller.v +++ b/rtl/VX_dmem_controller.v @@ -22,12 +22,12 @@ module VX_dmem_controller ( ); - VX_gpu_dcache_res_inter VX_dcache_rsp_smem(); - VX_gpu_dcache_req_inter VX_dcache_req_smem(); + VX_gpu_dcache_res_inter #(.NUMBER_REQUESTS(`DNUMBER_REQUESTS)) VX_dcache_rsp_smem(); + VX_gpu_dcache_req_inter #(.NUMBER_REQUESTS(`DNUMBER_REQUESTS)) VX_dcache_req_smem(); - VX_gpu_dcache_res_inter VX_dcache_rsp_dcache(); - VX_gpu_dcache_req_inter VX_dcache_req_dcache(); + VX_gpu_dcache_res_inter #(.NUMBER_REQUESTS(`DNUMBER_REQUESTS)) VX_dcache_rsp_dcache(); + VX_gpu_dcache_req_inter #(.NUMBER_REQUESTS(`DNUMBER_REQUESTS)) VX_dcache_req_dcache(); wire to_shm = VX_dcache_req.core_req_addr[0][31:24] == 8'hFF; @@ -71,19 +71,11 @@ module VX_dmem_controller ( - - - wire Sllvq_pop; - wire[`DNUMBER_REQUESTS-1:0] Sllvq_valid; - wire[`DNUMBER_REQUESTS-1:0][31:0] Sllvq_res_addr; - wire[`DNUMBER_REQUESTS-1:0][`DBANK_LINE_SIZE_RNG][31:0] Sllvq_res_data; - - VX_gpu_dcache_dram_req_inter VX_gpu_smem_dram_req(); - VX_gpu_dcache_dram_res_inter VX_gpu_smem_dram_res(); + VX_gpu_dcache_dram_req_inter #(.BANK_LINE_SIZE_WORDS(`DBANK_LINE_SIZE_WORDS)) VX_gpu_smem_dram_req(); + VX_gpu_dcache_dram_res_inter #(.BANK_LINE_SIZE_WORDS(`DBANK_LINE_SIZE_WORDS)) VX_gpu_smem_dram_res(); - assign Sllvq_pop = 0; VX_cache #( .CACHE_SIZE_BYTES (`SCACHE_SIZE_BYTES), .BANK_LINE_SIZE_BYTES (`SBANK_LINE_SIZE_BYTES), @@ -132,6 +124,7 @@ module VX_dmem_controller ( .core_wb_warp_num (VX_dcache_rsp_smem.core_wb_warp_num), .core_wb_readdata (VX_dcache_rsp_smem.core_wb_readdata), .core_wb_pc (VX_dcache_rsp_smem.core_wb_pc), + .core_wb_address (), // DRAM response .dram_fill_rsp (VX_gpu_smem_dram_res.dram_fill_rsp), @@ -155,23 +148,9 @@ module VX_dmem_controller ( // Snoop Request .snp_req (0), - .snp_req_addr (0), - - // LLVQ stuff - .llvq_pop (Sllvq_pop), - .llvq_valid (Sllvq_valid), - .llvq_res_addr (Sllvq_res_addr), - .llvq_res_data (Sllvq_res_data) + .snp_req_addr (0) ); - - wire Dllvq_pop; - wire[`DNUMBER_REQUESTS-1:0] Dllvq_valid; - wire[`DNUMBER_REQUESTS-1:0][31:0] Dllvq_res_addr; - wire[`DNUMBER_REQUESTS-1:0][`DBANK_LINE_SIZE_RNG][31:0] Dllvq_res_data; - - - assign Dllvq_pop = 0; VX_cache #( .CACHE_SIZE_BYTES (`DCACHE_SIZE_BYTES), .BANK_LINE_SIZE_BYTES (`DBANK_LINE_SIZE_BYTES), @@ -220,6 +199,7 @@ module VX_dmem_controller ( .core_wb_warp_num (VX_dcache_rsp_dcache.core_wb_warp_num), .core_wb_readdata (VX_dcache_rsp_dcache.core_wb_readdata), .core_wb_pc (VX_dcache_rsp_dcache.core_wb_pc), + .core_wb_address (), // DRAM response .dram_fill_rsp (VX_gpu_dcache_dram_res.dram_fill_rsp), @@ -243,22 +223,11 @@ module VX_dmem_controller ( // Snoop Request .snp_req (0), - .snp_req_addr (0), - - // LLVQ stuff - .llvq_pop (Dllvq_pop), - .llvq_valid (Dllvq_valid), - .llvq_res_addr (Dllvq_res_addr), - .llvq_res_data (Dllvq_res_data) + .snp_req_addr (0) ); - wire Illvq_pop; - wire[`DNUMBER_REQUESTS-1:0] Illvq_valid; - wire[`DNUMBER_REQUESTS-1:0][31:0] Illvq_res_addr; - wire[`DNUMBER_REQUESTS-1:0][`DBANK_LINE_SIZE_RNG][31:0] Illvq_res_data; - assign Illvq_pop = 0; VX_cache #( .CACHE_SIZE_BYTES (`ICACHE_SIZE_BYTES), .BANK_LINE_SIZE_BYTES (`IBANK_LINE_SIZE_BYTES), @@ -307,6 +276,7 @@ module VX_dmem_controller ( .core_wb_warp_num (VX_icache_rsp.core_wb_warp_num), .core_wb_readdata (VX_icache_rsp.core_wb_readdata), .core_wb_pc (VX_icache_rsp.core_wb_pc), + .core_wb_address (), // DRAM response .dram_fill_rsp (VX_gpu_icache_dram_res.dram_fill_rsp), @@ -330,13 +300,7 @@ module VX_dmem_controller ( // Snoop Request .snp_req (0), - .snp_req_addr (0), - - // LLVQ stuff - .llvq_pop (Illvq_pop), - .llvq_valid (Illvq_valid), - .llvq_res_addr (Illvq_res_addr), - .llvq_res_data (Illvq_res_data) + .snp_req_addr (0) ); diff --git a/rtl/VX_generic_queue_ll.v b/rtl/VX_generic_queue_ll.v index 4ffe34e60..dfe7828bf 100644 --- a/rtl/VX_generic_queue_ll.v +++ b/rtl/VX_generic_queue_ll.v @@ -16,6 +16,8 @@ module VX_generic_queue_ll output wire full ); + /* verilator lint_off WIDTH */ + if (SIZE == 0) begin assign empty = 1; assign out_data = 0; @@ -117,6 +119,6 @@ module VX_generic_queue_ll end - + /* verilator lint_on WIDTH */ endmodule \ No newline at end of file diff --git a/rtl/VX_icache_stage.v b/rtl/VX_icache_stage.v index 373c11f7f..1c8975d06 100644 --- a/rtl/VX_icache_stage.v +++ b/rtl/VX_icache_stage.v @@ -25,7 +25,7 @@ module VX_icache_stage ( assign VX_icache_req.core_req_mem_read = `LW_MEM_READ; assign VX_icache_req.core_req_mem_write = `NO_MEM_WRITE; assign VX_icache_req.core_req_rd = 5'b0; - assign VX_icache_req.core_req_wb = 2'b0; + assign VX_icache_req.core_req_wb = {1{2'b1}}; assign VX_icache_req.core_req_warp_num = fe_inst_meta_fi.warp_num; assign VX_icache_req.core_req_pc = fe_inst_meta_fi.inst_pc; @@ -33,7 +33,10 @@ module VX_icache_stage ( assign fe_inst_meta_id.instruction = VX_icache_rsp.core_wb_readdata[0][31:0]; assign fe_inst_meta_id.inst_pc = VX_icache_rsp.core_wb_pc[0]; assign fe_inst_meta_id.warp_num = VX_icache_rsp.core_wb_warp_num; + + /* verilator lint_off WIDTH */ assign fe_inst_meta_id.valid = VX_icache_rsp.core_wb_valid ? threads_active[VX_icache_rsp.core_wb_warp_num] : 0; + /* verilator lint_off WIDTH */ assign icache_stage_wid = fe_inst_meta_id.warp_num; assign icache_stage_valids = fe_inst_meta_id.valid & {`NT{!icache_stage_delay}}; @@ -50,7 +53,9 @@ module VX_icache_stage ( for (curr_w = 0; curr_w < `NW; curr_w=curr_w+1) threads_active[curr_w] <= 0; end else begin if (valid_inst && !icache_stage_delay) begin + /* verilator lint_off WIDTH */ threads_active[fe_inst_meta_fi.warp_num] <= fe_inst_meta_fi.valid; + /* verilator lint_on WIDTH */ end end end diff --git a/rtl/VX_lsu.v b/rtl/VX_lsu.v index c8b7aeebb..b962b738c 100644 --- a/rtl/VX_lsu.v +++ b/rtl/VX_lsu.v @@ -49,10 +49,10 @@ module VX_lsu ( assign VX_dcache_req.core_req_valid = use_valid; assign VX_dcache_req.core_req_addr = use_address; assign VX_dcache_req.core_req_writedata = use_store_data; - assign VX_dcache_req.core_req_mem_read = use_mem_read; - assign VX_dcache_req.core_req_mem_write = use_mem_write; + assign VX_dcache_req.core_req_mem_read = {`NT{use_mem_read}}; + assign VX_dcache_req.core_req_mem_write = {`NT{use_mem_write}}; assign VX_dcache_req.core_req_rd = use_rd; - assign VX_dcache_req.core_req_wb = use_wb; + assign VX_dcache_req.core_req_wb = {`NT{use_wb}}; assign VX_dcache_req.core_req_warp_num = use_warp_num; assign VX_dcache_req.core_req_pc = use_pc; diff --git a/rtl/VX_scheduler.v b/rtl/VX_scheduler.v index ed796e654..47e582a33 100644 --- a/rtl/VX_scheduler.v +++ b/rtl/VX_scheduler.v @@ -14,6 +14,7 @@ module VX_scheduler ( ); + /* verilator lint_off WIDTH */ reg[31:0] count_valid; assign is_empty = count_valid == 0; @@ -77,5 +78,6 @@ module VX_scheduler ( end end + /* verilator lint_on WIDTH */ endmodule \ No newline at end of file diff --git a/rtl/VX_warp_scheduler.v b/rtl/VX_warp_scheduler.v index 42014786a..a976bcf02 100644 --- a/rtl/VX_warp_scheduler.v +++ b/rtl/VX_warp_scheduler.v @@ -61,6 +61,7 @@ module VX_warp_scheduler ( ); + /* verilator lint_off WIDTH */ wire update_use_wspawn; wire update_visible_active; @@ -334,6 +335,6 @@ module VX_warp_scheduler ( wire ebreak = (warp_active == 0); assign out_ebreak = ebreak; - + /* verilator lint_on WIDTH */ endmodule \ No newline at end of file diff --git a/rtl/Vortex.v b/rtl/Vortex.v index 9fcf30282..862e70077 100644 --- a/rtl/Vortex.v +++ b/rtl/Vortex.v @@ -2,48 +2,92 @@ `include "VX_cache_config.v" module Vortex + #( + parameter CORE_ID = 0 + ) ( - input wire clk, - input wire reset, - input wire[31:0] icache_response_instruction, - output wire[31:0] icache_request_pc_address, - // IO - output wire io_valid, - output wire[31:0] io_data, - // DRAM Dcache Req - output wire dram_req, - output wire dram_req_write, - output wire dram_req_read, - output wire [31:0] dram_req_addr, - output wire [31:0] dram_req_size, - output wire [31:0] dram_req_data[`DBANK_LINE_SIZE_RNG], - output wire [31:0] dram_expected_lat, + `ifdef SINGLE_CORE_BENCH + input wire clk, + input wire reset, + // IO + output wire io_valid, + output wire[31:0] io_data, - // DRAM Dcache Res - output wire dram_fill_accept, - input wire dram_fill_rsp, - input wire [31:0] dram_fill_rsp_addr, - input wire [31:0] dram_fill_rsp_data[`DBANK_LINE_SIZE_RNG], + // DRAM Dcache Req + output wire dram_req, + output wire dram_req_write, + output wire dram_req_read, + output wire [31:0] dram_req_addr, + output wire [31:0] dram_req_size, + output wire [31:0] dram_req_data[`DBANK_LINE_SIZE_RNG], + output wire [31:0] dram_expected_lat, + + // DRAM Dcache Res + output wire dram_fill_accept, + input wire dram_fill_rsp, + input wire [31:0] dram_fill_rsp_addr, + input wire [31:0] dram_fill_rsp_data[`DBANK_LINE_SIZE_RNG], - // DRAM Icache Req - output wire I_dram_req, - output wire I_dram_req_write, - output wire I_dram_req_read, - output wire [31:0] I_dram_req_addr, - output wire [31:0] I_dram_req_size, - output wire [31:0] I_dram_req_data[`DBANK_LINE_SIZE_RNG], - output wire [31:0] I_dram_expected_lat, + // DRAM Icache Req + output wire I_dram_req, + output wire I_dram_req_write, + output wire I_dram_req_read, + output wire [31:0] I_dram_req_addr, + output wire [31:0] I_dram_req_size, + output wire [31:0] I_dram_req_data[`IBANK_LINE_SIZE_RNG], + output wire [31:0] I_dram_expected_lat, - // DRAM Icache Res - output wire I_dram_fill_accept, - input wire I_dram_fill_rsp, - input wire [31:0] I_dram_fill_rsp_addr, - input wire [31:0] I_dram_fill_rsp_data[`DBANK_LINE_SIZE_RNG], + // DRAM Icache Res + output wire I_dram_fill_accept, + input wire I_dram_fill_rsp, + input wire [31:0] I_dram_fill_rsp_addr, + input wire [31:0] I_dram_fill_rsp_data[`IBANK_LINE_SIZE_RNG], - output wire out_ebreak + output wire out_ebreak + `else + input wire clk, + input wire reset, + // IO + output wire io_valid, + output wire[31:0] io_data, + + // DRAM Dcache Req + output wire dram_req, + output wire dram_req_write, + output wire dram_req_read, + output wire [31:0] dram_req_addr, + output wire [31:0] dram_req_size, + output wire [`DBANK_LINE_SIZE_RNG][31:0] dram_req_data, + output wire [31:0] dram_expected_lat, + + // DRAM Dcache Res + output wire dram_fill_accept, + input wire dram_fill_rsp, + input wire [31:0] dram_fill_rsp_addr, + input wire [`DBANK_LINE_SIZE_RNG][31:0] dram_fill_rsp_data, + + + // DRAM Icache Req + output wire I_dram_req, + output wire I_dram_req_write, + output wire I_dram_req_read, + output wire [31:0] I_dram_req_addr, + output wire [31:0] I_dram_req_size, + output wire [`IBANK_LINE_SIZE_RNG][31:0] I_dram_req_data, + output wire [31:0] I_dram_expected_lat, + + // DRAM Icache Res + output wire I_dram_fill_accept, + input wire I_dram_fill_rsp, + input wire [31:0] I_dram_fill_rsp_addr, + input wire [`IBANK_LINE_SIZE_RNG][31:0] I_dram_fill_rsp_data, + + + output wire out_ebreak + `endif ); wire scheduler_empty; @@ -86,7 +130,7 @@ module Vortex end endgenerate - wire temp_io_valid = (!memory_delay) && (|VX_dcache_req.core_req_valid) && (VX_dcache_req.core_req_mem_write != `NO_MEM_WRITE) && (VX_dcache_req.core_req_addr[0] == 32'h00010000); + wire temp_io_valid = (!memory_delay) && (|VX_dcache_req.core_req_valid) && (VX_dcache_req.core_req_mem_write[0] != `NO_MEM_WRITE) && (VX_dcache_req.core_req_addr[0] == 32'h00010000); wire[31:0] temp_io_data = VX_dcache_req.core_req_writedata[0]; assign io_valid = temp_io_valid; assign io_data = temp_io_data; @@ -172,7 +216,7 @@ VX_scheduler schedule( .is_empty (scheduler_empty) ); -VX_back_end vx_back_end( +VX_back_end #(.CORE_ID(CORE_ID)) vx_back_end( .clk (clk), .reset (reset), .schedule_delay (schedule_delay), diff --git a/rtl/Vortex_SOC.v b/rtl/Vortex_SOC.v index 56f50f36f..2305e55c5 100644 --- a/rtl/Vortex_SOC.v +++ b/rtl/Vortex_SOC.v @@ -4,11 +4,11 @@ module Vortex_SOC ( input wire clk, input wire reset, - input wire[31:0] icache_response_instruction, - output wire[31:0] icache_request_pc_address, // IO - output wire io_valid, - output wire[31:0] io_data, + output wire io_valid[`NUMBER_CORES-1:0], + output wire[31:0] io_data [`NUMBER_CORES-1:0], + + output wire[31:0] number_cores, // DRAM Dcache Req output wire dram_req, @@ -26,61 +26,258 @@ module Vortex_SOC ( input wire [31:0] dram_fill_rsp_data[`DBANK_LINE_SIZE_RNG], - // DRAM Icache Req - output wire I_dram_req, - output wire I_dram_req_write, - output wire I_dram_req_read, - output wire [31:0] I_dram_req_addr, - output wire [31:0] I_dram_req_size, - output wire [31:0] I_dram_req_data[`DBANK_LINE_SIZE_RNG], - output wire [31:0] I_dram_expected_lat, - - // DRAM Icache Res - output wire I_dram_fill_accept, - input wire I_dram_fill_rsp, - input wire [31:0] I_dram_fill_rsp_addr, - input wire [31:0] I_dram_fill_rsp_data[`DBANK_LINE_SIZE_RNG], - - output wire out_ebreak ); - Vortex vortex_core( - .clk (clk), - .reset (reset), - .icache_response_instruction(icache_response_instruction), - .icache_request_pc_address (icache_request_pc_address), - .io_valid (io_valid), - .io_data (io_data), - .dram_req (dram_req), - .dram_req_write (dram_req_write), - .dram_req_read (dram_req_read), - .dram_req_addr (dram_req_addr), - .dram_req_size (dram_req_size), - .dram_req_data (dram_req_data), - .dram_expected_lat (dram_expected_lat), - .dram_fill_accept (dram_fill_accept), - .dram_fill_rsp (dram_fill_rsp), - .dram_fill_rsp_addr (dram_fill_rsp_addr), - .dram_fill_rsp_data (dram_fill_rsp_data), - .I_dram_req (I_dram_req), - .I_dram_req_write (I_dram_req_write), - .I_dram_req_read (I_dram_req_read), - .I_dram_req_addr (I_dram_req_addr), - .I_dram_req_size (I_dram_req_size), - .I_dram_req_data (I_dram_req_data), - .I_dram_expected_lat (I_dram_expected_lat), - .I_dram_fill_accept (I_dram_fill_accept), - .I_dram_fill_rsp (I_dram_fill_rsp), - .I_dram_fill_rsp_addr (I_dram_fill_rsp_addr), - .I_dram_fill_rsp_data (I_dram_fill_rsp_data), - .out_ebreak (out_ebreak) + assign number_cores = `NUMBER_CORES; + + // IO + wire per_core_io_valid[`NUMBER_CORES-1:0]; + wire[31:0] per_core_io_data[`NUMBER_CORES-1:0]; + + // DRAM Dcache Req + wire[`NUMBER_CORES-1:0] per_core_dram_req; + wire[`NUMBER_CORES-1:0] per_core_dram_req_write; + wire[`NUMBER_CORES-1:0] per_core_dram_req_read; + wire[`NUMBER_CORES-1:0] [31:0] per_core_dram_req_addr; + wire[`NUMBER_CORES-1:0] [31:0] per_core_dram_req_size; + wire[`NUMBER_CORES-1:0][`DBANK_LINE_SIZE_RNG][31:0] per_core_dram_req_data; + wire[`NUMBER_CORES-1:0] [31:0] per_core_dram_expected_lat; + + // DRAM Dcache Res + wire[`NUMBER_CORES-1:0] per_core_dram_fill_accept; + wire[`NUMBER_CORES-1:0] per_core_dram_fill_rsp; + wire[`NUMBER_CORES-1:0] [31:0] per_core_dram_fill_rsp_addr; + wire[`NUMBER_CORES-1:0][`DBANK_LINE_SIZE_RNG][31:0] per_core_dram_fill_rsp_data; + + + // DRAM Icache Req + wire[`NUMBER_CORES-1:0] per_core_I_dram_req; + wire[`NUMBER_CORES-1:0] per_core_I_dram_req_write; + wire[`NUMBER_CORES-1:0] per_core_I_dram_req_read; + wire[`NUMBER_CORES-1:0] [31:0] per_core_I_dram_req_addr; + wire[`NUMBER_CORES-1:0] [31:0] per_core_I_dram_req_size; + wire[`NUMBER_CORES-1:0][`IBANK_LINE_SIZE_RNG][31:0] per_core_I_dram_req_data; + wire[`NUMBER_CORES-1:0] [31:0] per_core_I_dram_expected_lat; + + // DRAM Icache Res + wire[`NUMBER_CORES-1:0] per_core_I_dram_fill_accept; + wire[`NUMBER_CORES-1:0] per_core_I_dram_fill_rsp; + wire[`NUMBER_CORES-1:0] [31:0] per_core_I_dram_fill_rsp_addr; + wire[`NUMBER_CORES-1:0][`IBANK_LINE_SIZE_RNG][31:0] per_core_I_dram_fill_rsp_data; + + // Out ebreak + wire[`NUMBER_CORES-1:0] per_core_out_ebreak; + + assign out_ebreak = (&per_core_out_ebreak); + + genvar curr_core; + generate + + for (curr_core = 0; curr_core < `NUMBER_CORES; curr_core=curr_core+1) begin + + wire [`IBANK_LINE_SIZE_RNG][31:0] curr_core_I_dram_req_data; + wire [`DBANK_LINE_SIZE_RNG][31:0] curr_core_dram_req_data ; + + assign io_valid[curr_core] = per_core_io_valid[curr_core]; + assign io_data [curr_core] = per_core_io_data [curr_core]; + Vortex #(.CORE_ID(curr_core)) vortex_core( + .clk (clk), + .reset (reset), + .io_valid (per_core_io_valid [curr_core]), + .io_data (per_core_io_data [curr_core]), + .dram_req (per_core_dram_req [curr_core]), + .dram_req_write (per_core_dram_req_write [curr_core]), + .dram_req_read (per_core_dram_req_read [curr_core]), + .dram_req_addr (per_core_dram_req_addr [curr_core]), + .dram_req_size (per_core_dram_req_size [curr_core]), + .dram_req_data (curr_core_dram_req_data ), + .dram_expected_lat (per_core_dram_expected_lat [curr_core]), + .dram_fill_accept (per_core_dram_fill_accept [curr_core]), + .dram_fill_rsp (per_core_dram_fill_rsp [curr_core]), + .dram_fill_rsp_addr (per_core_dram_fill_rsp_addr [curr_core]), + .dram_fill_rsp_data (per_core_dram_fill_rsp_data [curr_core]), + .I_dram_req (per_core_I_dram_req [curr_core]), + .I_dram_req_write (per_core_I_dram_req_write [curr_core]), + .I_dram_req_read (per_core_I_dram_req_read [curr_core]), + .I_dram_req_addr (per_core_I_dram_req_addr [curr_core]), + .I_dram_req_size (per_core_I_dram_req_size [curr_core]), + .I_dram_req_data (curr_core_I_dram_req_data ), + .I_dram_expected_lat (per_core_I_dram_expected_lat [curr_core]), + .I_dram_fill_accept (per_core_I_dram_fill_accept [curr_core]), + .I_dram_fill_rsp (per_core_I_dram_fill_rsp [curr_core]), + .I_dram_fill_rsp_addr (per_core_I_dram_fill_rsp_addr[curr_core]), + .I_dram_fill_rsp_data (per_core_I_dram_fill_rsp_data[curr_core]), + .out_ebreak (per_core_out_ebreak [curr_core]) + ); + + assign per_core_dram_req_data [curr_core] = curr_core_dram_req_data; + assign per_core_I_dram_req_data[curr_core] = curr_core_I_dram_req_data; + end + endgenerate + + + //////////////////// L2 Cache //////////////////// + wire[`LLNUMBER_REQUESTS-1:0] l2c_core_req; + wire[`LLNUMBER_REQUESTS-1:0][2:0] l2c_core_req_mem_write; + wire[`LLNUMBER_REQUESTS-1:0][2:0] l2c_core_req_mem_read; + wire[`LLNUMBER_REQUESTS-1:0][31:0] l2c_core_req_addr; + wire[`LLNUMBER_REQUESTS-1:0][`IBANK_LINE_SIZE_RNG][31:0] l2c_core_req_data; + wire[`LLNUMBER_REQUESTS-1:0][1:0] l2c_core_req_wb; + + wire l2c_core_accept; + + + wire[`LLNUMBER_REQUESTS-1:0] l2c_wb; + wire[`LLNUMBER_REQUESTS-1:0] [31:0] l2c_wb_addr; + wire[`LLNUMBER_REQUESTS-1:0][`IBANK_LINE_SIZE_RNG][31:0] l2c_wb_data; + + + wire[`DBANK_LINE_SIZE_RNG][31:0] dram_req_data_port; + wire[`DBANK_LINE_SIZE_RNG][31:0] dram_fill_rsp_data_port; + + genvar llb_index; + generate + for (llb_index = 0; llb_index < `DBANK_LINE_SIZE_WORDS; llb_index=llb_index+1) begin + assign dram_req_data [llb_index] = dram_req_data_port[llb_index]; + assign dram_fill_rsp_data_port[llb_index] = dram_fill_rsp_data[llb_index]; + end + endgenerate + + // genvar l2c_index; + // genvar l2c_bank_index; + // generate + // for (l2c_index = 0; l2c_index < `LLNUMBER_REQUESTS; l2c_index=l2c_index+1) begin + // assign l2c_wb [l2c_index] = l2c_wb_port [l2c_index]; + // assign l2c_wb_addr[l2c_index] = l2c_wb_addr_port[l2c_index]; + // for (l2c_bank_index = 0; l2c_bank_index < `LLNUMBER_REQUESTS; l2c_bank_index=l2c_bank_index+1) begin + // assign l2c_wb_data[l2c_index][l2c_bank_index] = l2c_wb_data_port[l2c_index][l2c_bank_index]; + // end + // end + // endgenerate + + + + // + genvar l2c_curr_core; + generate + for (l2c_curr_core = 0; l2c_curr_core < `LLNUMBER_REQUESTS; l2c_curr_core=l2c_curr_core+2) begin + // Core Request + assign l2c_core_req [l2c_curr_core] = per_core_dram_req [(l2c_curr_core/2)]; + assign l2c_core_req [l2c_curr_core+1] = per_core_I_dram_req[(l2c_curr_core/2)]; + + assign l2c_core_req_mem_write [l2c_curr_core] = per_core_dram_req_write ? `SW_MEM_WRITE : `NO_MEM_WRITE; + assign l2c_core_req_mem_write [l2c_curr_core+1] = `NO_MEM_WRITE; // I caches don't write + + assign l2c_core_req_mem_read [l2c_curr_core] = per_core_dram_req_read ? `LW_MEM_READ : `NO_MEM_READ; + assign l2c_core_req_mem_read [l2c_curr_core+1] = `LW_MEM_READ; // I caches don't write + + assign l2c_core_req_wb [l2c_curr_core] = per_core_dram_req_read ? 1 : 0; + assign l2c_core_req_wb [l2c_curr_core+1] = 1; // I caches don't write + + assign l2c_core_req_addr [l2c_curr_core] = per_core_dram_req_addr [(l2c_curr_core/2)]; + assign l2c_core_req_addr [l2c_curr_core+1] = per_core_I_dram_req_addr[(l2c_curr_core/2)]; + + assign l2c_core_req_data [l2c_curr_core] = per_core_dram_req_data [(l2c_curr_core/2)]; + assign l2c_core_req_data [l2c_curr_core+1] = per_core_I_dram_req_data[(l2c_curr_core/2)]; + + // L2 can't accept requests + assign per_core_dram_fill_accept [(l2c_curr_core/2)] = l2c_core_accept; + assign per_core_I_dram_fill_accept[(l2c_curr_core/2)] = l2c_core_accept; + + // Cache Fill Response + assign per_core_dram_fill_rsp [(l2c_curr_core/2)] = l2c_wb[l2c_curr_core]; + assign per_core_I_dram_fill_rsp [(l2c_curr_core/2)] = l2c_wb[l2c_curr_core+1]; + + assign per_core_dram_fill_rsp_data[(l2c_curr_core/2)] = l2c_wb_data[l2c_curr_core]; + assign per_core_I_dram_fill_rsp_data[(l2c_curr_core/2)] = l2c_wb_data[l2c_curr_core+1]; + + assign per_core_dram_fill_rsp_addr[(l2c_curr_core/2)] = l2c_wb_addr[l2c_curr_core]; + assign per_core_I_dram_fill_rsp_addr[(l2c_curr_core/2)] = l2c_wb_addr[l2c_curr_core+1]; + end + endgenerate + + wire dram_snp_full; + wire dram_req_because_of_wb; + VX_cache #( + .CACHE_SIZE_BYTES (`LLCACHE_SIZE_BYTES), + .BANK_LINE_SIZE_BYTES (`LLBANK_LINE_SIZE_BYTES), + .NUMBER_BANKS (`LLNUMBER_BANKS), + .WORD_SIZE_BYTES (`LLWORD_SIZE_BYTES), + .NUMBER_REQUESTS (`LLNUMBER_REQUESTS), + .STAGE_1_CYCLES (`LLSTAGE_1_CYCLES), + .FUNC_ID (`LLFUNC_ID), + .REQQ_SIZE (`LLREQQ_SIZE), + .MRVQ_SIZE (`LLMRVQ_SIZE), + .DFPQ_SIZE (`LLDFPQ_SIZE), + .SNRQ_SIZE (`LLSNRQ_SIZE), + .CWBQ_SIZE (`LLCWBQ_SIZE), + .DWBQ_SIZE (`LLDWBQ_SIZE), + .DFQQ_SIZE (`LLDFQQ_SIZE), + .LLVQ_SIZE (`LLLLVQ_SIZE), + .FILL_INVALIDAOR_SIZE (`LLFILL_INVALIDAOR_SIZE), + .SIMULATED_DRAM_LATENCY_CYCLES(`LLSIMULATED_DRAM_LATENCY_CYCLES) + ) + gpu_l2cache + ( + .clk (clk), + .reset (reset), + + // Core Req (DRAM Fills/WB) To L2 Request + .core_req_valid (l2c_core_req), + .core_req_addr (l2c_core_req_addr), + .core_req_writedata({l2c_core_req_data}), + .core_req_mem_read (l2c_core_req_mem_read), + .core_req_mem_write(l2c_core_req_mem_write), + .core_req_rd (0), + .core_req_wb (l2c_core_req_wb), + .core_req_warp_num (0), + .core_req_pc (0), + + // L2 can't accept Core Request + .delay_req (l2c_core_accept), + + // Core can't accept L2 Request + .core_no_wb_slot (0), + + // Core Writeback + .core_wb_valid (l2c_wb), + .core_wb_req_rd (), + .core_wb_req_wb (), + .core_wb_warp_num (), + .core_wb_readdata ({l2c_wb_data}), + .core_wb_address (l2c_wb_addr), + .core_wb_pc (), + + // L2 Cache DRAM Fill response + .dram_fill_rsp (dram_fill_rsp), + .dram_fill_rsp_addr(dram_fill_rsp_addr), + .dram_fill_rsp_data({dram_fill_rsp_data_port}), + + // L2 Cache can't accept Fill Response + .dram_fill_accept (dram_fill_accept), + + // L2 Cache DRAM Fill Request + .dram_req (dram_req), + .dram_req_write (dram_req_write), + .dram_req_read (dram_req_read), + .dram_req_addr (dram_req_addr), + .dram_req_size (dram_req_size), + .dram_req_data ({dram_req_data_port}), + + // Snoop Response + .dram_req_because_of_wb(dram_req_because_of_wb), + .dram_snp_full (dram_snp_full), + + // Snoop Request + .snp_req (0), + .snp_req_addr (0) ); - + //////////////////// L2 Cache //////////////////// diff --git a/rtl/interfaces/VX_gpu_dcache_dram_res_inter.v b/rtl/interfaces/VX_gpu_dcache_dram_res_inter.v index a6bd0ff87..95364b5fa 100644 --- a/rtl/interfaces/VX_gpu_dcache_dram_res_inter.v +++ b/rtl/interfaces/VX_gpu_dcache_dram_res_inter.v @@ -13,8 +13,8 @@ interface VX_gpu_dcache_dram_res_inter ) (); // DRAM Rsponse - wire dram_fill_rsp; - wire [31:0] dram_fill_rsp_addr; + wire dram_fill_rsp; + wire [31:0] dram_fill_rsp_addr; wire [BANK_LINE_SIZE_WORDS-1:0][31:0] dram_fill_rsp_data; endinterface diff --git a/rtl/interfaces/VX_gpu_dcache_req_inter.v b/rtl/interfaces/VX_gpu_dcache_req_inter.v index 83b507fd4..108db514c 100644 --- a/rtl/interfaces/VX_gpu_dcache_req_inter.v +++ b/rtl/interfaces/VX_gpu_dcache_req_inter.v @@ -16,10 +16,10 @@ interface VX_gpu_dcache_req_inter wire [NUMBER_REQUESTS-1:0] core_req_valid; wire [NUMBER_REQUESTS-1:0][31:0] core_req_addr; wire [NUMBER_REQUESTS-1:0][31:0] core_req_writedata; - wire [2:0] core_req_mem_read; - wire [2:0] core_req_mem_write; + wire [NUMBER_REQUESTS-1:0][2:0] core_req_mem_read; + wire [NUMBER_REQUESTS-1:0][2:0] core_req_mem_write; wire [4:0] core_req_rd; - wire [1:0] core_req_wb; + wire [NUMBER_REQUESTS-1:0][1:0] core_req_wb; wire [`NW_M1:0] core_req_warp_num; wire [31:0] core_req_pc; diff --git a/rtl/simulate/multi_test_bench.h b/rtl/simulate/multi_test_bench.h index 06850e5a9..1b26b4836 100644 --- a/rtl/simulate/multi_test_bench.h +++ b/rtl/simulate/multi_test_bench.h @@ -78,7 +78,6 @@ class Vortex int debug_debugAddr; double stats_sim_time; std::vector dram_req_vec; - std::vector I_dram_req_vec; #ifdef VCD_OUTPUT VerilatedVcdC *m_trace; #endif @@ -165,84 +164,6 @@ void Vortex::print_stats(bool cycle_test) bool Vortex::ibus_driver() { - - // Iterate through each element, and get pop index - int dequeue_index = -1; - bool dequeue_valid = false; - for (int i = 0; i < this->I_dram_req_vec.size(); i++) - { - if (this->I_dram_req_vec[i].cycles_left > 0) - { - this->I_dram_req_vec[i].cycles_left -= 1; - } - - if ((this->I_dram_req_vec[i].cycles_left == 0) && (!dequeue_valid)) - { - dequeue_index = i; - dequeue_valid = true; - } - } - - - if (vortex->I_dram_req) - { - // std::cout << "Icache Dram Request received!\n"; - if (vortex->I_dram_req_read) - { - // std::cout << "Icache Dram Request is read!\n"; - // Need to add an element - dram_req_t dram_req; - dram_req.cycles_left = vortex->I_dram_expected_lat; - dram_req.data_length = vortex->I_dram_req_size / 4; - dram_req.base_addr = vortex->I_dram_req_addr; - dram_req.data = (unsigned *) malloc(dram_req.data_length * sizeof(unsigned)); - - for (int i = 0; i < dram_req.data_length; i++) - { - unsigned curr_addr = dram_req.base_addr + (i*4); - unsigned data_rd; - ram.getWord(curr_addr, &data_rd); - dram_req.data[i] = data_rd; - } - // std::cout << "Fill Req -> Addr: " << std::hex << dram_req.base_addr << std::dec << "\n"; - this->I_dram_req_vec.push_back(dram_req); - } - - if (vortex->I_dram_req_write) - { - unsigned base_addr = vortex->I_dram_req_addr; - unsigned data_length = vortex->I_dram_req_size / 4; - - for (int i = 0; i < data_length; i++) - { - unsigned curr_addr = base_addr + (i*4); - unsigned data_wr = vortex->I_dram_req_data[i]; - ram.writeWord(curr_addr, &data_wr); - } - } - } - - if (vortex->I_dram_fill_accept && dequeue_valid) - { - // std::cout << "Icache Dram Response Sending...!\n"; - - vortex->I_dram_fill_rsp = 1; - vortex->I_dram_fill_rsp_addr = this->I_dram_req_vec[dequeue_index].base_addr; - // std::cout << "Fill Rsp -> Addr: " << std::hex << (this->I_dram_req_vec[dequeue_index].base_addr) << std::dec << "\n"; - - for (int i = 0; i < this->I_dram_req_vec[dequeue_index].data_length; i++) - { - vortex->I_dram_fill_rsp_data[i] = this->I_dram_req_vec[dequeue_index].data[i]; - } - free(this->I_dram_req_vec[dequeue_index].data); - - this->I_dram_req_vec.erase(this->I_dram_req_vec.begin() + dequeue_index); - } - else - { - vortex->I_dram_fill_rsp = 0; - vortex->I_dram_fill_rsp_addr = 0; - } return false; @@ -251,15 +172,18 @@ bool Vortex::ibus_driver() void Vortex::io_handler() { // std::cout << "Checking\n"; - if (vortex->io_valid) + for (int c = 0; c < vortex->number_cores; c++) { - uint32_t data_write = (uint32_t) vortex->io_data; - // std::cout << "IO VALID!\n"; - char c = (char) data_write; - std::cerr << c; - // std::cout << c; + if (vortex->io_valid[c]) + { + uint32_t data_write = (uint32_t) vortex->io_data[c]; + // std::cout << "IO VALID!\n"; + char c = (char) data_write; + std::cerr << c; + // std::cout << c; - std::cout << std::flush; + std::cout << std::flush; + } } } diff --git a/runtime/intrinsics/vx_intrinsics.h b/runtime/intrinsics/vx_intrinsics.h index df85807a1..e7e097a2e 100644 --- a/runtime/intrinsics/vx_intrinsics.h +++ b/runtime/intrinsics/vx_intrinsics.h @@ -31,6 +31,8 @@ unsigned vx_threadID(void); // Get hardware warp ID unsigned vx_warpID(void); +unsigned vx_warpNum(void); + // Get Number cycles/Inst unsigned vx_getCycles(void); unsigned vx_getInst(void); diff --git a/runtime/intrinsics/vx_intrinsics.s b/runtime/intrinsics/vx_intrinsics.s index a99048401..5b328ee85 100644 --- a/runtime/intrinsics/vx_intrinsics.s +++ b/runtime/intrinsics/vx_intrinsics.s @@ -41,7 +41,11 @@ vx_join: vx_warpID: csrr a0, 0x21 # read warp IDs ret - +.type vx_warpNum, @function +.global vx_warpNum +vx_warpNum: + csrr a0, 0x22 # read warp IDs + ret .type vx_threadID, @function .global vx_threadID