diff --git a/driver/rtlsim/vortex.cpp b/driver/rtlsim/vortex.cpp index 2167c47d7..5d4594b33 100644 --- a/driver/rtlsim/vortex.cpp +++ b/driver/rtlsim/vortex.cpp @@ -3,8 +3,7 @@ #include #include #include -#include -#include +#include #include #include @@ -59,20 +58,14 @@ private: class vx_device { public: - vx_device() - : is_done_(false) { - thread_ = new std::thread(__thread_proc__, this); + vx_device() { mem_allocation_ = vx_dev_caps(VX_CAPS_ALLOC_BASE_ADDR); + simulator_.attach_ram(&ram_); } - ~vx_device() { - if (thread_) { - mutex_.lock(); - is_done_ = true; - mutex_.unlock(); - - thread_->join(); - delete thread_; + ~vx_device() { + if (future_.valid()) { + future_.wait(); } } @@ -115,79 +108,48 @@ public: return 0; } - int flush_caches(size_t dev_maddr, size_t size) { - - mutex_.lock(); - simulator_.attach_ram(&ram_); - simulator_.flush_caches(dev_maddr, size); - simulator_.attach_ram(nullptr); - mutex_.unlock(); - - return 0; - } - int start() { - - mutex_.lock(); - simulator_.reset(); - simulator_.attach_ram(&ram_); - mutex_.unlock(); - + if (future_.valid()) { + future_.wait(); // ensure prior run completed + } + future_ = std::async(std::launch::async, [&]{ + simulator_.reset(); + while (simulator_.is_busy()) { + simulator_.step(); + } + }); return 0; } int wait(long long timeout) { + if (!future_.valid()) + return 0; auto timeout_sec = (timeout < 0) ? timeout : (timeout / 1000); + std::chrono::seconds wait_time(1); for (;;) { - mutex_.lock(); - bool is_busy = simulator_.is_busy(); - mutex_.unlock(); - - if (!is_busy || 0 == timeout_sec--) { - if (!is_busy) { - mutex_.lock(); - simulator_.attach_ram(nullptr); - mutex_.unlock(); - } + auto status = future_.wait_for(wait_time); // wait for 1 sec and check status + if (status == std::future_status::ready + || 0 == timeout_sec--) break; - } - - std::this_thread::sleep_for(std::chrono::seconds(1)); } return 0; } + int flush_caches(size_t dev_maddr, size_t size) { + if (future_.valid()) { + future_.wait(); // ensure prior run completed + } + simulator_.flush_caches(dev_maddr, size); + return 0; + } + + private: - void thread_proc() { - std::cout << "Device ready..." << std::endl; - - for (;;) { - mutex_.lock(); - bool is_done = is_done_; - mutex_.unlock(); - - if (is_done) - break; - - mutex_.lock(); - simulator_.step(); - mutex_.unlock(); - } - - std::cout << "Device shutdown..." << std::endl; - } - - static void __thread_proc__(vx_device* device) { - device->thread_proc(); - } - - bool is_done_; size_t mem_allocation_; RAM ram_; Simulator simulator_; - std::thread* thread_; - std::mutex mutex_; + std::future future_; }; /////////////////////////////////////////////////////////////////////////////// diff --git a/hw/rtl/VX_define.vh b/hw/rtl/VX_define.vh index ef8a09fff..f4e8f5376 100644 --- a/hw/rtl/VX_define.vh +++ b/hw/rtl/VX_define.vh @@ -29,11 +29,6 @@ if (!(cond)) $error(msg); \ endgenerate -`define UNUSED(x) \ - `IGNORE_WARNINGS_BEGIN \ - if (x != 0) begin end \ - `IGNORE_WARNINGS_END - `define CLOG2(x) $clog2(x) `define FLOG2(x) ($clog2(x) - (((1 << $clog2(x)) > x) ? 1 : 0)) `define LOG2UP(x) ((x > 1) ? $clog2(x) : 1) diff --git a/hw/rtl/VX_icache_stage.v b/hw/rtl/VX_icache_stage.v index 0c5698637..286094e53 100644 --- a/hw/rtl/VX_icache_stage.v +++ b/hw/rtl/VX_icache_stage.v @@ -64,10 +64,10 @@ module VX_icache_stage #( /*always_comb begin if (1'($time & 1) && icache_req_if.core_req_ready && icache_req_if.core_req_valid) begin - $display("*** %t: I%01d$ req: pc=%0h, warp=%0d", $time, CORE_ID, fe_inst_meta_fi.inst_pc, fe_inst_meta_fi.warp_num); + $display("*** %t: I%01d$ req: tag=%0h, pc=%0h, warp=%0d", $time, CORE_ID, icache_req_if.core_req_tag, fe_inst_meta_fi.inst_pc, fe_inst_meta_fi.warp_num); end if (1'($time & 1) && icache_rsp_if.core_rsp_ready && icache_rsp_if.core_rsp_valid) begin - $display("*** %t: I%01d$ rsp: pc=%0h, warp=%0d, instr=%0h", $time, CORE_ID, fe_inst_meta_id.inst_pc, fe_inst_meta_id.warp_num, fe_inst_meta_id.instruction); + $display("*** %t: I%01d$ rsp: tag=%0h, pc=%0h, warp=%0d, instr=%0h", $time, CORE_ID, icache_rsp_if.core_rsp_tag, fe_inst_meta_id.inst_pc, fe_inst_meta_id.warp_num, fe_inst_meta_id.instruction); end end*/ diff --git a/hw/rtl/VX_lsu_unit.v b/hw/rtl/VX_lsu_unit.v index 37c9ac1e9..f25b58262 100644 --- a/hw/rtl/VX_lsu_unit.v +++ b/hw/rtl/VX_lsu_unit.v @@ -64,10 +64,10 @@ module VX_lsu_unit #( /*always_comb begin if (1'($time & 1) && dcache_req_if.core_req_ready && (| dcache_req_if.core_req_valid)) begin - $display("*** %t: D%01d$ req: valid=%b, addr=%0h, r=%0d, w=%0d, pc=%0h, rd=%0d, warp=%0d, data=%0h", $time, CORE_ID, use_valid, use_address, use_mem_read, use_mem_write, use_pc, use_rd, use_warp_num, use_store_data); + $display("*** %t: D%01d$ req: valid=%b, addr=%0h, tag=%0h, r=%0d, w=%0d, pc=%0h, rd=%0d, warp=%0d, data=%0h", $time, CORE_ID, use_valid, use_address, dcache_req_if.core_req_tag, use_mem_read, use_mem_write, use_pc, use_rd, use_warp_num, use_store_data); end if (1'($time & 1) && dcache_rsp_if.core_rsp_ready && (| dcache_rsp_if.core_rsp_valid)) begin - $display("*** %t: D%01d$ rsp: valid=%b, pc=%0h, rd=%0d, warp=%0d, data=%0h", $time, CORE_ID, mem_wb_if.valid, mem_wb_if.pc, mem_wb_if.rd, mem_wb_if.warp_num, mem_wb_if.data); + $display("*** %t: D%01d$ rsp: valid=%b, tag=%0h, pc=%0h, rd=%0d, warp=%0d, data=%0h", $time, CORE_ID, mem_wb_if.valid, dcache_rsp_if.core_rsp_tag, mem_wb_if.pc, mem_wb_if.rd, mem_wb_if.warp_num, mem_wb_if.data); end end*/ diff --git a/hw/rtl/Vortex_Socket.v b/hw/rtl/Vortex_Socket.v index 2fafaeb6b..2001e191e 100644 --- a/hw/rtl/Vortex_Socket.v +++ b/hw/rtl/Vortex_Socket.v @@ -330,7 +330,7 @@ module Vortex_Socket ( /*always_comb begin if (1'($time & 1) && (dram_req_read || dram_req_write) && dram_req_ready) begin - $display("*** %t: DRAM req: w=%b addr=%0h, tag=%0h, data=%0h", $time, dram_req_write, dram_req_addr, dram_req_tag, dram_req_data); + $display("*** %t: DRAM req: w=%b addr=%0h, tag=%0h, data=%0h", $time, dram_req_write, {dram_req_addr, `CLOG2(`GLOBAL_BLOCK_SIZE)'(0)}, dram_req_tag, dram_req_data); end if (1'($time & 1) && dram_rsp_valid && dram_rsp_ready) begin $display("*** %t: DRAM rsp: tag=%0h, data=%0h", $time, dram_rsp_tag, dram_rsp_data); diff --git a/hw/rtl/cache/VX_bank.v b/hw/rtl/cache/VX_bank.v index 29386a1c6..ba9c5492a 100644 --- a/hw/rtl/cache/VX_bank.v +++ b/hw/rtl/cache/VX_bank.v @@ -407,10 +407,9 @@ module VX_bank #( .out ({is_snp_st2 , snrq_tag_st2, fill_saw_dirty_st2 , is_fill_st2 , valid_st2 , addr_st2 , wsel_st2, writeword_st2 , readword_st2 , readdata_st2 , readtag_st2 , miss_st2 , dirty_st2 , inst_meta_st2 }) ); - wire should_flush; - wire dwbq_push; - wire cwbq_full; + wire dwbq_push; + wire dwbq_empty; wire dwbq_full; wire srpq_full; wire invalidate_fill; @@ -420,7 +419,6 @@ module VX_bank #( && !is_snp_st2 && miss_st2 && !mrvq_full - && !(should_flush && dwbq_push) && !((is_snp_st2 && valid_st2 && srpq_full) || ((valid_st2 && !miss_st2) && cwbq_full) || (((valid_st2 && miss_st2 && dirty_st2) || fill_saw_dirty_st2) && dwbq_full) @@ -443,7 +441,7 @@ module VX_bank #( .clk (clk), .reset (reset), // Enqueue - .miss_add (miss_add), // Need to do all + .miss_add (miss_add), .miss_add_addr (miss_add_addr), .miss_add_wsel (miss_add_wsel), .miss_add_data (miss_add_data), @@ -505,30 +503,16 @@ module VX_bank #( .full (cwbq_full) ); - assign should_flush = valid_st2 - && (miss_add_mem_write != `BYTE_EN_NO) - && !is_snp_st2 - && !is_fill_st2; - // Enqueue to DWB Queue - assign dwbq_push = ((valid_st2 && miss_st2 && dirty_st2) || fill_saw_dirty_st2 || should_flush) + assign dwbq_push = ((valid_st2 && miss_st2 && dirty_st2) || fill_saw_dirty_st2) && !dwbq_full && !((is_snp_st2 && valid_st2 && srpq_full) || ((valid_st2 && !miss_st2) && cwbq_full) || (valid_st2 && miss_st2 && mrvq_full) || (valid_st2 && miss_st2 && !invalidate_fill && ~dram_fill_req_ready)); - wire[`LINE_ADDR_WIDTH-1:0] dwbq_req_addr; - wire[`BANK_LINE_WIDTH-1:0] dwbq_req_data; - wire dwbq_empty; - - if (SNOOP_FORWARDING) begin - assign dwbq_req_data = (should_flush && dwbq_push) ? writeword_st2 : readdata_st2; - assign dwbq_req_addr = (should_flush && dwbq_push) ? addr_st2 : {readtag_st2, addr_st2[`LINE_SELECT_BITS-1:0]}; - end else begin - assign dwbq_req_data = readdata_st2; - assign dwbq_req_addr = {readtag_st2, addr_st2[`LINE_SELECT_BITS-1:0]}; - end + wire [`BANK_LINE_WIDTH-1:0] dwbq_req_data = readdata_st2; + wire [`LINE_ADDR_WIDTH-1:0] dwbq_req_addr = {readtag_st2, addr_st2[`LINE_SELECT_BITS-1:0]}; wire possible_fill = valid_st2 && miss_st2 && dram_fill_req_ready && ~is_snp_st2; wire [`LINE_ADDR_WIDTH-1:0] fill_invalidator_addr = addr_st2; @@ -544,7 +528,7 @@ module VX_bank #( .success_fill (is_fill_st2), .fill_addr (fill_invalidator_addr), .invalidate_fill (invalidate_fill) - ); + ); // Enqueue in dram_fill_req assign dram_fill_req_valid = possible_fill && !invalidate_fill; diff --git a/hw/rtl/cache/VX_tag_data_access.v b/hw/rtl/cache/VX_tag_data_access.v index 51330171d..bfcec08db 100644 --- a/hw/rtl/cache/VX_tag_data_access.v +++ b/hw/rtl/cache/VX_tag_data_access.v @@ -72,6 +72,9 @@ module VX_tag_data_access #( && ((valid_req_st1e && !use_read_valid_st1e) || (valid_req_st1e && use_read_valid_st1e && !tags_match)); + wire[`TAG_SELECT_BITS-1:0] writetag_st1e = writeaddr_st1e[`TAG_LINE_ADDR_RNG]; + wire[`LINE_SELECT_BITS-1:0] writeladdr_st1e = writeaddr_st1e[`LINE_SELECT_BITS-1:0]; + VX_tag_data_structure #( .CACHE_SIZE (CACHE_SIZE), .BANK_LINE_SIZE (BANK_LINE_SIZE), @@ -91,8 +94,8 @@ module VX_tag_data_access #( .invalidate (invalidate_line), .write_enable(use_write_enable), .write_fill (real_writefill), - .write_addr (writeaddr_st1e[`LINE_SELECT_BITS-1:0]), - .tag_index (writeaddr_st1e[`TAG_LINE_ADDR_RNG]), + .write_addr (writeladdr_st1e), + .tag_index (writetag_st1e), .write_data (use_write_data), .fill_sent (fill_sent) ); @@ -125,30 +128,28 @@ module VX_tag_data_access #( assign use_read_valid_st1e = read_valid_st1c[STAGE_1_CYCLES-1] || ~DRAM_ENABLE; // If shared memory, always valid assign use_read_dirty_st1e = read_dirty_st1c[STAGE_1_CYCLES-1] && DRAM_ENABLE; // Dirty only applies in Dcache - assign use_read_tag_st1e = DRAM_ENABLE ? read_tag_st1c[STAGE_1_CYCLES-1] : writeaddr_st1e[`TAG_LINE_ADDR_RNG]; // Tag is always the same in SM - - for (i = 0; i < `BANK_LINE_WORDS; i++) begin - assign use_read_data_st1e[i * `WORD_WIDTH +: `WORD_WIDTH] = read_data_st1c[STAGE_1_CYCLES-1][i * `WORD_WIDTH +: `WORD_WIDTH]; - end + assign use_read_tag_st1e = DRAM_ENABLE ? read_tag_st1c[STAGE_1_CYCLES-1] : writetag_st1e; // Tag is always the same in SM + assign use_read_data_st1e = read_data_st1c[STAGE_1_CYCLES-1]; wire force_write = real_writefill; + wire should_write; wire [`BANK_LINE_WORDS-1:0][3:0] we; wire [`BANK_LINE_WIDTH-1:0] data_write; if (WORD_SIZE == BANK_LINE_SIZE) begin - wire should_write = ((mem_write_st1e != `BYTE_EN_NO)) - && valid_req_st1e - && use_read_valid_st1e - && !miss_st1e - && !is_snp_st1e; + assign should_write = ((mem_write_st1e != `BYTE_EN_NO)) + && valid_req_st1e + && use_read_valid_st1e + && !miss_st1e + && !is_snp_st1e; for (i = 0; i < `BANK_LINE_WORDS; i++) begin assign we[i] = (force_write || (should_write && !real_writefill)) ? 4'b1111 : 4'b0000; end - assign readword_st1e = read_data_st1c[STAGE_1_CYCLES-1]; + assign readword_st1e = use_read_data_st1e; assign data_write = force_write ? writedata_st1e : writeword_st1e; end else begin @@ -174,13 +175,13 @@ module VX_tag_data_access #( wire [3:0] sb_mask = (b0 ? 4'b0001 : (b1 ? 4'b0010 : (b2 ? 4'b0100 : 4'b1000))); wire [3:0] sh_mask = (b0 ? 4'b0011 : 4'b1100); - wire should_write = (sw || sb || sh) - && valid_req_st1e - && use_read_valid_st1e - && !miss_st1e - && !is_snp_st1e; + assign should_write = (sw || sb || sh) + && valid_req_st1e + && use_read_valid_st1e + && !miss_st1e + && !is_snp_st1e; - wire[`WORD_WIDTH-1:0] data_unmod = read_data_st1c[STAGE_1_CYCLES-1][block_offset * 32 +: 32]; + wire[`WORD_WIDTH-1:0] data_unmod = use_read_data_st1e[block_offset * 32 +: 32]; wire[`WORD_WIDTH-1:0] data_unQual = (b0 || lw) ? (data_unmod) : b1 ? (data_unmod >> 8) : b2 ? (data_unmod >> 16) : @@ -200,7 +201,7 @@ module VX_tag_data_access #( assign readword_st1e = data_Qual; for (i = 0; i < `BANK_LINE_WORDS; i++) begin - wire normal_write = (block_offset == i[`WORD_SELECT_BITS-1:0]) && should_write && !real_writefill; + wire normal_write = (block_offset == `WORD_SELECT_BITS'(i)) && should_write && !real_writefill; assign we[i] = (force_write) ? 4'b1111 : (normal_write && sw) ? 4'b1111 : @@ -226,7 +227,7 @@ module VX_tag_data_access #( assign use_write_data = data_write; // use "case equality" to handle uninitialized tag when block entry is not valid - assign tags_match = ((writeaddr_st1e[`TAG_LINE_ADDR_RNG] == use_read_tag_st1e) === 1'b1); + assign tags_match = ((writetag_st1e == use_read_tag_st1e) === 1'b1); wire snoop_hit = valid_req_st1e && is_snp_st1e && use_read_valid_st1e && tags_match && use_read_dirty_st1e; wire req_invalid = valid_req_st1e && !is_snp_st1e && !use_read_valid_st1e && !writefill_st1e; diff --git a/hw/rtl/cache/VX_tag_data_structure.v b/hw/rtl/cache/VX_tag_data_structure.v index 474f8d386..491aa9862 100644 --- a/hw/rtl/cache/VX_tag_data_structure.v +++ b/hw/rtl/cache/VX_tag_data_structure.v @@ -39,7 +39,7 @@ module VX_tag_data_structure #( assign read_tag = tag [read_addr]; assign read_data = data [read_addr]; - wire going_to_write = (| write_enable); + wire going_to_write = (| write_enable); integer i; always @(posedge clk) begin diff --git a/hw/simulate/simulator.cpp b/hw/simulate/simulator.cpp index 8573c49d2..9cc669d93 100644 --- a/hw/simulate/simulator.cpp +++ b/hw/simulate/simulator.cpp @@ -2,13 +2,13 @@ #include #include -uint64_t time_stamp = 0; +uint64_t timestamp = 0; double sc_time_stamp() { - return time_stamp; + return timestamp; } -Simulator::Simulator() { +Simulator::Simulator() { ram_ = nullptr; vortex_ = new VVortex_Socket(); @@ -28,18 +28,23 @@ Simulator::~Simulator() { } void Simulator::attach_ram(RAM* ram) { +#ifndef NDEBUG + std::cout << timestamp << ": [sim] attach_ram" << std::endl; +#endif ram_ = ram; dram_rsp_vec_.clear(); } void Simulator::print_stats(std::ostream& out) { out << std::left; - out << std::setw(24) << "# of total cycles:" << std::dec << time_stamp/2 << std::endl; + out << std::setw(24) << "# of total cycles:" << std::dec << timestamp/2 << std::endl; } void Simulator::dbus_driver() { - if (ram_ == nullptr) + if (ram_ == nullptr) { + vortex_->dram_req_ready = false; return; + } // handle DRAM response cycle int dequeue_index = -1; @@ -70,7 +75,7 @@ void Simulator::dbus_driver() { // handle DRAM stalls bool dram_stalled = false; #ifdef ENABLE_DRAM_STALLS - if (0 == ((time_stamp/2) % DRAM_STALLS_MODULO)) { + if (0 == ((timestamp/2) % DRAM_STALLS_MODULO)) { dram_stalled = true; } else if (dram_rsp_vec_.size() >= DRAM_RQ_SIZE) { @@ -114,12 +119,15 @@ void Simulator::io_driver() { && vortex_->io_req_addr == IO_BUS_ADDR_COUT) { uint32_t data_write = (uint32_t)vortex_->io_req_data; char c = (char)data_write; - std::cerr << c; + std::cout << c; } vortex_->io_req_ready = true; } -void Simulator::reset() { +void Simulator::reset() { +#ifndef NDEBUG + std::cout << timestamp << ": [sim] reset()" << std::endl; +#endif vortex_->reset = 1; this->step(); vortex_->reset = 0; @@ -141,9 +149,9 @@ void Simulator::step() { void Simulator::eval() { vortex_->eval(); #ifdef VCD_OUTPUT - trace_->dump(time_stamp); + trace_->dump(timestamp); #endif - ++time_stamp; + ++timestamp; } void Simulator::wait(uint32_t cycles) { @@ -157,6 +165,9 @@ bool Simulator::is_busy() { } void Simulator::flush_caches(uint32_t mem_addr, uint32_t size) { +#ifndef NDEBUG + std::cout << timestamp << ": [sim] flush_caches()" << std::endl; +#endif // align address to LLC block boundaries auto aligned_addr_start = mem_addr / GLOBAL_BLOCK_SIZE; auto aligned_addr_end = (mem_addr + size + GLOBAL_BLOCK_SIZE - 1) / GLOBAL_BLOCK_SIZE; @@ -186,6 +197,10 @@ void Simulator::flush_caches(uint32_t mem_addr, uint32_t size) { } bool Simulator::run() { +#ifndef NDEBUG + std::cout << timestamp << ": [sim] run()" << std::endl; +#endif + // reset the device this->reset(); diff --git a/hw/simulate/simulator.h b/hw/simulate/simulator.h index 92057fe36..6df673322 100644 --- a/hw/simulate/simulator.h +++ b/hw/simulate/simulator.h @@ -52,6 +52,7 @@ private: RAM *ram_; VVortex_Socket *vortex_; + bool enable_; #ifdef VCD_OUTPUT VerilatedVcdC *trace_; #endif diff --git a/runtime/tests/simple/vx_simple_main.elf b/runtime/tests/simple/vx_simple_main.elf index 50626c8a5..136d74fd3 100755 Binary files a/runtime/tests/simple/vx_simple_main.elf and b/runtime/tests/simple/vx_simple_main.elf differ