mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
cummulative fixes, RTL uuid trace, texture unit fixes, simx timing fixes
This commit is contained in:
parent
b995843a5b
commit
41d7e6c63a
79 changed files with 2148 additions and 1372 deletions
|
@ -124,7 +124,17 @@ CONFIGS="-DNUM_CLUSTERS=$CLUSTERS -DNUM_CORES=$CORES -DNUM_WARPS=$WARPS -DNUM_TH
|
|||
|
||||
echo "CONFIGS=$CONFIGS"
|
||||
|
||||
make -C $DRIVER_PATH clean
|
||||
if [ -f "blackbox.cache" ]
|
||||
then
|
||||
LAST_CONFIGS=`cat blackbox.cache`
|
||||
fi
|
||||
|
||||
if [ "$CONFIGS+$DEBUG+$SCOPE" != "$LAST_CONFIGS" ];
|
||||
then
|
||||
make -C $DRIVER_PATH clean
|
||||
fi
|
||||
|
||||
echo "$CONFIGS+$DEBUG+$SCOPE" > blackbox.cache
|
||||
|
||||
status=0
|
||||
|
||||
|
|
|
@ -27,8 +27,11 @@ tex()
|
|||
echo "begin texture tests..."
|
||||
|
||||
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=vlsim --app=tex --args="-isoccer.png -osoccer_result.png -g0"
|
||||
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=simx --app=tex --args="-isoccer.png -osoccer_result.png -g0"
|
||||
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-itoad.png -otoad_result.png -g1"
|
||||
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=simx --app=tex --args="-irainbow.png -orainbow_result.png -g2"
|
||||
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=rtlsim --app=tex --args="-itoad.png -otoad_result.png -g1" --perf
|
||||
CONFIGS="-DEXT_TEX_ENABLE=1" ./ci/blackbox.sh --driver=simx --app=tex --args="-itoad.png -otoad_result.png -g1" --perf
|
||||
|
||||
echo "coverage texture done!"
|
||||
}
|
||||
|
@ -58,7 +61,9 @@ debug()
|
|||
echo "begin debugging tests..."
|
||||
|
||||
./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --perf --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --perf --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=vlsim --cores=2 --clusters=2 --l2cache --debug --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=simx --cores=2 --clusters=2 --l2cache --debug --app=demo --args="-n1"
|
||||
./ci/blackbox.sh --driver=vlsim --cores=1 --scope --app=basic --args="-t0 -n1"
|
||||
|
||||
echo "debugging tests done!"
|
||||
|
@ -73,9 +78,13 @@ CONFIGS=-DEXT_M_DISABLE ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_e
|
|||
|
||||
# disabling F extension
|
||||
CONFIGS=-DEXT_F_DISABLE ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext
|
||||
CONFIGS=-DEXT_F_DISABLE ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_mf_ext --perf
|
||||
CONFIGS=-DEXT_F_DISABLE ./ci/blackbox.sh --driver=simx --cores=1 --app=no_mf_ext --perf
|
||||
|
||||
# disable shared memory
|
||||
CONFIGS=-DSM_ENABLE=0 ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_smem
|
||||
CONFIGS=-DSM_ENABLE=0 ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=no_smem --perf
|
||||
CONFIGS=-DSM_ENABLE=0 ./ci/blackbox.sh --driver=simx --cores=1 --app=no_smem --perf
|
||||
|
||||
# using Default FPU core
|
||||
FPU_CORE=FPU_DEFAULT ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=dogfood
|
||||
|
|
|
@ -114,11 +114,13 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
|||
uint64_t csr_stalls = 0;
|
||||
uint64_t alu_stalls = 0;
|
||||
uint64_t gpu_stalls = 0;
|
||||
// PERF: decode
|
||||
uint64_t loads = 0;
|
||||
uint64_t stores = 0;
|
||||
uint64_t branches = 0;
|
||||
// PERF: Icache
|
||||
uint64_t icache_reads = 0;
|
||||
uint64_t icache_read_misses = 0;
|
||||
uint64_t icache_pipe_stalls = 0;
|
||||
uint64_t icache_rsp_stalls = 0;
|
||||
// PERF: Dcache
|
||||
uint64_t dcache_reads = 0;
|
||||
uint64_t dcache_writes = 0;
|
||||
|
@ -126,17 +128,19 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
|||
uint64_t dcache_write_misses = 0;
|
||||
uint64_t dcache_bank_stalls = 0;
|
||||
uint64_t dcache_mshr_stalls = 0;
|
||||
uint64_t dcache_pipe_stalls = 0;
|
||||
uint64_t dcache_rsp_stalls = 0;
|
||||
// PERF: SMEM
|
||||
// PERF: shared memory
|
||||
uint64_t smem_reads = 0;
|
||||
uint64_t smem_writes = 0;
|
||||
uint64_t smem_bank_stalls = 0;
|
||||
// PERF: memory
|
||||
uint64_t mem_reads = 0;
|
||||
uint64_t mem_writes = 0;
|
||||
uint64_t mem_stalls = 0;
|
||||
uint64_t mem_lat = 0;
|
||||
#ifdef EXT_TEX_ENABLE
|
||||
// PERF: texunit
|
||||
uint64_t tex_mem_reads = 0;
|
||||
uint64_t tex_mem_lat = 0;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
uint64_t num_cores;
|
||||
|
@ -196,6 +200,20 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
|||
if (num_cores > 1) fprintf(stream, "PERF: core%d: gpu unit stalls=%ld\n", core_id, gpu_stalls_per_core);
|
||||
gpu_stalls += gpu_stalls_per_core;
|
||||
|
||||
// PERF: decode
|
||||
// loads
|
||||
uint64_t loads_per_core = get_csr_64(staging_ptr, CSR_MPM_LOADS);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: loads=%ld\n", core_id, loads_per_core);
|
||||
loads += loads_per_core;
|
||||
// stores
|
||||
uint64_t stores_per_core = get_csr_64(staging_ptr, CSR_MPM_STORES);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: stores=%ld\n", core_id, stores_per_core);
|
||||
stores += stores_per_core;
|
||||
// branches
|
||||
uint64_t branches_per_core = get_csr_64(staging_ptr, CSR_MPM_BRANCHES);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: branches=%ld\n", core_id, branches_per_core);
|
||||
branches += branches_per_core;
|
||||
|
||||
// PERF: Icache
|
||||
// total reads
|
||||
uint64_t icache_reads_per_core = get_csr_64(staging_ptr, CSR_MPM_ICACHE_READS);
|
||||
|
@ -204,16 +222,8 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
|||
// read misses
|
||||
uint64_t icache_miss_r_per_core = get_csr_64(staging_ptr, CSR_MPM_ICACHE_MISS_R);
|
||||
int icache_read_hit_ratio = (int)((1.0 - (double(icache_miss_r_per_core) / double(icache_reads_per_core))) * 100);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache read misses=%ld (hit ratio=%d%%)\n", core_id, icache_miss_r_per_core, icache_read_hit_ratio);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache misses=%ld (hit ratio=%d%%)\n", core_id, icache_miss_r_per_core, icache_read_hit_ratio);
|
||||
icache_read_misses += icache_miss_r_per_core;
|
||||
// pipeline stalls
|
||||
uint64_t icache_pipe_st_per_core = get_csr_64(staging_ptr, CSR_MPM_ICACHE_PIPE_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache pipeline stalls=%ld\n", core_id, icache_pipe_st_per_core);
|
||||
icache_pipe_stalls += icache_pipe_st_per_core;
|
||||
// response stalls
|
||||
uint64_t icache_crsp_st_per_core = get_csr_64(staging_ptr, CSR_MPM_ICACHE_CRSP_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: icache reponse stalls=%ld\n", core_id, icache_crsp_st_per_core);
|
||||
icache_rsp_stalls += icache_crsp_st_per_core;
|
||||
|
||||
// PERF: Dcache
|
||||
// total reads
|
||||
|
@ -243,14 +253,6 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
|||
uint64_t dcache_mshr_st_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_MSHR_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache mshr stalls=%ld\n", core_id, dcache_mshr_st_per_core);
|
||||
dcache_mshr_stalls += dcache_mshr_st_per_core;
|
||||
// pipeline stalls
|
||||
uint64_t dcache_pipe_st_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_PIPE_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache pipeline stalls=%ld\n", core_id, dcache_pipe_st_per_core);
|
||||
dcache_pipe_stalls += dcache_pipe_st_per_core;
|
||||
// response stalls
|
||||
uint64_t dcache_crsp_st_per_core = get_csr_64(staging_ptr, CSR_MPM_DCACHE_CRSP_ST);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: dcache reponse stalls=%ld\n", core_id, dcache_crsp_st_per_core);
|
||||
dcache_rsp_stalls += dcache_crsp_st_per_core;
|
||||
|
||||
// PERF: SMEM
|
||||
// total reads
|
||||
|
@ -270,17 +272,26 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
|||
// PERF: memory
|
||||
uint64_t mem_reads_per_core = get_csr_64(staging_ptr, CSR_MPM_MEM_READS);
|
||||
uint64_t mem_writes_per_core = get_csr_64(staging_ptr, CSR_MPM_MEM_WRITES);
|
||||
uint64_t mem_stalls_per_core = get_csr_64(staging_ptr, CSR_MPM_MEM_ST);
|
||||
uint64_t mem_lat_per_core = get_csr_64(staging_ptr, CSR_MPM_MEM_LAT);
|
||||
int mem_utilization = (int)((double(mem_reads_per_core + mem_writes_per_core) / double(mem_reads_per_core + mem_writes_per_core + mem_stalls_per_core)) * 100);
|
||||
int mem_avg_lat = (int)(double(mem_lat_per_core) / double(mem_reads_per_core));
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: memory requests=%ld (reads=%ld, writes=%ld)\n", core_id, (mem_reads_per_core + mem_writes_per_core), mem_reads_per_core, mem_writes_per_core);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: memory stalls=%ld (utilization=%d%%)\n", core_id, mem_stalls_per_core, mem_utilization);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: memory average latency=%d cycles\n", core_id, mem_avg_lat);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: memory latency=%d cycles\n", core_id, mem_avg_lat);
|
||||
mem_reads += mem_reads_per_core;
|
||||
mem_writes += mem_writes_per_core;
|
||||
mem_stalls += mem_stalls_per_core;
|
||||
mem_lat += mem_lat_per_core;
|
||||
|
||||
#ifdef EXT_TEX_ENABLE
|
||||
// total reads
|
||||
uint64_t tex_reads_per_core = get_csr_64(staging_ptr, CSR_MPM_TEX_READS);
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: tex memory reads=%ld\n", core_id, tex_reads_per_core);
|
||||
tex_mem_reads += tex_reads_per_core;
|
||||
|
||||
// read latency
|
||||
uint64_t tex_lat_per_core = get_csr_64(staging_ptr, CSR_MPM_TEX_LAT);
|
||||
int tex_avg_lat = (int)(double(tex_lat_per_core) / double(tex_reads_per_core));
|
||||
if (num_cores > 1) fprintf(stream, "PERF: core%d: tex memory latency=%d cycles\n", core_id, tex_avg_lat);
|
||||
tex_mem_lat += tex_lat_per_core;
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -293,7 +304,6 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
|||
int dcache_write_hit_ratio = (int)((1.0 - (double(dcache_write_misses) / double(dcache_writes))) * 100);
|
||||
int dcache_bank_utilization = (int)((double(dcache_reads + dcache_writes) / double(dcache_reads + dcache_writes + dcache_bank_stalls)) * 100);
|
||||
int smem_bank_utilization = (int)((double(smem_reads + smem_writes) / double(smem_reads + smem_writes + smem_bank_stalls)) * 100);
|
||||
int mem_utilization = (int)((double(mem_reads + mem_writes) / double(mem_reads + mem_writes + mem_stalls)) * 100);
|
||||
int mem_avg_lat = (int)(double(mem_lat) / double(mem_reads));
|
||||
fprintf(stream, "PERF: ibuffer stalls=%ld\n", ibuffer_stalls);
|
||||
fprintf(stream, "PERF: scoreboard stalls=%ld\n", scoreboard_stalls);
|
||||
|
@ -302,24 +312,27 @@ extern int vx_dump_perf(vx_device_h device, FILE* stream) {
|
|||
fprintf(stream, "PERF: csr unit stalls=%ld\n", csr_stalls);
|
||||
fprintf(stream, "PERF: fpu unit stalls=%ld\n", fpu_stalls);
|
||||
fprintf(stream, "PERF: gpu unit stalls=%ld\n", gpu_stalls);
|
||||
fprintf(stream, "PERF: loads=%ld\n", loads);
|
||||
fprintf(stream, "PERF: stores=%ld\n", stores);
|
||||
fprintf(stream, "PERF: branches=%ld\n", branches);
|
||||
fprintf(stream, "PERF: icache reads=%ld\n", icache_reads);
|
||||
fprintf(stream, "PERF: icache read misses=%ld (hit ratio=%d%%)\n", icache_read_misses, icache_read_hit_ratio);
|
||||
fprintf(stream, "PERF: icache pipeline stalls=%ld\n", icache_pipe_stalls);
|
||||
fprintf(stream, "PERF: icache reponse stalls=%ld\n", icache_rsp_stalls);
|
||||
fprintf(stream, "PERF: dcache reads=%ld\n", dcache_reads);
|
||||
fprintf(stream, "PERF: dcache writes=%ld\n", dcache_writes);
|
||||
fprintf(stream, "PERF: dcache read misses=%ld (hit ratio=%d%%)\n", dcache_read_misses, dcache_read_hit_ratio);
|
||||
fprintf(stream, "PERF: dcache write misses=%ld (hit ratio=%d%%)\n", dcache_write_misses, dcache_write_hit_ratio);
|
||||
fprintf(stream, "PERF: dcache bank stalls=%ld (utilization=%d%%)\n", dcache_bank_stalls, dcache_bank_utilization);
|
||||
fprintf(stream, "PERF: dcache mshr stalls=%ld\n", dcache_mshr_stalls);
|
||||
fprintf(stream, "PERF: dcache pipeline stalls=%ld\n", dcache_pipe_stalls);
|
||||
fprintf(stream, "PERF: dcache reponse stalls=%ld\n", dcache_rsp_stalls);
|
||||
fprintf(stream, "PERF: smem reads=%ld\n", smem_reads);
|
||||
fprintf(stream, "PERF: smem writes=%ld\n", smem_writes);
|
||||
fprintf(stream, "PERF: smem bank stalls=%ld (utilization=%d%%)\n", smem_bank_stalls, smem_bank_utilization);
|
||||
fprintf(stream, "PERF: memory requests=%ld (reads=%ld, writes=%ld)\n", (mem_reads + mem_writes), mem_reads, mem_writes);
|
||||
fprintf(stream, "PERF: memory stalls=%ld (utilization=%d%%)\n", mem_stalls, mem_utilization);
|
||||
fprintf(stream, "PERF: memory average latency=%d cycles\n", mem_avg_lat);
|
||||
#ifdef EXT_TEX_ENABLE
|
||||
int tex_avg_lat = (int)(double(tex_mem_lat) / double(tex_mem_reads));
|
||||
fprintf(stream, "PERF: tex memory reads=%ld\n", tex_mem_reads);
|
||||
fprintf(stream, "PERF: tex memory latency=%d cycles\n", tex_avg_lat);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// release allocated resources
|
||||
|
|
|
@ -96,6 +96,7 @@ module VX_alu_unit #(
|
|||
wire alu_ready_in;
|
||||
wire alu_valid_out;
|
||||
wire alu_ready_out;
|
||||
wire [63:0] alu_uuid;
|
||||
wire [`NW_BITS-1:0] alu_wid;
|
||||
wire [`NUM_THREADS-1:0] alu_tmask;
|
||||
wire [31:0] alu_PC;
|
||||
|
@ -112,14 +113,14 @@ module VX_alu_unit #(
|
|||
assign alu_ready_in = alu_ready_out || ~alu_valid_out;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `INST_BR_BITS + 1 + 1 + 32),
|
||||
.DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `INST_BR_BITS + 1 + 1 + 32),
|
||||
.RESETW (1)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (alu_ready_in),
|
||||
.data_in ({alu_valid_in, alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.rd, alu_req_if.wb, alu_jal_result, is_br_op, br_op, is_less, is_equal, br_dest}),
|
||||
.data_out ({alu_valid_out, alu_wid, alu_tmask, alu_PC, alu_rd, alu_wb, alu_data, is_br_op_r, br_op_r, is_less_r, is_equal_r, br_dest_r})
|
||||
.data_in ({alu_valid_in, alu_req_if.uuid, alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.rd, alu_req_if.wb, alu_jal_result, is_br_op, br_op, is_less, is_equal, br_dest}),
|
||||
.data_out ({alu_valid_out, alu_uuid, alu_wid, alu_tmask, alu_PC, alu_rd, alu_wb, alu_data, is_br_op_r, br_op_r, is_less_r, is_equal_r, br_dest_r})
|
||||
);
|
||||
|
||||
`UNUSED_VAR (br_op_r)
|
||||
|
@ -138,6 +139,7 @@ module VX_alu_unit #(
|
|||
wire mul_ready_in;
|
||||
wire mul_valid_out;
|
||||
wire mul_ready_out;
|
||||
wire [63:0] mul_uuid;
|
||||
wire [`NW_BITS-1:0] mul_wid;
|
||||
wire [`NUM_THREADS-1:0] mul_tmask;
|
||||
wire [31:0] mul_PC;
|
||||
|
@ -153,6 +155,7 @@ module VX_alu_unit #(
|
|||
|
||||
// Inputs
|
||||
.alu_op (mul_op),
|
||||
.uuid_in (alu_req_if.uuid),
|
||||
.wid_in (alu_req_if.wid),
|
||||
.tmask_in (alu_req_if.tmask),
|
||||
.PC_in (alu_req_if.PC),
|
||||
|
@ -163,6 +166,7 @@ module VX_alu_unit #(
|
|||
|
||||
// Outputs
|
||||
.wid_out (mul_wid),
|
||||
.uuid_out (mul_uuid),
|
||||
.tmask_out (mul_tmask),
|
||||
.PC_out (mul_PC),
|
||||
.rd_out (mul_rd),
|
||||
|
@ -184,6 +188,7 @@ module VX_alu_unit #(
|
|||
assign mul_valid_in = alu_req_if.valid && is_mul_op;
|
||||
|
||||
assign alu_commit_if.valid = alu_valid_out || mul_valid_out;
|
||||
assign alu_commit_if.uuid = alu_valid_out ? alu_uuid : mul_uuid;
|
||||
assign alu_commit_if.wid = alu_valid_out ? alu_wid : mul_wid;
|
||||
assign alu_commit_if.tmask = alu_valid_out ? alu_tmask : mul_tmask;
|
||||
assign alu_commit_if.PC = alu_valid_out ? alu_PC : mul_PC;
|
||||
|
@ -201,6 +206,7 @@ module VX_alu_unit #(
|
|||
assign alu_valid_in = alu_req_if.valid;
|
||||
|
||||
assign alu_commit_if.valid = alu_valid_out;
|
||||
assign alu_commit_if.uuid = alu_uuid;
|
||||
assign alu_commit_if.wid = alu_wid;
|
||||
assign alu_commit_if.tmask = alu_tmask;
|
||||
assign alu_commit_if.PC = alu_PC;
|
||||
|
@ -220,8 +226,8 @@ module VX_alu_unit #(
|
|||
`ifdef DBG_TRACE_PIPELINE
|
||||
always @(posedge clk) begin
|
||||
if (branch_ctl_if.valid) begin
|
||||
dpi_trace("%d: core%0d-branch: wid=%0d, PC=%0h, taken=%b, dest=%0h\n",
|
||||
$time, CORE_ID, branch_ctl_if.wid, alu_commit_if.PC, branch_ctl_if.taken, branch_ctl_if.dest);
|
||||
dpi_trace("%d: core%0d-branch: wid=%0d, PC=%0h, taken=%b, dest=%0h (#%0d)\n",
|
||||
$time, CORE_ID, branch_ctl_if.wid, alu_commit_if.PC, branch_ctl_if.taken, branch_ctl_if.dest, alu_uuid);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -40,27 +40,35 @@ module VX_commit #(
|
|||
`endif
|
||||
|| gpu_commit_fire;
|
||||
|
||||
wire [`NUM_THREADS-1:0] commit_tmask;
|
||||
assign commit_tmask = alu_commit_fire ? alu_commit_if.tmask:
|
||||
ld_commit_fire ? ld_commit_if.tmask:
|
||||
st_commit_fire ? st_commit_if.tmask:
|
||||
csr_commit_fire ? csr_commit_if.tmask:
|
||||
`ifdef EXT_F_ENABLE
|
||||
fpu_commit_fire ? fpu_commit_if.tmask:
|
||||
`endif
|
||||
/*gpu_commit_fire ?*/ gpu_commit_if.tmask;
|
||||
`ifdef EXT_F_ENABLE
|
||||
wire [(6*`NUM_THREADS)-1:0] commit_tmask;
|
||||
`else
|
||||
wire [(5*`NUM_THREADS)-1:0] commit_tmask;
|
||||
`endif
|
||||
|
||||
wire [$clog2(`NUM_THREADS+1)-1:0] commit_cnt;
|
||||
`POP_COUNT(commit_cnt, commit_tmask);
|
||||
wire [$clog2($bits(commit_tmask)+1)-1:0] commit_size;
|
||||
|
||||
assign commit_tmask = {
|
||||
{`NUM_THREADS{alu_commit_fire}} & alu_commit_if.tmask,
|
||||
{`NUM_THREADS{ld_commit_fire}} & ld_commit_if.tmask,
|
||||
{`NUM_THREADS{st_commit_fire}} & st_commit_if.tmask,
|
||||
{`NUM_THREADS{csr_commit_fire}} & csr_commit_if.tmask,
|
||||
`ifdef EXT_F_ENABLE
|
||||
{`NUM_THREADS{fpu_commit_fire}} & fpu_commit_if.tmask,
|
||||
`endif
|
||||
{`NUM_THREADS{gpu_commit_fire}} & gpu_commit_if.tmask
|
||||
};
|
||||
|
||||
`POP_COUNT(commit_size, commit_tmask);
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + $clog2(`NUM_THREADS+1)),
|
||||
.DATAW (1 + $bits(commit_size)),
|
||||
.RESETW (1)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (1'b1),
|
||||
.data_in ({commit_fire, commit_cnt}),
|
||||
.data_in ({commit_fire, commit_size}),
|
||||
.data_out ({cmt_to_csr_if.valid, cmt_to_csr_if.commit_size})
|
||||
);
|
||||
|
||||
|
@ -90,32 +98,32 @@ module VX_commit #(
|
|||
if (alu_commit_if.valid && alu_commit_if.ready) begin
|
||||
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=ALU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.wb, alu_commit_if.rd);
|
||||
`TRACE_ARRAY1D(alu_commit_if.data, `NUM_THREADS);
|
||||
dpi_trace("\n");
|
||||
dpi_trace(" (#%0d)\n", alu_commit_if.uuid);
|
||||
end
|
||||
if (ld_commit_if.valid && ld_commit_if.ready) begin
|
||||
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, ld_commit_if.wid, ld_commit_if.PC, ld_commit_if.tmask, ld_commit_if.wb, ld_commit_if.rd);
|
||||
`TRACE_ARRAY1D(ld_commit_if.data, `NUM_THREADS);
|
||||
dpi_trace("\n");
|
||||
dpi_trace(" (#%0d)\n", ld_commit_if.uuid);
|
||||
end
|
||||
if (st_commit_if.valid && st_commit_if.ready) begin
|
||||
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d\n", $time, CORE_ID, st_commit_if.wid, st_commit_if.PC, st_commit_if.tmask, st_commit_if.wb, st_commit_if.rd);
|
||||
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d (#%0d)\n", $time, CORE_ID, st_commit_if.wid, st_commit_if.PC, st_commit_if.tmask, st_commit_if.wb, st_commit_if.rd, st_commit_if.uuid);
|
||||
end
|
||||
if (csr_commit_if.valid && csr_commit_if.ready) begin
|
||||
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=CSR, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, csr_commit_if.wid, csr_commit_if.PC, csr_commit_if.tmask, csr_commit_if.wb, csr_commit_if.rd);
|
||||
`TRACE_ARRAY1D(csr_commit_if.data, `NUM_THREADS);
|
||||
dpi_trace("\n");
|
||||
dpi_trace(" (#%0d)\n", csr_commit_if.uuid);
|
||||
end
|
||||
`ifdef EXT_F_ENABLE
|
||||
if (fpu_commit_if.valid && fpu_commit_if.ready) begin
|
||||
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=FPU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, fpu_commit_if.wid, fpu_commit_if.PC, fpu_commit_if.tmask, fpu_commit_if.wb, fpu_commit_if.rd);
|
||||
`TRACE_ARRAY1D(fpu_commit_if.data, `NUM_THREADS);
|
||||
dpi_trace("\n");
|
||||
dpi_trace(" (#%0d)\n", fpu_commit_if.uuid);
|
||||
end
|
||||
`endif
|
||||
if (gpu_commit_if.valid && gpu_commit_if.ready) begin
|
||||
dpi_trace("%d: core%0d-commit: wid=%0d, PC=%0h, ex=GPU, tmask=%b, wb=%0d, rd=%0d, data=", $time, CORE_ID, gpu_commit_if.wid, gpu_commit_if.PC, gpu_commit_if.tmask, gpu_commit_if.wb, gpu_commit_if.rd);
|
||||
`TRACE_ARRAY1D(gpu_commit_if.data, `NUM_THREADS);
|
||||
dpi_trace("\n");
|
||||
dpi_trace(" (#%0d)\n", gpu_commit_if.uuid);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -171,48 +171,50 @@
|
|||
`define CSR_MPM_FPU_ST_H 12'hB88
|
||||
`define CSR_MPM_GPU_ST 12'hB09
|
||||
`define CSR_MPM_GPU_ST_H 12'hB89
|
||||
// PERF: decode
|
||||
`define CSR_MPM_LOADS 12'hB0A
|
||||
`define CSR_MPM_LOADS_H 12'hB8A
|
||||
`define CSR_MPM_STORES 12'hB0B
|
||||
`define CSR_MPM_STORES_H 12'hB8B
|
||||
`define CSR_MPM_BRANCHES 12'hB0C
|
||||
`define CSR_MPM_BRANCHES_H 12'hB8C
|
||||
// PERF: icache
|
||||
`define CSR_MPM_ICACHE_READS 12'hB0A // total reads
|
||||
`define CSR_MPM_ICACHE_READS_H 12'hB8A
|
||||
`define CSR_MPM_ICACHE_MISS_R 12'hB0B // total misses
|
||||
`define CSR_MPM_ICACHE_MISS_R_H 12'hB8B
|
||||
`define CSR_MPM_ICACHE_PIPE_ST 12'hB0C // pipeline stalls
|
||||
`define CSR_MPM_ICACHE_PIPE_ST_H 12'hB8C
|
||||
`define CSR_MPM_ICACHE_CRSP_ST 12'hB0D // core response stalls
|
||||
`define CSR_MPM_ICACHE_CRSP_ST_H 12'hB8D
|
||||
`define CSR_MPM_ICACHE_READS 12'hB0D // total reads
|
||||
`define CSR_MPM_ICACHE_READS_H 12'hB8D
|
||||
`define CSR_MPM_ICACHE_MISS_R 12'hB0E // read misses
|
||||
`define CSR_MPM_ICACHE_MISS_R_H 12'hB8E
|
||||
// PERF: dcache
|
||||
`define CSR_MPM_DCACHE_READS 12'hB0E // total reads
|
||||
`define CSR_MPM_DCACHE_READS_H 12'hB8E
|
||||
`define CSR_MPM_DCACHE_WRITES 12'hB0F // total writes
|
||||
`define CSR_MPM_DCACHE_WRITES_H 12'hB8F
|
||||
`define CSR_MPM_DCACHE_MISS_R 12'hB10 // read misses
|
||||
`define CSR_MPM_DCACHE_MISS_R_H 12'hB90
|
||||
`define CSR_MPM_DCACHE_MISS_W 12'hB11 // write misses
|
||||
`define CSR_MPM_DCACHE_MISS_W_H 12'hB91
|
||||
`define CSR_MPM_DCACHE_BANK_ST 12'hB12 // bank conflicts stalls
|
||||
`define CSR_MPM_DCACHE_BANK_ST_H 12'hB92
|
||||
`define CSR_MPM_DCACHE_MSHR_ST 12'hB13 // MSHR stalls
|
||||
`define CSR_MPM_DCACHE_MSHR_ST_H 12'hB93
|
||||
`define CSR_MPM_DCACHE_PIPE_ST 12'hB14 // pipeline stalls
|
||||
`define CSR_MPM_DCACHE_PIPE_ST_H 12'hB94
|
||||
`define CSR_MPM_DCACHE_CRSP_ST 12'hB15 // core response stalls
|
||||
`define CSR_MPM_DCACHE_CRSP_ST_H 12'hB95
|
||||
`define CSR_MPM_DCACHE_READS 12'hB0F // total reads
|
||||
`define CSR_MPM_DCACHE_READS_H 12'hB8F
|
||||
`define CSR_MPM_DCACHE_WRITES 12'hB10 // total writes
|
||||
`define CSR_MPM_DCACHE_WRITES_H 12'hB90
|
||||
`define CSR_MPM_DCACHE_MISS_R 12'hB11 // read misses
|
||||
`define CSR_MPM_DCACHE_MISS_R_H 12'hB91
|
||||
`define CSR_MPM_DCACHE_MISS_W 12'hB12 // write misses
|
||||
`define CSR_MPM_DCACHE_MISS_W_H 12'hB92
|
||||
`define CSR_MPM_DCACHE_BANK_ST 12'hB13 // bank conflicts
|
||||
`define CSR_MPM_DCACHE_BANK_ST_H 12'hB93
|
||||
`define CSR_MPM_DCACHE_MSHR_ST 12'hB14 // MSHR stalls
|
||||
`define CSR_MPM_DCACHE_MSHR_ST_H 12'hB94
|
||||
// PERF: smem
|
||||
`define CSR_MPM_SMEM_READS 12'hB16 // total reads
|
||||
`define CSR_MPM_SMEM_READS_H 12'hB96
|
||||
`define CSR_MPM_SMEM_WRITES 12'hB17 // total writes
|
||||
`define CSR_MPM_SMEM_WRITES_H 12'hB97
|
||||
`define CSR_MPM_SMEM_BANK_ST 12'hB18 // bank conflicts stalls
|
||||
`define CSR_MPM_SMEM_BANK_ST_H 12'hB98
|
||||
`define CSR_MPM_SMEM_READS 12'hB15 // total reads
|
||||
`define CSR_MPM_SMEM_READS_H 12'hB95
|
||||
`define CSR_MPM_SMEM_WRITES 12'hB16 // total writes
|
||||
`define CSR_MPM_SMEM_WRITES_H 12'hB96
|
||||
`define CSR_MPM_SMEM_BANK_ST 12'hB17 // bank conflicts
|
||||
`define CSR_MPM_SMEM_BANK_ST_H 12'hB97
|
||||
// PERF: memory
|
||||
`define CSR_MPM_MEM_READS 12'hB19 // memory reads
|
||||
`define CSR_MPM_MEM_READS_H 12'hB99
|
||||
`define CSR_MPM_MEM_WRITES 12'hB1A // memory writes
|
||||
`define CSR_MPM_MEM_WRITES_H 12'hB9A
|
||||
`define CSR_MPM_MEM_ST 12'hB1B // memory request stalls
|
||||
`define CSR_MPM_MEM_ST_H 12'hB9B
|
||||
`define CSR_MPM_MEM_LAT 12'hB1C // memory latency (total)
|
||||
`define CSR_MPM_MEM_LAT_H 12'hB9C
|
||||
`define CSR_MPM_MEM_READS 12'hB18 // memory reads
|
||||
`define CSR_MPM_MEM_READS_H 12'hB98
|
||||
`define CSR_MPM_MEM_WRITES 12'hB19 // memory writes
|
||||
`define CSR_MPM_MEM_WRITES_H 12'hB99
|
||||
`define CSR_MPM_MEM_LAT 12'hB1A // memory latency
|
||||
`define CSR_MPM_MEM_LAT_H 12'hB9A
|
||||
// PERF: texunit
|
||||
`define CSR_MPM_TEX_READS 12'hB1B // texture accesses
|
||||
`define CSR_MPM_TEX_READS_H 12'hB9B
|
||||
`define CSR_MPM_TEX_LAT 12'hB1C // texture latency
|
||||
`define CSR_MPM_TEX_LAT_H 12'hB9C
|
||||
|
||||
// Machine Information Registers
|
||||
`define CSR_MVENDORID 12'hF11
|
||||
|
@ -254,12 +256,22 @@
|
|||
`define TEX_STATE_WRAPU 5
|
||||
`define TEX_STATE_WRAPV 6
|
||||
`define TEX_STATE_MIPOFF(lod) (7+(lod))
|
||||
`define NUM_TEX_STATES (`TEX_STATE_MIPOFF(`TEX_LOD_MAX)+1)
|
||||
|
||||
`define NUM_TEX_STATES (7+`TEX_LOD_MAX)
|
||||
`define CSR_TEX_UNIT 12'hFD0
|
||||
|
||||
`define CSR_TEX(unit,state) (12'hFD0 + ((unit) * `NUM_TEX_STATES) + (state))
|
||||
`define CSR_TEX_UNIT(csr) (((csr) - 12'hFD0) / `NUM_TEX_STATES)
|
||||
`define CSR_TEX_STATE(csr) (((csr) - 12'hFD0) % `NUM_TEX_STATES)
|
||||
`define CSR_TEX_STATE_BEGIN 12'hFD1
|
||||
`define CSR_TEX_ADDR (`CSR_TEX_STATE_BEGIN+`TEX_STATE_ADDR)
|
||||
`define CSR_TEX_WIDTH (`CSR_TEX_STATE_BEGIN+`TEX_STATE_WIDTH)
|
||||
`define CSR_TEX_HEIGHT (`CSR_TEX_STATE_BEGIN+`TEX_STATE_HEIGHT)
|
||||
`define CSR_TEX_FORMAT (`CSR_TEX_STATE_BEGIN+`TEX_STATE_FORMAT)
|
||||
`define CSR_TEX_FILTER (`CSR_TEX_STATE_BEGIN+`TEX_STATE_FILTER)
|
||||
`define CSR_TEX_WRAPU (`CSR_TEX_STATE_BEGIN+`TEX_STATE_WRAPU)
|
||||
`define CSR_TEX_WRAPV (`CSR_TEX_STATE_BEGIN+`TEX_STATE_WRAPV)
|
||||
`define CSR_TEX_MIPOFF(lod) (`CSR_TEX_STATE_BEGIN+`TEX_STATE_MIPOFF(lod))
|
||||
`define CSR_TEX_STATE_END (`CSR_TEX_STATE_BEGIN + `NUM_TEX_STATES)
|
||||
|
||||
`define CSR_TEX_STATE(addr) ((addr) - `CSR_TEX_STATE_BEGIN)
|
||||
|
||||
// Pipeline Queues ////////////////////////////////////////////////////////////
|
||||
|
||||
|
|
|
@ -7,6 +7,9 @@ module VX_csr_data #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
VX_perf_tex_if.slave perf_tex_if,
|
||||
`endif
|
||||
VX_perf_memsys_if.slave perf_memsys_if,
|
||||
VX_perf_pipeline_if.slave perf_pipeline_if,
|
||||
`endif
|
||||
|
@ -22,11 +25,13 @@ module VX_csr_data #(
|
|||
`endif
|
||||
|
||||
input wire read_enable,
|
||||
input wire [63:0] read_uuid,
|
||||
input wire[`CSR_ADDR_BITS-1:0] read_addr,
|
||||
input wire[`NW_BITS-1:0] read_wid,
|
||||
output wire[31:0] read_data,
|
||||
|
||||
input wire write_enable,
|
||||
input wire [63:0] write_uuid,
|
||||
input wire[`CSR_ADDR_BITS-1:0] write_addr,
|
||||
input wire[`NW_BITS-1:0] write_wid,
|
||||
input wire[31:0] write_data,
|
||||
|
@ -56,7 +61,7 @@ module VX_csr_data #(
|
|||
`ifdef EXT_F_ENABLE
|
||||
if (fpu_to_csr_if.write_enable) begin
|
||||
fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0] <= fcsr[fpu_to_csr_if.write_wid][`FFLAGS_BITS-1:0]
|
||||
| fpu_to_csr_if.write_fflags;
|
||||
| fpu_to_csr_if.write_fflags;
|
||||
end
|
||||
`endif
|
||||
if (write_enable) begin
|
||||
|
@ -75,11 +80,12 @@ module VX_csr_data #(
|
|||
`CSR_PMPADDR0: csr_pmpaddr[0] <= write_data[`CSR_WIDTH-1:0];
|
||||
default: begin
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
`ASSERT(write_addr >= `CSR_TEX(0,0)
|
||||
&& write_addr < `CSR_TEX(`NUM_TEX_UNITS, 0),
|
||||
("%t: invalid CSR write address: %0h", $time, write_addr));
|
||||
`ASSERT((write_addr == `CSR_TEX_UNIT)
|
||||
|| (write_addr >= `CSR_TEX_STATE_BEGIN
|
||||
&& write_addr < `CSR_TEX_STATE_END),
|
||||
("%t: *** invalid CSR write address: %0h (#%0d)", $time, write_addr, write_uuid));
|
||||
`else
|
||||
`ASSERT(~write_enable, ("%t: invalid CSR write address: %0h", $time, write_addr));
|
||||
`ASSERT(~write_enable, ("%t: *** invalid CSR write address: %0h (#%0d)", $time, write_addr, write_uuid));
|
||||
`endif
|
||||
end
|
||||
endcase
|
||||
|
@ -152,20 +158,28 @@ module VX_csr_data #(
|
|||
`CSR_MPM_LSU_ST_H : read_data_r = 32'(perf_pipeline_if.lsu_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_CSR_ST : read_data_r = perf_pipeline_if.csr_stalls[31:0];
|
||||
`CSR_MPM_CSR_ST_H : read_data_r = 32'(perf_pipeline_if.csr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`ifdef EXT_F_ENABLE
|
||||
`CSR_MPM_FPU_ST : read_data_r = perf_pipeline_if.fpu_stalls[31:0];
|
||||
`CSR_MPM_FPU_ST_H : read_data_r = 32'(perf_pipeline_if.fpu_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`else
|
||||
`CSR_MPM_FPU_ST : read_data_r = '0;
|
||||
`CSR_MPM_FPU_ST_H : read_data_r = '0;
|
||||
`endif
|
||||
`CSR_MPM_GPU_ST : read_data_r = perf_pipeline_if.gpu_stalls[31:0];
|
||||
`CSR_MPM_GPU_ST_H : read_data_r = 32'(perf_pipeline_if.gpu_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: decode
|
||||
`CSR_MPM_LOADS : read_data_r = perf_pipeline_if.loads[31:0];
|
||||
`CSR_MPM_LOADS_H : read_data_r = 32'(perf_pipeline_if.loads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_STORES : read_data_r = perf_pipeline_if.stores[31:0];
|
||||
`CSR_MPM_STORES_H : read_data_r = 32'(perf_pipeline_if.stores[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_BRANCHES : read_data_r = perf_pipeline_if.branches[31:0];
|
||||
`CSR_MPM_BRANCHES_H : read_data_r = 32'(perf_pipeline_if.branches[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: icache
|
||||
`CSR_MPM_ICACHE_READS : read_data_r = perf_memsys_if.icache_reads[31:0];
|
||||
`CSR_MPM_ICACHE_READS_H : read_data_r = 32'(perf_memsys_if.icache_reads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_ICACHE_MISS_R : read_data_r = perf_memsys_if.icache_read_misses[31:0];
|
||||
`CSR_MPM_ICACHE_MISS_R_H : read_data_r = 32'(perf_memsys_if.icache_read_misses[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_ICACHE_PIPE_ST : read_data_r = perf_memsys_if.icache_pipe_stalls[31:0];
|
||||
`CSR_MPM_ICACHE_PIPE_ST_H : read_data_r = 32'(perf_memsys_if.icache_pipe_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_ICACHE_CRSP_ST : read_data_r = perf_memsys_if.icache_crsp_stalls[31:0];
|
||||
`CSR_MPM_ICACHE_CRSP_ST_H : read_data_r = 32'(perf_memsys_if.icache_crsp_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: dcache
|
||||
// PERF: dcache
|
||||
`CSR_MPM_DCACHE_READS : read_data_r = perf_memsys_if.dcache_reads[31:0];
|
||||
`CSR_MPM_DCACHE_READS_H : read_data_r = 32'(perf_memsys_if.dcache_reads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_DCACHE_WRITES : read_data_r = perf_memsys_if.dcache_writes[31:0];
|
||||
|
@ -178,26 +192,27 @@ module VX_csr_data #(
|
|||
`CSR_MPM_DCACHE_BANK_ST_H : read_data_r = 32'(perf_memsys_if.dcache_bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_DCACHE_MSHR_ST : read_data_r = perf_memsys_if.dcache_mshr_stalls[31:0];
|
||||
`CSR_MPM_DCACHE_MSHR_ST_H : read_data_r = 32'(perf_memsys_if.dcache_mshr_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_DCACHE_PIPE_ST : read_data_r = perf_memsys_if.dcache_pipe_stalls[31:0];
|
||||
`CSR_MPM_DCACHE_PIPE_ST_H : read_data_r = 32'(perf_memsys_if.dcache_pipe_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_DCACHE_CRSP_ST : read_data_r = perf_memsys_if.dcache_crsp_stalls[31:0];
|
||||
`CSR_MPM_DCACHE_CRSP_ST_H : read_data_r = 32'(perf_memsys_if.dcache_crsp_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: smem
|
||||
// PERF: smem
|
||||
`CSR_MPM_SMEM_READS : read_data_r = perf_memsys_if.smem_reads[31:0];
|
||||
`CSR_MPM_SMEM_READS_H : read_data_r = 32'(perf_memsys_if.smem_reads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_SMEM_WRITES : read_data_r = perf_memsys_if.smem_writes[31:0];
|
||||
`CSR_MPM_SMEM_WRITES_H : read_data_r = 32'(perf_memsys_if.smem_writes[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_SMEM_BANK_ST : read_data_r = perf_memsys_if.smem_bank_stalls[31:0];
|
||||
`CSR_MPM_SMEM_BANK_ST_H : read_data_r = 32'(perf_memsys_if.smem_bank_stalls[`PERF_CTR_BITS-1:32]);
|
||||
// PERF: MEM
|
||||
// PERF: memory
|
||||
`CSR_MPM_MEM_READS : read_data_r = perf_memsys_if.mem_reads[31:0];
|
||||
`CSR_MPM_MEM_READS_H : read_data_r = 32'(perf_memsys_if.mem_reads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_MEM_WRITES : read_data_r = perf_memsys_if.mem_writes[31:0];
|
||||
`CSR_MPM_MEM_WRITES_H : read_data_r = 32'(perf_memsys_if.mem_writes[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_MEM_ST : read_data_r = perf_memsys_if.mem_stalls[31:0];
|
||||
`CSR_MPM_MEM_ST_H : read_data_r = 32'(perf_memsys_if.mem_stalls[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_MEM_LAT : read_data_r = perf_memsys_if.mem_latency[31:0];
|
||||
`CSR_MPM_MEM_LAT_H : read_data_r = 32'(perf_memsys_if.mem_latency[`PERF_CTR_BITS-1:32]);
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
// PERF: texunit
|
||||
`CSR_MPM_TEX_READS : read_data_r = perf_tex_if.mem_reads[31:0];
|
||||
`CSR_MPM_TEX_READS_H : read_data_r = 32'(perf_tex_if.mem_reads[`PERF_CTR_BITS-1:32]);
|
||||
`CSR_MPM_TEX_LAT : read_data_r = perf_tex_if.mem_latency[31:0];
|
||||
`CSR_MPM_TEX_LAT_H : read_data_r = 32'(perf_tex_if.mem_latency[`PERF_CTR_BITS-1:32]);
|
||||
`endif
|
||||
// PERF: reserved
|
||||
`CSR_MPM_RESERVED : read_data_r = '0;
|
||||
`CSR_MPM_RESERVED_H : read_data_r = '0;
|
||||
|
@ -227,7 +242,9 @@ module VX_csr_data #(
|
|||
read_addr_valid_r = 1;
|
||||
end else
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
if (read_addr >= `CSR_TEX(0,0) && read_addr < `CSR_TEX(`NUM_TEX_UNITS,0)) begin
|
||||
if ((read_addr == `CSR_TEX_UNIT)
|
||||
|| (read_addr >= `CSR_TEX_STATE_BEGIN
|
||||
&& read_addr < `CSR_TEX_STATE_END)) begin
|
||||
read_addr_valid_r = 1;
|
||||
end else
|
||||
`endif
|
||||
|
@ -236,7 +253,7 @@ module VX_csr_data #(
|
|||
endcase
|
||||
end
|
||||
|
||||
`RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("invalid CSR read address: %0h", read_addr))
|
||||
`RUNTIME_ASSERT(~read_enable || read_addr_valid_r, ("%t: *** invalid CSR read address: %0h (#%0d)", $time, read_addr, read_uuid))
|
||||
|
||||
assign read_data = read_data_r;
|
||||
|
||||
|
|
|
@ -7,6 +7,9 @@ module VX_csr_unit #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
VX_perf_tex_if.slave perf_tex_if,
|
||||
`endif
|
||||
VX_perf_memsys_if.slave perf_memsys_if,
|
||||
VX_perf_pipeline_if.slave perf_pipeline_if,
|
||||
`endif
|
||||
|
@ -29,7 +32,8 @@ module VX_csr_unit #(
|
|||
);
|
||||
wire csr_we_s1;
|
||||
wire [`CSR_ADDR_BITS-1:0] csr_addr_s1;
|
||||
wire [31:0] csr_read_data, csr_read_data_s1;
|
||||
wire [31:0] csr_read_data;
|
||||
wire [31:0] csr_read_data_s1;
|
||||
wire [31:0] csr_updated_data_s1;
|
||||
|
||||
wire write_enable = csr_commit_if.valid && csr_we_s1;
|
||||
|
@ -42,8 +46,11 @@ module VX_csr_unit #(
|
|||
.clk (clk),
|
||||
.reset (reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_memsys_if (perf_memsys_if),
|
||||
.perf_pipeline_if (perf_pipeline_if),
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
.perf_tex_if (perf_tex_if),
|
||||
`endif
|
||||
.perf_memsys_if (perf_memsys_if),
|
||||
.perf_pipeline_if(perf_pipeline_if),
|
||||
`endif
|
||||
.cmt_to_csr_if (cmt_to_csr_if),
|
||||
.fetch_to_csr_if(fetch_to_csr_if),
|
||||
|
@ -54,10 +61,12 @@ module VX_csr_unit #(
|
|||
.tex_csr_if (tex_csr_if),
|
||||
`endif
|
||||
.read_enable (csr_req_if.valid),
|
||||
.read_uuid (csr_req_if.uuid),
|
||||
.read_addr (csr_req_if.addr),
|
||||
.read_wid (csr_req_if.wid),
|
||||
.read_data (csr_read_data),
|
||||
.write_enable (write_enable),
|
||||
.write_uuid (csr_commit_if.uuid),
|
||||
.write_addr (csr_addr_s1),
|
||||
.write_wid (csr_commit_if.wid),
|
||||
.write_data (csr_updated_data_s1),
|
||||
|
@ -101,14 +110,14 @@ module VX_csr_unit #(
|
|||
wire stall_out = ~csr_commit_if.ready && csr_commit_if.valid;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1 + `CSR_ADDR_BITS + 32 + 32),
|
||||
.DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1 + `CSR_ADDR_BITS + 32 + 32),
|
||||
.RESETW (1)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!stall_out),
|
||||
.data_in ({csr_req_valid, csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.rd, csr_req_if.wb, csr_we_s0_unqual, csr_req_if.addr, csr_read_data_qual, csr_updated_data}),
|
||||
.data_out ({csr_commit_if.valid, csr_commit_if.wid, csr_commit_if.tmask, csr_commit_if.PC, csr_commit_if.rd, csr_commit_if.wb, csr_we_s1, csr_addr_s1, csr_read_data_s1, csr_updated_data_s1})
|
||||
.data_in ({csr_req_valid, csr_req_if.uuid, csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.rd, csr_req_if.wb, csr_we_s0_unqual, csr_req_if.addr, csr_read_data_qual, csr_updated_data}),
|
||||
.data_out ({csr_commit_if.valid, csr_commit_if.uuid, csr_commit_if.wid, csr_commit_if.tmask, csr_commit_if.PC, csr_commit_if.rd, csr_commit_if.wb, csr_we_s1, csr_addr_s1, csr_read_data_s1, csr_updated_data_s1})
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
|
|
|
@ -20,6 +20,10 @@ module VX_decode #(
|
|||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_pipeline_if.decode perf_decode_if,
|
||||
`endif
|
||||
|
||||
// inputs
|
||||
VX_ifetch_rsp_if.slave ifetch_rsp_if,
|
||||
|
||||
|
@ -57,7 +61,6 @@ module VX_decode #(
|
|||
wire [11:0] s_imm = {func7, rd};
|
||||
wire [12:0] b_imm = {instr[31], instr[7], instr[30:25], instr[11:8], 1'b0};
|
||||
wire [20:0] jal_imm = {instr[31], instr[19:12], instr[20], instr[30:21], 1'b0};
|
||||
wire [11:0] jalr_imm = {func7, rs2};
|
||||
|
||||
`UNUSED_VAR (rs3)
|
||||
|
||||
|
@ -169,7 +172,7 @@ module VX_decode #(
|
|||
use_rd = 1;
|
||||
use_imm = 1;
|
||||
is_wstall = 1;
|
||||
imm = {{20{jalr_imm[11]}}, jalr_imm};
|
||||
imm = {{20{u_12[11]}}, u_12};
|
||||
`USED_IREG (rd);
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
|
@ -192,7 +195,7 @@ module VX_decode #(
|
|||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
end
|
||||
`INST_F: begin
|
||||
`INST_FENCE: begin
|
||||
ex_type = `EX_LSU;
|
||||
op_mod = `INST_MOD_BITS'(1);
|
||||
end
|
||||
|
@ -411,6 +414,7 @@ module VX_decode #(
|
|||
wire wb = use_rd && (| rd_r);
|
||||
|
||||
assign decode_if.valid = ifetch_rsp_if.valid;
|
||||
assign decode_if.uuid = ifetch_rsp_if.uuid;
|
||||
assign decode_if.wid = ifetch_rsp_if.wid;
|
||||
assign decode_if.tmask = ifetch_rsp_if.tmask;
|
||||
assign decode_if.PC = ifetch_rsp_if.PC;
|
||||
|
@ -439,6 +443,42 @@ module VX_decode #(
|
|||
|
||||
assign ifetch_rsp_if.ready = decode_if.ready;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
wire [$clog2(`NUM_THREADS+1)-1:0] perf_loads_per_cycle;
|
||||
wire [$clog2(`NUM_THREADS+1)-1:0] perf_stores_per_cycle;
|
||||
wire [$clog2(`NUM_THREADS+1)-1:0] perf_branches_per_cycle;
|
||||
|
||||
wire [`NUM_THREADS-1:0] perf_loads_per_mask = decode_if.tmask & {`NUM_THREADS{decode_if.ex_type == `EX_LSU && `INST_LSU_IS_MEM(decode_if.op_mod) && decode_if.wb}};
|
||||
wire [`NUM_THREADS-1:0] perf_stores_per_mask = decode_if.tmask & {`NUM_THREADS{decode_if.ex_type == `EX_LSU && `INST_LSU_IS_MEM(decode_if.op_mod) && ~decode_if.wb}};
|
||||
wire [`NUM_THREADS-1:0] perf_branches_per_mask = decode_if.tmask & {`NUM_THREADS{decode_if.ex_type == `EX_ALU && `INST_ALU_IS_BR(decode_if.op_mod)}};
|
||||
|
||||
`POP_COUNT(perf_loads_per_cycle, perf_loads_per_mask);
|
||||
`POP_COUNT(perf_stores_per_cycle, perf_stores_per_mask);
|
||||
`POP_COUNT(perf_branches_per_cycle, perf_branches_per_mask);
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_loads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_stores;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_branches;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_loads <= 0;
|
||||
perf_stores <= 0;
|
||||
perf_branches <= 0;
|
||||
end else begin
|
||||
if (decode_if.valid && decode_if.ready) begin
|
||||
perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_loads_per_cycle);
|
||||
perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_stores_per_cycle);
|
||||
perf_branches <= perf_branches + `PERF_CTR_BITS'(perf_branches_per_cycle);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
assign perf_decode_if.loads = perf_loads;
|
||||
assign perf_decode_if.stores = perf_stores;
|
||||
assign perf_decode_if.branches = perf_branches;
|
||||
`endif
|
||||
|
||||
`ifdef DBG_TRACE_PIPELINE
|
||||
always @(posedge clk) begin
|
||||
if (decode_if.valid && decode_if.ready) begin
|
||||
|
@ -446,7 +486,8 @@ module VX_decode #(
|
|||
trace_ex_type(decode_if.ex_type);
|
||||
dpi_trace(", op=");
|
||||
trace_ex_op(decode_if.ex_type, decode_if.op_type, decode_if.op_mod);
|
||||
dpi_trace(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b\n", decode_if.op_mod, decode_if.tmask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.use_PC, decode_if.use_imm);
|
||||
dpi_trace(", mod=%0d, tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b (#%0d)\n",
|
||||
decode_if.op_mod, decode_if.tmask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.use_PC, decode_if.use_imm, decode_if.uuid);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -55,7 +55,7 @@
|
|||
`define INST_S 7'b0100011 // store instructions
|
||||
`define INST_I 7'b0010011 // immediate instructions
|
||||
`define INST_R 7'b0110011 // register instructions
|
||||
`define INST_F 7'b0001111 // Fence instructions
|
||||
`define INST_FENCE 7'b0001111 // Fence instructions
|
||||
`define INST_SYS 7'b1110011 // system instructions
|
||||
|
||||
`define INST_FL 7'b0000111 // float load instruction
|
||||
|
@ -155,6 +155,7 @@
|
|||
`define INST_LSU_BITS 4
|
||||
`define INST_LSU_FMT(x) x[2:0]
|
||||
`define INST_LSU_WSIZE(x) x[1:0]
|
||||
`define INST_LSU_IS_MEM(x) (3'h0 == x)
|
||||
`define INST_LSU_IS_FENCE(x) (3'h1 == x)
|
||||
`define INST_LSU_IS_PREFETCH(x) (3'h2 == x)
|
||||
|
||||
|
|
|
@ -42,15 +42,15 @@ module VX_dispatch (
|
|||
wire [`INST_ALU_BITS-1:0] alu_op_type = `INST_ALU_BITS'(ibuffer_if.op_type);
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_ALU_BITS + `INST_MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)),
|
||||
.DATAW (64 + `NW_BITS + `NUM_THREADS + 32 + 32 + `INST_ALU_BITS + `INST_MOD_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS + (2 * `NUM_THREADS * 32)),
|
||||
.OUT_REG (1)
|
||||
) alu_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (alu_req_valid),
|
||||
.ready_in (alu_req_ready),
|
||||
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, alu_op_type, ibuffer_if.op_mod, ibuffer_if.imm, ibuffer_if.use_PC, ibuffer_if.use_imm, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}),
|
||||
.data_out ({alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.next_PC, alu_req_if.op_type, alu_req_if.op_mod, alu_req_if.imm, alu_req_if.use_PC, alu_req_if.use_imm, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid, alu_req_if.rs1_data, alu_req_if.rs2_data}),
|
||||
.data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, alu_op_type, ibuffer_if.op_mod, ibuffer_if.imm, ibuffer_if.use_PC, ibuffer_if.use_imm, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}),
|
||||
.data_out ({alu_req_if.uuid, alu_req_if.wid, alu_req_if.tmask, alu_req_if.PC, alu_req_if.next_PC, alu_req_if.op_type, alu_req_if.op_mod, alu_req_if.imm, alu_req_if.use_PC, alu_req_if.use_imm, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid, alu_req_if.rs1_data, alu_req_if.rs2_data}),
|
||||
.valid_out (alu_req_if.valid),
|
||||
.ready_out (alu_req_if.ready)
|
||||
);
|
||||
|
@ -63,15 +63,15 @@ module VX_dispatch (
|
|||
wire lsu_is_prefetch = `INST_LSU_IS_PREFETCH(ibuffer_if.op_mod);
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32) + 1),
|
||||
.DATAW (64 + `NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32) + 1),
|
||||
.OUT_REG (1)
|
||||
) lsu_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (lsu_req_valid),
|
||||
.ready_in (lsu_req_ready),
|
||||
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, lsu_op_type, lsu_is_fence, ibuffer_if.imm, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, lsu_is_prefetch}),
|
||||
.data_out ({lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, lsu_req_if.op_type, lsu_req_if.is_fence, lsu_req_if.offset, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.base_addr, lsu_req_if.store_data, lsu_req_if.is_prefetch}),
|
||||
.data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, lsu_op_type, lsu_is_fence, ibuffer_if.imm, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, lsu_is_prefetch}),
|
||||
.data_out ({lsu_req_if.uuid, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, lsu_req_if.op_type, lsu_req_if.is_fence, lsu_req_if.offset, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.base_addr, lsu_req_if.store_data, lsu_req_if.is_prefetch}),
|
||||
.valid_out (lsu_req_if.valid),
|
||||
.ready_out (lsu_req_if.ready)
|
||||
);
|
||||
|
@ -85,15 +85,15 @@ module VX_dispatch (
|
|||
wire [31:0] csr_rs1_data = gpr_rsp_if.rs1_data[tid];
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NRI_BITS + 32),
|
||||
.DATAW (64 + `NW_BITS + `NUM_THREADS + 32 + `INST_CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1 + `NRI_BITS + 32),
|
||||
.OUT_REG (1)
|
||||
) csr_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (csr_req_valid),
|
||||
.ready_in (csr_req_ready),
|
||||
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, csr_op_type, csr_addr, ibuffer_if.rd, ibuffer_if.wb, ibuffer_if.use_imm, csr_imm, csr_rs1_data}),
|
||||
.data_out ({csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.op_type, csr_req_if.addr, csr_req_if.rd, csr_req_if.wb, csr_req_if.use_imm, csr_req_if.imm, csr_req_if.rs1_data}),
|
||||
.data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, csr_op_type, csr_addr, ibuffer_if.rd, ibuffer_if.wb, ibuffer_if.use_imm, csr_imm, csr_rs1_data}),
|
||||
.data_out ({csr_req_if.uuid, csr_req_if.wid, csr_req_if.tmask, csr_req_if.PC, csr_req_if.op_type, csr_req_if.addr, csr_req_if.rd, csr_req_if.wb, csr_req_if.use_imm, csr_req_if.imm, csr_req_if.rs1_data}),
|
||||
.valid_out (csr_req_if.valid),
|
||||
.ready_out (csr_req_if.ready)
|
||||
);
|
||||
|
@ -105,15 +105,15 @@ module VX_dispatch (
|
|||
wire [`INST_FPU_BITS-1:0] fpu_op_type = `INST_FPU_BITS'(ibuffer_if.op_type);
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_FPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)),
|
||||
.DATAW (64 + `NW_BITS + `NUM_THREADS + 32 + `INST_FPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)),
|
||||
.OUT_REG (1)
|
||||
) fpu_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (fpu_req_valid),
|
||||
.ready_in (fpu_req_ready),
|
||||
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, fpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}),
|
||||
.data_out ({fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.op_type, fpu_req_if.op_mod, fpu_req_if.rd, fpu_req_if.wb, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}),
|
||||
.data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, fpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}),
|
||||
.data_out ({fpu_req_if.uuid, fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.op_type, fpu_req_if.op_mod, fpu_req_if.rd, fpu_req_if.wb, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}),
|
||||
.valid_out (fpu_req_if.valid),
|
||||
.ready_out (fpu_req_if.ready)
|
||||
);
|
||||
|
@ -127,15 +127,15 @@ module VX_dispatch (
|
|||
wire [`INST_GPU_BITS-1:0] gpu_op_type = `INST_GPU_BITS'(ibuffer_if.op_type);
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + `NT_BITS + (3 * `NUM_THREADS * 32)),
|
||||
.DATAW (64 + `NW_BITS + `NUM_THREADS + 32 + 32 + `INST_GPU_BITS + `INST_MOD_BITS + `NR_BITS + 1 + `NT_BITS + (3 * `NUM_THREADS * 32)),
|
||||
.OUT_REG (1)
|
||||
) gpu_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (gpu_req_valid),
|
||||
.ready_in (gpu_req_ready),
|
||||
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, gpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}),
|
||||
.data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.op_mod, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.tid, gpu_req_if.rs1_data, gpu_req_if.rs2_data, gpu_req_if.rs3_data}),
|
||||
.data_in ({ibuffer_if.uuid, ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, gpu_op_type, ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}),
|
||||
.data_out ({gpu_req_if.uuid, gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.op_mod, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.tid, gpu_req_if.rs1_data, gpu_req_if.rs2_data, gpu_req_if.rs3_data}),
|
||||
.valid_out (gpu_req_if.valid),
|
||||
.ready_out (gpu_req_if.ready)
|
||||
);
|
||||
|
|
|
@ -75,6 +75,10 @@ module VX_execute #(
|
|||
|
||||
VX_tex_csr_if tex_csr_if();
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_tex_if perf_tex_if();
|
||||
`endif
|
||||
|
||||
VX_cache_arb #(
|
||||
.NUM_REQS (2),
|
||||
.LANES (`NUM_THREADS),
|
||||
|
@ -165,6 +169,9 @@ module VX_execute #(
|
|||
.clk (clk),
|
||||
.reset (csr_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
.perf_tex_if (perf_tex_if),
|
||||
`endif
|
||||
.perf_memsys_if (perf_memsys_if),
|
||||
.perf_pipeline_if(perf_pipeline_if),
|
||||
`endif
|
||||
|
@ -209,6 +216,9 @@ module VX_execute #(
|
|||
.reset (gpu_reset),
|
||||
.gpu_req_if (gpu_req_if),
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_tex_if (perf_tex_if),
|
||||
`endif
|
||||
.tex_csr_if (tex_csr_if),
|
||||
.dcache_req_if (tex_dcache_req_if),
|
||||
.dcache_rsp_if (tex_dcache_rsp_if),
|
||||
|
|
|
@ -22,6 +22,7 @@ module VX_fpu_unit #(
|
|||
wire valid_out;
|
||||
wire ready_out;
|
||||
|
||||
wire [63:0] rsp_uuid;
|
||||
wire [`NW_BITS-1:0] rsp_wid;
|
||||
wire [`NUM_THREADS-1:0] rsp_tmask;
|
||||
wire [31:0] rsp_PC;
|
||||
|
@ -39,7 +40,7 @@ module VX_fpu_unit #(
|
|||
wire fpuq_pop = valid_out && ready_out;
|
||||
|
||||
VX_index_buffer #(
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1),
|
||||
.DATAW (64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1),
|
||||
.SIZE (`FPUQ_SIZE)
|
||||
) req_metadata (
|
||||
.clk (clk),
|
||||
|
@ -48,8 +49,8 @@ module VX_fpu_unit #(
|
|||
.write_addr (tag_in),
|
||||
.read_addr (tag_out),
|
||||
.release_addr (tag_out),
|
||||
.write_data ({fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.rd, fpu_req_if.wb}),
|
||||
.read_data ({rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb}),
|
||||
.write_data ({fpu_req_if.uuid, fpu_req_if.wid, fpu_req_if.tmask, fpu_req_if.PC, fpu_req_if.rd, fpu_req_if.wb}),
|
||||
.read_data ({rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb}),
|
||||
.release_slot (fpuq_pop),
|
||||
.full (fpuq_full),
|
||||
`UNUSED_PIN (empty)
|
||||
|
@ -180,14 +181,14 @@ module VX_fpu_unit #(
|
|||
wire stall_out = ~fpu_commit_if.ready && fpu_commit_if.valid;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `FFLAGS_BITS),
|
||||
.DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + `FFLAGS_BITS),
|
||||
.RESETW (1)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!stall_out),
|
||||
.data_in ({valid_out, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, result, has_fflags, rsp_fflags}),
|
||||
.data_out ({fpu_commit_if.valid, fpu_commit_if.wid, fpu_commit_if.tmask, fpu_commit_if.PC, fpu_commit_if.rd, fpu_commit_if.wb, fpu_commit_if.data, has_fflags_r, fflags_r})
|
||||
.data_in ({valid_out, rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, result, has_fflags, rsp_fflags}),
|
||||
.data_out ({fpu_commit_if.valid, fpu_commit_if.uuid, fpu_commit_if.wid, fpu_commit_if.tmask, fpu_commit_if.PC, fpu_commit_if.rd, fpu_commit_if.wb, fpu_commit_if.data, has_fflags_r, fflags_r})
|
||||
);
|
||||
|
||||
assign fpu_commit_if.eop = 1'b1;
|
||||
|
|
|
@ -12,6 +12,10 @@ module VX_gpu_unit #(
|
|||
VX_gpu_req_if.slave gpu_req_if,
|
||||
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
// PERF
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_tex_if.master perf_tex_if,
|
||||
`endif
|
||||
VX_dcache_req_if.master dcache_req_if,
|
||||
VX_dcache_rsp_if.slave dcache_rsp_if,
|
||||
VX_tex_csr_if.slave tex_csr_if,
|
||||
|
@ -28,12 +32,13 @@ module VX_gpu_unit #(
|
|||
localparam WCTL_DATAW = `GPU_TMC_BITS + `GPU_WSPAWN_BITS + `GPU_SPLIT_BITS + `GPU_BARRIER_BITS;
|
||||
localparam RSP_DATAW = `MAX(`NUM_THREADS * 32, WCTL_DATAW);
|
||||
|
||||
wire rsp_valid;
|
||||
wire [`NW_BITS-1:0] rsp_wid;
|
||||
wire [`NUM_THREADS-1:0] rsp_tmask;
|
||||
wire [31:0] rsp_PC;
|
||||
wire [`NR_BITS-1:0] rsp_rd;
|
||||
wire rsp_wb;
|
||||
wire rsp_valid;
|
||||
wire [63:0] rsp_uuid;
|
||||
wire [`NW_BITS-1:0] rsp_wid;
|
||||
wire [`NUM_THREADS-1:0] rsp_tmask;
|
||||
wire [31:0] rsp_PC;
|
||||
wire [`NR_BITS-1:0] rsp_rd;
|
||||
wire rsp_wb;
|
||||
|
||||
wire [RSP_DATAW-1:0] rsp_data, rsp_data_r;
|
||||
|
||||
|
@ -112,6 +117,7 @@ module VX_gpu_unit #(
|
|||
wire is_tex = (gpu_req_if.op_type == `INST_GPU_TEX);
|
||||
|
||||
assign tex_req_if.valid = gpu_req_if.valid && is_tex;
|
||||
assign tex_req_if.uuid = gpu_req_if.uuid;
|
||||
assign tex_req_if.wid = gpu_req_if.wid;
|
||||
assign tex_req_if.tmask = gpu_req_if.tmask;
|
||||
assign tex_req_if.PC = gpu_req_if.PC;
|
||||
|
@ -128,6 +134,9 @@ module VX_gpu_unit #(
|
|||
) tex_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_tex_if (perf_tex_if),
|
||||
`endif
|
||||
.tex_req_if (tex_req_if),
|
||||
.tex_csr_if (tex_csr_if),
|
||||
.tex_rsp_if (tex_rsp_if),
|
||||
|
@ -143,6 +152,7 @@ module VX_gpu_unit #(
|
|||
assign is_warp_ctl = !(is_tex || tex_rsp_if.valid);
|
||||
|
||||
assign rsp_valid = tex_rsp_if.valid || (gpu_req_if.valid && ~is_tex);
|
||||
assign rsp_uuid = tex_rsp_if.valid ? tex_rsp_if.uuid : gpu_req_if.uuid;
|
||||
assign rsp_wid = tex_rsp_if.valid ? tex_rsp_if.wid : gpu_req_if.wid;
|
||||
assign rsp_tmask = tex_rsp_if.valid ? tex_rsp_if.tmask : gpu_req_if.tmask;
|
||||
assign rsp_PC = tex_rsp_if.valid ? tex_rsp_if.PC : gpu_req_if.PC;
|
||||
|
@ -161,6 +171,7 @@ module VX_gpu_unit #(
|
|||
assign is_warp_ctl = 1;
|
||||
|
||||
assign rsp_valid = gpu_req_if.valid;
|
||||
assign rsp_uuid = gpu_req_if.uuid;
|
||||
assign rsp_wid = gpu_req_if.wid;
|
||||
assign rsp_tmask = gpu_req_if.tmask;
|
||||
assign rsp_PC = gpu_req_if.PC;
|
||||
|
@ -176,14 +187,14 @@ module VX_gpu_unit #(
|
|||
assign stall_out = ~gpu_commit_if.ready && gpu_commit_if.valid;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + RSP_DATAW + 1),
|
||||
.DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + RSP_DATAW + 1),
|
||||
.RESETW (1)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!stall_out),
|
||||
.data_in ({rsp_valid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data, is_warp_ctl}),
|
||||
.data_out ({gpu_commit_if.valid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, rsp_data_r, is_warp_ctl_r})
|
||||
.data_in ({rsp_valid, rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data, is_warp_ctl}),
|
||||
.data_out ({gpu_commit_if.valid, gpu_commit_if.uuid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, rsp_data_r, is_warp_ctl_r})
|
||||
);
|
||||
|
||||
assign gpu_commit_if.data = rsp_data_r[(`NUM_THREADS * 32)-1:0];
|
||||
|
@ -200,7 +211,7 @@ module VX_gpu_unit #(
|
|||
assign gpu_req_if.ready = ~stall_in;
|
||||
|
||||
`SCOPE_ASSIGN (gpu_rsp_valid, warp_ctl_if.valid);
|
||||
`SCOPE_ASSIGN (gpu_rsp_wid, warp_ctl_if.wid);
|
||||
`SCOPE_ASSIGN (gpu_rsp_uuid, gpu_commit_if.uuid);
|
||||
`SCOPE_ASSIGN (gpu_rsp_tmc, warp_ctl_if.tmc.valid);
|
||||
`SCOPE_ASSIGN (gpu_rsp_wspawn, warp_ctl_if.wspawn.valid);
|
||||
`SCOPE_ASSIGN (gpu_rsp_split, warp_ctl_if.split.valid);
|
||||
|
|
|
@ -15,7 +15,7 @@ module VX_ibuffer #(
|
|||
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
localparam DATAW = `NUM_THREADS + 32 + `EX_BITS + `INST_OP_BITS + `INST_FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1;
|
||||
localparam DATAW = 64 + `NUM_THREADS + 32 + `EX_BITS + `INST_OP_BITS + `INST_FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1;
|
||||
localparam ADDRW = $clog2(`IBUF_SIZE+1);
|
||||
localparam NWARPSW = $clog2(`NUM_WARPS+1);
|
||||
|
||||
|
@ -168,7 +168,8 @@ module VX_ibuffer #(
|
|||
|
||||
assign decode_if.ready = ~q_full[decode_if.wid];
|
||||
|
||||
assign q_data_in = {decode_if.tmask,
|
||||
assign q_data_in = {decode_if.uuid,
|
||||
decode_if.tmask,
|
||||
decode_if.PC,
|
||||
decode_if.ex_type,
|
||||
decode_if.op_type,
|
||||
|
@ -184,7 +185,8 @@ module VX_ibuffer #(
|
|||
|
||||
assign ibuffer_if.valid = deq_valid;
|
||||
assign ibuffer_if.wid = deq_wid;
|
||||
assign {ibuffer_if.tmask,
|
||||
assign {ibuffer_if.uuid,
|
||||
ibuffer_if.tmask,
|
||||
ibuffer_if.PC,
|
||||
ibuffer_if.ex_type,
|
||||
ibuffer_if.op_type,
|
||||
|
|
|
@ -25,35 +25,36 @@ module VX_icache_stage #(
|
|||
localparam OUT_REG = 0;
|
||||
|
||||
reg [`DBG_CACHE_REQ_IDW-1:0] req_id;
|
||||
wire [`DBG_CACHE_REQ_IDW-1:0] rsp_req_id;
|
||||
wire [`DBG_CACHE_REQ_IDW-1:0] rsp_id;
|
||||
wire [`NW_BITS-1:0] req_tag, rsp_tag;
|
||||
|
||||
`UNUSED_VAR (rsp_req_id)
|
||||
`UNUSED_VAR (rsp_id)
|
||||
|
||||
wire icache_req_fire = icache_req_if.valid && icache_req_if.ready;
|
||||
|
||||
assign req_tag = ifetch_req_if.wid;
|
||||
assign rsp_tag = icache_rsp_if.tag[`NW_BITS-1:0];
|
||||
assign rsp_req_id = icache_rsp_if.tag[`NW_BITS +: `DBG_CACHE_REQ_IDW];
|
||||
assign req_tag = ifetch_req_if.wid;
|
||||
assign rsp_tag = icache_rsp_if.tag[`NW_BITS-1:0];
|
||||
assign rsp_id = icache_rsp_if.tag[`NW_BITS +: `DBG_CACHE_REQ_IDW];
|
||||
|
||||
wire [63:0] rsp_uuid;
|
||||
wire [31:0] rsp_PC;
|
||||
wire [`NUM_THREADS-1:0] rsp_tmask;
|
||||
|
||||
VX_dp_ram #(
|
||||
.DATAW (32 + `NUM_THREADS),
|
||||
.DATAW (32 + `NUM_THREADS + 64),
|
||||
.SIZE (`NUM_WARPS),
|
||||
.LUTRAM (1)
|
||||
) req_metadata (
|
||||
.clk (clk),
|
||||
.wren (icache_req_fire),
|
||||
.waddr (req_tag),
|
||||
.wdata ({ifetch_req_if.PC, ifetch_req_if.tmask}),
|
||||
.wdata ({ifetch_req_if.PC, ifetch_req_if.tmask, ifetch_req_if.uuid}),
|
||||
.raddr (rsp_tag),
|
||||
.rdata ({rsp_PC, rsp_tmask})
|
||||
.rdata ({rsp_PC, rsp_tmask, rsp_uuid})
|
||||
);
|
||||
|
||||
`RUNTIME_ASSERT((!ifetch_req_if.valid || ifetch_req_if.PC >= `STARTUP_ADDR),
|
||||
("invalid PC=%0h, wid=%0d, tmask=%b", ifetch_req_if.PC, ifetch_req_if.wid, ifetch_req_if.tmask))
|
||||
("%t: *** invalid PC=%0h, wid=%0d, tmask=%b (#%0d)", $time, ifetch_req_if.PC, ifetch_req_if.wid, ifetch_req_if.tmask, ifetch_req_if.uuid))
|
||||
|
||||
// Icache Request
|
||||
assign icache_req_if.valid = ifetch_req_if.valid;
|
||||
|
@ -78,35 +79,37 @@ module VX_icache_stage #(
|
|||
wire stall_out = ~ifetch_rsp_if.ready && (0 == OUT_REG && ifetch_rsp_if.valid);
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + 32),
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + 32 + 64),
|
||||
.RESETW (1),
|
||||
.DEPTH (OUT_REG)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!stall_out),
|
||||
.data_in ({icache_rsp_if.valid, rsp_wid, rsp_tmask, rsp_PC, icache_rsp_if.data}),
|
||||
.data_out ({ifetch_rsp_if.valid, ifetch_rsp_if.wid, ifetch_rsp_if.tmask, ifetch_rsp_if.PC, ifetch_rsp_if.data})
|
||||
.data_in ({icache_rsp_if.valid, rsp_wid, rsp_tmask, rsp_PC, icache_rsp_if.data, rsp_uuid}),
|
||||
.data_out ({ifetch_rsp_if.valid, ifetch_rsp_if.wid, ifetch_rsp_if.tmask, ifetch_rsp_if.PC, ifetch_rsp_if.data, ifetch_rsp_if.uuid})
|
||||
);
|
||||
|
||||
// Can accept new response?
|
||||
assign icache_rsp_if.ready = ~stall_out;
|
||||
|
||||
`SCOPE_ASSIGN (icache_req_fire, icache_req_fire);
|
||||
`SCOPE_ASSIGN (icache_req_wid, ifetch_req_if.wid);
|
||||
`SCOPE_ASSIGN (icache_req_uuid, ifetch_req_if.uuid);
|
||||
`SCOPE_ASSIGN (icache_req_addr, {icache_req_if.addr, 2'b0});
|
||||
`SCOPE_ASSIGN (icache_req_tag, req_tag);
|
||||
|
||||
`SCOPE_ASSIGN (icache_rsp_fire, icache_rsp_if.valid && icache_rsp_if.ready);
|
||||
`SCOPE_ASSIGN (icache_rsp_uuid, rsp_uuid);
|
||||
`SCOPE_ASSIGN (icache_rsp_data, icache_rsp_if.data);
|
||||
`SCOPE_ASSIGN (icache_rsp_tag, rsp_tag);
|
||||
|
||||
`ifdef DBG_TRACE_CORE_ICACHE
|
||||
always @(posedge clk) begin
|
||||
if (icache_req_fire) begin
|
||||
dpi_trace("%d: I$%0d req: wid=%0d, PC=%0h, req_id=%0h\n", $time, CORE_ID, ifetch_req_if.wid, ifetch_req_if.PC, req_id);
|
||||
dpi_trace("%d: I$%0d req: wid=%0d, PC=%0h, req_id=%0h (#%0d)\n", $time, CORE_ID, ifetch_req_if.wid, ifetch_req_if.PC, req_id, ifetch_req_if.uuid);
|
||||
end
|
||||
if (ifetch_rsp_if.valid && ifetch_rsp_if.ready) begin
|
||||
dpi_trace("%d: I$%0d rsp: wid=%0d, PC=%0h, req_id=%0h, data=%0h\n", $time, CORE_ID, ifetch_rsp_if.wid, ifetch_rsp_if.PC, rsp_req_id, ifetch_rsp_if.data);
|
||||
dpi_trace("%d: I$%0d rsp: wid=%0d, PC=%0h, req_id=%0h, data=%0h (#%0d)\n", $time, CORE_ID, ifetch_rsp_if.wid, ifetch_rsp_if.PC, rsp_id, ifetch_rsp_if.data, ifetch_rsp_if.uuid);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -9,7 +9,7 @@ module VX_issue #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_pipeline_if.master perf_pipeline_if,
|
||||
VX_perf_pipeline_if.issue perf_issue_if,
|
||||
`endif
|
||||
|
||||
VX_decode_if.slave decode_if,
|
||||
|
@ -38,6 +38,7 @@ module VX_issue #(
|
|||
|
||||
// scoreboard writeback interface
|
||||
assign sboard_wb_if.valid = writeback_if.valid;
|
||||
assign sboard_wb_if.uuid = writeback_if.uuid;
|
||||
assign sboard_wb_if.wid = writeback_if.wid;
|
||||
assign sboard_wb_if.PC = writeback_if.PC;
|
||||
assign sboard_wb_if.rd = writeback_if.rd;
|
||||
|
@ -45,6 +46,7 @@ module VX_issue #(
|
|||
|
||||
// scoreboard interface
|
||||
assign scoreboard_if.valid = ibuffer_if.valid && dispatch_if.ready;
|
||||
assign scoreboard_if.uuid = ibuffer_if.uuid;
|
||||
assign scoreboard_if.wid = ibuffer_if.wid;
|
||||
assign scoreboard_if.PC = ibuffer_if.PC;
|
||||
assign scoreboard_if.wb = ibuffer_if.wb;
|
||||
|
@ -57,6 +59,7 @@ module VX_issue #(
|
|||
|
||||
// dispatch interface
|
||||
assign dispatch_if.valid = ibuffer_if.valid && scoreboard_if.ready;
|
||||
assign dispatch_if.uuid = ibuffer_if.uuid;
|
||||
assign dispatch_if.wid = ibuffer_if.wid;
|
||||
assign dispatch_if.tmask = ibuffer_if.tmask;
|
||||
assign dispatch_if.PC = ibuffer_if.PC;
|
||||
|
@ -121,9 +124,8 @@ module VX_issue #(
|
|||
);
|
||||
|
||||
`SCOPE_ASSIGN (issue_fire, ibuffer_if.valid && ibuffer_if.ready);
|
||||
`SCOPE_ASSIGN (issue_wid, ibuffer_if.wid);
|
||||
`SCOPE_ASSIGN (issue_uuid, ibuffer_if.uuid);
|
||||
`SCOPE_ASSIGN (issue_tmask, ibuffer_if.tmask);
|
||||
`SCOPE_ASSIGN (issue_pc, ibuffer_if.PC);
|
||||
`SCOPE_ASSIGN (issue_ex_type, ibuffer_if.ex_type);
|
||||
`SCOPE_ASSIGN (issue_op_type, ibuffer_if.op_type);
|
||||
`SCOPE_ASSIGN (issue_op_mod, ibuffer_if.op_mod);
|
||||
|
@ -140,10 +142,9 @@ module VX_issue #(
|
|||
`SCOPE_ASSIGN (gpr_rs1, gpr_rsp_if.rs1_data);
|
||||
`SCOPE_ASSIGN (gpr_rs2, gpr_rsp_if.rs2_data);
|
||||
`SCOPE_ASSIGN (gpr_rs3, gpr_rsp_if.rs3_data);
|
||||
`SCOPE_ASSIGN (writeback_valid, writeback_if.valid);
|
||||
`SCOPE_ASSIGN (writeback_valid, writeback_if.valid);
|
||||
`SCOPE_ASSIGN (writeback_uuid, writeback_if.uuid);
|
||||
`SCOPE_ASSIGN (writeback_tmask, writeback_if.tmask);
|
||||
`SCOPE_ASSIGN (writeback_wid, writeback_if.wid);
|
||||
`SCOPE_ASSIGN (writeback_pc, writeback_if.PC);
|
||||
`SCOPE_ASSIGN (writeback_rd, writeback_if.rd);
|
||||
`SCOPE_ASSIGN (writeback_data, writeback_if.data);
|
||||
`SCOPE_ASSIGN (writeback_eop, writeback_if.eop);
|
||||
|
@ -171,40 +172,35 @@ module VX_issue #(
|
|||
perf_fpu_stalls <= 0;
|
||||
`endif
|
||||
end else begin
|
||||
if (decode_if.valid & !decode_if.ready) begin
|
||||
if (decode_if.valid & ~decode_if.ready) begin
|
||||
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
if (scoreboard_if.valid & !scoreboard_if.ready) begin
|
||||
if (scoreboard_if.valid & ~scoreboard_if.ready) begin
|
||||
perf_scb_stalls <= perf_scb_stalls + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
if (alu_req_if.valid & !alu_req_if.ready) begin
|
||||
perf_alu_stalls <= perf_alu_stalls + `PERF_CTR_BITS'd1;
|
||||
if (dispatch_if.valid & ~dispatch_if.ready) begin
|
||||
case (dispatch_if.ex_type)
|
||||
`EX_ALU: perf_alu_stalls <= perf_alu_stalls + `PERF_CTR_BITS'd1;
|
||||
`ifdef EXT_F_ENABLE
|
||||
`EX_FPU: perf_fpu_stalls <= perf_fpu_stalls + `PERF_CTR_BITS'd1;
|
||||
`endif
|
||||
`EX_LSU: perf_lsu_stalls <= perf_lsu_stalls + `PERF_CTR_BITS'd1;
|
||||
`EX_CSR: perf_csr_stalls <= perf_csr_stalls + `PERF_CTR_BITS'd1;
|
||||
//`EX_GPU:
|
||||
default: perf_gpu_stalls <= perf_gpu_stalls + `PERF_CTR_BITS'd1;
|
||||
endcase
|
||||
end
|
||||
if (lsu_req_if.valid & !lsu_req_if.ready) begin
|
||||
perf_lsu_stalls <= perf_lsu_stalls + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
if (csr_req_if.valid & !csr_req_if.ready) begin
|
||||
perf_csr_stalls <= perf_csr_stalls + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
if (gpu_req_if.valid & !gpu_req_if.ready) begin
|
||||
perf_gpu_stalls <= perf_gpu_stalls + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
`ifdef EXT_F_ENABLE
|
||||
if (fpu_req_if.valid & !fpu_req_if.ready) begin
|
||||
perf_fpu_stalls <= perf_fpu_stalls + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
`endif
|
||||
end
|
||||
end
|
||||
|
||||
assign perf_pipeline_if.ibf_stalls = perf_ibf_stalls;
|
||||
assign perf_pipeline_if.scb_stalls = perf_scb_stalls;
|
||||
assign perf_pipeline_if.alu_stalls = perf_alu_stalls;
|
||||
assign perf_pipeline_if.lsu_stalls = perf_lsu_stalls;
|
||||
assign perf_pipeline_if.csr_stalls = perf_csr_stalls;
|
||||
assign perf_pipeline_if.gpu_stalls = perf_gpu_stalls;
|
||||
assign perf_issue_if.ibf_stalls = perf_ibf_stalls;
|
||||
assign perf_issue_if.scb_stalls = perf_scb_stalls;
|
||||
assign perf_issue_if.alu_stalls = perf_alu_stalls;
|
||||
assign perf_issue_if.lsu_stalls = perf_lsu_stalls;
|
||||
assign perf_issue_if.csr_stalls = perf_csr_stalls;
|
||||
assign perf_issue_if.gpu_stalls = perf_gpu_stalls;
|
||||
`ifdef EXT_F_ENABLE
|
||||
assign perf_pipeline_if.fpu_stalls = perf_fpu_stalls;
|
||||
assign perf_issue_if.fpu_stalls = perf_fpu_stalls;
|
||||
`endif
|
||||
`endif
|
||||
|
||||
|
@ -216,7 +212,7 @@ module VX_issue #(
|
|||
`TRACE_ARRAY1D(alu_req_if.rs1_data, `NUM_THREADS);
|
||||
dpi_trace(", rs2_data=");
|
||||
`TRACE_ARRAY1D(alu_req_if.rs2_data, `NUM_THREADS);
|
||||
dpi_trace("\n");
|
||||
dpi_trace(" (#%0d)\n", alu_req_if.uuid);
|
||||
end
|
||||
if (lsu_req_if.valid && lsu_req_if.ready) begin
|
||||
dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=LSU, tmask=%b, rd=%0d, offset=%0h, addr=",
|
||||
|
@ -224,13 +220,13 @@ module VX_issue #(
|
|||
`TRACE_ARRAY1D(lsu_req_if.base_addr, `NUM_THREADS);
|
||||
dpi_trace(", data=");
|
||||
`TRACE_ARRAY1D(lsu_req_if.store_data, `NUM_THREADS);
|
||||
dpi_trace("\n");
|
||||
dpi_trace(" (#%0d)\n", lsu_req_if.uuid);
|
||||
end
|
||||
if (csr_req_if.valid && csr_req_if.ready) begin
|
||||
dpi_trace("%d: core%0d-issue: wid=%0d, PC=%0h, ex=CSR, tmask=%b, rd=%0d, addr=%0h, rs1_data=",
|
||||
$time, CORE_ID, csr_req_if.wid, csr_req_if.PC, csr_req_if.tmask, csr_req_if.rd, csr_req_if.addr);
|
||||
`TRACE_ARRAY1D(csr_req_if.rs1_data, `NUM_THREADS);
|
||||
dpi_trace("\n");
|
||||
dpi_trace(" (#%0d)\n", csr_req_if.uuid);
|
||||
end
|
||||
`ifdef EXT_F_ENABLE
|
||||
if (fpu_req_if.valid && fpu_req_if.ready) begin
|
||||
|
@ -241,7 +237,7 @@ module VX_issue #(
|
|||
`TRACE_ARRAY1D(fpu_req_if.rs2_data, `NUM_THREADS);
|
||||
dpi_trace(", rs3_data=");
|
||||
`TRACE_ARRAY1D(fpu_req_if.rs3_data, `NUM_THREADS);
|
||||
dpi_trace("\n");
|
||||
dpi_trace(" (#%0d)\n", fpu_req_if.uuid);
|
||||
end
|
||||
`endif
|
||||
if (gpu_req_if.valid && gpu_req_if.ready) begin
|
||||
|
@ -252,7 +248,7 @@ module VX_issue #(
|
|||
`TRACE_ARRAY1D(gpu_req_if.rs2_data, `NUM_THREADS);
|
||||
dpi_trace(", rs3_data=");
|
||||
`TRACE_ARRAY1D(gpu_req_if.rs3_data, `NUM_THREADS);
|
||||
dpi_trace("\n");
|
||||
dpi_trace(" (#%0d)\n", gpu_req_if.uuid);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -21,7 +21,6 @@ module VX_lsu_unit #(
|
|||
);
|
||||
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
|
||||
localparam MEM_ADDRW = 32 - MEM_ASHIFT;
|
||||
|
||||
localparam REQ_ASHIFT = `CLOG2(`DCACHE_WORD_SIZE);
|
||||
|
||||
`STATIC_ASSERT(0 == (`IO_BASE_ADDR % MEM_ASHIFT), ("invalid parameter"))
|
||||
|
@ -29,6 +28,7 @@ module VX_lsu_unit #(
|
|||
`STATIC_ASSERT(`SMEM_SIZE == `MEM_BLOCK_SIZE * (`SMEM_SIZE / `MEM_BLOCK_SIZE), ("invalid parameter"))
|
||||
|
||||
wire req_valid;
|
||||
wire [63:0] req_uuid;
|
||||
wire [`NUM_THREADS-1:0] req_tmask;
|
||||
wire [`NUM_THREADS-1:0][31:0] req_addr;
|
||||
wire [`INST_LSU_BITS-1:0] req_type;
|
||||
|
@ -54,16 +54,16 @@ module VX_lsu_unit #(
|
|||
for (genvar i = 0; i < (`NUM_THREADS-1); i++) begin
|
||||
assign addr_matches[i] = (lsu_req_if.base_addr[i+1] == lsu_req_if.base_addr[0]) || ~lsu_req_if.tmask[i+1];
|
||||
end
|
||||
|
||||
wire lsu_is_dup = lsu_req_if.tmask[0] && (& addr_matches);
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
||||
// is non-cacheable address
|
||||
wire is_addr_nc = (full_addr[i][MEM_ASHIFT +: MEM_ADDRW] >= MEM_ADDRW'(`IO_BASE_ADDR >> MEM_ASHIFT));
|
||||
|
||||
if (`SM_ENABLE) begin
|
||||
// is shared memory address
|
||||
wire is_addr_sm = (full_addr[i][MEM_ASHIFT +: MEM_ADDRW] >= MEM_ADDRW'((`SMEM_BASE_ADDR - `SMEM_SIZE) >> MEM_ASHIFT))
|
||||
& (full_addr[i][MEM_ASHIFT +: MEM_ADDRW] < MEM_ADDRW'(`SMEM_BASE_ADDR >> MEM_ASHIFT));
|
||||
& (full_addr[i][MEM_ASHIFT +: MEM_ADDRW] < MEM_ADDRW'(`SMEM_BASE_ADDR >> MEM_ASHIFT));
|
||||
assign lsu_addr_type[i] = {is_addr_nc, is_addr_sm};
|
||||
end else begin
|
||||
assign lsu_addr_type[i] = is_addr_nc;
|
||||
|
@ -81,19 +81,20 @@ module VX_lsu_unit #(
|
|||
wire lsu_wb = lsu_req_if.wb | lsu_req_if.is_prefetch;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + 1 + 1 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * `CACHE_ADDR_TYPE_BITS) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)),
|
||||
.DATAW (1 + 1 + 1 + 64 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * `CACHE_ADDR_TYPE_BITS) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)),
|
||||
.RESETW (1)
|
||||
) req_pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!stall_in),
|
||||
.data_in ({lsu_valid, lsu_is_dup, lsu_req_if.is_prefetch, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, full_addr, lsu_addr_type, lsu_req_if.op_type, lsu_req_if.rd, lsu_wb, lsu_req_if.store_data}),
|
||||
.data_out ({req_valid, req_is_dup, req_is_prefetch, req_wid, req_tmask, req_pc, req_addr, req_addr_type, req_type, req_rd, req_wb, req_data})
|
||||
.data_in ({lsu_valid, lsu_is_dup, lsu_req_if.is_prefetch, lsu_req_if.uuid, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, full_addr, lsu_addr_type, lsu_req_if.op_type, lsu_req_if.rd, lsu_wb, lsu_req_if.store_data}),
|
||||
.data_out ({req_valid, req_is_dup, req_is_prefetch, req_uuid, req_wid, req_tmask, req_pc, req_addr, req_addr_type, req_type, req_rd, req_wb, req_data})
|
||||
);
|
||||
|
||||
// Can accept new request?
|
||||
assign lsu_req_if.ready = ~stall_in && ~fence_wait;
|
||||
|
||||
wire [63:0] rsp_uuid;
|
||||
wire [`NW_BITS-1:0] rsp_wid;
|
||||
wire [31:0] rsp_pc;
|
||||
wire [`NR_BITS-1:0] rsp_rd;
|
||||
|
@ -146,7 +147,7 @@ module VX_lsu_unit #(
|
|||
wire req_wb2 = req_wb && ~req_is_prefetch;
|
||||
|
||||
VX_index_buffer #(
|
||||
.DATAW (`NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `INST_LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1 + 1),
|
||||
.DATAW (64 + `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `INST_LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1 + 1),
|
||||
.SIZE (`LSUQ_SIZE)
|
||||
) req_metadata (
|
||||
.clk (clk),
|
||||
|
@ -154,8 +155,8 @@ module VX_lsu_unit #(
|
|||
.write_addr (mbuf_waddr),
|
||||
.acquire_slot (mbuf_push),
|
||||
.read_addr (mbuf_raddr),
|
||||
.write_data ({req_wid, req_pc, req_tmask, req_rd, req_wb2, req_type, req_offset, req_is_dup, req_is_prefetch}),
|
||||
.read_data ({rsp_wid, rsp_pc, rsp_tmask, rsp_rd, rsp_wb, rsp_type, rsp_offset, rsp_is_dup, rsp_is_prefetch}),
|
||||
.write_data ({req_uuid, req_wid, req_pc, req_tmask, req_rd, req_wb2, req_type, req_offset, req_is_dup, req_is_prefetch}),
|
||||
.read_data ({rsp_uuid, rsp_wid, rsp_pc, rsp_tmask, rsp_rd, rsp_wb, rsp_type, rsp_offset, rsp_is_dup, rsp_is_prefetch}),
|
||||
.release_addr (mbuf_raddr),
|
||||
.release_slot (mbuf_pop),
|
||||
.full (mbuf_full),
|
||||
|
@ -259,6 +260,7 @@ module VX_lsu_unit #(
|
|||
wire is_store_rsp = req_valid && ~req_wb && dcache_req_ready;
|
||||
|
||||
assign st_commit_if.valid = is_store_rsp;
|
||||
assign st_commit_if.uuid = req_uuid;
|
||||
assign st_commit_if.wid = req_wid;
|
||||
assign st_commit_if.tmask = req_tmask;
|
||||
assign st_commit_if.PC = req_pc;
|
||||
|
@ -295,14 +297,14 @@ module VX_lsu_unit #(
|
|||
wire load_rsp_stall = ~ld_commit_if.ready && ld_commit_if.valid;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1),
|
||||
.DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1),
|
||||
.RESETW (1)
|
||||
) rsp_pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!load_rsp_stall),
|
||||
.data_in ({dcache_rsp_if.valid, rsp_wid, rsp_tmask_qual, rsp_pc, rsp_rd, rsp_wb, rsp_data, mbuf_pop}),
|
||||
.data_out ({ld_commit_if.valid, ld_commit_if.wid, ld_commit_if.tmask, ld_commit_if.PC, ld_commit_if.rd, ld_commit_if.wb, ld_commit_if.data, ld_commit_if.eop})
|
||||
.data_in ({dcache_rsp_if.valid, rsp_uuid, rsp_wid, rsp_tmask_qual, rsp_pc, rsp_rd, rsp_wb, rsp_data, mbuf_pop}),
|
||||
.data_out ({ld_commit_if.valid, ld_commit_if.uuid, ld_commit_if.wid, ld_commit_if.tmask, ld_commit_if.PC, ld_commit_if.rd, ld_commit_if.wb, ld_commit_if.data, ld_commit_if.eop})
|
||||
);
|
||||
|
||||
// Can accept new cache response?
|
||||
|
@ -310,19 +312,19 @@ module VX_lsu_unit #(
|
|||
|
||||
// scope registration
|
||||
`SCOPE_ASSIGN (dcache_req_fire, dcache_req_fire);
|
||||
`SCOPE_ASSIGN (dcache_req_wid, req_wid);
|
||||
`SCOPE_ASSIGN (dcache_req_pc, req_pc);
|
||||
`SCOPE_ASSIGN (dcache_req_uuid, req_uuid);
|
||||
`SCOPE_ASSIGN (dcache_req_addr, req_addr);
|
||||
`SCOPE_ASSIGN (dcache_req_rw, ~req_wb);
|
||||
`SCOPE_ASSIGN (dcache_req_byteen,dcache_req_if.byteen);
|
||||
`SCOPE_ASSIGN (dcache_req_data, dcache_req_if.data);
|
||||
`SCOPE_ASSIGN (dcache_req_tag, req_tag);
|
||||
`SCOPE_ASSIGN (dcache_rsp_fire, dcache_rsp_if.tmask & {`NUM_THREADS{dcache_rsp_fire}});
|
||||
`SCOPE_ASSIGN (dcache_rsp_uuid, rsp_uuid);
|
||||
`SCOPE_ASSIGN (dcache_rsp_data, dcache_rsp_if.data);
|
||||
`SCOPE_ASSIGN (dcache_rsp_tag, mbuf_raddr);
|
||||
|
||||
`ifndef SYNTHESIS
|
||||
reg [`LSUQ_SIZE-1:0][(`NW_BITS + 32 + `NR_BITS + 64 + 1)-1:0] pending_reqs;
|
||||
reg [`LSUQ_SIZE-1:0][(`NW_BITS + 32 + `NR_BITS + 64 + 64 + 1)-1:0] pending_reqs;
|
||||
wire [63:0] delay_timeout = 10000 * (1 ** (`L2_ENABLE + `L3_ENABLE));
|
||||
|
||||
always @(posedge clk) begin
|
||||
|
@ -330,7 +332,7 @@ module VX_lsu_unit #(
|
|||
pending_reqs <= '0;
|
||||
end begin
|
||||
if (mbuf_push) begin
|
||||
pending_reqs[mbuf_waddr] <= {req_wid, req_pc, req_rd, $time, 1'b1};
|
||||
pending_reqs[mbuf_waddr] <= {req_wid, req_pc, req_rd, req_uuid, $time, 1'b1};
|
||||
end
|
||||
if (mbuf_pop) begin
|
||||
pending_reqs[mbuf_raddr] <= '0;
|
||||
|
@ -340,8 +342,11 @@ module VX_lsu_unit #(
|
|||
for (integer i = 0; i < `LSUQ_SIZE; ++i) begin
|
||||
if (pending_reqs[i][0]) begin
|
||||
`ASSERT(($time - pending_reqs[i][1 +: 64]) < delay_timeout,
|
||||
("%t: *** D$%0d response timeout: remaining=%b, wid=%0d, PC=%0h, rd=%0d",
|
||||
$time, CORE_ID, rsp_rem_mask[i], pending_reqs[i][1+64+32+`NR_BITS +: `NW_BITS], pending_reqs[i][1+64+`NR_BITS +: 32], pending_reqs[i][1+64 +: `NR_BITS]));
|
||||
("%t: *** D$%0d response timeout: remaining=%b, wid=%0d, PC=%0h, rd=%0d (#%0d)",
|
||||
$time, CORE_ID, rsp_rem_mask[i], pending_reqs[i][1+64+64+32+`NR_BITS +: `NW_BITS],
|
||||
pending_reqs[i][1+64+64+`NR_BITS +: 32],
|
||||
pending_reqs[i][1+64+64 +: `NR_BITS],
|
||||
pending_reqs[i][1+64 +: 64]));
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -360,20 +365,20 @@ module VX_lsu_unit #(
|
|||
`TRACE_ARRAY1D(req_addr_type, `NUM_THREADS);
|
||||
dpi_trace(", data=");
|
||||
`TRACE_ARRAY1D(dcache_req_if.data, `NUM_THREADS);
|
||||
dpi_trace(", req_id=%0h\n", req_id);
|
||||
dpi_trace(", (#%0d)\n", req_uuid);
|
||||
end else begin
|
||||
dpi_trace("%d: D$%0d Rd Req: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, req_id=%0h, addr=", $time, CORE_ID, req_is_prefetch, req_wid, req_pc, dcache_req_fire, req_id);
|
||||
`TRACE_ARRAY1D(req_addr, `NUM_THREADS);
|
||||
dpi_trace(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen);
|
||||
`TRACE_ARRAY1D(req_addr_type, `NUM_THREADS);
|
||||
dpi_trace(", rd=%0d, is_dup=%b\n", req_rd, req_is_dup);
|
||||
dpi_trace(", rd=%0d, is_dup=%b (#%0d)\n", req_rd, req_is_dup, req_uuid);
|
||||
end
|
||||
end
|
||||
if (dcache_rsp_fire) begin
|
||||
dpi_trace("%d: D$%0d Rsp: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, req_id=%0h, tag=%0h, rd=%0d, data=",
|
||||
$time, CORE_ID, rsp_is_prefetch, rsp_wid, rsp_pc, dcache_rsp_if.tmask, rsp_req_id, mbuf_raddr, rsp_rd);
|
||||
`TRACE_ARRAY1D(dcache_rsp_if.data, `NUM_THREADS);
|
||||
dpi_trace(", is_dup=%b\n", rsp_is_dup);
|
||||
dpi_trace(", is_dup=%b (#%0d)\n", rsp_is_dup, rsp_uuid);
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
|
|
@ -358,19 +358,17 @@ module VX_mem_unit # (
|
|||
|
||||
`ifdef PERF_ENABLE
|
||||
|
||||
`UNUSED_VAR (perf_dcache_if.mem_stalls)
|
||||
`UNUSED_VAR (perf_dcache_if.crsp_stalls)
|
||||
|
||||
assign perf_memsys_if.icache_reads = perf_icache_if.reads;
|
||||
assign perf_memsys_if.icache_read_misses = perf_icache_if.read_misses;
|
||||
assign perf_memsys_if.icache_pipe_stalls = perf_icache_if.pipe_stalls;
|
||||
assign perf_memsys_if.icache_crsp_stalls = perf_icache_if.crsp_stalls;
|
||||
|
||||
assign perf_memsys_if.dcache_reads = perf_dcache_if.reads;
|
||||
assign perf_memsys_if.dcache_writes = perf_dcache_if.writes;
|
||||
assign perf_memsys_if.dcache_read_misses = perf_dcache_if.read_misses;
|
||||
assign perf_memsys_if.dcache_write_misses= perf_dcache_if.write_misses;
|
||||
assign perf_memsys_if.dcache_bank_stalls = perf_dcache_if.bank_stalls;
|
||||
assign perf_memsys_if.dcache_mshr_stalls = perf_dcache_if.mshr_stalls;
|
||||
assign perf_memsys_if.dcache_pipe_stalls = perf_dcache_if.pipe_stalls;
|
||||
assign perf_memsys_if.dcache_crsp_stalls = perf_dcache_if.crsp_stalls;
|
||||
|
||||
if (`SM_ENABLE) begin
|
||||
assign perf_memsys_if.smem_reads = perf_smem_if.reads;
|
||||
|
@ -382,47 +380,41 @@ end else begin
|
|||
assign perf_memsys_if.smem_bank_stalls = 0;
|
||||
end
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_lat_per_cycle;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_mem_lat_per_cycle <= 0;
|
||||
perf_mem_pending_reads <= 0;
|
||||
end else begin
|
||||
perf_mem_lat_per_cycle <= perf_mem_lat_per_cycle +
|
||||
`PERF_CTR_BITS'($signed(2'((mem_req_if.valid && !mem_req_if.rw && mem_req_if.ready) && !(mem_rsp_if.valid && mem_rsp_if.ready)) -
|
||||
2'((mem_rsp_if.valid && mem_rsp_if.ready) && !(mem_req_if.valid && !mem_req_if.rw && mem_req_if.ready))));
|
||||
perf_mem_pending_reads <= perf_mem_pending_reads +
|
||||
`PERF_CTR_BITS'($signed(2'((mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw) && !(mem_rsp_if.valid && mem_rsp_if.ready)) -
|
||||
2'((mem_rsp_if.valid && mem_rsp_if.ready) && !(mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw))));
|
||||
end
|
||||
end
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_writes;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_lat;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_stalls;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_mem_reads <= 0;
|
||||
perf_mem_writes <= 0;
|
||||
perf_mem_lat <= 0;
|
||||
perf_mem_stalls <= 0;
|
||||
end else begin
|
||||
if (mem_req_if.valid && mem_req_if.ready && !mem_req_if.rw) begin
|
||||
perf_mem_reads <= perf_mem_reads + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
if (mem_req_if.valid && mem_req_if.ready && mem_req_if.rw) begin
|
||||
perf_mem_writes <= perf_mem_writes + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
if (mem_req_if.valid && !mem_req_if.ready) begin
|
||||
perf_mem_stalls <= perf_mem_stalls + `PERF_CTR_BITS'd1;
|
||||
end
|
||||
perf_mem_lat <= perf_mem_lat + perf_mem_lat_per_cycle;
|
||||
end
|
||||
perf_mem_lat <= perf_mem_lat + perf_mem_pending_reads;
|
||||
end
|
||||
end
|
||||
|
||||
assign perf_memsys_if.mem_reads = perf_mem_reads;
|
||||
assign perf_memsys_if.mem_writes = perf_mem_writes;
|
||||
assign perf_memsys_if.mem_latency = perf_mem_lat;
|
||||
assign perf_memsys_if.mem_stalls = perf_mem_stalls;
|
||||
assign perf_memsys_if.mem_latency = perf_mem_lat;
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -6,6 +6,7 @@ module VX_muldiv (
|
|||
|
||||
// Inputs
|
||||
input wire [`INST_MUL_BITS-1:0] alu_op,
|
||||
input wire [63:0] uuid_in,
|
||||
input wire [`NW_BITS-1:0] wid_in,
|
||||
input wire [`NUM_THREADS-1:0] tmask_in,
|
||||
input wire [31:0] PC_in,
|
||||
|
@ -15,6 +16,7 @@ module VX_muldiv (
|
|||
input wire [`NUM_THREADS-1:0][31:0] alu_in2,
|
||||
|
||||
// Outputs
|
||||
output wire [63:0] uuid_out,
|
||||
output wire [`NW_BITS-1:0] wid_out,
|
||||
output wire [`NUM_THREADS-1:0] tmask_out,
|
||||
output wire [31:0] PC_out,
|
||||
|
@ -32,6 +34,7 @@ module VX_muldiv (
|
|||
wire is_div_op = `INST_MUL_IS_DIV(alu_op);
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] mul_result;
|
||||
wire [63:0] mul_uuid_out;
|
||||
wire [`NW_BITS-1:0] mul_wid_out;
|
||||
wire [`NUM_THREADS-1:0] mul_tmask_out;
|
||||
wire [31:0] mul_PC_out;
|
||||
|
@ -63,15 +66,15 @@ module VX_muldiv (
|
|||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)),
|
||||
.DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)),
|
||||
.DEPTH (`LATENCY_IMUL),
|
||||
.RESETW (1)
|
||||
) mul_shift_reg (
|
||||
.clk(clk),
|
||||
.reset (reset),
|
||||
.enable (mul_ready_in),
|
||||
.data_in ({mul_valid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, mul_result_tmp}),
|
||||
.data_out ({mul_valid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, mul_result})
|
||||
.data_in ({mul_valid_in, uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, mul_result_tmp}),
|
||||
.data_out ({mul_valid_out, mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, mul_result})
|
||||
);
|
||||
|
||||
`else
|
||||
|
@ -103,15 +106,15 @@ module VX_muldiv (
|
|||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1),
|
||||
.DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1),
|
||||
.DEPTH (`LATENCY_IMUL),
|
||||
.RESETW (1)
|
||||
) mul_shift_reg (
|
||||
.clk(clk),
|
||||
.reset (reset),
|
||||
.enable (mul_ready_in),
|
||||
.data_in ({mul_valid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, is_mulh_in}),
|
||||
.data_out ({mul_valid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, is_mulh_out})
|
||||
.data_in ({mul_valid_in, uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, is_mulh_in}),
|
||||
.data_out ({mul_valid_out, mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, is_mulh_out})
|
||||
);
|
||||
|
||||
`endif
|
||||
|
@ -119,6 +122,7 @@ module VX_muldiv (
|
|||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] div_result;
|
||||
wire [63:0] div_uuid_out;
|
||||
wire [`NW_BITS-1:0] div_wid_out;
|
||||
wire [`NUM_THREADS-1:0] div_tmask_out;
|
||||
wire [31:0] div_PC_out;
|
||||
|
@ -147,15 +151,15 @@ module VX_muldiv (
|
|||
end
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)),
|
||||
.DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)),
|
||||
.DEPTH (`LATENCY_IMUL),
|
||||
.RESETW (1)
|
||||
) div_shift_reg (
|
||||
.clk(clk),
|
||||
.reset (reset),
|
||||
.enable (div_ready_in),
|
||||
.data_in ({div_valid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, div_result_tmp}),
|
||||
.data_out ({div_valid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, div_result})
|
||||
.data_in ({div_valid_in, uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, div_result_tmp}),
|
||||
.data_out ({div_valid_out, div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, div_result})
|
||||
);
|
||||
|
||||
assign div_ready_in = div_ready_out || ~div_valid_out;
|
||||
|
@ -171,21 +175,21 @@ module VX_muldiv (
|
|||
.WIDTHQ (32),
|
||||
.WIDTHR (32),
|
||||
.LANES (`NUM_THREADS),
|
||||
.TAGW (`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1)
|
||||
.TAGW (64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1)
|
||||
) divide (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (div_valid_in),
|
||||
.ready_in (div_ready_in),
|
||||
.signed_mode(is_signed_div),
|
||||
.tag_in ({wid_in, tmask_in, PC_in, rd_in, wb_in, is_rem_op_in}),
|
||||
.tag_in ({uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, is_rem_op_in}),
|
||||
.numer (alu_in1),
|
||||
.denom (alu_in2),
|
||||
.quotient (div_result_tmp),
|
||||
.remainder (rem_result_tmp),
|
||||
.ready_out (div_ready_out),
|
||||
.valid_out (div_valid_out),
|
||||
.tag_out ({div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, is_rem_op_out})
|
||||
.tag_out ({div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, is_rem_op_out})
|
||||
);
|
||||
|
||||
assign div_result = is_rem_op_out ? rem_result_tmp : div_result_tmp;
|
||||
|
@ -195,6 +199,7 @@ module VX_muldiv (
|
|||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire rsp_valid = mul_valid_out || div_valid_out;
|
||||
wire [63:0] rsp_uuid = mul_valid_out ? mul_uuid_out : div_uuid_out;
|
||||
wire [`NW_BITS-1:0] rsp_wid = mul_valid_out ? mul_wid_out : div_wid_out;
|
||||
wire [`NUM_THREADS-1:0] rsp_tmask = mul_valid_out ? mul_tmask_out : div_tmask_out;
|
||||
wire [31:0] rsp_PC = mul_valid_out ? mul_PC_out : div_PC_out;
|
||||
|
@ -205,14 +210,14 @@ module VX_muldiv (
|
|||
assign stall_out = ~ready_out && valid_out;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)),
|
||||
.DATAW (1 + 64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)),
|
||||
.RESETW (1)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall_out),
|
||||
.data_in ({rsp_valid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data}),
|
||||
.data_out ({valid_out, wid_out, tmask_out, PC_out, rd_out, wb_out, data_out})
|
||||
.data_in ({rsp_valid, rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data}),
|
||||
.data_out ({valid_out, uuid_out, wid_out, tmask_out, PC_out, rd_out, wb_out, data_out})
|
||||
);
|
||||
|
||||
// can accept new request?
|
||||
|
|
|
@ -165,6 +165,9 @@ module VX_pipeline #(
|
|||
) decode (
|
||||
.clk (clk),
|
||||
.reset (decode_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_decode_if (perf_pipeline_if.decode),
|
||||
`endif
|
||||
.ifetch_rsp_if (ifetch_rsp_if),
|
||||
.decode_if (decode_if),
|
||||
.wstall_if (wstall_if),
|
||||
|
@ -180,7 +183,7 @@ module VX_pipeline #(
|
|||
.reset (issue_reset),
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_pipeline_if (perf_pipeline_if),
|
||||
.perf_issue_if (perf_pipeline_if.issue),
|
||||
`endif
|
||||
|
||||
.decode_if (decode_if),
|
||||
|
|
|
@ -60,22 +60,22 @@ module VX_scoreboard #(
|
|||
end else begin
|
||||
`ifdef DBG_TRACE_PIPELINE
|
||||
if (ibuffer_if.valid && ~ibuffer_if.ready) begin
|
||||
dpi_trace("%d: *** core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b\n",
|
||||
dpi_trace("%d: *** core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b (#%0d)\n",
|
||||
$time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb,
|
||||
deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3);
|
||||
deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3, ibuffer_if.uuid);
|
||||
end
|
||||
`endif
|
||||
if (release_reg) begin
|
||||
`ASSERT(inuse_regs[writeback_if.wid][writeback_if.rd] != 0,
|
||||
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=%0h, rd=%0d",
|
||||
$time, CORE_ID, writeback_if.wid, writeback_if.PC, writeback_if.rd));
|
||||
("%t: *** core%0d: invalid writeback register: wid=%0d, PC=%0h, rd=%0d (#%0d)",
|
||||
$time, CORE_ID, writeback_if.wid, writeback_if.PC, writeback_if.rd,writeback_if.uuid));
|
||||
end
|
||||
if (ibuffer_if.valid && ~ibuffer_if.ready) begin
|
||||
deadlock_ctr <= deadlock_ctr + 1;
|
||||
`ASSERT(deadlock_ctr < deadlock_timeout,
|
||||
("%t: *** core%0d-deadlock: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b",
|
||||
("%t: *** core%0d-deadlock: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b (#%0d)",
|
||||
$time, CORE_ID, ibuffer_if.wid, ibuffer_if.PC, ibuffer_if.rd, ibuffer_if.wb,
|
||||
deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3));
|
||||
deq_inuse_rd, deq_inuse_rs1, deq_inuse_rs2, deq_inuse_rs3, ibuffer_if.uuid));
|
||||
end else if (ibuffer_if.valid && ibuffer_if.ready) begin
|
||||
deadlock_ctr <= 0;
|
||||
end
|
||||
|
|
|
@ -35,9 +35,9 @@ task trace_ex_op (
|
|||
`INST_BR_JALR: dpi_trace("JALR");
|
||||
`INST_BR_ECALL: dpi_trace("ECALL");
|
||||
`INST_BR_EBREAK:dpi_trace("EBREAK");
|
||||
`INST_BR_MRET: dpi_trace("MRET");
|
||||
`INST_BR_URET: dpi_trace("URET");
|
||||
`INST_BR_SRET: dpi_trace("SRET");
|
||||
`INST_BR_DRET: dpi_trace("DRET");
|
||||
`INST_BR_MRET: dpi_trace("MRET");
|
||||
default: dpi_trace("?");
|
||||
endcase
|
||||
end else if (`INST_ALU_IS_MUL(op_mod)) begin
|
||||
|
|
|
@ -46,6 +46,8 @@ module VX_warp_sched #(
|
|||
wire schedule_valid;
|
||||
wire warp_scheduled;
|
||||
|
||||
reg [63:0] issued_instrs;
|
||||
|
||||
wire ifetch_req_fire = ifetch_req_if.valid && ifetch_req_if.ready;
|
||||
|
||||
wire tmc_active = (warp_ctl_if.tmc.tmask != 0);
|
||||
|
@ -62,12 +64,13 @@ module VX_warp_sched #(
|
|||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
barrier_masks <= 0;
|
||||
use_wspawn <= 0;
|
||||
stalled_warps <= 0;
|
||||
barrier_masks <= '0;
|
||||
use_wspawn <= '0;
|
||||
stalled_warps <= '0;
|
||||
warp_pcs <= '0;
|
||||
active_warps <= '0;
|
||||
thread_masks <= '0;
|
||||
issued_instrs <= '0;
|
||||
|
||||
// activate first warp
|
||||
warp_pcs[0] <= `STARTUP_ADDR;
|
||||
|
@ -117,6 +120,8 @@ module VX_warp_sched #(
|
|||
if (use_wspawn[schedule_wid]) begin
|
||||
thread_masks[schedule_wid] <= 1;
|
||||
end
|
||||
|
||||
issued_instrs <= issued_instrs + 1;
|
||||
end
|
||||
|
||||
if (ifetch_req_fire) begin
|
||||
|
@ -223,20 +228,23 @@ module VX_warp_sched #(
|
|||
|
||||
assign warp_scheduled = schedule_valid && ~stall_out;
|
||||
|
||||
wire [63:0] instr_uuid = (issued_instrs * `NUM_CORES * `NUM_CLUSTERS) + 64'(CORE_ID);
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `NUM_THREADS + 32 + `NW_BITS),
|
||||
.DATAW (1 + 64 + `NUM_THREADS + 32 + `NW_BITS),
|
||||
.RESETW (1)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!stall_out),
|
||||
.data_in ({schedule_valid, schedule_tmask, schedule_pc, schedule_wid}),
|
||||
.data_out ({ifetch_req_if.valid, ifetch_req_if.tmask, ifetch_req_if.PC, ifetch_req_if.wid})
|
||||
.data_in ({schedule_valid, instr_uuid, schedule_tmask, schedule_pc, schedule_wid}),
|
||||
.data_out ({ifetch_req_if.valid, ifetch_req_if.uuid, ifetch_req_if.tmask, ifetch_req_if.PC, ifetch_req_if.wid})
|
||||
);
|
||||
|
||||
assign busy = (active_warps != 0);
|
||||
|
||||
`SCOPE_ASSIGN (wsched_scheduled, warp_scheduled);
|
||||
`SCOPE_ASSIGN (wsched_schedule_uuid, instr_uuid);
|
||||
`SCOPE_ASSIGN (wsched_active_warps, active_warps);
|
||||
`SCOPE_ASSIGN (wsched_stalled_warps, stalled_warps);
|
||||
`SCOPE_ASSIGN (wsched_schedule_wid, schedule_wid);
|
||||
|
|
|
@ -23,17 +23,9 @@ module VX_writeback #(
|
|||
|
||||
localparam DATAW = `NW_BITS + 32 + `NUM_THREADS + `NR_BITS + (`NUM_THREADS * 32) + 1;
|
||||
`ifdef EXT_F_ENABLE
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
localparam NUM_RSPS = 5;
|
||||
`else
|
||||
localparam NUM_RSPS = 4;
|
||||
`endif
|
||||
`else
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
localparam NUM_RSPS = 4;
|
||||
`else
|
||||
localparam NUM_RSPS = 3;
|
||||
`endif
|
||||
`endif
|
||||
|
||||
wire wb_valid;
|
||||
|
@ -50,9 +42,7 @@ module VX_writeback #(
|
|||
wire stall;
|
||||
|
||||
assign rsp_valid = {
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
gpu_commit_if.valid && gpu_commit_if.wb,
|
||||
`endif
|
||||
csr_commit_if.valid && csr_commit_if.wb,
|
||||
alu_commit_if.valid && alu_commit_if.wb,
|
||||
`ifdef EXT_F_ENABLE
|
||||
|
@ -62,9 +52,7 @@ module VX_writeback #(
|
|||
};
|
||||
|
||||
assign rsp_data = {
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
{gpu_commit_if.wid, gpu_commit_if.PC, gpu_commit_if.tmask, gpu_commit_if.rd, gpu_commit_if.data, gpu_commit_if.eop},
|
||||
`endif
|
||||
{csr_commit_if.wid, csr_commit_if.PC, csr_commit_if.tmask, csr_commit_if.rd, csr_commit_if.data, csr_commit_if.eop},
|
||||
{alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.rd, alu_commit_if.data, alu_commit_if.eop},
|
||||
`ifdef EXT_F_ENABLE
|
||||
|
@ -88,28 +76,17 @@ module VX_writeback #(
|
|||
.ready_out (~stall)
|
||||
);
|
||||
|
||||
assign ld_commit_if.ready = rsp_ready[0] || ~ld_commit_if.wb;
|
||||
assign ld_commit_if.ready = rsp_ready[0] || ~ld_commit_if.wb;
|
||||
`ifdef EXT_F_ENABLE
|
||||
assign fpu_commit_if.ready = rsp_ready[1] || ~fpu_commit_if.wb;
|
||||
assign alu_commit_if.ready = rsp_ready[2] || ~alu_commit_if.wb;
|
||||
assign csr_commit_if.ready = rsp_ready[3] || ~csr_commit_if.wb;
|
||||
assign gpu_commit_if.ready = rsp_ready[4] || ~gpu_commit_if.wb;
|
||||
`else
|
||||
assign alu_commit_if.ready = rsp_ready[1] || ~alu_commit_if.wb;
|
||||
assign csr_commit_if.ready = rsp_ready[2] || ~csr_commit_if.wb;
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
assign gpu_commit_if.ready = rsp_ready[3] || ~gpu_commit_if.wb;
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
`ifdef EXT_F_ENABLE
|
||||
assign gpu_commit_if.ready = rsp_ready[4] || ~gpu_commit_if.wb;
|
||||
`else
|
||||
assign gpu_commit_if.ready = rsp_ready[3] || ~gpu_commit_if.wb;
|
||||
`endif
|
||||
`else
|
||||
assign gpu_commit_if.ready = 1;
|
||||
`endif
|
||||
|
||||
assign stall = ~writeback_if.ready && writeback_if.valid;
|
||||
|
||||
|
|
|
@ -124,7 +124,8 @@ module VX_to_mem #(
|
|||
end
|
||||
end
|
||||
assign mem_rsp_tag_in_w = (rsp_ctr != 0) ? mem_rsp_tag_in_r : mem_rsp_tag_in;
|
||||
`RUNTIME_ASSERT(!mem_rsp_in_fire || (mem_rsp_tag_in_w == mem_rsp_tag_in), ("out-of-order memory reponse! cur=%d, expected=%d", mem_rsp_tag_in_w, mem_rsp_tag_in))
|
||||
`RUNTIME_ASSERT(!mem_rsp_in_fire || (mem_rsp_tag_in_w == mem_rsp_tag_in),
|
||||
("%t: *** out-of-order memory reponse! cur=%d, expected=%d", $time, mem_rsp_tag_in_w, mem_rsp_tag_in))
|
||||
|
||||
wire [SRC_ADDR_WIDTH+D-1:0] mem_req_addr_in_qual = {mem_req_addr_in, req_ctr};
|
||||
|
||||
|
|
2
hw/rtl/cache/VX_bank.sv
vendored
2
hw/rtl/cache/VX_bank.sv
vendored
|
@ -48,7 +48,6 @@ module VX_bank #(
|
|||
output wire perf_read_misses,
|
||||
output wire perf_write_misses,
|
||||
output wire perf_mshr_stalls,
|
||||
output wire perf_pipe_stalls,
|
||||
`endif
|
||||
|
||||
// Core Request
|
||||
|
@ -470,7 +469,6 @@ module VX_bank #(
|
|||
`ifdef PERF_ENABLE
|
||||
assign perf_read_misses = do_read_st1 && miss_st1;
|
||||
assign perf_write_misses = do_write_st1 && miss_st1;
|
||||
assign perf_pipe_stalls = crsq_stall || mreq_alm_full || mshr_alm_full;
|
||||
assign perf_mshr_stalls = mshr_alm_full;
|
||||
`endif
|
||||
|
||||
|
|
244
hw/rtl/cache/VX_cache.sv
vendored
244
hw/rtl/cache/VX_cache.sv
vendored
|
@ -102,7 +102,6 @@ module VX_cache #(
|
|||
wire [NUM_BANKS-1:0] perf_read_miss_per_bank;
|
||||
wire [NUM_BANKS-1:0] perf_write_miss_per_bank;
|
||||
wire [NUM_BANKS-1:0] perf_mshr_stall_per_bank;
|
||||
wire [NUM_BANKS-1:0] perf_pipe_stall_per_bank;
|
||||
`endif
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
@ -219,37 +218,37 @@ module VX_cache #(
|
|||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Core request
|
||||
wire [NUM_REQS-1:0] core_req_valid_nc;
|
||||
wire [NUM_REQS-1:0] core_req_rw_nc;
|
||||
wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr_nc;
|
||||
wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen_nc;
|
||||
wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_req_data_nc;
|
||||
wire [NUM_REQS-1:0][CORE_TAG_X_WIDTH-1:0] core_req_tag_nc;
|
||||
wire [NUM_REQS-1:0] core_req_ready_nc;
|
||||
wire [NUM_REQS-1:0] core_req_valid_c;
|
||||
wire [NUM_REQS-1:0] core_req_rw_c;
|
||||
wire [NUM_REQS-1:0][`WORD_ADDR_WIDTH-1:0] core_req_addr_c;
|
||||
wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen_c;
|
||||
wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_req_data_c;
|
||||
wire [NUM_REQS-1:0][CORE_TAG_X_WIDTH-1:0] core_req_tag_c;
|
||||
wire [NUM_REQS-1:0] core_req_ready_c;
|
||||
|
||||
// Core response
|
||||
wire [`CORE_RSP_TAGS-1:0] core_rsp_valid_nc;
|
||||
wire [NUM_REQS-1:0] core_rsp_tmask_nc;
|
||||
wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_nc;
|
||||
wire [`CORE_RSP_TAGS-1:0][CORE_TAG_X_WIDTH-1:0] core_rsp_tag_nc;
|
||||
wire [`CORE_RSP_TAGS-1:0] core_rsp_ready_nc;
|
||||
wire [`CORE_RSP_TAGS-1:0] core_rsp_valid_c;
|
||||
wire [NUM_REQS-1:0] core_rsp_tmask_c;
|
||||
wire [NUM_REQS-1:0][`WORD_WIDTH-1:0] core_rsp_data_c;
|
||||
wire [`CORE_RSP_TAGS-1:0][CORE_TAG_X_WIDTH-1:0] core_rsp_tag_c;
|
||||
wire [`CORE_RSP_TAGS-1:0] core_rsp_ready_c;
|
||||
|
||||
// Memory request
|
||||
wire mem_req_valid_nc;
|
||||
wire mem_req_rw_nc;
|
||||
wire [`MEM_ADDR_WIDTH-1:0] mem_req_addr_nc;
|
||||
wire [NUM_PORTS-1:0] mem_req_pmask_nc;
|
||||
wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mem_req_byteen_nc;
|
||||
wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mem_req_wsel_nc;
|
||||
wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mem_req_data_nc;
|
||||
wire [MEM_TAG_IN_WIDTH-1:0] mem_req_tag_nc;
|
||||
wire mem_req_ready_nc;
|
||||
wire mem_req_valid_c;
|
||||
wire mem_req_rw_c;
|
||||
wire [`MEM_ADDR_WIDTH-1:0] mem_req_addr_c;
|
||||
wire [NUM_PORTS-1:0] mem_req_pmask_c;
|
||||
wire [NUM_PORTS-1:0][WORD_SIZE-1:0] mem_req_byteen_c;
|
||||
wire [NUM_PORTS-1:0][WORD_SELECT_BITS-1:0] mem_req_wsel_c;
|
||||
wire [NUM_PORTS-1:0][`WORD_WIDTH-1:0] mem_req_data_c;
|
||||
wire [MEM_TAG_IN_WIDTH-1:0] mem_req_tag_c;
|
||||
wire mem_req_ready_c;
|
||||
|
||||
// Memory response
|
||||
wire mem_rsp_valid_nc;
|
||||
wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data_nc;
|
||||
wire [MEM_TAG_IN_WIDTH-1:0] mem_rsp_tag_nc;
|
||||
wire mem_rsp_ready_nc;
|
||||
wire mem_rsp_valid_c;
|
||||
wire [`CACHE_LINE_WIDTH-1:0] mem_rsp_data_c;
|
||||
wire [MEM_TAG_IN_WIDTH-1:0] mem_rsp_tag_c;
|
||||
wire mem_rsp_ready_c;
|
||||
|
||||
if (NC_ENABLE) begin
|
||||
VX_nc_bypass #(
|
||||
|
@ -280,20 +279,20 @@ module VX_cache #(
|
|||
.core_req_ready_in (core_req_ready),
|
||||
|
||||
// Core request out
|
||||
.core_req_valid_out (core_req_valid_nc),
|
||||
.core_req_rw_out (core_req_rw_nc),
|
||||
.core_req_byteen_out(core_req_byteen_nc),
|
||||
.core_req_addr_out (core_req_addr_nc),
|
||||
.core_req_data_out (core_req_data_nc),
|
||||
.core_req_tag_out (core_req_tag_nc),
|
||||
.core_req_ready_out (core_req_ready_nc),
|
||||
.core_req_valid_out (core_req_valid_c),
|
||||
.core_req_rw_out (core_req_rw_c),
|
||||
.core_req_byteen_out(core_req_byteen_c),
|
||||
.core_req_addr_out (core_req_addr_c),
|
||||
.core_req_data_out (core_req_data_c),
|
||||
.core_req_tag_out (core_req_tag_c),
|
||||
.core_req_ready_out (core_req_ready_c),
|
||||
|
||||
// Core response in
|
||||
.core_rsp_valid_in (core_rsp_valid_nc),
|
||||
.core_rsp_tmask_in (core_rsp_tmask_nc),
|
||||
.core_rsp_data_in (core_rsp_data_nc),
|
||||
.core_rsp_tag_in (core_rsp_tag_nc),
|
||||
.core_rsp_ready_in (core_rsp_ready_nc),
|
||||
.core_rsp_valid_in (core_rsp_valid_c),
|
||||
.core_rsp_tmask_in (core_rsp_tmask_c),
|
||||
.core_rsp_data_in (core_rsp_data_c),
|
||||
.core_rsp_tag_in (core_rsp_tag_c),
|
||||
.core_rsp_ready_in (core_rsp_ready_c),
|
||||
|
||||
// Core response out
|
||||
.core_rsp_valid_out (core_rsp_valid_sb),
|
||||
|
@ -303,15 +302,15 @@ module VX_cache #(
|
|||
.core_rsp_ready_out (core_rsp_ready_sb),
|
||||
|
||||
// Memory request in
|
||||
.mem_req_valid_in (mem_req_valid_nc),
|
||||
.mem_req_rw_in (mem_req_rw_nc),
|
||||
.mem_req_addr_in (mem_req_addr_nc),
|
||||
.mem_req_pmask_in (mem_req_pmask_nc),
|
||||
.mem_req_byteen_in (mem_req_byteen_nc),
|
||||
.mem_req_wsel_in (mem_req_wsel_nc),
|
||||
.mem_req_data_in (mem_req_data_nc),
|
||||
.mem_req_tag_in (mem_req_tag_nc),
|
||||
.mem_req_ready_in (mem_req_ready_nc),
|
||||
.mem_req_valid_in (mem_req_valid_c),
|
||||
.mem_req_rw_in (mem_req_rw_c),
|
||||
.mem_req_addr_in (mem_req_addr_c),
|
||||
.mem_req_pmask_in (mem_req_pmask_c),
|
||||
.mem_req_byteen_in (mem_req_byteen_c),
|
||||
.mem_req_wsel_in (mem_req_wsel_c),
|
||||
.mem_req_data_in (mem_req_data_c),
|
||||
.mem_req_tag_in (mem_req_tag_c),
|
||||
.mem_req_ready_in (mem_req_ready_c),
|
||||
|
||||
// Memory request out
|
||||
.mem_req_valid_out (mem_req_valid_sb),
|
||||
|
@ -331,40 +330,40 @@ module VX_cache #(
|
|||
.mem_rsp_ready_in (mem_rsp_ready),
|
||||
|
||||
// Memory response out
|
||||
.mem_rsp_valid_out (mem_rsp_valid_nc),
|
||||
.mem_rsp_data_out (mem_rsp_data_nc),
|
||||
.mem_rsp_tag_out (mem_rsp_tag_nc),
|
||||
.mem_rsp_ready_out (mem_rsp_ready_nc)
|
||||
.mem_rsp_valid_out (mem_rsp_valid_c),
|
||||
.mem_rsp_data_out (mem_rsp_data_c),
|
||||
.mem_rsp_tag_out (mem_rsp_tag_c),
|
||||
.mem_rsp_ready_out (mem_rsp_ready_c)
|
||||
);
|
||||
end else begin
|
||||
assign core_req_valid_nc = core_req_valid;
|
||||
assign core_req_rw_nc = core_req_rw;
|
||||
assign core_req_addr_nc = core_req_addr;
|
||||
assign core_req_byteen_nc = core_req_byteen;
|
||||
assign core_req_data_nc = core_req_data;
|
||||
assign core_req_tag_nc = core_req_tag;
|
||||
assign core_req_ready = core_req_ready_nc;
|
||||
assign core_req_valid_c = core_req_valid;
|
||||
assign core_req_rw_c = core_req_rw;
|
||||
assign core_req_addr_c = core_req_addr;
|
||||
assign core_req_byteen_c = core_req_byteen;
|
||||
assign core_req_data_c = core_req_data;
|
||||
assign core_req_tag_c = core_req_tag;
|
||||
assign core_req_ready = core_req_ready_c;
|
||||
|
||||
assign core_rsp_valid_sb = core_rsp_valid_nc;
|
||||
assign core_rsp_tmask_sb = core_rsp_tmask_nc;
|
||||
assign core_rsp_data_sb = core_rsp_data_nc;
|
||||
assign core_rsp_tag_sb = core_rsp_tag_nc;
|
||||
assign core_rsp_ready_nc = core_rsp_ready_sb;
|
||||
assign core_rsp_valid_sb = core_rsp_valid_c;
|
||||
assign core_rsp_tmask_sb = core_rsp_tmask_c;
|
||||
assign core_rsp_data_sb = core_rsp_data_c;
|
||||
assign core_rsp_tag_sb = core_rsp_tag_c;
|
||||
assign core_rsp_ready_c = core_rsp_ready_sb;
|
||||
|
||||
assign mem_req_valid_sb = mem_req_valid_nc;
|
||||
assign mem_req_addr_sb = mem_req_addr_nc;
|
||||
assign mem_req_rw_p = mem_req_rw_nc;
|
||||
assign mem_req_pmask_p = mem_req_pmask_nc;
|
||||
assign mem_req_byteen_p = mem_req_byteen_nc;
|
||||
assign mem_req_wsel_p = mem_req_wsel_nc;
|
||||
assign mem_req_data_p = mem_req_data_nc;
|
||||
assign mem_req_tag_sb = mem_req_tag_nc;
|
||||
assign mem_req_ready_nc = mem_req_ready_sb;
|
||||
assign mem_req_valid_sb = mem_req_valid_c;
|
||||
assign mem_req_addr_sb = mem_req_addr_c;
|
||||
assign mem_req_rw_p = mem_req_rw_c;
|
||||
assign mem_req_pmask_p = mem_req_pmask_c;
|
||||
assign mem_req_byteen_p = mem_req_byteen_c;
|
||||
assign mem_req_wsel_p = mem_req_wsel_c;
|
||||
assign mem_req_data_p = mem_req_data_c;
|
||||
assign mem_req_tag_sb = mem_req_tag_c;
|
||||
assign mem_req_ready_c = mem_req_ready_sb;
|
||||
|
||||
assign mem_rsp_valid_nc = mem_rsp_valid;
|
||||
assign mem_rsp_data_nc = mem_rsp_data;
|
||||
assign mem_rsp_tag_nc = mem_rsp_tag;
|
||||
assign mem_rsp_ready = mem_rsp_ready_nc;
|
||||
assign mem_rsp_valid_c = mem_rsp_valid;
|
||||
assign mem_rsp_data_c = mem_rsp_data;
|
||||
assign mem_rsp_tag_c = mem_rsp_tag;
|
||||
assign mem_rsp_ready = mem_rsp_ready_c;
|
||||
end
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
@ -383,15 +382,15 @@ module VX_cache #(
|
|||
) mem_rsp_queue (
|
||||
.clk (clk),
|
||||
.reset (mrsq_reset),
|
||||
.ready_in (mem_rsp_ready_nc),
|
||||
.valid_in (mem_rsp_valid_nc),
|
||||
.data_in ({mem_rsp_tag_nc, mem_rsp_data_nc}),
|
||||
.ready_in (mem_rsp_ready_c),
|
||||
.valid_in (mem_rsp_valid_c),
|
||||
.data_in ({mem_rsp_tag_c, mem_rsp_data_c}),
|
||||
.data_out ({mem_rsp_tag_qual, mem_rsp_data_qual}),
|
||||
.ready_out (mrsq_out_ready),
|
||||
.valid_out (mrsq_out_valid)
|
||||
);
|
||||
|
||||
`UNUSED_VAR (mem_rsp_tag_nc)
|
||||
`UNUSED_VAR (mem_rsp_tag_c)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
@ -464,13 +463,13 @@ module VX_cache #(
|
|||
`ifdef PERF_ENABLE
|
||||
.bank_stalls(perf_cache_if.bank_stalls),
|
||||
`endif
|
||||
.core_req_valid (core_req_valid_nc),
|
||||
.core_req_rw (core_req_rw_nc),
|
||||
.core_req_addr (core_req_addr_nc),
|
||||
.core_req_byteen (core_req_byteen_nc),
|
||||
.core_req_data (core_req_data_nc),
|
||||
.core_req_tag (core_req_tag_nc),
|
||||
.core_req_ready (core_req_ready_nc),
|
||||
.core_req_valid (core_req_valid_c),
|
||||
.core_req_rw (core_req_rw_c),
|
||||
.core_req_addr (core_req_addr_c),
|
||||
.core_req_byteen (core_req_byteen_c),
|
||||
.core_req_data (core_req_data_c),
|
||||
.core_req_tag (core_req_tag_c),
|
||||
.core_req_ready (core_req_ready_c),
|
||||
.per_bank_core_req_valid (per_bank_core_req_valid),
|
||||
.per_bank_core_req_pmask (per_bank_core_req_pmask),
|
||||
.per_bank_core_req_rw (per_bank_core_req_rw),
|
||||
|
@ -592,7 +591,6 @@ module VX_cache #(
|
|||
.perf_read_misses (perf_read_miss_per_bank[i]),
|
||||
.perf_write_misses (perf_write_miss_per_bank[i]),
|
||||
.perf_mshr_stalls (perf_mshr_stall_per_bank[i]),
|
||||
.perf_pipe_stalls (perf_pipe_stall_per_bank[i]),
|
||||
`endif
|
||||
|
||||
// Core request
|
||||
|
@ -655,11 +653,11 @@ module VX_cache #(
|
|||
.per_bank_core_rsp_tag (per_bank_core_rsp_tag),
|
||||
.per_bank_core_rsp_tid (per_bank_core_rsp_tid),
|
||||
.per_bank_core_rsp_ready (per_bank_core_rsp_ready),
|
||||
.core_rsp_valid (core_rsp_valid_nc),
|
||||
.core_rsp_tmask (core_rsp_tmask_nc),
|
||||
.core_rsp_tag (core_rsp_tag_nc),
|
||||
.core_rsp_data (core_rsp_data_nc),
|
||||
.core_rsp_ready (core_rsp_ready_nc)
|
||||
.core_rsp_valid (core_rsp_valid_c),
|
||||
.core_rsp_tmask (core_rsp_tmask_c),
|
||||
.core_rsp_tag (core_rsp_tag_c),
|
||||
.core_rsp_data (core_rsp_data_c),
|
||||
.core_rsp_ready (core_rsp_ready_c)
|
||||
);
|
||||
|
||||
wire [NUM_BANKS-1:0][(`MEM_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1 + NUM_PORTS * (1 + WORD_SIZE + WORD_SELECT_BITS + `WORD_WIDTH))-1:0] data_in;
|
||||
|
@ -681,15 +679,15 @@ module VX_cache #(
|
|||
.valid_in (per_bank_mem_req_valid),
|
||||
.data_in (data_in),
|
||||
.ready_in (per_bank_mem_req_ready),
|
||||
.valid_out (mem_req_valid_nc),
|
||||
.data_out ({mem_req_addr_nc, mem_req_id, mem_req_rw_nc, mem_req_pmask_nc, mem_req_byteen_nc, mem_req_wsel_nc, mem_req_data_nc}),
|
||||
.ready_out (mem_req_ready_nc)
|
||||
.valid_out (mem_req_valid_c),
|
||||
.data_out ({mem_req_addr_c, mem_req_id, mem_req_rw_c, mem_req_pmask_c, mem_req_byteen_c, mem_req_wsel_c, mem_req_data_c}),
|
||||
.ready_out (mem_req_ready_c)
|
||||
);
|
||||
|
||||
if (NUM_BANKS == 1) begin
|
||||
assign mem_req_tag_nc = MEM_TAG_IN_WIDTH'(mem_req_id);
|
||||
assign mem_req_tag_c = MEM_TAG_IN_WIDTH'(mem_req_id);
|
||||
end else begin
|
||||
assign mem_req_tag_nc = MEM_TAG_IN_WIDTH'({`MEM_ADDR_TO_BANK_ID(mem_req_addr_nc), mem_req_id});
|
||||
assign mem_req_tag_c = MEM_TAG_IN_WIDTH'({`MEM_ADDR_TO_BANK_ID(mem_req_addr_c), mem_req_id});
|
||||
end
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
|
@ -697,12 +695,21 @@ module VX_cache #(
|
|||
wire [$clog2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
|
||||
wire [$clog2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
|
||||
wire [$clog2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
|
||||
|
||||
wire [NUM_REQS-1:0] perf_core_reads_per_mask = core_req_valid & core_req_ready & ~core_req_rw;
|
||||
wire [NUM_REQS-1:0] perf_core_writes_per_mask = core_req_valid & core_req_ready & core_req_rw;
|
||||
|
||||
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_mask);
|
||||
|
||||
wire [NUM_REQS-1:0] perf_core_reads_per_mask = core_req_valid_c & core_req_ready_c & ~core_req_rw;
|
||||
wire [NUM_REQS-1:0] perf_core_writes_per_mask = core_req_valid_c & core_req_ready_c & core_req_rw;
|
||||
|
||||
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
|
||||
wire [$clog2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle;
|
||||
wire [$clog2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle;
|
||||
wire [$clog2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle;
|
||||
wire [$clog2(NUM_BANKS+1)-1:0] perf_crsp_stall_per_cycle;
|
||||
|
||||
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_mask);
|
||||
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_mask);
|
||||
`POP_COUNT(perf_read_miss_per_cycle, perf_read_miss_per_bank);
|
||||
`POP_COUNT(perf_write_miss_per_cycle, perf_write_miss_per_bank);
|
||||
`POP_COUNT(perf_mshr_stall_per_cycle, perf_mshr_stall_per_bank);
|
||||
|
||||
if (CORE_TAG_ID_BITS != 0) begin
|
||||
wire [NUM_REQS-1:0] perf_crsp_stall_per_mask = core_rsp_tmask & {NUM_REQS{core_rsp_valid && ~core_rsp_ready}};
|
||||
|
@ -712,23 +719,14 @@ module VX_cache #(
|
|||
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_mask);
|
||||
end
|
||||
|
||||
// per cycle: read misses, write misses, msrq stalls, pipeline stalls
|
||||
wire [$clog2(NUM_BANKS+1)-1:0] perf_read_miss_per_cycle;
|
||||
wire [$clog2(NUM_BANKS+1)-1:0] perf_write_miss_per_cycle;
|
||||
wire [$clog2(NUM_BANKS+1)-1:0] perf_mshr_stall_per_cycle;
|
||||
wire [$clog2(NUM_BANKS+1)-1:0] perf_pipe_stall_per_cycle;
|
||||
|
||||
`POP_COUNT(perf_read_miss_per_cycle, perf_read_miss_per_bank);
|
||||
`POP_COUNT(perf_write_miss_per_cycle, perf_write_miss_per_bank);
|
||||
`POP_COUNT(perf_mshr_stall_per_cycle, perf_mshr_stall_per_bank);
|
||||
`POP_COUNT(perf_pipe_stall_per_cycle, perf_pipe_stall_per_bank);
|
||||
wire perf_mem_stall_per_cycle = mem_req_valid & ~mem_req_ready;
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_core_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_core_writes;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_read_misses;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_write_misses;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mshr_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_pipe_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
|
||||
|
||||
always @(posedge clk) begin
|
||||
|
@ -738,16 +736,16 @@ module VX_cache #(
|
|||
perf_read_misses <= 0;
|
||||
perf_write_misses <= 0;
|
||||
perf_mshr_stalls <= 0;
|
||||
perf_pipe_stalls <= 0;
|
||||
perf_mem_stalls <= 0;
|
||||
perf_crsp_stalls <= 0;
|
||||
end else begin
|
||||
perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle);
|
||||
perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle);
|
||||
perf_read_misses <= perf_read_misses + `PERF_CTR_BITS'(perf_read_miss_per_cycle);
|
||||
perf_write_misses <= perf_write_misses+ `PERF_CTR_BITS'(perf_write_miss_per_cycle);
|
||||
perf_mshr_stalls <= perf_mshr_stalls + `PERF_CTR_BITS'(perf_mshr_stall_per_cycle);
|
||||
perf_pipe_stalls <= perf_pipe_stalls + `PERF_CTR_BITS'(perf_pipe_stall_per_cycle);
|
||||
perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
|
||||
perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle);
|
||||
perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle);
|
||||
perf_read_misses <= perf_read_misses + `PERF_CTR_BITS'(perf_read_miss_per_cycle);
|
||||
perf_write_misses <= perf_write_misses + `PERF_CTR_BITS'(perf_write_miss_per_cycle);
|
||||
perf_mshr_stalls <= perf_mshr_stalls + `PERF_CTR_BITS'(perf_mshr_stall_per_cycle);
|
||||
perf_mem_stalls <= perf_mem_stalls + `PERF_CTR_BITS'(perf_mem_stall_per_cycle);
|
||||
perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -756,7 +754,7 @@ module VX_cache #(
|
|||
assign perf_cache_if.read_misses = perf_read_misses;
|
||||
assign perf_cache_if.write_misses = perf_write_misses;
|
||||
assign perf_cache_if.mshr_stalls = perf_mshr_stalls;
|
||||
assign perf_cache_if.pipe_stalls = perf_pipe_stalls;
|
||||
assign perf_cache_if.mem_stalls = perf_mem_stalls;
|
||||
assign perf_cache_if.crsp_stalls = perf_crsp_stalls;
|
||||
`endif
|
||||
|
||||
|
|
25
hw/rtl/cache/VX_shared_mem.sv
vendored
25
hw/rtl/cache/VX_shared_mem.sv
vendored
|
@ -335,21 +335,13 @@ module VX_shared_mem #(
|
|||
// per cycle: core_reads, core_writes
|
||||
wire [$clog2(NUM_REQS+1)-1:0] perf_core_reads_per_cycle;
|
||||
wire [$clog2(NUM_REQS+1)-1:0] perf_core_writes_per_cycle;
|
||||
wire [$clog2(NUM_REQS+1)-1:0] perf_crsp_stall_per_cycle;
|
||||
|
||||
wire [NUM_REQS-1:0] perf_core_reads_per_mask = core_req_valid & core_req_ready & ~core_req_rw;
|
||||
wire [NUM_REQS-1:0] perf_core_writes_per_mask = core_req_valid & core_req_ready & core_req_rw;
|
||||
|
||||
`POP_COUNT(perf_core_reads_per_cycle, perf_core_reads_per_mask);
|
||||
`POP_COUNT(perf_core_writes_per_cycle, perf_core_writes_per_mask);
|
||||
|
||||
if (CORE_TAG_ID_BITS != 0) begin
|
||||
wire [NUM_REQS-1:0] perf_crsp_stall_per_mask = core_rsp_tmask & {NUM_REQS{core_rsp_valid && ~core_rsp_ready}};
|
||||
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_mask);
|
||||
end else begin
|
||||
wire [NUM_REQS-1:0] perf_crsp_stall_per_mask = core_rsp_valid & ~core_rsp_ready;
|
||||
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_mask);
|
||||
end
|
||||
wire perf_crsp_stall_per_cycle = core_rsp_valid & ~core_rsp_ready;
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_core_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_core_writes;
|
||||
|
@ -357,13 +349,13 @@ module VX_shared_mem #(
|
|||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_core_reads <= 0;
|
||||
perf_core_writes <= 0;
|
||||
perf_crsp_stalls <= 0;
|
||||
perf_core_reads <= 0;
|
||||
perf_core_writes <= 0;
|
||||
perf_crsp_stalls <= 0;
|
||||
end else begin
|
||||
perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle);
|
||||
perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle);
|
||||
perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
|
||||
perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle);
|
||||
perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle);
|
||||
perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -371,7 +363,8 @@ module VX_shared_mem #(
|
|||
assign perf_cache_if.writes = perf_core_writes;
|
||||
assign perf_cache_if.read_misses = '0;
|
||||
assign perf_cache_if.write_misses = '0;
|
||||
assign perf_cache_if.pipe_stalls = '0;
|
||||
assign perf_cache_if.mshr_stalls = '0;
|
||||
assign perf_cache_if.mem_stalls = '0;
|
||||
assign perf_cache_if.crsp_stalls = perf_crsp_stalls;
|
||||
`endif
|
||||
|
||||
|
|
|
@ -5,7 +5,8 @@
|
|||
|
||||
interface VX_alu_req_if ();
|
||||
|
||||
wire valid;
|
||||
wire valid;
|
||||
wire [63:0] uuid;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [31:0] PC;
|
||||
|
@ -24,6 +25,7 @@ interface VX_alu_req_if ();
|
|||
|
||||
modport master (
|
||||
output valid,
|
||||
output uuid,
|
||||
output wid,
|
||||
output tmask,
|
||||
output PC,
|
||||
|
@ -43,6 +45,7 @@ interface VX_alu_req_if ();
|
|||
|
||||
modport slave (
|
||||
input valid,
|
||||
input uuid,
|
||||
input wid,
|
||||
input tmask,
|
||||
input PC,
|
||||
|
|
|
@ -5,9 +5,12 @@
|
|||
|
||||
interface VX_cmt_to_csr_if ();
|
||||
|
||||
wire valid;
|
||||
wire [$clog2(`NUM_THREADS+1)-1:0] commit_size;
|
||||
|
||||
wire valid;
|
||||
`ifdef EXT_F_ENABLE
|
||||
wire [$clog2(6*`NUM_THREADS+1)-1:0] commit_size;
|
||||
`else
|
||||
wire [$clog2(5*`NUM_THREADS+1)-1:0] commit_size;
|
||||
`endif
|
||||
modport master (
|
||||
output valid,
|
||||
output commit_size
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
interface VX_commit_if ();
|
||||
|
||||
wire valid;
|
||||
wire [63:0] uuid;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [31:0] PC;
|
||||
|
@ -17,6 +18,7 @@ interface VX_commit_if ();
|
|||
|
||||
modport master (
|
||||
output valid,
|
||||
output uuid,
|
||||
output wid,
|
||||
output tmask,
|
||||
output PC,
|
||||
|
@ -29,6 +31,7 @@ interface VX_commit_if ();
|
|||
|
||||
modport slave (
|
||||
input valid,
|
||||
input uuid,
|
||||
input wid,
|
||||
input tmask,
|
||||
input PC,
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
interface VX_csr_req_if ();
|
||||
|
||||
wire valid;
|
||||
wire [63:0] uuid;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [31:0] PC;
|
||||
|
@ -20,6 +21,7 @@ interface VX_csr_req_if ();
|
|||
|
||||
modport master (
|
||||
output valid,
|
||||
output uuid,
|
||||
output wid,
|
||||
output tmask,
|
||||
output PC,
|
||||
|
@ -35,6 +37,7 @@ interface VX_csr_req_if ();
|
|||
|
||||
modport slave (
|
||||
input valid,
|
||||
input uuid,
|
||||
input wid,
|
||||
input tmask,
|
||||
input PC,
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
interface VX_decode_if ();
|
||||
|
||||
wire valid;
|
||||
wire [63:0] uuid;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [31:0] PC;
|
||||
|
@ -23,7 +24,8 @@ interface VX_decode_if ();
|
|||
wire ready;
|
||||
|
||||
modport master (
|
||||
output valid,
|
||||
output valid,
|
||||
output uuid,
|
||||
output wid,
|
||||
output tmask,
|
||||
output PC,
|
||||
|
@ -42,7 +44,8 @@ interface VX_decode_if ();
|
|||
);
|
||||
|
||||
modport slave (
|
||||
input valid,
|
||||
input valid,
|
||||
input uuid,
|
||||
input wid,
|
||||
input tmask,
|
||||
input PC,
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
interface VX_fpu_req_if ();
|
||||
|
||||
wire valid;
|
||||
wire [63:0] uuid;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [31:0] PC;
|
||||
|
@ -20,6 +21,7 @@ interface VX_fpu_req_if ();
|
|||
|
||||
modport master (
|
||||
output valid,
|
||||
output uuid,
|
||||
output wid,
|
||||
output tmask,
|
||||
output PC,
|
||||
|
@ -35,6 +37,7 @@ interface VX_fpu_req_if ();
|
|||
|
||||
modport slave (
|
||||
input valid,
|
||||
input uuid,
|
||||
input wid,
|
||||
input tmask,
|
||||
input PC,
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
interface VX_gpu_req_if();
|
||||
|
||||
wire valid;
|
||||
|
||||
wire [63:0] uuid;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [31:0] PC;
|
||||
|
@ -19,11 +19,11 @@ interface VX_gpu_req_if();
|
|||
wire [`NUM_THREADS-1:0][31:0] rs3_data;
|
||||
wire [`NR_BITS-1:0] rd;
|
||||
wire wb;
|
||||
|
||||
wire ready;
|
||||
|
||||
modport master (
|
||||
output valid,
|
||||
output uuid,
|
||||
output wid,
|
||||
output tmask,
|
||||
output PC,
|
||||
|
@ -41,6 +41,7 @@ interface VX_gpu_req_if();
|
|||
|
||||
modport slave (
|
||||
input valid,
|
||||
input uuid,
|
||||
input wid,
|
||||
input tmask,
|
||||
input PC,
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
interface VX_ibuffer_if ();
|
||||
|
||||
wire valid;
|
||||
wire [63:0] uuid;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [31:0] PC;
|
||||
|
@ -31,6 +32,7 @@ interface VX_ibuffer_if ();
|
|||
|
||||
modport master (
|
||||
output valid,
|
||||
output uuid,
|
||||
output wid,
|
||||
output tmask,
|
||||
output PC,
|
||||
|
@ -55,6 +57,7 @@ interface VX_ibuffer_if ();
|
|||
|
||||
modport slave (
|
||||
input valid,
|
||||
input uuid,
|
||||
input wid,
|
||||
input tmask,
|
||||
input PC,
|
||||
|
|
|
@ -5,14 +5,16 @@
|
|||
|
||||
interface VX_ifetch_req_if ();
|
||||
|
||||
wire valid;
|
||||
wire valid;
|
||||
wire [63:0] uuid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [31:0] PC;
|
||||
wire ready;
|
||||
|
||||
modport master (
|
||||
output valid,
|
||||
output valid,
|
||||
output uuid,
|
||||
output tmask,
|
||||
output wid,
|
||||
output PC,
|
||||
|
@ -20,7 +22,8 @@ interface VX_ifetch_req_if ();
|
|||
);
|
||||
|
||||
modport slave (
|
||||
input valid,
|
||||
input valid,
|
||||
input uuid,
|
||||
input tmask,
|
||||
input wid,
|
||||
input PC,
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
interface VX_ifetch_rsp_if ();
|
||||
|
||||
wire valid;
|
||||
wire [63:0] uuid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [31:0] PC;
|
||||
|
@ -13,7 +14,8 @@ interface VX_ifetch_rsp_if ();
|
|||
wire ready;
|
||||
|
||||
modport master (
|
||||
output valid,
|
||||
output valid,
|
||||
output uuid,
|
||||
output tmask,
|
||||
output wid,
|
||||
output PC,
|
||||
|
@ -22,7 +24,8 @@ interface VX_ifetch_rsp_if ();
|
|||
);
|
||||
|
||||
modport slave (
|
||||
input valid,
|
||||
input valid,
|
||||
input uuid,
|
||||
input tmask,
|
||||
input wid,
|
||||
input PC,
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
interface VX_lsu_req_if ();
|
||||
|
||||
wire valid;
|
||||
wire [63:0] uuid;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [31:0] PC;
|
||||
|
@ -21,6 +22,7 @@ interface VX_lsu_req_if ();
|
|||
|
||||
modport master (
|
||||
output valid,
|
||||
output uuid,
|
||||
output wid,
|
||||
output tmask,
|
||||
output PC,
|
||||
|
@ -37,6 +39,7 @@ interface VX_lsu_req_if ();
|
|||
|
||||
modport slave (
|
||||
input valid,
|
||||
input uuid,
|
||||
input wid,
|
||||
input tmask,
|
||||
input PC,
|
||||
|
|
|
@ -11,7 +11,7 @@ interface VX_perf_cache_if ();
|
|||
wire [`PERF_CTR_BITS-1:0] write_misses;
|
||||
wire [`PERF_CTR_BITS-1:0] bank_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] mshr_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] pipe_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] mem_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] crsp_stalls;
|
||||
|
||||
modport master (
|
||||
|
@ -21,7 +21,7 @@ interface VX_perf_cache_if ();
|
|||
output write_misses,
|
||||
output bank_stalls,
|
||||
output mshr_stalls,
|
||||
output pipe_stalls,
|
||||
output mem_stalls,
|
||||
output crsp_stalls
|
||||
);
|
||||
|
||||
|
@ -32,7 +32,7 @@ interface VX_perf_cache_if ();
|
|||
input write_misses,
|
||||
input bank_stalls,
|
||||
input mshr_stalls,
|
||||
input pipe_stalls,
|
||||
input mem_stalls,
|
||||
input crsp_stalls
|
||||
);
|
||||
|
||||
|
|
|
@ -7,68 +7,50 @@ interface VX_perf_memsys_if ();
|
|||
|
||||
wire [`PERF_CTR_BITS-1:0] icache_reads;
|
||||
wire [`PERF_CTR_BITS-1:0] icache_read_misses;
|
||||
wire [`PERF_CTR_BITS-1:0] icache_pipe_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] icache_crsp_stalls;
|
||||
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_reads;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_writes;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_writes;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_read_misses;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_write_misses;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_bank_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_mshr_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_pipe_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] dcache_crsp_stalls;
|
||||
|
||||
wire [`PERF_CTR_BITS-1:0] smem_reads;
|
||||
wire [`PERF_CTR_BITS-1:0] smem_writes;
|
||||
wire [`PERF_CTR_BITS-1:0] smem_bank_stalls;
|
||||
|
||||
wire [`PERF_CTR_BITS-1:0] mem_reads;
|
||||
wire [`PERF_CTR_BITS-1:0] mem_writes;
|
||||
wire [`PERF_CTR_BITS-1:0] mem_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] mem_latency;
|
||||
|
||||
modport master (
|
||||
output icache_reads,
|
||||
output icache_read_misses,
|
||||
output icache_pipe_stalls,
|
||||
output icache_crsp_stalls,
|
||||
output dcache_reads,
|
||||
output dcache_writes,
|
||||
output dcache_writes,
|
||||
output dcache_read_misses,
|
||||
output dcache_write_misses,
|
||||
output dcache_bank_stalls,
|
||||
output dcache_mshr_stalls,
|
||||
output dcache_pipe_stalls,
|
||||
output dcache_crsp_stalls,
|
||||
output smem_reads,
|
||||
output smem_writes,
|
||||
output smem_bank_stalls,
|
||||
output mem_reads,
|
||||
output mem_writes,
|
||||
output mem_stalls,
|
||||
output mem_latency
|
||||
);
|
||||
|
||||
modport slave (
|
||||
input icache_reads,
|
||||
input icache_read_misses,
|
||||
input icache_pipe_stalls,
|
||||
input icache_crsp_stalls,
|
||||
input dcache_reads,
|
||||
input dcache_writes,
|
||||
input dcache_writes,
|
||||
input dcache_read_misses,
|
||||
input dcache_write_misses,
|
||||
input dcache_bank_stalls,
|
||||
input dcache_mshr_stalls,
|
||||
input dcache_pipe_stalls,
|
||||
input dcache_crsp_stalls,
|
||||
input smem_reads,
|
||||
input smem_writes,
|
||||
input smem_bank_stalls,
|
||||
input mem_reads,
|
||||
input mem_writes,
|
||||
input mem_stalls,
|
||||
input mem_latency
|
||||
);
|
||||
|
||||
|
|
|
@ -4,18 +4,27 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
interface VX_perf_pipeline_if ();
|
||||
|
||||
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] scb_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] lsu_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] csr_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] alu_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] loads;
|
||||
wire [`PERF_CTR_BITS-1:0] stores;
|
||||
wire [`PERF_CTR_BITS-1:0] branches;
|
||||
|
||||
wire [`PERF_CTR_BITS-1:0] ibf_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] scb_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] lsu_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] csr_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] alu_stalls;
|
||||
`ifdef EXT_F_ENABLE
|
||||
wire [`PERF_CTR_BITS-1:0] fpu_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] fpu_stalls;
|
||||
`endif
|
||||
wire [`PERF_CTR_BITS-1:0] gpu_stalls;
|
||||
wire [`PERF_CTR_BITS-1:0] gpu_stalls;
|
||||
|
||||
modport master (
|
||||
modport decode (
|
||||
output loads,
|
||||
output stores,
|
||||
output branches
|
||||
);
|
||||
|
||||
modport issue (
|
||||
output ibf_stalls,
|
||||
output scb_stalls,
|
||||
output lsu_stalls,
|
||||
|
@ -25,9 +34,12 @@ interface VX_perf_pipeline_if ();
|
|||
output fpu_stalls,
|
||||
`endif
|
||||
output gpu_stalls
|
||||
);
|
||||
);
|
||||
|
||||
modport slave (
|
||||
input loads,
|
||||
input stores,
|
||||
input branches,
|
||||
input ibf_stalls,
|
||||
input scb_stalls,
|
||||
input lsu_stalls,
|
||||
|
|
23
hw/rtl/interfaces/VX_perf_tex_if.sv
Normal file
23
hw/rtl/interfaces/VX_perf_tex_if.sv
Normal file
|
@ -0,0 +1,23 @@
|
|||
`ifndef VX_PERF_TEX_IF
|
||||
`define VX_PERF_TEX_IF
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_perf_tex_if ();
|
||||
|
||||
wire [`PERF_CTR_BITS-1:0] mem_reads;
|
||||
wire [`PERF_CTR_BITS-1:0] mem_latency;
|
||||
|
||||
modport master (
|
||||
output mem_reads,
|
||||
output mem_latency
|
||||
);
|
||||
|
||||
modport slave (
|
||||
input mem_reads,
|
||||
input mem_latency
|
||||
);
|
||||
|
||||
endinterface
|
||||
|
||||
`endif
|
|
@ -6,6 +6,7 @@
|
|||
interface VX_tex_req_if ();
|
||||
|
||||
wire valid;
|
||||
wire [63:0] uuid;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [31:0] PC;
|
||||
|
@ -20,6 +21,7 @@ interface VX_tex_req_if ();
|
|||
|
||||
modport master (
|
||||
output valid,
|
||||
output uuid,
|
||||
output wid,
|
||||
output tmask,
|
||||
output PC,
|
||||
|
@ -33,6 +35,7 @@ interface VX_tex_req_if ();
|
|||
|
||||
modport slave (
|
||||
input valid,
|
||||
input uuid,
|
||||
input wid,
|
||||
input tmask,
|
||||
input PC,
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
interface VX_tex_rsp_if ();
|
||||
|
||||
wire valid;
|
||||
wire [63:0] uuid;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [31:0] PC;
|
||||
|
@ -16,6 +17,7 @@ interface VX_tex_rsp_if ();
|
|||
|
||||
modport master (
|
||||
output valid,
|
||||
output uuid,
|
||||
output wid,
|
||||
output tmask,
|
||||
output PC,
|
||||
|
@ -27,6 +29,7 @@ interface VX_tex_rsp_if ();
|
|||
|
||||
modport slave (
|
||||
input valid,
|
||||
input uuid,
|
||||
input wid,
|
||||
input tmask,
|
||||
input PC,
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
interface VX_writeback_if ();
|
||||
|
||||
wire valid;
|
||||
wire [63:0] uuid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [31:0] PC;
|
||||
|
@ -16,6 +17,7 @@ interface VX_writeback_if ();
|
|||
|
||||
modport master (
|
||||
output valid,
|
||||
output uuid,
|
||||
output tmask,
|
||||
output wid,
|
||||
output PC,
|
||||
|
@ -27,6 +29,7 @@ interface VX_writeback_if ();
|
|||
|
||||
modport slave (
|
||||
input valid,
|
||||
input uuid,
|
||||
input tmask,
|
||||
input wid,
|
||||
input PC,
|
||||
|
|
|
@ -125,7 +125,7 @@ module VX_axi_adapter #(
|
|||
|
||||
// AXI write response channel
|
||||
`UNUSED_VAR (m_axi_bid);
|
||||
`RUNTIME_ASSERT(~m_axi_bvalid || m_axi_bresp == 0, ("AXI response error"));
|
||||
`RUNTIME_ASSERT(~m_axi_bvalid || m_axi_bresp == 0, ("%t: *** AXI response error", $time));
|
||||
assign m_axi_bready = 1'b1;
|
||||
|
||||
// AXI read request channel
|
||||
|
@ -144,7 +144,7 @@ module VX_axi_adapter #(
|
|||
assign mem_rsp_valid = m_axi_rvalid;
|
||||
assign mem_rsp_tag = m_axi_rid;
|
||||
assign mem_rsp_data = m_axi_rdata;
|
||||
`RUNTIME_ASSERT(~m_axi_rvalid || m_axi_rresp == 0, ("AXI response error"));
|
||||
`RUNTIME_ASSERT(~m_axi_rvalid || m_axi_rresp == 0, ("%t: *** AXI response error", $time));
|
||||
`UNUSED_VAR (m_axi_rlast);
|
||||
assign m_axi_rready = mem_rsp_ready;
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@ module VX_index_queue #(
|
|||
assign enqueue = push;
|
||||
assign dequeue = !empty && !valid[rd_a]; // auto-remove when head is invalid
|
||||
|
||||
`RUNTIME_ASSERT(!push || !full, ("invalid inputs"));
|
||||
`RUNTIME_ASSERT(!push || !full, ("%t: *** invalid inputs", $time));
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
|
|
|
@ -4,12 +4,17 @@
|
|||
module VX_popcount #(
|
||||
parameter MODEL = 1,
|
||||
parameter N = 1,
|
||||
parameter LOGN = $clog2(N),
|
||||
parameter M = LOGN+1
|
||||
parameter M = $clog2(N+1)
|
||||
) (
|
||||
input wire [N-1:0] in_i,
|
||||
output wire [M-1:0] cnt_o
|
||||
);
|
||||
`ifndef SYNTHESIS
|
||||
assign cnt_o = $countones(in_i);
|
||||
`else
|
||||
`ifdef QUARTUS
|
||||
assign cnt_o = $countones(in_i);
|
||||
`else
|
||||
if (N == 1) begin
|
||||
|
||||
assign cnt_o = in_i;
|
||||
|
@ -53,6 +58,8 @@ module VX_popcount #(
|
|||
assign cnt_o = cnt_r;
|
||||
|
||||
end
|
||||
`endif
|
||||
`endif
|
||||
|
||||
endmodule
|
||||
`TRACING_ON
|
|
@ -30,7 +30,7 @@ module VX_skid_buffer #(
|
|||
|
||||
end else if (NOBACKPRESSURE) begin
|
||||
|
||||
`RUNTIME_ASSERT(ready_out, ("ready_out should always be asserted"))
|
||||
`RUNTIME_ASSERT(ready_out, ("%t: *** ready_out should always be asserted", $time))
|
||||
|
||||
wire stall = valid_out && ~ready_out;
|
||||
|
||||
|
|
|
@ -6,6 +6,11 @@ module VX_tex_unit #(
|
|||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// PERF
|
||||
`ifdef PERF_ENABLE
|
||||
VX_perf_tex_if.master perf_tex_if,
|
||||
`endif
|
||||
|
||||
// Texture unit <-> Memory Unit
|
||||
VX_dcache_req_if.master dcache_req_if,
|
||||
VX_dcache_rsp_if.slave dcache_rsp_if,
|
||||
|
@ -18,10 +23,11 @@ module VX_tex_unit #(
|
|||
VX_tex_rsp_if.master tex_rsp_if
|
||||
);
|
||||
|
||||
localparam REQ_INFOW_S = `NR_BITS + 1 + `NW_BITS + 32;
|
||||
localparam REQ_INFOW_S = 64 + `NR_BITS + 1 + `NW_BITS + 32;
|
||||
localparam REQ_INFOW_A = `TEX_FORMAT_BITS + REQ_INFOW_S;
|
||||
localparam REQ_INFOW_M = (2 * `NUM_THREADS * `TEX_BLEND_FRAC) + REQ_INFOW_A;
|
||||
|
||||
reg [$clog2(`NUM_TEX_UNITS)-1:0] csr_tex_unit;
|
||||
reg [`TEX_MIPOFF_BITS-1:0] tex_mipoff [`NUM_TEX_UNITS-1:0][`TEX_LOD_MAX+1-1:0];
|
||||
reg [1:0][`TEX_LOD_BITS-1:0] tex_logdims [`NUM_TEX_UNITS-1:0];
|
||||
reg [1:0][`TEX_WRAP_BITS-1:0] tex_wraps [`NUM_TEX_UNITS-1:0];
|
||||
|
@ -29,57 +35,60 @@ module VX_tex_unit #(
|
|||
reg [`TEX_FORMAT_BITS-1:0] tex_format [`NUM_TEX_UNITS-1:0];
|
||||
reg [`TEX_FILTER_BITS-1:0] tex_filter [`NUM_TEX_UNITS-1:0];
|
||||
|
||||
// CSRs programming
|
||||
// CSRs programming
|
||||
|
||||
reg [`NUM_TEX_UNITS-1:0] csrs_dirty;
|
||||
reg csrs_dirty [`NUM_TEX_UNITS-1:0];
|
||||
`UNUSED_VAR (csrs_dirty)
|
||||
|
||||
for (genvar i = 0; i < `NUM_TEX_UNITS; ++i) begin
|
||||
always @(posedge clk) begin
|
||||
if (tex_csr_if.write_enable) begin
|
||||
case (tex_csr_if.write_addr)
|
||||
`CSR_TEX(i, `TEX_STATE_ADDR) : begin
|
||||
tex_baddr[i] <= tex_csr_if.write_data[`TEX_ADDR_BITS-1:0];
|
||||
csrs_dirty[i] <= 1;
|
||||
end
|
||||
`CSR_TEX(i, `TEX_STATE_FORMAT) : begin
|
||||
tex_format[i] <= tex_csr_if.write_data[`TEX_FORMAT_BITS-1:0];
|
||||
csrs_dirty[i] <= 1;
|
||||
end
|
||||
`CSR_TEX(i, `TEX_STATE_WRAPU) : begin
|
||||
tex_wraps[i][0] <= tex_csr_if.write_data[`TEX_WRAP_BITS-1:0];
|
||||
csrs_dirty[i] <= 1;
|
||||
end
|
||||
`CSR_TEX(i, `TEX_STATE_WRAPV) : begin
|
||||
tex_wraps[i][1] <= tex_csr_if.write_data[`TEX_WRAP_BITS-1:0];
|
||||
csrs_dirty[i] <= 1;
|
||||
end
|
||||
`CSR_TEX(i, `TEX_STATE_FILTER) : begin
|
||||
tex_filter[i] <= tex_csr_if.write_data[`TEX_FILTER_BITS-1:0];
|
||||
csrs_dirty[i] <= 1;
|
||||
end
|
||||
`CSR_TEX(i, `TEX_STATE_WIDTH) : begin
|
||||
tex_logdims[i][0] <= tex_csr_if.write_data[`TEX_LOD_BITS-1:0];
|
||||
csrs_dirty[i] <= 1;
|
||||
end
|
||||
`CSR_TEX(i, `TEX_STATE_HEIGHT) : begin
|
||||
tex_logdims[i][1] <= tex_csr_if.write_data[`TEX_LOD_BITS-1:0];
|
||||
csrs_dirty[i] <= 1;
|
||||
end
|
||||
default: begin
|
||||
for (integer j = 0; j <= `TEX_LOD_MAX; ++j) begin
|
||||
`IGNORE_WARNINGS_BEGIN
|
||||
if (tex_csr_if.write_addr == `CSR_ADDR_BITS'(`CSR_TEX(i, `TEX_STATE_MIPOFF(j)))) begin
|
||||
`IGNORE_WARNINGS_END
|
||||
tex_mipoff[i][j] <= tex_csr_if.write_data[`TEX_MIPOFF_BITS-1:0];
|
||||
csrs_dirty[i] <= 1;
|
||||
end
|
||||
always @(posedge clk) begin
|
||||
if (tex_csr_if.write_enable) begin
|
||||
case (tex_csr_if.write_addr)
|
||||
`CSR_TEX_UNIT: begin
|
||||
csr_tex_unit <= tex_csr_if.write_data[$clog2(`NUM_TEX_UNITS)-1:0];
|
||||
end
|
||||
`CSR_TEX_ADDR: begin
|
||||
tex_baddr[csr_tex_unit] <= tex_csr_if.write_data[`TEX_ADDR_BITS-1:0];
|
||||
csrs_dirty[csr_tex_unit] <= 1;
|
||||
end
|
||||
`CSR_TEX_FORMAT: begin
|
||||
tex_format[csr_tex_unit] <= tex_csr_if.write_data[`TEX_FORMAT_BITS-1:0];
|
||||
csrs_dirty[csr_tex_unit] <= 1;
|
||||
end
|
||||
`CSR_TEX_WRAPU: begin
|
||||
tex_wraps[csr_tex_unit][0] <= tex_csr_if.write_data[`TEX_WRAP_BITS-1:0];
|
||||
csrs_dirty[csr_tex_unit] <= 1;
|
||||
end
|
||||
`CSR_TEX_WRAPV: begin
|
||||
tex_wraps[csr_tex_unit][1] <= tex_csr_if.write_data[`TEX_WRAP_BITS-1:0];
|
||||
csrs_dirty[csr_tex_unit] <= 1;
|
||||
end
|
||||
`CSR_TEX_FILTER: begin
|
||||
tex_filter[csr_tex_unit] <= tex_csr_if.write_data[`TEX_FILTER_BITS-1:0];
|
||||
csrs_dirty[csr_tex_unit] <= 1;
|
||||
end
|
||||
`CSR_TEX_WIDTH: begin
|
||||
tex_logdims[csr_tex_unit][0] <= tex_csr_if.write_data[`TEX_LOD_BITS-1:0];
|
||||
csrs_dirty[csr_tex_unit] <= 1;
|
||||
end
|
||||
`CSR_TEX_HEIGHT: begin
|
||||
tex_logdims[csr_tex_unit][1] <= tex_csr_if.write_data[`TEX_LOD_BITS-1:0];
|
||||
csrs_dirty[csr_tex_unit] <= 1;
|
||||
end
|
||||
default: begin
|
||||
for (integer j = 0; j <= `TEX_LOD_MAX; ++j) begin
|
||||
`IGNORE_WARNINGS_BEGIN
|
||||
if (tex_csr_if.write_addr == `CSR_TEX_MIPOFF(j)) begin
|
||||
`IGNORE_WARNINGS_END
|
||||
tex_mipoff[csr_tex_unit][j] <= tex_csr_if.write_data[`TEX_MIPOFF_BITS-1:0];
|
||||
csrs_dirty[csr_tex_unit] <= 1;
|
||||
end
|
||||
end
|
||||
endcase
|
||||
end
|
||||
if (reset || (tex_req_if.valid && tex_req_if.ready)) begin
|
||||
csrs_dirty[i] <= '0;
|
||||
end
|
||||
endcase
|
||||
end
|
||||
if (reset || (tex_req_if.valid && tex_req_if.ready)) begin
|
||||
for (integer i = 0; i < `NUM_TEX_UNITS; ++i) begin
|
||||
csrs_dirty[i] <= 0;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -125,7 +134,7 @@ module VX_tex_unit #(
|
|||
.req_baseaddr(tex_baddr[tex_req_if.unit]),
|
||||
.req_mipoff (sel_mipoff),
|
||||
.req_logdims(sel_logdims),
|
||||
.req_info ({tex_format[tex_req_if.unit], tex_req_if.rd, tex_req_if.wb, tex_req_if.wid, tex_req_if.PC}),
|
||||
.req_info ({tex_format[tex_req_if.unit], tex_req_if.uuid, tex_req_if.rd, tex_req_if.wb, tex_req_if.wid, tex_req_if.PC}),
|
||||
.req_ready (tex_req_if.ready),
|
||||
|
||||
.rsp_valid (mem_req_valid),
|
||||
|
@ -204,9 +213,47 @@ module VX_tex_unit #(
|
|||
.rsp_valid (tex_rsp_if.valid),
|
||||
.rsp_tmask (tex_rsp_if.tmask),
|
||||
.rsp_data (tex_rsp_if.data),
|
||||
.rsp_info ({tex_rsp_if.rd, tex_rsp_if.wb, tex_rsp_if.wid, tex_rsp_if.PC}),
|
||||
.rsp_info ({tex_rsp_if.uuid, tex_rsp_if.rd, tex_rsp_if.wb, tex_rsp_if.wid, tex_rsp_if.PC}),
|
||||
.rsp_ready (tex_rsp_if.ready)
|
||||
);
|
||||
);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
wire [$clog2(`NUM_THREADS+1)-1:0] perf_mem_req_per_cycle;
|
||||
wire [$clog2(`NUM_THREADS+1)-1:0] perf_mem_rsp_per_cycle;
|
||||
|
||||
wire [`NUM_THREADS-1:0] perf_mem_req_per_mask = dcache_req_if.valid & dcache_req_if.ready;
|
||||
wire [`NUM_THREADS-1:0] perf_mem_rsp_per_mask = dcache_rsp_if.tmask & {`NUM_THREADS{dcache_rsp_if.valid & dcache_rsp_if.ready}};
|
||||
|
||||
`POP_COUNT(perf_mem_req_per_cycle, perf_mem_req_per_mask);
|
||||
`POP_COUNT(perf_mem_rsp_per_cycle, perf_mem_rsp_per_mask);
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_pending_reads;
|
||||
wire [$clog2(`NUM_THREADS+1)+1-1:0] perf_pending_reads_cycle = perf_mem_req_per_cycle - perf_mem_rsp_per_cycle;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_pending_reads <= 0;
|
||||
end else begin
|
||||
perf_pending_reads <= perf_pending_reads + `PERF_CTR_BITS'($signed(perf_pending_reads_cycle));
|
||||
end
|
||||
end
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_latency;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_mem_reads <= 0;
|
||||
perf_mem_latency <= 0;
|
||||
end else begin
|
||||
perf_mem_reads <= perf_mem_reads + `PERF_CTR_BITS'(perf_mem_req_per_cycle);
|
||||
perf_mem_latency <= perf_mem_latency + `PERF_CTR_BITS'(perf_pending_reads);
|
||||
end
|
||||
end
|
||||
|
||||
assign perf_tex_if.mem_reads = perf_mem_reads;
|
||||
assign perf_tex_if.mem_latency = perf_mem_latency;
|
||||
`endif
|
||||
|
||||
`ifdef DBG_TRACE_TEX
|
||||
always @(posedge clk) begin
|
||||
|
|
|
@ -123,9 +123,9 @@
|
|||
"!cci_pending_writes_full": 1,
|
||||
"?afu_mem_req_fire": 1,
|
||||
"afu_mem_req_addr": 26,
|
||||
"afu_mem_req_tag": 27,
|
||||
"afu_mem_req_tag": "`VX_MEM_TAG_WIDTH+1",
|
||||
"?afu_mem_rsp_fire": 1,
|
||||
"afu_mem_rsp_tag": 27
|
||||
"afu_mem_rsp_tag": "`VX_MEM_TAG_WIDTH+1"
|
||||
},
|
||||
"afu/vortex": {
|
||||
"!reset": 1,
|
||||
|
@ -140,49 +140,29 @@
|
|||
"mem_rsp_tag":"`VX_MEM_TAG_WIDTH",
|
||||
"busy": 1
|
||||
},
|
||||
"afu/vortex/cluster/core/pipeline/fetch/icache_stage": {
|
||||
"?icache_req_fire": 1,
|
||||
"icache_req_wid":"`NW_BITS",
|
||||
"icache_req_addr": 32,
|
||||
"icache_req_tag":"`ICACHE_CORE_TAG_ID_BITS",
|
||||
"?icache_rsp_fire": 1,
|
||||
"icache_rsp_data": 32,
|
||||
"icache_rsp_tag":"`ICACHE_CORE_TAG_ID_BITS"
|
||||
},
|
||||
"afu/vortex/cluster/core/pipeline/fetch/warp_sched": {
|
||||
"?wsched_scheduled": 1,
|
||||
"wsched_schedule_uuid": 64,
|
||||
"wsched_active_warps": "`NUM_WARPS",
|
||||
"wsched_stalled_warps": "`NUM_WARPS",
|
||||
"wsched_schedule_tmask": "`NUM_THREADS",
|
||||
"wsched_schedule_wid": "`NW_BITS",
|
||||
"wsched_schedule_pc": "32"
|
||||
"wsched_schedule_pc": 32
|
||||
},
|
||||
"afu/vortex/cluster/core/pipeline/execute/gpu_unit": {
|
||||
"?gpu_rsp_valid": 1,
|
||||
"gpu_rsp_wid": "`NW_BITS",
|
||||
"gpu_rsp_tmc": 1,
|
||||
"gpu_rsp_wspawn": 1,
|
||||
"gpu_rsp_split": 1,
|
||||
"gpu_rsp_barrier": 1
|
||||
},
|
||||
"afu/vortex/cluster/core/pipeline/execute/lsu_unit": {
|
||||
"?dcache_req_fire":"`NUM_THREADS",
|
||||
"dcache_req_wid":"`NW_BITS",
|
||||
"dcache_req_pc": 32,
|
||||
"dcache_req_addr":"`NUM_THREADS * 32",
|
||||
"dcache_req_rw": 1,
|
||||
"dcache_req_byteen":"`NUM_THREADS * 4",
|
||||
"dcache_req_data": "`NUM_THREADS * 32",
|
||||
"dcache_req_tag":"`LSUQ_ADDR_BITS",
|
||||
"?dcache_rsp_fire":"`NUM_THREADS",
|
||||
"dcache_rsp_data":"`NUM_THREADS * 32",
|
||||
"dcache_rsp_tag":"`LSUQ_ADDR_BITS"
|
||||
"afu/vortex/cluster/core/pipeline/fetch/icache_stage": {
|
||||
"?icache_req_fire": 1,
|
||||
"icache_req_uuid": 64,
|
||||
"icache_req_addr": 32,
|
||||
"icache_req_tag":"`ICACHE_CORE_TAG_ID_BITS",
|
||||
"?icache_rsp_fire": 1,
|
||||
"icache_rsp_uuid": 64,
|
||||
"icache_rsp_data": 32,
|
||||
"icache_rsp_tag":"`ICACHE_CORE_TAG_ID_BITS"
|
||||
},
|
||||
"afu/vortex/cluster/core/pipeline/issue": {
|
||||
"?issue_fire": 1,
|
||||
"issue_wid":"`NW_BITS",
|
||||
"issue_tmask":"`NUM_THREADS",
|
||||
"issue_pc": 32,
|
||||
"issue_uuid": 64,
|
||||
"issue_tmask":"`NUM_THREADS",
|
||||
"issue_ex_type":"`EX_BITS",
|
||||
"issue_op_type":"`INST_OP_BITS",
|
||||
"issue_op_mod":"`INST_MOD_BITS",
|
||||
|
@ -198,15 +178,35 @@
|
|||
"gpr_rs2":"`NUM_THREADS * 32",
|
||||
"gpr_rs3":"`NUM_THREADS * 32",
|
||||
"?writeback_valid": 1,
|
||||
"writeback_wid":"`NW_BITS",
|
||||
"writeback_pc": 32,
|
||||
"writeback_uuid": 64,
|
||||
"writeback_tmask":"`NUM_THREADS",
|
||||
"writeback_rd":"`NR_BITS",
|
||||
"writeback_data":"`NUM_THREADS * 32",
|
||||
"writeback_eop": 1,
|
||||
"!scoreboard_delay": 1,
|
||||
"!dispatch_delay": 1
|
||||
},
|
||||
},
|
||||
"afu/vortex/cluster/core/pipeline/execute/lsu_unit": {
|
||||
"?dcache_req_fire":"`NUM_THREADS",
|
||||
"dcache_req_uuid": 64,
|
||||
"dcache_req_addr":"`NUM_THREADS * 32",
|
||||
"dcache_req_rw": 1,
|
||||
"dcache_req_byteen":"`NUM_THREADS * 4",
|
||||
"dcache_req_data":"`NUM_THREADS * 32",
|
||||
"dcache_req_tag":"`LSUQ_ADDR_BITS",
|
||||
"?dcache_rsp_fire":"`NUM_THREADS",
|
||||
"dcache_rsp_uuid": 64,
|
||||
"dcache_rsp_data":"`NUM_THREADS * 32",
|
||||
"dcache_rsp_tag":"`LSUQ_ADDR_BITS"
|
||||
},
|
||||
"afu/vortex/cluster/core/pipeline/execute/gpu_unit": {
|
||||
"?gpu_rsp_valid": 1,
|
||||
"gpu_rsp_uuid": 64,
|
||||
"gpu_rsp_tmc": 1,
|
||||
"gpu_rsp_wspawn": 1,
|
||||
"gpu_rsp_split": 1,
|
||||
"gpu_rsp_barrier": 1
|
||||
},
|
||||
"afu/vortex/l3cache/bank, afu/vortex/cluster/l2cache/bank, afu/vortex/cluster/core/mem_unit/dcache/bank, afu/vortex/cluster/core/mem_unit/icache/bank": {
|
||||
"?valid_st0": 1,
|
||||
"?valid_st1": 1,
|
||||
|
|
|
@ -42,15 +42,9 @@ _start:
|
|||
.type _exit, @function
|
||||
.global _exit
|
||||
_exit:
|
||||
beqz a0, label_exit_next
|
||||
mv gp, a0
|
||||
ecall;
|
||||
|
||||
label_exit_next:
|
||||
# dump performance CSRs
|
||||
call vx_perf_dump
|
||||
|
||||
# disable all threads in current warp
|
||||
mv s0, a0
|
||||
call vx_perf_dump
|
||||
mv gp, s0
|
||||
li a0, 0
|
||||
.insn s 0x6b, 0, x0, 0(a0) # tmc a0
|
||||
|
||||
|
|
47
sim/common/mempool.h
Normal file
47
sim/common/mempool.h
Normal file
|
@ -0,0 +1,47 @@
|
|||
#pragma once
|
||||
|
||||
#include <stack>
|
||||
|
||||
template <typename T>
|
||||
class MemoryPool {
|
||||
public:
|
||||
MemoryPool(uint32_t max_size) : max_size_(max_size) {}
|
||||
|
||||
MemoryPool(MemoryPool && other)
|
||||
: free_list_(std::move(other.free_list_))
|
||||
{}
|
||||
|
||||
~MemoryPool() {
|
||||
this->flush();
|
||||
}
|
||||
|
||||
void* allocate() {
|
||||
void* mem;
|
||||
if (!free_list_.empty()) {
|
||||
mem = static_cast<void*>(free_list_.top());
|
||||
free_list_.pop();
|
||||
} else {
|
||||
mem = ::operator new(sizeof(T));
|
||||
}
|
||||
return mem;
|
||||
}
|
||||
|
||||
void deallocate(void * object) {
|
||||
if (free_list_.size() < max_size_) {
|
||||
free_list_.push(static_cast<T*>(object));
|
||||
} else {
|
||||
::operator delete(object);
|
||||
}
|
||||
}
|
||||
|
||||
void flush() {
|
||||
while (!free_list_.empty()) {
|
||||
::operator delete(free_list_.top());
|
||||
free_list_.pop();
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
std::stack<void*> free_list_;
|
||||
uint32_t max_size_;
|
||||
};
|
|
@ -7,6 +7,7 @@
|
|||
#include <list>
|
||||
#include <queue>
|
||||
#include <assert.h>
|
||||
#include "mempool.h"
|
||||
|
||||
class SimObjectBase;
|
||||
|
||||
|
@ -20,37 +21,14 @@ public:
|
|||
return module_;
|
||||
}
|
||||
|
||||
SimPortBase* peer() const {
|
||||
return peer_;
|
||||
}
|
||||
|
||||
bool connected() const {
|
||||
return (peer_ != nullptr);
|
||||
}
|
||||
|
||||
protected:
|
||||
SimPortBase(SimObjectBase* module)
|
||||
: module_(module)
|
||||
, peer_(nullptr)
|
||||
{}
|
||||
|
||||
void connect(SimPortBase* peer) {
|
||||
assert(peer_ == nullptr);
|
||||
peer_ = peer;
|
||||
}
|
||||
|
||||
void disconnect() {
|
||||
assert(peer_ == nullptr);
|
||||
peer_ = nullptr;
|
||||
}
|
||||
|
||||
SimPortBase& operator=(const SimPortBase&) = delete;
|
||||
|
||||
SimObjectBase* module_;
|
||||
SimPortBase* peer_;
|
||||
|
||||
template <typename U> friend class SlavePort;
|
||||
template <typename U> friend class MasterPort;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -58,72 +36,92 @@ protected:
|
|||
template <typename Pkt>
|
||||
class SimPort : public SimPortBase {
|
||||
public:
|
||||
void send(const Pkt& pkt, uint64_t delay) const;
|
||||
typedef std::function<void (const Pkt&, uint64_t)> TxCallback;
|
||||
|
||||
SimPort(SimObjectBase* module)
|
||||
: SimPortBase(module)
|
||||
, peer_(nullptr)
|
||||
, tx_cb_(nullptr)
|
||||
{}
|
||||
|
||||
void send(const Pkt& pkt, uint64_t delay = 1) const;
|
||||
|
||||
void bind(SimPort<Pkt>* peer) {
|
||||
this->connect(peer);
|
||||
assert(peer_ == nullptr);
|
||||
peer_ = peer;
|
||||
}
|
||||
|
||||
void unbind() {
|
||||
this->disconnect();
|
||||
assert(peer_ == nullptr);
|
||||
peer_ = nullptr;
|
||||
}
|
||||
|
||||
bool connected() const {
|
||||
return (peer_ != nullptr);
|
||||
}
|
||||
|
||||
SimPort* peer() const {
|
||||
return peer_;
|
||||
}
|
||||
|
||||
bool empty() const {
|
||||
return queue_.empty();
|
||||
}
|
||||
|
||||
const Pkt& top() const {
|
||||
const Pkt& front() const {
|
||||
return queue_.front();
|
||||
}
|
||||
|
||||
Pkt& top() {
|
||||
return queue_.front();
|
||||
Pkt& front() {
|
||||
return queue_.front().pkt;
|
||||
}
|
||||
|
||||
void pop() {
|
||||
const Pkt& back() const {
|
||||
return queue_.back();
|
||||
}
|
||||
|
||||
Pkt& back() {
|
||||
return queue_.back().pkt;
|
||||
}
|
||||
|
||||
uint64_t pop() {
|
||||
auto cycle = queue_.front().cycle;
|
||||
queue_.pop();
|
||||
}
|
||||
return cycle;
|
||||
}
|
||||
|
||||
void tx_callback(const TxCallback& callback) {
|
||||
tx_cb_ = callback;
|
||||
}
|
||||
|
||||
protected:
|
||||
SimPort(SimObjectBase* module)
|
||||
: SimPortBase(module)
|
||||
{}
|
||||
struct timed_pkt_t {
|
||||
Pkt pkt;
|
||||
uint64_t cycle;
|
||||
};
|
||||
|
||||
void push(const Pkt& data) {
|
||||
queue_.push(data);
|
||||
std::queue<timed_pkt_t> queue_;
|
||||
SimPort* peer_;
|
||||
TxCallback tx_cb_;
|
||||
|
||||
void push(const Pkt& data, uint64_t cycle) {
|
||||
if (tx_cb_) {
|
||||
tx_cb_(data, cycle);
|
||||
}
|
||||
if (peer_) {
|
||||
peer_->push(data, cycle);
|
||||
} else {
|
||||
queue_.push({data, cycle});
|
||||
}
|
||||
}
|
||||
|
||||
SimPort& operator=(const SimPort&) = delete;
|
||||
|
||||
std::queue<Pkt> queue_;
|
||||
|
||||
template <typename U> friend class SimPortEvent;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename Pkt>
|
||||
class SlavePort : public SimPort<Pkt> {
|
||||
public:
|
||||
SlavePort(SimObjectBase* module) : SimPort<Pkt>(module) {}
|
||||
|
||||
protected:
|
||||
SlavePort& operator=(const SlavePort&) = delete;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename Pkt>
|
||||
class MasterPort : public SimPort<Pkt> {
|
||||
public:
|
||||
MasterPort(SimObjectBase* module) : SimPort<Pkt>(module) {}
|
||||
|
||||
protected:
|
||||
MasterPort& operator=(const MasterPort&) = delete;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class SimEventBase {
|
||||
public:
|
||||
typedef std::shared_ptr<SimEventBase> Ptr;
|
||||
|
@ -132,14 +130,14 @@ public:
|
|||
|
||||
virtual void fire() const = 0;
|
||||
|
||||
bool step() {
|
||||
return (0 == --delay_);
|
||||
uint64_t time() const {
|
||||
return time_;
|
||||
}
|
||||
|
||||
protected:
|
||||
SimEventBase(uint64_t delay) : delay_(delay) {}
|
||||
SimEventBase(uint64_t time) : time_(time) {}
|
||||
|
||||
uint64_t delay_;
|
||||
uint64_t time_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -147,26 +145,34 @@ protected:
|
|||
template <typename Pkt>
|
||||
class SimCallEvent : public SimEventBase {
|
||||
public:
|
||||
typedef std::function<void (const Pkt&)> Func;
|
||||
|
||||
template <typename... Args>
|
||||
static Ptr Create(const Func& func, const Pkt& pkt, uint64_t delay) {
|
||||
return std::make_shared<SimCallEvent>(func, pkt, delay);
|
||||
}
|
||||
|
||||
SimCallEvent(const Func& func, const Pkt& pkt, uint64_t delay)
|
||||
: SimEventBase(delay)
|
||||
, func_(func)
|
||||
, pkt_(pkt)
|
||||
{}
|
||||
|
||||
void fire() const override {
|
||||
func_(pkt_);
|
||||
}
|
||||
|
||||
protected:
|
||||
typedef std::function<void (const Pkt&)> Func;
|
||||
|
||||
SimCallEvent(const Func& func, const Pkt& pkt, uint64_t time)
|
||||
: SimEventBase(time)
|
||||
, func_(func)
|
||||
, pkt_(pkt)
|
||||
{}
|
||||
|
||||
void* operator new(size_t /*size*/) {
|
||||
return allocator().allocate();
|
||||
}
|
||||
|
||||
void operator delete(void* ptr) {
|
||||
allocator().deallocate(ptr);
|
||||
}
|
||||
|
||||
protected:
|
||||
Func func_;
|
||||
Pkt pkt_;
|
||||
Pkt pkt_;
|
||||
|
||||
static MemoryPool<SimCallEvent<Pkt>>& allocator() {
|
||||
static MemoryPool<SimCallEvent<Pkt>> instance(64);
|
||||
return instance;
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -174,23 +180,32 @@ protected:
|
|||
template <typename Pkt>
|
||||
class SimPortEvent : public SimEventBase {
|
||||
public:
|
||||
static Ptr Create(const SimPort<Pkt>* port, const Pkt& pkt, uint64_t delay) {
|
||||
return std::make_shared<SimPortEvent>(port, pkt, delay);
|
||||
void fire() const override {
|
||||
const_cast<SimPort<Pkt>*>(port_)->push(pkt_, time_);
|
||||
}
|
||||
|
||||
SimPortEvent(const SimPort<Pkt>* port, const Pkt& pkt, uint64_t delay)
|
||||
: SimEventBase(delay)
|
||||
SimPortEvent(const SimPort<Pkt>* port, const Pkt& pkt, uint64_t time)
|
||||
: SimEventBase(time)
|
||||
, port_(port)
|
||||
, pkt_(pkt)
|
||||
{}
|
||||
|
||||
void fire() const override {
|
||||
const_cast<SimPort<Pkt>*>(port_)->push(pkt_);
|
||||
|
||||
void* operator new(size_t /*size*/) {
|
||||
return allocator().allocate();
|
||||
}
|
||||
|
||||
private:
|
||||
void operator delete(void* ptr) {
|
||||
allocator().deallocate(ptr);
|
||||
}
|
||||
|
||||
protected:
|
||||
const SimPort<Pkt>* port_;
|
||||
Pkt pkt_;
|
||||
|
||||
static MemoryPool<SimPortEvent<Pkt>>& allocator() {
|
||||
static MemoryPool<SimPortEvent<Pkt>> instance(64);
|
||||
return instance;
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -203,24 +218,17 @@ public:
|
|||
|
||||
virtual ~SimObjectBase() {}
|
||||
|
||||
template <typename T, typename Pkt>
|
||||
void schedule(T *obj, void (T::*entry)(const Pkt&), const Pkt& pkt, uint64_t delay);
|
||||
|
||||
const std::string& name() const {
|
||||
return name_;
|
||||
}
|
||||
|
||||
protected:
|
||||
|
||||
virtual void step(uint64_t cycle) = 0;
|
||||
|
||||
SimObjectBase(const SimContext& ctx, const char* name);
|
||||
protected:
|
||||
|
||||
SimObjectBase(const SimContext& ctx, const char* name);
|
||||
|
||||
private:
|
||||
std::string name_;
|
||||
|
||||
friend class SimPlatform;
|
||||
friend class SimPortBase;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -228,14 +236,16 @@ private:
|
|||
template <typename Impl>
|
||||
class SimObject : public SimObjectBase {
|
||||
public:
|
||||
typedef std::shared_ptr<Impl> Ptr;
|
||||
typedef std::shared_ptr<Impl> Ptr;
|
||||
|
||||
template <typename... Args>
|
||||
static Ptr Create(Args&&... args);
|
||||
|
||||
protected:
|
||||
|
||||
SimObject(const SimContext& ctx, const char* name) : SimObjectBase(ctx, name) {}
|
||||
SimObject(const SimContext& ctx, const char* name)
|
||||
: SimObjectBase(ctx, name)
|
||||
{}
|
||||
|
||||
void step(uint64_t cycle) override {
|
||||
this->impl().step(cycle);
|
||||
|
@ -255,8 +265,8 @@ private:
|
|||
class SimContext {
|
||||
private:
|
||||
SimContext() {}
|
||||
template <typename Impl> template <typename... Args>
|
||||
friend typename SimObject<Impl>::Ptr SimObject<Impl>::Create(Args&&... args);
|
||||
|
||||
friend class SimPlatform;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -281,25 +291,19 @@ public:
|
|||
instance().clear();
|
||||
}
|
||||
|
||||
void register_object(const SimObjectBase::Ptr& obj) {
|
||||
template <typename Impl, typename... Args>
|
||||
typename SimObject<Impl>::Ptr CreateObject(Args&&... args) {
|
||||
auto obj = std::make_shared<Impl>(SimContext{}, std::forward<Args>(args)...);
|
||||
objects_.push_back(obj);
|
||||
return obj;
|
||||
}
|
||||
|
||||
template <typename Pkt>
|
||||
void schedule(const typename SimCallEvent<Pkt>::Func& callback,
|
||||
void schedule(const typename SimCallEvent<Pkt>::Func& callback,
|
||||
const Pkt& pkt,
|
||||
uint64_t delay) {
|
||||
auto evt = SimCallEvent<Pkt>::Create(callback, pkt, delay);
|
||||
assert(delay != 0);
|
||||
events_.emplace_back(evt);
|
||||
}
|
||||
|
||||
template <typename Pkt>
|
||||
void schedule(const SimPort<Pkt>* port,
|
||||
const Pkt& pkt,
|
||||
uint64_t delay) {
|
||||
auto evt = SimPortEvent<Pkt>::Create(port, pkt, delay);
|
||||
assert(delay != 0);
|
||||
auto evt = std::make_shared<SimCallEvent<Pkt>>(callback, pkt, cycles_ + delay);
|
||||
events_.emplace_back(evt);
|
||||
}
|
||||
|
||||
|
@ -309,7 +313,7 @@ public:
|
|||
auto evt_it_end = events_.end();
|
||||
while (evt_it != evt_it_end) {
|
||||
auto& event = *evt_it;
|
||||
if (event->step()) {
|
||||
if (cycles_ >= event->time()) {
|
||||
event->fire();
|
||||
evt_it = events_.erase(evt_it);
|
||||
} else {
|
||||
|
@ -341,9 +345,19 @@ private:
|
|||
events_.clear();
|
||||
}
|
||||
|
||||
template <typename Pkt>
|
||||
void schedule(const SimPort<Pkt>* port, const Pkt& pkt, uint64_t delay) {
|
||||
assert(delay != 0);
|
||||
auto evt = SimEventBase::Ptr(new SimPortEvent<Pkt>(port, pkt, cycles_ + delay));
|
||||
events_.emplace_back(evt);
|
||||
}
|
||||
|
||||
std::vector<SimObjectBase::Ptr> objects_;
|
||||
std::list<SimEventBase::Ptr> events_;
|
||||
uint64_t cycles_;
|
||||
|
||||
template <typename U> friend class SimPort;
|
||||
friend class SimObjectBase;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -355,22 +369,14 @@ inline SimObjectBase::SimObjectBase(const SimContext&, const char* name)
|
|||
template <typename Impl>
|
||||
template <typename... Args>
|
||||
typename SimObject<Impl>::Ptr SimObject<Impl>::Create(Args&&... args) {
|
||||
auto obj = std::make_shared<Impl>(SimContext{}, std::forward<Args>(args)...);
|
||||
SimPlatform::instance().register_object(obj);
|
||||
return obj;
|
||||
return SimPlatform::instance().CreateObject<Impl>(std::forward<Args>(args)...);
|
||||
}
|
||||
|
||||
template <typename Pkt>
|
||||
void SimPort<Pkt>::send(const Pkt& pkt, uint64_t delay) const {
|
||||
if (peer_) {
|
||||
if (peer_ && !tx_cb_) {
|
||||
reinterpret_cast<const SimPort<Pkt>*>(peer_)->send(pkt, delay);
|
||||
} else {
|
||||
SimPlatform::instance().schedule(this, pkt, delay);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T, typename Pkt>
|
||||
void SimObjectBase::schedule(T *obj, void (T::*entry)(const Pkt&), const Pkt& pkt, uint64_t delay) {
|
||||
auto callback = std::bind(entry, obj, std::placeholders::_1);
|
||||
SimPlatform::instance().schedule(callback, pkt, delay);
|
||||
}
|
|
@ -11,20 +11,20 @@ namespace vortex {
|
|||
|
||||
class ArchDef {
|
||||
private:
|
||||
int num_cores_;
|
||||
int num_warps_;
|
||||
int num_threads_;
|
||||
int wsize_;
|
||||
int vsize_;
|
||||
int num_regs_;
|
||||
int num_csrs_;
|
||||
int num_barriers_;
|
||||
uint16_t num_cores_;
|
||||
uint16_t num_warps_;
|
||||
uint16_t num_threads_;
|
||||
uint16_t wsize_;
|
||||
uint16_t vsize_;
|
||||
uint16_t num_regs_;
|
||||
uint16_t num_csrs_;
|
||||
uint16_t num_barriers_;
|
||||
|
||||
public:
|
||||
ArchDef(const std::string& /*arch*/,
|
||||
int num_cores,
|
||||
int num_warps,
|
||||
int num_threads)
|
||||
uint16_t num_cores,
|
||||
uint16_t num_warps,
|
||||
uint16_t num_threads)
|
||||
: num_cores_(num_cores)
|
||||
, num_warps_(num_warps)
|
||||
, num_threads_(num_threads)
|
||||
|
@ -35,35 +35,35 @@ public:
|
|||
, num_barriers_(NUM_BARRIERS)
|
||||
{}
|
||||
|
||||
int wsize() const {
|
||||
uint16_t wsize() const {
|
||||
return wsize_;
|
||||
}
|
||||
|
||||
int vsize() const {
|
||||
uint16_t vsize() const {
|
||||
return vsize_;
|
||||
}
|
||||
|
||||
int num_regs() const {
|
||||
uint16_t num_regs() const {
|
||||
return num_regs_;
|
||||
}
|
||||
|
||||
int num_csrs() const {
|
||||
uint16_t num_csrs() const {
|
||||
return num_csrs_;
|
||||
}
|
||||
|
||||
int num_barriers() const {
|
||||
uint16_t num_barriers() const {
|
||||
return num_barriers_;
|
||||
}
|
||||
|
||||
int num_threads() const {
|
||||
uint16_t num_threads() const {
|
||||
return num_threads_;
|
||||
}
|
||||
|
||||
int num_warps() const {
|
||||
uint16_t num_warps() const {
|
||||
return num_warps_;
|
||||
}
|
||||
|
||||
int num_cores() const {
|
||||
uint16_t num_cores() const {
|
||||
return num_cores_;
|
||||
}
|
||||
};
|
||||
|
|
|
@ -35,7 +35,7 @@ public:
|
|||
CommandLineArg(l, ht), arg_(x) {}
|
||||
|
||||
int read(int argc, char **argv) {
|
||||
__unused(argc);
|
||||
__unused (argc);
|
||||
std::istringstream iss(argv[1]);
|
||||
iss >> arg_;
|
||||
return 1;
|
||||
|
@ -53,7 +53,7 @@ public:
|
|||
CommandLineArg(l, ht), arg_(x) { arg_ = false; }
|
||||
|
||||
int read(int argc, char **argv) {
|
||||
__unused(argc, argv);
|
||||
__unused (argc, argv);
|
||||
arg_ = true;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -27,7 +27,7 @@ struct params_t {
|
|||
uint32_t tag_select_addr_start;
|
||||
uint32_t tag_select_addr_end;
|
||||
|
||||
params_t(const CacheConfig& config) {
|
||||
params_t(const Cache::Config& config) {
|
||||
uint32_t bank_bits = log2ceil(config.num_banks);
|
||||
uint32_t offset_bits = config.B - config.W;
|
||||
uint32_t log2_bank_size = config.C - bank_bits;
|
||||
|
@ -214,7 +214,7 @@ struct bank_t {
|
|||
std::vector<set_t> sets;
|
||||
MSHR mshr;
|
||||
|
||||
bank_t(const CacheConfig& config,
|
||||
bank_t(const Cache::Config& config,
|
||||
const params_t& params)
|
||||
: sets(params.sets_per_bank, params.blocks_per_set)
|
||||
, mshr(config.mshr_size)
|
||||
|
@ -226,22 +226,30 @@ struct bank_t {
|
|||
class Cache::Impl {
|
||||
private:
|
||||
Cache* const simobject_;
|
||||
CacheConfig config_;
|
||||
Config config_;
|
||||
params_t params_;
|
||||
std::vector<bank_t> banks_;
|
||||
Switch<MemReq, MemRsp>::Ptr mem_switch_;
|
||||
Switch<MemReq, MemRsp>::Ptr bypass_switch_;
|
||||
std::vector<MasterPort<MemReq>> mem_req_ports_;
|
||||
std::vector<SlavePort<MemRsp>> mem_rsp_ports_;
|
||||
std::vector<SimPort<MemReq>> mem_req_ports_;
|
||||
std::vector<SimPort<MemRsp>> mem_rsp_ports_;
|
||||
PerfStats perf_stats_;
|
||||
uint64_t pending_read_reqs_;
|
||||
uint64_t pending_write_reqs_;
|
||||
uint64_t pending_fill_reqs_;
|
||||
uint32_t flush_cycles_;
|
||||
|
||||
public:
|
||||
Impl(Cache* simobject, const CacheConfig& config)
|
||||
Impl(Cache* simobject, const Config& config)
|
||||
: simobject_(simobject)
|
||||
, config_(config)
|
||||
, params_(config)
|
||||
, banks_(config.num_banks, {config, params_})
|
||||
, mem_req_ports_(config.num_banks, simobject)
|
||||
, mem_rsp_ports_(config.num_banks, simobject)
|
||||
, pending_read_reqs_(0)
|
||||
, pending_write_reqs_(0)
|
||||
, pending_fill_reqs_(0)
|
||||
{
|
||||
bypass_switch_ = Switch<MemReq, MemRsp>::Create("bypass_arb", ArbiterType::Priority, 2);
|
||||
bypass_switch_->ReqOut.bind(&simobject->MemReqPort);
|
||||
|
@ -259,13 +267,29 @@ public:
|
|||
mem_req_ports_.at(0).bind(&bypass_switch_->ReqIn.at(0));
|
||||
bypass_switch_->RspOut.at(0).bind(&mem_rsp_ports_.at(0));
|
||||
}
|
||||
|
||||
// calculate tag flush cycles
|
||||
flush_cycles_ = params_.sets_per_bank * params_.blocks_per_set;
|
||||
}
|
||||
|
||||
const PerfStats& perf_stats() const {
|
||||
return perf_stats_;
|
||||
}
|
||||
|
||||
void step(uint64_t /*cycle*/) {
|
||||
void step(uint64_t cycle) {
|
||||
// wait on flush cycles
|
||||
if (flush_cycles_ != 0) {
|
||||
--flush_cycles_;
|
||||
return;
|
||||
}
|
||||
|
||||
// calculate memory latency
|
||||
perf_stats_.mem_latency += pending_fill_reqs_;
|
||||
|
||||
// handle bypasss responses
|
||||
auto& bypass_port = bypass_switch_->RspOut.at(1);
|
||||
if (!bypass_port.empty()) {
|
||||
auto& mem_rsp = bypass_port.top();
|
||||
auto& mem_rsp = bypass_port.front();
|
||||
uint32_t req_id = mem_rsp.tag & ((1 << params_.log2_num_inputs)-1);
|
||||
uint64_t tag = mem_rsp.tag >> params_.log2_num_inputs;
|
||||
MemRsp core_rsp(tag);
|
||||
|
@ -287,7 +311,7 @@ public:
|
|||
for (uint32_t bank_id = 0, n = config_.num_banks; bank_id < n; ++bank_id) {
|
||||
auto& mem_rsp_port = mem_rsp_ports_.at(bank_id);
|
||||
if (!mem_rsp_port.empty()) {
|
||||
auto& mem_rsp = mem_rsp_port.top();
|
||||
auto& mem_rsp = mem_rsp_port.front();
|
||||
this->processMemoryFill(bank_id, mem_rsp.tag);
|
||||
pending_fill_req.at(bank_id) = true;
|
||||
mem_rsp_port.pop();
|
||||
|
@ -300,7 +324,7 @@ public:
|
|||
if (core_req_port.empty())
|
||||
continue;
|
||||
|
||||
auto& core_req = core_req_port.top();
|
||||
auto& core_req = core_req_port.front();
|
||||
|
||||
// check cache bypassing
|
||||
if (core_req.is_io) {
|
||||
|
@ -345,7 +369,7 @@ public:
|
|||
// check MSHR capacity if read or writeback
|
||||
if ((!core_req.write || !config_.write_through)
|
||||
&& bank.mshr.full()) {
|
||||
// stall
|
||||
++perf_stats_.mshr_stalls;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
@ -356,7 +380,7 @@ public:
|
|||
|| pipeline_req.set_id != set_id
|
||||
|| pipeline_req.tag != tag
|
||||
|| pipeline_req.infos[port_id].valid) {
|
||||
// stall
|
||||
++perf_stats_.bank_stalls;
|
||||
continue;
|
||||
}
|
||||
// update pending request infos
|
||||
|
@ -365,8 +389,15 @@ public:
|
|||
// schedule new request
|
||||
pipeline_req = bank_req;
|
||||
}
|
||||
|
||||
if (core_req.write)
|
||||
++perf_stats_.writes;
|
||||
else
|
||||
++perf_stats_.reads;
|
||||
|
||||
// remove request
|
||||
core_req_port.pop();
|
||||
auto time = core_req_port.pop();
|
||||
perf_stats_.pipeline_stalls += (cycle - time);
|
||||
}
|
||||
|
||||
// process active request
|
||||
|
@ -393,6 +424,7 @@ public:
|
|||
auto& block = set.blocks.at(entry.block_id);
|
||||
block.valid = true;
|
||||
block.tag = entry.tag;
|
||||
--pending_fill_reqs_;
|
||||
}
|
||||
|
||||
void processBankRequest(const std::vector<bank_req_t>& pipeline_reqs) {
|
||||
|
@ -438,7 +470,7 @@ public:
|
|||
|
||||
if (hit) {
|
||||
//
|
||||
// MISS handling
|
||||
// Hit handling
|
||||
//
|
||||
if (pipeline_req.write) {
|
||||
// handle write hit
|
||||
|
@ -462,8 +494,13 @@ public:
|
|||
}
|
||||
} else {
|
||||
//
|
||||
// MISS handling
|
||||
//
|
||||
// Miss handling
|
||||
//
|
||||
if (pipeline_req.write)
|
||||
++perf_stats_.write_misses;
|
||||
else
|
||||
++perf_stats_.read_misses;
|
||||
|
||||
if (!found_free_block && !config_.write_through) {
|
||||
// write back dirty block
|
||||
auto& repl_block = set.blocks.at(repl_block_id);
|
||||
|
@ -472,6 +509,7 @@ public:
|
|||
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, repl_block.tag);
|
||||
mem_req.write = true;
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
++perf_stats_.evictions;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -500,9 +538,10 @@ public:
|
|||
if (pending == -1) {
|
||||
MemReq mem_req;
|
||||
mem_req.addr = params_.mem_addr(bank_id, pipeline_req.set_id, pipeline_req.tag);
|
||||
mem_req.write = pipeline_req.write;
|
||||
mem_req.write = false;
|
||||
mem_req.tag = mshr_id;
|
||||
mem_req_ports_.at(bank_id).send(mem_req, 1);
|
||||
++pending_fill_reqs_;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -513,7 +552,7 @@ public:
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
Cache::Cache(const SimContext& ctx, const char* name, const CacheConfig& config)
|
||||
Cache::Cache(const SimContext& ctx, const char* name, const Config& config)
|
||||
: SimObject<Cache>(ctx, name)
|
||||
, CoreReqPorts(config.num_inputs, this)
|
||||
, CoreRspPorts(config.num_inputs, this)
|
||||
|
@ -528,4 +567,8 @@ Cache::~Cache() {
|
|||
|
||||
void Cache::step(uint64_t cycle) {
|
||||
impl_->step(cycle);
|
||||
}
|
||||
|
||||
const Cache::PerfStats& Cache::perf_stats() const {
|
||||
return impl_->perf_stats();
|
||||
}
|
|
@ -5,33 +5,58 @@
|
|||
|
||||
namespace vortex {
|
||||
|
||||
struct CacheConfig {
|
||||
uint8_t C; // log2 cache size
|
||||
uint8_t B; // log2 block size
|
||||
uint8_t W; // log2 word size
|
||||
uint8_t A; // log2 associativity
|
||||
uint8_t addr_width; // word address bits
|
||||
uint8_t num_banks; // number of banks
|
||||
uint8_t ports_per_bank; // number of ports per bank
|
||||
uint8_t num_inputs; // number of inputs
|
||||
bool write_through; // is write-through
|
||||
bool write_reponse; // enable write response
|
||||
uint16_t victim_size; // victim cache size
|
||||
uint16_t mshr_size; // MSHR buffer size
|
||||
uint8_t latency; // pipeline latency
|
||||
};
|
||||
|
||||
class Cache : public SimObject<Cache> {
|
||||
class Cache : public SimObject<Cache> {
|
||||
public:
|
||||
Cache(const SimContext& ctx, const char* name, const CacheConfig& config);
|
||||
struct Config {
|
||||
uint8_t C; // log2 cache size
|
||||
uint8_t B; // log2 block size
|
||||
uint8_t W; // log2 word size
|
||||
uint8_t A; // log2 associativity
|
||||
uint8_t addr_width; // word address bits
|
||||
uint8_t num_banks; // number of banks
|
||||
uint8_t ports_per_bank; // number of ports per bank
|
||||
uint8_t num_inputs; // number of inputs
|
||||
bool write_through; // is write-through
|
||||
bool write_reponse; // enable write response
|
||||
uint16_t victim_size; // victim cache size
|
||||
uint16_t mshr_size; // MSHR buffer size
|
||||
uint8_t latency; // pipeline latency
|
||||
};
|
||||
struct PerfStats {
|
||||
uint64_t reads;
|
||||
uint64_t writes;
|
||||
uint64_t read_misses;
|
||||
uint64_t write_misses;
|
||||
uint64_t evictions;
|
||||
uint64_t pipeline_stalls;
|
||||
uint64_t bank_stalls;
|
||||
uint64_t mshr_stalls;
|
||||
uint64_t mem_latency;
|
||||
|
||||
PerfStats()
|
||||
: reads(0)
|
||||
, writes(0)
|
||||
, read_misses(0)
|
||||
, write_misses(0)
|
||||
, evictions(0)
|
||||
, pipeline_stalls(0)
|
||||
, bank_stalls(0)
|
||||
, mshr_stalls(0)
|
||||
, mem_latency(0)
|
||||
{}
|
||||
};
|
||||
|
||||
std::vector<SimPort<MemReq>> CoreReqPorts;
|
||||
std::vector<SimPort<MemRsp>> CoreRspPorts;
|
||||
SimPort<MemReq> MemReqPort;
|
||||
SimPort<MemRsp> MemRspPort;
|
||||
|
||||
Cache(const SimContext& ctx, const char* name, const Config& config);
|
||||
~Cache();
|
||||
|
||||
void step(uint64_t cycle);
|
||||
|
||||
std::vector<SlavePort<MemReq>> CoreReqPorts;
|
||||
std::vector<MasterPort<MemRsp>> CoreRspPorts;
|
||||
MasterPort<MemReq> MemReqPort;
|
||||
SlavePort<MemRsp> MemRspPort;
|
||||
const PerfStats& perf_stats() const;
|
||||
|
||||
private:
|
||||
class Impl;
|
||||
|
|
|
@ -3,14 +3,14 @@
|
|||
#include "types.h"
|
||||
|
||||
#ifndef MEM_LATENCY
|
||||
#define MEM_LATENCY 18
|
||||
#define MEM_LATENCY 24
|
||||
#endif
|
||||
|
||||
namespace vortex {
|
||||
|
||||
struct Constants {
|
||||
enum Constants {
|
||||
|
||||
static constexpr uint32_t SMEM_DELAY = 1 + SM_ENABLE;
|
||||
SMEM_BANK_OFFSET = log2ceil(sizeof(Word)) + log2ceil(STACK_SIZE / sizeof(Word)),
|
||||
|
||||
};
|
||||
|
||||
|
|
|
@ -9,16 +9,18 @@
|
|||
#include "decode.h"
|
||||
#include "core.h"
|
||||
#include "debug.h"
|
||||
#include "constants.h"
|
||||
|
||||
using namespace vortex;
|
||||
|
||||
Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
|
||||
: SimObject(ctx, "Core")
|
||||
, MemRspPort(this)
|
||||
, MemReqPort(this)
|
||||
, id_(id)
|
||||
, arch_(arch)
|
||||
, decoder_(arch)
|
||||
, mmu_(0, arch.wsize(), true)
|
||||
, shared_mem_(4096)
|
||||
, tex_units_(NUM_TEX_UNITS, this)
|
||||
, warps_(arch.num_warps())
|
||||
, barriers_(arch.num_barriers(), 0)
|
||||
|
@ -27,7 +29,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
|
|||
, ibuffers_(arch.num_warps(), IBUF_SIZE)
|
||||
, scoreboard_(arch_)
|
||||
, exe_units_((int)ExeType::MAX)
|
||||
, icache_(Cache::Create("Icache", CacheConfig{
|
||||
, icache_(Cache::Create("Icache", Cache::Config{
|
||||
log2ceil(ICACHE_SIZE), // C
|
||||
log2ceil(L1_BLOCK_SIZE),// B
|
||||
2, // W
|
||||
|
@ -42,7 +44,7 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
|
|||
NUM_WARPS, // mshr
|
||||
2, // pipeline latency
|
||||
}))
|
||||
, dcache_(Cache::Create("Dcache", CacheConfig{
|
||||
, dcache_(Cache::Create("Dcache", Cache::Config{
|
||||
log2ceil(DCACHE_SIZE), // C
|
||||
log2ceil(L1_BLOCK_SIZE),// B
|
||||
2, // W
|
||||
|
@ -55,37 +57,41 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
|
|||
false, // write response
|
||||
0, // victim size
|
||||
DCACHE_MSHR_SIZE, // mshr
|
||||
2, // pipeline latency
|
||||
4, // pipeline latency
|
||||
}))
|
||||
, shared_mem_(SharedMem::Create("sharedmem", SharedMem::Config{
|
||||
arch.num_threads(),
|
||||
arch.num_threads(),
|
||||
Constants::SMEM_BANK_OFFSET,
|
||||
1,
|
||||
false
|
||||
}))
|
||||
, l1_mem_switch_(Switch<MemReq, MemRsp>::Create("l1_arb", ArbiterType::Priority, 2))
|
||||
, dcache_switch_(arch.num_threads())
|
||||
, fetch_stage_("fetch")
|
||||
, decode_stage_("decode")
|
||||
, issue_stage_("issue")
|
||||
, execute_stage_("execute")
|
||||
, commit_stage_("writeback")
|
||||
, fetch_latch_("fetch")
|
||||
, decode_latch_("decode")
|
||||
, pending_icache_(arch_.num_warps())
|
||||
, active_warps_(1)
|
||||
, stalled_warps_(0)
|
||||
, last_schedule_wid_(0)
|
||||
, issued_instrs_(0)
|
||||
, committed_instrs_(0)
|
||||
, csr_tex_unit_(0)
|
||||
, ecall_(false)
|
||||
, ebreak_(false)
|
||||
, stats_insts_(0)
|
||||
, MemRspPort(this)
|
||||
, MemReqPort(this)
|
||||
, perf_mem_pending_reads_(0)
|
||||
{
|
||||
for (int i = 0; i < arch_.num_warps(); ++i) {
|
||||
warps_.at(i) = std::make_shared<Warp>(this, i);
|
||||
}
|
||||
|
||||
// register execute units
|
||||
exe_units_.at((int)ExeType::NOP) = std::make_shared<NopUnit>(this);
|
||||
exe_units_.at((int)ExeType::ALU) = std::make_shared<AluUnit>(this);
|
||||
exe_units_.at((int)ExeType::LSU) = std::make_shared<LsuUnit>(this);
|
||||
exe_units_.at((int)ExeType::CSR) = std::make_shared<CsrUnit>(this);
|
||||
exe_units_.at((int)ExeType::FPU) = std::make_shared<FpuUnit>(this);
|
||||
exe_units_.at((int)ExeType::GPU) = std::make_shared<GpuUnit>(this);
|
||||
exe_units_.at((int)ExeType::NOP) = SimPlatform::instance().CreateObject<NopUnit>(this);
|
||||
exe_units_.at((int)ExeType::ALU) = SimPlatform::instance().CreateObject<AluUnit>(this);
|
||||
exe_units_.at((int)ExeType::LSU) = SimPlatform::instance().CreateObject<LsuUnit>(this);
|
||||
exe_units_.at((int)ExeType::CSR) = SimPlatform::instance().CreateObject<CsrUnit>(this);
|
||||
exe_units_.at((int)ExeType::FPU) = SimPlatform::instance().CreateObject<FpuUnit>(this);
|
||||
exe_units_.at((int)ExeType::GPU) = SimPlatform::instance().CreateObject<GpuUnit>(this);
|
||||
|
||||
// connect l1 switch
|
||||
icache_->MemReqPort.bind(&l1_mem_switch_->ReqIn[0]);
|
||||
|
@ -109,6 +115,18 @@ Core::Core(const SimContext& ctx, const ArchDef &arch, Word id)
|
|||
|
||||
// activate warp0
|
||||
warps_.at(0)->setTmask(0, true);
|
||||
|
||||
// memory perf callbacks
|
||||
MemReqPort.tx_callback([&](const MemReq& req, uint64_t cycle){
|
||||
__unused (cycle);
|
||||
perf_stats_.mem_reads += !req.write;
|
||||
perf_stats_.mem_writes += req.write;
|
||||
perf_mem_pending_reads_ += !req.write;
|
||||
});
|
||||
MemRspPort.tx_callback([&](const MemRsp&, uint64_t cycle){
|
||||
__unused (cycle);
|
||||
--perf_mem_pending_reads_;
|
||||
});
|
||||
}
|
||||
|
||||
Core::~Core() {
|
||||
|
@ -128,23 +146,26 @@ void Core::attach_ram(RAM* ram) {
|
|||
void Core::step(uint64_t cycle) {
|
||||
this->commit(cycle);
|
||||
this->execute(cycle);
|
||||
this->issue(cycle);
|
||||
this->decode(cycle);
|
||||
this->fetch(cycle);
|
||||
this->schedule(cycle);
|
||||
|
||||
// update perf counter
|
||||
perf_stats_.mem_latency += perf_mem_pending_reads_;
|
||||
|
||||
DPN(2, std::flush);
|
||||
}
|
||||
|
||||
void Core::warp_scheduler(uint64_t cycle) {
|
||||
void Core::schedule(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
|
||||
bool foundSchedule = false;
|
||||
int scheduled_warp = last_schedule_wid_;
|
||||
|
||||
// round robin scheduling
|
||||
for (size_t wid = 0; wid < warps_.size(); ++wid) {
|
||||
scheduled_warp = (scheduled_warp + 1) % warps_.size();
|
||||
bool warp_active = warps_.at(scheduled_warp)->active();
|
||||
for (size_t wid = 0, nw = arch_.num_warps(); wid < nw; ++wid) {
|
||||
scheduled_warp = (scheduled_warp + 1) % nw;
|
||||
bool warp_active = active_warps_.test(scheduled_warp);
|
||||
bool warp_stalled = stalled_warps_.test(scheduled_warp);
|
||||
if (warp_active && !warp_stalled) {
|
||||
last_schedule_wid_ = scheduled_warp;
|
||||
|
@ -159,85 +180,91 @@ void Core::warp_scheduler(uint64_t cycle) {
|
|||
// suspend warp until decode
|
||||
stalled_warps_.set(scheduled_warp);
|
||||
|
||||
auto& warp = warps_.at(scheduled_warp);
|
||||
stats_insts_ += warp->getActiveThreads();
|
||||
|
||||
auto trace = new pipeline_trace_t((issued_instrs_++ * arch_.num_cores()) + id_, arch_);
|
||||
auto& warp = warps_.at(scheduled_warp);
|
||||
|
||||
uint64_t uuid = (issued_instrs_++ * arch_.num_cores()) + id_;
|
||||
|
||||
auto trace = new pipeline_trace_t(uuid, arch_);
|
||||
|
||||
warp->eval(trace);
|
||||
|
||||
DT(3, cycle, "pipeline-schedule: " << *trace);
|
||||
|
||||
// advance to fetch stage
|
||||
fetch_stage_.push(trace);
|
||||
fetch_latch_.push(trace);
|
||||
}
|
||||
|
||||
void Core::fetch(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
|
||||
// handle icache reponse
|
||||
auto& icache_rsp_port = icache_->CoreRspPorts.at(0);
|
||||
if (!icache_rsp_port.empty()){
|
||||
auto& mem_rsp = icache_rsp_port.top();
|
||||
auto& mem_rsp = icache_rsp_port.front();
|
||||
auto trace = pending_icache_.at(mem_rsp.tag);
|
||||
auto latency = (SimPlatform::instance().cycles() - trace->icache_latency);
|
||||
trace->icache_latency = latency;
|
||||
decode_stage_.push(trace);
|
||||
decode_latch_.push(trace);
|
||||
DT(3, cycle, "icache-rsp: addr=" << std::hex << trace->PC << ", tag=" << mem_rsp.tag << ", " << *trace);
|
||||
pending_icache_.release(mem_rsp.tag);
|
||||
icache_rsp_port.pop();
|
||||
}
|
||||
|
||||
// send icache request
|
||||
if (!fetch_stage_.empty()) {
|
||||
auto trace = fetch_stage_.top();
|
||||
trace->icache_latency = SimPlatform::instance().cycles();
|
||||
if (!fetch_latch_.empty()) {
|
||||
auto trace = fetch_latch_.front();
|
||||
MemReq mem_req;
|
||||
mem_req.addr = trace->PC;
|
||||
mem_req.write = false;
|
||||
mem_req.tag = pending_icache_.allocate(trace);
|
||||
icache_->CoreReqPorts.at(0).send(mem_req, 1);
|
||||
DT(3, cycle, "icache-req: addr=" << std::hex << mem_req.addr << ", tag=" << mem_req.tag << ", " << *trace);
|
||||
fetch_stage_.pop();
|
||||
}
|
||||
|
||||
// schedule next warp
|
||||
this->warp_scheduler(cycle);
|
||||
fetch_latch_.pop();
|
||||
}
|
||||
}
|
||||
|
||||
void Core::decode(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
|
||||
if (decode_stage_.empty())
|
||||
if (decode_latch_.empty())
|
||||
return;
|
||||
|
||||
auto trace = decode_stage_.top();
|
||||
auto trace = decode_latch_.front();
|
||||
|
||||
// check ibuffer capacity
|
||||
auto& ibuffer = ibuffers_.at(trace->wid);
|
||||
if (ibuffer.full()) {
|
||||
if (!trace->suspend()) {
|
||||
DT(3, cycle, "*** ibuffer-stall: " << *trace);
|
||||
}
|
||||
++perf_stats_.ibuf_stalls;
|
||||
return;
|
||||
} else {
|
||||
trace->resume();
|
||||
}
|
||||
|
||||
// release warp
|
||||
if (!trace->fetch_stall) {
|
||||
stalled_warps_.reset(trace->wid);
|
||||
}
|
||||
|
||||
// update perf counters
|
||||
uint32_t active_threads = trace->tmask.count();
|
||||
if (trace->exe_type == ExeType::LSU && trace->lsu.type == LsuType::LOAD)
|
||||
perf_stats_.loads += active_threads;
|
||||
if (trace->exe_type == ExeType::LSU && trace->lsu.type == LsuType::STORE)
|
||||
perf_stats_.stores += active_threads;
|
||||
if (trace->exe_type == ExeType::ALU && trace->alu.type == AluType::BRANCH)
|
||||
perf_stats_.branches += active_threads;
|
||||
|
||||
DT(3, cycle, "pipeline-decode: " << *trace);
|
||||
|
||||
// advance to issue stage
|
||||
issue_stage_.push(trace);
|
||||
decode_stage_.pop();
|
||||
|
||||
// insert to ibuffer
|
||||
ibuffer.push(trace);
|
||||
|
||||
decode_latch_.pop();
|
||||
}
|
||||
|
||||
void Core::issue(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
|
||||
if (!issue_stage_.empty()) {
|
||||
// insert to ibuffer
|
||||
auto trace = issue_stage_.top();
|
||||
auto& ibuffer = ibuffers_.at(trace->wid);
|
||||
if (!trace->check_stalled(ibuffer.full())) {
|
||||
DT(3, cycle, "*** ibuffer-stall: " << *trace);
|
||||
}
|
||||
if (!ibuffer.full()) {
|
||||
ibuffer.push(trace);
|
||||
issue_stage_.pop();
|
||||
}
|
||||
}
|
||||
void Core::execute(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
|
||||
// issue ibuffer instructions
|
||||
for (auto& ibuffer : ibuffers_) {
|
||||
|
@ -247,180 +274,102 @@ void Core::issue(uint64_t cycle) {
|
|||
auto trace = ibuffer.top();
|
||||
|
||||
// check scoreboard
|
||||
if (!trace->check_stalled(scoreboard_.in_use(trace))) {
|
||||
DTH(3, cycle, "*** scoreboard-stall: dependents={");
|
||||
auto uses = scoreboard_.get_uses(trace);
|
||||
for (uint32_t i = 0, n = uses.size(); i < n; ++i) {
|
||||
auto& use = uses.at(i);
|
||||
__unused(use);
|
||||
if (i) DTN(3, ", ");
|
||||
DTN(3, use.type << use.reg << "(#" << use.owner << ")");
|
||||
if (scoreboard_.in_use(trace)) {
|
||||
if (!trace->suspend()) {
|
||||
DTH(3, cycle, "*** scoreboard-stall: dependents={");
|
||||
auto uses = scoreboard_.get_uses(trace);
|
||||
for (uint32_t i = 0, n = uses.size(); i < n; ++i) {
|
||||
auto& use = uses.at(i);
|
||||
__unused (use);
|
||||
if (i) DTN(3, ", ");
|
||||
DTN(3, use.type << use.reg << "(#" << use.owner << ")");
|
||||
}
|
||||
DTN(3, "}, " << *trace << std::endl);
|
||||
}
|
||||
DTN(3, "}, " << *trace << std::endl);
|
||||
}
|
||||
if (scoreboard_.in_use(trace))
|
||||
++perf_stats_.scrb_stalls;
|
||||
continue;
|
||||
|
||||
DT(3, cycle, "pipeline-issue: " << *trace);
|
||||
} else {
|
||||
trace->resume();
|
||||
}
|
||||
|
||||
// update scoreboard
|
||||
scoreboard_.reserve(trace);
|
||||
|
||||
// advance to execute stage
|
||||
execute_stage_.push(trace);
|
||||
DT(3, cycle, "pipeline-issue: " << *trace);
|
||||
|
||||
// push to execute units
|
||||
auto& exe_unit = exe_units_.at((int)trace->exe_type);
|
||||
exe_unit->Input.send(trace, 1);
|
||||
|
||||
ibuffer.pop();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void Core::execute(uint64_t cycle) {
|
||||
// process stage inputs
|
||||
if (!execute_stage_.empty()) {
|
||||
auto trace = execute_stage_.top();
|
||||
auto& exe_unit = exe_units_.at((int)trace->exe_type);
|
||||
exe_unit->push(trace);
|
||||
DT(3, cycle, "pipeline-execute: " << *trace);
|
||||
execute_stage_.pop();
|
||||
}
|
||||
|
||||
// advance execute units
|
||||
for (auto& exe_unit : exe_units_) {
|
||||
exe_unit->step(cycle);
|
||||
}
|
||||
|
||||
// commit completed instructions
|
||||
for (auto& exe_unit : exe_units_) {
|
||||
if (!exe_unit->empty()) {
|
||||
auto trace = exe_unit->top();
|
||||
if (trace->fetch_stall) {
|
||||
stalled_warps_.reset(trace->wid);
|
||||
}
|
||||
// advance to commit stage
|
||||
commit_stage_.push(trace);
|
||||
exe_unit->pop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Core::commit(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
|
||||
if (commit_stage_.empty())
|
||||
return;
|
||||
// commit completed instructions
|
||||
bool wb = false;
|
||||
for (auto& exe_unit : exe_units_) {
|
||||
if (!exe_unit->Output.empty()) {
|
||||
auto trace = exe_unit->Output.front();
|
||||
|
||||
auto trace = commit_stage_.top();
|
||||
// allow only one commit that updates registers
|
||||
if (trace->wb && wb)
|
||||
continue;
|
||||
wb |= trace->wb;
|
||||
|
||||
DT(3, cycle, "pipeline-commit: " << *trace);
|
||||
// advance to commit stage
|
||||
DT(3, cycle, "pipeline-commit: " << *trace);
|
||||
|
||||
// update scoreboard
|
||||
scoreboard_.release(trace);
|
||||
// update scoreboard
|
||||
scoreboard_.release(trace);
|
||||
|
||||
assert(committed_instrs_ <= issued_instrs_);
|
||||
++committed_instrs_;
|
||||
assert(committed_instrs_ <= issued_instrs_);
|
||||
++committed_instrs_;
|
||||
|
||||
commit_stage_.pop();
|
||||
perf_stats_.instrs += trace->tmask.count();
|
||||
|
||||
// delete the trace
|
||||
delete trace;
|
||||
}
|
||||
// delete the trace
|
||||
delete trace;
|
||||
|
||||
bool Core::running() const {
|
||||
bool is_running = (committed_instrs_ != issued_instrs_);
|
||||
return is_running;
|
||||
}
|
||||
|
||||
Word Core::get_csr(Addr addr, int tid, int wid) {
|
||||
if (addr == CSR_FFLAGS) {
|
||||
return fcsrs_.at(wid) & 0x1F;
|
||||
} else if (addr == CSR_FRM) {
|
||||
return (fcsrs_.at(wid) >> 5);
|
||||
} else if (addr == CSR_FCSR) {
|
||||
return fcsrs_.at(wid);
|
||||
} else if (addr == CSR_WTID) {
|
||||
// Warp threadID
|
||||
return tid;
|
||||
} else if (addr == CSR_LTID) {
|
||||
// Core threadID
|
||||
return tid + (wid * arch_.num_threads());
|
||||
} else if (addr == CSR_GTID) {
|
||||
// Processor threadID
|
||||
return tid + (wid * arch_.num_threads()) +
|
||||
(arch_.num_threads() * arch_.num_warps() * id_);
|
||||
} else if (addr == CSR_LWID) {
|
||||
// Core warpID
|
||||
return wid;
|
||||
} else if (addr == CSR_GWID) {
|
||||
// Processor warpID
|
||||
return wid + (arch_.num_warps() * id_);
|
||||
} else if (addr == CSR_GCID) {
|
||||
// Processor coreID
|
||||
return id_;
|
||||
} else if (addr == CSR_TMASK) {
|
||||
// Processor coreID
|
||||
return warps_.at(wid)->getTmask();
|
||||
} else if (addr == CSR_NT) {
|
||||
// Number of threads per warp
|
||||
return arch_.num_threads();
|
||||
} else if (addr == CSR_NW) {
|
||||
// Number of warps per core
|
||||
return arch_.num_warps();
|
||||
} else if (addr == CSR_NC) {
|
||||
// Number of cores
|
||||
return arch_.num_cores();
|
||||
} else if (addr == CSR_MINSTRET) {
|
||||
// NumInsts
|
||||
return stats_insts_;
|
||||
} else if (addr == CSR_MINSTRET_H) {
|
||||
// NumInsts
|
||||
return (Word)(stats_insts_ >> 32);
|
||||
} else if (addr == CSR_MCYCLE) {
|
||||
// NumCycles
|
||||
return (Word)SimPlatform::instance().cycles();
|
||||
} else if (addr == CSR_MCYCLE_H) {
|
||||
// NumCycles
|
||||
return (Word)(SimPlatform::instance().cycles() >> 32);
|
||||
} else {
|
||||
if (addr >= CSR_TEX(0,0)
|
||||
&& addr < CSR_TEX(NUM_TEX_UNITS,0)) {
|
||||
uint32_t unit = CSR_TEX_UNIT(addr);
|
||||
uint32_t state = CSR_TEX_STATE(addr);
|
||||
return tex_units_.at(unit).get_state(state);
|
||||
exe_unit->Output.pop();
|
||||
}
|
||||
return csrs_.at(addr);
|
||||
}
|
||||
}
|
||||
|
||||
void Core::set_csr(Addr addr, Word value, int /*tid*/, int wid) {
|
||||
if (addr == CSR_FFLAGS) {
|
||||
fcsrs_.at(wid) = (fcsrs_.at(wid) & ~0x1F) | (value & 0x1F);
|
||||
} else if (addr == CSR_FRM) {
|
||||
fcsrs_.at(wid) = (fcsrs_.at(wid) & ~0xE0) | (value << 5);
|
||||
} else if (addr == CSR_FCSR) {
|
||||
fcsrs_.at(wid) = value & 0xff;
|
||||
} else {
|
||||
if (addr >= CSR_TEX(0,0)
|
||||
&& addr < CSR_TEX(NUM_TEX_UNITS,0)) {
|
||||
uint32_t unit = CSR_TEX_UNIT(addr);
|
||||
uint32_t state = CSR_TEX_STATE(addr);
|
||||
tex_units_.at(unit).set_state(state, value);
|
||||
return;
|
||||
}
|
||||
csrs_.at(addr) = value;
|
||||
WarpMask Core::wspawn(int num_warps, int nextPC) {
|
||||
WarpMask ret(1);
|
||||
int active_warps = std::min<int>(num_warps, arch_.num_warps());
|
||||
DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << nextPC);
|
||||
for (int i = 1; i < active_warps; ++i) {
|
||||
auto warp = warps_.at(i);
|
||||
warp->setPC(nextPC);
|
||||
warp->setTmask(0, true);
|
||||
ret.set(i);
|
||||
}
|
||||
return std::move(ret);
|
||||
}
|
||||
|
||||
void Core::barrier(int bar_id, int count, int warp_id) {
|
||||
WarpMask Core::barrier(int bar_id, int count, int warp_id) {
|
||||
WarpMask ret(0);
|
||||
auto& barrier = barriers_.at(bar_id);
|
||||
barrier.set(warp_id);
|
||||
if (barrier.count() < (size_t)count)
|
||||
return;
|
||||
if (barrier.count() < (size_t)count) {
|
||||
warps_.at(warp_id)->suspend();
|
||||
DP(3, "*** Suspend warp #" << warp_id << " at barrier #" << bar_id);
|
||||
return std::move(ret);
|
||||
}
|
||||
for (int i = 0; i < arch_.num_warps(); ++i) {
|
||||
if (barrier.test(i)) {
|
||||
DP(3, "*** Resume warp #" << i << " at barrier #" << bar_id);
|
||||
warps_.at(i)->activate();
|
||||
ret.set(i);
|
||||
}
|
||||
}
|
||||
barrier.reset();
|
||||
return std::move(ret);
|
||||
}
|
||||
|
||||
Word Core::icache_read(Addr addr, Size size) {
|
||||
|
@ -430,35 +379,21 @@ Word Core::icache_read(Addr addr, Size size) {
|
|||
}
|
||||
|
||||
Word Core::dcache_read(Addr addr, Size size) {
|
||||
Word data = 0;
|
||||
if (SM_ENABLE) {
|
||||
if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE))
|
||||
&& ((addr + 3) < SMEM_BASE_ADDR)) {
|
||||
shared_mem_.read(&data, addr & (SMEM_SIZE-1), size);
|
||||
return data;
|
||||
}
|
||||
}
|
||||
Word data;
|
||||
mmu_.read(&data, addr, size, 0);
|
||||
return data;
|
||||
}
|
||||
|
||||
void Core::dcache_write(Addr addr, Word data, Size size) {
|
||||
if (SM_ENABLE) {
|
||||
if ((addr >= (SMEM_BASE_ADDR - SMEM_SIZE))
|
||||
&& ((addr + 3) < SMEM_BASE_ADDR)) {
|
||||
shared_mem_.write(&data, addr & (SMEM_SIZE-1), size);
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (addr >= IO_COUT_ADDR
|
||||
&& addr <= (IO_COUT_ADDR + IO_COUT_SIZE - 1)) {
|
||||
this->writeToStdOut(addr, data);
|
||||
return;
|
||||
} else {
|
||||
mmu_.write(&data, addr, size, 0);
|
||||
}
|
||||
mmu_.write(&data, addr, size, 0);
|
||||
}
|
||||
|
||||
Word Core::tex_read(uint32_t unit, Word u, Word v, Word lod, std::vector<uint64_t>* mem_addrs) {
|
||||
Word Core::tex_read(uint32_t unit, Word u, Word v, Word lod, std::vector<mem_addr_size_t>* mem_addrs) {
|
||||
return tex_units_.at(unit).read(u, v, lod, mem_addrs);
|
||||
}
|
||||
|
||||
|
@ -473,6 +408,228 @@ void Core::writeToStdOut(Addr addr, Word data) {
|
|||
}
|
||||
}
|
||||
|
||||
Word Core::get_csr(Addr addr, int tid, int wid) {
|
||||
switch (addr) {
|
||||
case CSR_SATP:
|
||||
case CSR_PMPCFG0:
|
||||
case CSR_PMPADDR0:
|
||||
case CSR_MSTATUS:
|
||||
case CSR_MISA:
|
||||
case CSR_MEDELEG:
|
||||
case CSR_MIDELEG:
|
||||
case CSR_MIE:
|
||||
case CSR_MTVEC:
|
||||
case CSR_MEPC:
|
||||
return 0;
|
||||
|
||||
case CSR_FFLAGS:
|
||||
return fcsrs_.at(wid) & 0x1F;
|
||||
case CSR_FRM:
|
||||
return (fcsrs_.at(wid) >> 5);
|
||||
case CSR_FCSR:
|
||||
return fcsrs_.at(wid);
|
||||
case CSR_WTID:
|
||||
// Warp threadID
|
||||
return tid;
|
||||
case CSR_LTID:
|
||||
// Core threadID
|
||||
return tid + (wid * arch_.num_threads());
|
||||
case CSR_GTID:
|
||||
// Processor threadID
|
||||
return tid + (wid * arch_.num_threads()) +
|
||||
(arch_.num_threads() * arch_.num_warps() * id_);
|
||||
case CSR_LWID:
|
||||
// Core warpID
|
||||
return wid;
|
||||
case CSR_GWID:
|
||||
// Processor warpID
|
||||
return wid + (arch_.num_warps() * id_);
|
||||
case CSR_GCID:
|
||||
// Processor coreID
|
||||
return id_;
|
||||
case CSR_TMASK:
|
||||
// Processor coreID
|
||||
return warps_.at(wid)->getTmask();
|
||||
case CSR_NT:
|
||||
// Number of threads per warp
|
||||
return arch_.num_threads();
|
||||
case CSR_NW:
|
||||
// Number of warps per core
|
||||
return arch_.num_warps();
|
||||
case CSR_NC:
|
||||
// Number of cores
|
||||
return arch_.num_cores();
|
||||
case CSR_MINSTRET:
|
||||
// NumInsts
|
||||
return perf_stats_.instrs & 0xffffffff;
|
||||
case CSR_MINSTRET_H:
|
||||
// NumInsts
|
||||
return (Word)(perf_stats_.instrs >> 32);
|
||||
case CSR_MCYCLE:
|
||||
// NumCycles
|
||||
return (Word)SimPlatform::instance().cycles();
|
||||
case CSR_MCYCLE_H:
|
||||
// NumCycles
|
||||
return (Word)(SimPlatform::instance().cycles() >> 32);
|
||||
case CSR_MPM_IBUF_ST:
|
||||
return perf_stats_.ibuf_stalls & 0xffffffff;
|
||||
case CSR_MPM_IBUF_ST_H:
|
||||
return perf_stats_.ibuf_stalls >> 32;
|
||||
case CSR_MPM_SCRB_ST:
|
||||
return perf_stats_.scrb_stalls & 0xffffffff;
|
||||
case CSR_MPM_SCRB_ST_H:
|
||||
return perf_stats_.scrb_stalls >> 32;
|
||||
case CSR_MPM_ALU_ST:
|
||||
return perf_stats_.alu_stalls & 0xffffffff;
|
||||
case CSR_MPM_ALU_ST_H:
|
||||
return perf_stats_.alu_stalls >> 32;
|
||||
case CSR_MPM_LSU_ST:
|
||||
return perf_stats_.lsu_stalls & 0xffffffff;
|
||||
case CSR_MPM_LSU_ST_H:
|
||||
return perf_stats_.lsu_stalls >> 32;
|
||||
case CSR_MPM_CSR_ST:
|
||||
return perf_stats_.csr_stalls & 0xffffffff;
|
||||
case CSR_MPM_CSR_ST_H:
|
||||
return perf_stats_.csr_stalls >> 32;
|
||||
case CSR_MPM_FPU_ST:
|
||||
return perf_stats_.fpu_stalls & 0xffffffff;
|
||||
case CSR_MPM_FPU_ST_H:
|
||||
return perf_stats_.fpu_stalls >> 32;
|
||||
case CSR_MPM_GPU_ST:
|
||||
return perf_stats_.gpu_stalls & 0xffffffff;
|
||||
case CSR_MPM_GPU_ST_H:
|
||||
return perf_stats_.gpu_stalls >> 32;
|
||||
|
||||
case CSR_MPM_LOADS:
|
||||
return perf_stats_.loads & 0xffffffff;
|
||||
case CSR_MPM_LOADS_H:
|
||||
return perf_stats_.loads >> 32;
|
||||
case CSR_MPM_STORES:
|
||||
return perf_stats_.stores & 0xffffffff;
|
||||
case CSR_MPM_STORES_H:
|
||||
return perf_stats_.stores >> 32;
|
||||
case CSR_MPM_BRANCHES:
|
||||
return perf_stats_.branches & 0xffffffff;
|
||||
case CSR_MPM_BRANCHES_H:
|
||||
return perf_stats_.branches >> 32;
|
||||
|
||||
case CSR_MPM_ICACHE_READS:
|
||||
return icache_->perf_stats().reads & 0xffffffff;
|
||||
case CSR_MPM_ICACHE_READS_H:
|
||||
return icache_->perf_stats().reads >> 32;
|
||||
case CSR_MPM_ICACHE_MISS_R:
|
||||
return icache_->perf_stats().read_misses & 0xffffffff;
|
||||
case CSR_MPM_ICACHE_MISS_R_H:
|
||||
return icache_->perf_stats().read_misses >> 32;
|
||||
|
||||
case CSR_MPM_DCACHE_READS:
|
||||
return dcache_->perf_stats().reads & 0xffffffff;
|
||||
case CSR_MPM_DCACHE_READS_H:
|
||||
return dcache_->perf_stats().reads >> 32;
|
||||
case CSR_MPM_DCACHE_WRITES:
|
||||
return dcache_->perf_stats().writes & 0xffffffff;
|
||||
case CSR_MPM_DCACHE_WRITES_H:
|
||||
return dcache_->perf_stats().writes >> 32;
|
||||
case CSR_MPM_DCACHE_MISS_R:
|
||||
return dcache_->perf_stats().read_misses & 0xffffffff;
|
||||
case CSR_MPM_DCACHE_MISS_R_H:
|
||||
return dcache_->perf_stats().read_misses >> 32;
|
||||
case CSR_MPM_DCACHE_MISS_W:
|
||||
return dcache_->perf_stats().write_misses & 0xffffffff;
|
||||
case CSR_MPM_DCACHE_MISS_W_H:
|
||||
return dcache_->perf_stats().write_misses >> 32;
|
||||
case CSR_MPM_DCACHE_BANK_ST:
|
||||
return dcache_->perf_stats().bank_stalls & 0xffffffff;
|
||||
case CSR_MPM_DCACHE_BANK_ST_H:
|
||||
return dcache_->perf_stats().bank_stalls >> 32;
|
||||
case CSR_MPM_DCACHE_MSHR_ST:
|
||||
return dcache_->perf_stats().mshr_stalls & 0xffffffff;
|
||||
case CSR_MPM_DCACHE_MSHR_ST_H:
|
||||
return dcache_->perf_stats().mshr_stalls >> 32;
|
||||
|
||||
case CSR_MPM_SMEM_READS:
|
||||
return shared_mem_->perf_stats().reads & 0xffffffff;
|
||||
case CSR_MPM_SMEM_READS_H:
|
||||
return shared_mem_->perf_stats().reads >> 32;
|
||||
case CSR_MPM_SMEM_WRITES:
|
||||
return shared_mem_->perf_stats().writes & 0xffffffff;
|
||||
case CSR_MPM_SMEM_WRITES_H:
|
||||
return shared_mem_->perf_stats().writes >> 32;
|
||||
case CSR_MPM_SMEM_BANK_ST:
|
||||
return shared_mem_->perf_stats().bank_stalls & 0xffffffff;
|
||||
case CSR_MPM_SMEM_BANK_ST_H:
|
||||
return shared_mem_->perf_stats().bank_stalls >> 32;
|
||||
|
||||
case CSR_MPM_MEM_READS:
|
||||
return perf_stats_.mem_reads & 0xffffffff;
|
||||
case CSR_MPM_MEM_READS_H:
|
||||
return perf_stats_.mem_reads >> 32;
|
||||
case CSR_MPM_MEM_WRITES:
|
||||
return perf_stats_.mem_writes & 0xffffffff;
|
||||
case CSR_MPM_MEM_WRITES_H:
|
||||
return perf_stats_.mem_writes >> 32;
|
||||
case CSR_MPM_MEM_LAT:
|
||||
return perf_stats_.mem_latency & 0xffffffff;
|
||||
case CSR_MPM_MEM_LAT_H:
|
||||
return perf_stats_.mem_latency >> 32;
|
||||
|
||||
#ifdef EXT_TEX_ENABLE
|
||||
case CSR_MPM_TEX_READS:
|
||||
return perf_stats_.tex_reads & 0xffffffff;
|
||||
case CSR_MPM_TEX_READS_H:
|
||||
return perf_stats_.tex_reads >> 32;
|
||||
case CSR_MPM_TEX_LAT:
|
||||
return perf_stats_.tex_latency & 0xffffffff;
|
||||
case CSR_MPM_TEX_LAT_H:
|
||||
return perf_stats_.tex_latency >> 32;
|
||||
#endif
|
||||
default:
|
||||
if ((addr >= CSR_MPM_BASE && addr < (CSR_MPM_BASE + 32))
|
||||
|| (addr >= CSR_MPM_BASE_H && addr < (CSR_MPM_BASE_H + 32))) {
|
||||
// user-defined MPM CSRs
|
||||
} else
|
||||
#ifdef EXT_TEX_ENABLE
|
||||
if (addr == CSR_TEX_UNIT) {
|
||||
return csr_tex_unit_;
|
||||
} else
|
||||
if (addr >= CSR_TEX_STATE_BEGIN
|
||||
&& addr < CSR_TEX_STATE_END) {
|
||||
uint32_t state = CSR_TEX_STATE(addr);
|
||||
return tex_units_.at(csr_tex_unit_).get_state(state);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
std::cout << std::hex << "Error: invalid CSR read addr=0x" << addr << std::endl;
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void Core::set_csr(Addr addr, Word value, int /*tid*/, int wid) {
|
||||
if (addr == CSR_FFLAGS) {
|
||||
fcsrs_.at(wid) = (fcsrs_.at(wid) & ~0x1F) | (value & 0x1F);
|
||||
} else if (addr == CSR_FRM) {
|
||||
fcsrs_.at(wid) = (fcsrs_.at(wid) & ~0xE0) | (value << 5);
|
||||
} else if (addr == CSR_FCSR) {
|
||||
fcsrs_.at(wid) = value & 0xff;
|
||||
} else
|
||||
#ifdef EXT_TEX_ENABLE
|
||||
if (addr == CSR_TEX_UNIT) {
|
||||
csr_tex_unit_ = value;
|
||||
} else
|
||||
if (addr >= CSR_TEX_STATE_BEGIN
|
||||
&& addr < CSR_TEX_STATE_END) {
|
||||
uint32_t state = CSR_TEX_STATE(addr);
|
||||
tex_units_.at(csr_tex_unit_).set_state(state, value);
|
||||
return;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
csrs_.at(addr) = value;
|
||||
}
|
||||
}
|
||||
|
||||
void Core::trigger_ecall() {
|
||||
ecall_ = true;
|
||||
}
|
||||
|
@ -483,4 +640,9 @@ void Core::trigger_ebreak() {
|
|||
|
||||
bool Core::check_exit() const {
|
||||
return ebreak_ || ecall_;
|
||||
}
|
||||
|
||||
bool Core::running() const {
|
||||
bool is_running = (committed_instrs_ != issued_instrs_);
|
||||
return is_running;
|
||||
}
|
|
@ -17,6 +17,7 @@
|
|||
#include "warp.h"
|
||||
#include "pipeline.h"
|
||||
#include "cache.h"
|
||||
#include "sharedmem.h"
|
||||
#include "ibuffer.h"
|
||||
#include "scoreboard.h"
|
||||
#include "exeunit.h"
|
||||
|
@ -26,6 +27,47 @@ namespace vortex {
|
|||
|
||||
class Core : public SimObject<Core> {
|
||||
public:
|
||||
struct PerfStats {
|
||||
uint64_t instrs;
|
||||
uint64_t ibuf_stalls;
|
||||
uint64_t scrb_stalls;
|
||||
uint64_t alu_stalls;
|
||||
uint64_t lsu_stalls;
|
||||
uint64_t csr_stalls;
|
||||
uint64_t fpu_stalls;
|
||||
uint64_t gpu_stalls;
|
||||
uint64_t loads;
|
||||
uint64_t stores;
|
||||
uint64_t branches;
|
||||
uint64_t mem_reads;
|
||||
uint64_t mem_writes;
|
||||
uint64_t mem_latency;
|
||||
uint64_t tex_reads;
|
||||
uint64_t tex_latency;
|
||||
|
||||
PerfStats()
|
||||
: instrs(0)
|
||||
, ibuf_stalls(0)
|
||||
, scrb_stalls(0)
|
||||
, alu_stalls(0)
|
||||
, lsu_stalls(0)
|
||||
, csr_stalls(0)
|
||||
, fpu_stalls(0)
|
||||
, gpu_stalls(0)
|
||||
, loads(0)
|
||||
, stores(0)
|
||||
, branches(0)
|
||||
, mem_reads(0)
|
||||
, mem_writes(0)
|
||||
, mem_latency(0)
|
||||
, tex_reads(0)
|
||||
, tex_latency(0)
|
||||
{}
|
||||
};
|
||||
|
||||
SimPort<MemRsp> MemRspPort;
|
||||
SimPort<MemReq> MemReqPort;
|
||||
|
||||
Core(const SimContext& ctx, const ArchDef &arch, Word id);
|
||||
~Core();
|
||||
|
||||
|
@ -51,8 +93,8 @@ public:
|
|||
return arch_;
|
||||
}
|
||||
|
||||
unsigned long stats_insts() const {
|
||||
return stats_insts_;
|
||||
const PerfStats& perf_stats() const {
|
||||
return perf_stats_;
|
||||
}
|
||||
|
||||
Word getIRegValue(int reg) const {
|
||||
|
@ -63,7 +105,9 @@ public:
|
|||
|
||||
void set_csr(Addr addr, Word value, int tid, int wid);
|
||||
|
||||
void barrier(int bar_id, int count, int warp_id);
|
||||
WarpMask wspawn(int num_warps, int nextPC);
|
||||
|
||||
WarpMask barrier(int bar_id, int count, int warp_id);
|
||||
|
||||
Word icache_read(Addr, Size);
|
||||
|
||||
|
@ -71,7 +115,7 @@ public:
|
|||
|
||||
void dcache_write(Addr, Word, Size);
|
||||
|
||||
Word tex_read(uint32_t unit, Word lod, Word u, Word v, std::vector<uint64_t>* mem_addrs);
|
||||
Word tex_read(uint32_t unit, Word lod, Word u, Word v, std::vector<mem_addr_size_t>* mem_addrs);
|
||||
|
||||
void trigger_ecall();
|
||||
|
||||
|
@ -81,21 +125,18 @@ public:
|
|||
|
||||
private:
|
||||
|
||||
void schedule(uint64_t cycle);
|
||||
void fetch(uint64_t cycle);
|
||||
void decode(uint64_t cycle);
|
||||
void issue(uint64_t cycle);
|
||||
void execute(uint64_t cycle);
|
||||
void commit(uint64_t cycle);
|
||||
|
||||
void warp_scheduler(uint64_t cycle);
|
||||
|
||||
|
||||
void writeToStdOut(Addr addr, Word data);
|
||||
|
||||
Word id_;
|
||||
const ArchDef arch_;
|
||||
const Decoder decoder_;
|
||||
MemoryUnit mmu_;
|
||||
RAM shared_mem_;
|
||||
std::vector<TexUnit> tex_units_;
|
||||
|
||||
std::vector<std::shared_ptr<Warp>> warps_;
|
||||
|
@ -107,33 +148,33 @@ private:
|
|||
std::vector<ExeUnit::Ptr> exe_units_;
|
||||
Cache::Ptr icache_;
|
||||
Cache::Ptr dcache_;
|
||||
SharedMem::Ptr shared_mem_;
|
||||
Switch<MemReq, MemRsp>::Ptr l1_mem_switch_;
|
||||
std::vector<Switch<MemReq, MemRsp>::Ptr> dcache_switch_;
|
||||
|
||||
PipelineStage fetch_stage_;
|
||||
PipelineStage decode_stage_;
|
||||
PipelineStage issue_stage_;
|
||||
PipelineStage execute_stage_;
|
||||
PipelineStage commit_stage_;
|
||||
PipelineLatch fetch_latch_;
|
||||
PipelineLatch decode_latch_;
|
||||
|
||||
HashTable<pipeline_trace_t*> pending_icache_;
|
||||
WarpMask stalled_warps_;
|
||||
WarpMask active_warps_;
|
||||
WarpMask stalled_warps_;
|
||||
uint32_t last_schedule_wid_;
|
||||
uint32_t issued_instrs_;
|
||||
uint32_t committed_instrs_;
|
||||
uint64_t issued_instrs_;
|
||||
uint64_t committed_instrs_;
|
||||
uint32_t csr_tex_unit_;
|
||||
bool ecall_;
|
||||
bool ebreak_;
|
||||
|
||||
std::unordered_map<int, std::stringstream> print_bufs_;
|
||||
|
||||
uint64_t stats_insts_;
|
||||
PerfStats perf_stats_;
|
||||
uint64_t perf_mem_pending_reads_;
|
||||
|
||||
friend class LsuUnit;
|
||||
friend class AluUnit;
|
||||
friend class CsrUnit;
|
||||
friend class FpuUnit;
|
||||
friend class GpuUnit;
|
||||
|
||||
public:
|
||||
SlavePort<MemRsp> MemRspPort;
|
||||
MasterPort<MemReq> MemReqPort;
|
||||
};
|
||||
|
||||
} // namespace vortex
|
|
@ -359,14 +359,28 @@ std::shared_ptr<Instr> Decoder::decode(Word code) const {
|
|||
instr->setDestReg(rd);
|
||||
}
|
||||
instr->setFunc3(func3);
|
||||
instr->setFunc7(func7);
|
||||
if ((func3 == 5) && (op != L_INST) && (op != Opcode::FL)) {
|
||||
instr->setImm(sext32(rs2, 5));
|
||||
} else {
|
||||
instr->setFunc7(func7);
|
||||
switch (op) {
|
||||
case Opcode::SYS_INST:
|
||||
case Opcode::FENCE:
|
||||
// uint12
|
||||
instr->setImm(code >> shift_rs2_);
|
||||
break;
|
||||
case Opcode::I_INST:
|
||||
if (func3 == 0x1 || func3 == 0x5) {
|
||||
// int5
|
||||
instr->setImm(sext32(rs2, 5));
|
||||
} else {
|
||||
// int12
|
||||
instr->setImm(sext32(code >> shift_rs2_, 12));
|
||||
}
|
||||
break;
|
||||
default:
|
||||
// int12
|
||||
instr->setImm(sext32(code >> shift_rs2_, 12));
|
||||
break;
|
||||
}
|
||||
} break;
|
||||
|
||||
case InstType::S_TYPE: {
|
||||
instr->setSrcReg(rs1);
|
||||
if (op == Opcode::FS) {
|
||||
|
@ -375,8 +389,8 @@ std::shared_ptr<Instr> Decoder::decode(Word code) const {
|
|||
instr->setSrcReg(rs2);
|
||||
}
|
||||
instr->setFunc3(func3);
|
||||
Word imeed = (func7 << reg_s_) | rd;
|
||||
instr->setImm(sext32(imeed, 12));
|
||||
Word imm = (func7 << reg_s_) | rd;
|
||||
instr->setImm(sext32(imm, 12));
|
||||
} break;
|
||||
|
||||
case InstType::B_TYPE: {
|
||||
|
@ -387,8 +401,8 @@ std::shared_ptr<Instr> Decoder::decode(Word code) const {
|
|||
Word bits_4_1 = rd >> 1;
|
||||
Word bit_10_5 = func7 & 0x3f;
|
||||
Word bit_12 = func7 >> 6;
|
||||
Word imeed = (bits_4_1 << 1) | (bit_10_5 << 5) | (bit_11 << 11) | (bit_12 << 12);
|
||||
instr->setImm(sext32(imeed, 13));
|
||||
Word imm = (bits_4_1 << 1) | (bit_10_5 << 5) | (bit_11 << 11) | (bit_12 << 12);
|
||||
instr->setImm(sext32(imm, 13));
|
||||
} break;
|
||||
|
||||
case InstType::U_TYPE:
|
||||
|
@ -403,11 +417,11 @@ std::shared_ptr<Instr> Decoder::decode(Word code) const {
|
|||
Word bit_11 = (unordered >> 8) & 0x1;
|
||||
Word bits_10_1 = (unordered >> 9) & 0x3ff;
|
||||
Word bit_20 = (unordered >> 19) & 0x1;
|
||||
Word imeed = 0 | (bits_10_1 << 1) | (bit_11 << 11) | (bits_19_12 << 12) | (bit_20 << 20);
|
||||
Word imm = 0 | (bits_10_1 << 1) | (bit_11 << 11) | (bits_19_12 << 12) | (bit_20 << 20);
|
||||
if (bit_20) {
|
||||
imeed |= ~j_imm_mask_;
|
||||
imm |= ~j_imm_mask_;
|
||||
}
|
||||
instr->setImm(imeed);
|
||||
instr->setImm(imm);
|
||||
} break;
|
||||
|
||||
case InstType::V_TYPE:
|
||||
|
|
|
@ -428,7 +428,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
Word memAddr = ((rsdata[t][0] + immsrc) & 0xFFFFFFFC); // word aligned
|
||||
Word shift_by = ((rsdata[t][0] + immsrc) & 0x00000003) * 8;
|
||||
Word data_read = core_->dcache_read(memAddr, 4);
|
||||
trace->mem_addrs.at(t).push_back(memAddr);
|
||||
trace->mem_addrs.at(t).push_back({memAddr, 4});
|
||||
DP(4, "LOAD MEM: ADDRESS=0x" << std::hex << memAddr << ", DATA=0x" << data_read);
|
||||
switch (func3) {
|
||||
case 0:
|
||||
|
@ -491,7 +491,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
if (!tmask_.test(t))
|
||||
continue;
|
||||
Word memAddr = rsdata[t][0] + immsrc;
|
||||
trace->mem_addrs.at(t).push_back(memAddr);
|
||||
trace->mem_addrs.at(t).push_back({memAddr, (1u << func3)});
|
||||
DP(4, "STORE MEM: ADDRESS=0x" << std::hex << memAddr);
|
||||
switch (func3) {
|
||||
case 0:
|
||||
|
@ -528,14 +528,14 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
}
|
||||
break;
|
||||
case SYS_INST:
|
||||
trace->exe_type = ExeType::CSR;
|
||||
for (int t = 0; t < num_threads; ++t) {
|
||||
if (!tmask_.test(t))
|
||||
continue;
|
||||
Word csr_addr = immsrc & 0x00000FFF;
|
||||
Word csr_value = core_->get_csr(csr_addr, t, id_);
|
||||
switch (func3) {
|
||||
case 0:
|
||||
Word csr_addr = immsrc;
|
||||
Word csr_value;
|
||||
if (func3 == 0) {
|
||||
trace->exe_type = ExeType::ALU;
|
||||
trace->fetch_stall = true;
|
||||
switch (csr_addr) {
|
||||
case 0: // ECALL
|
||||
core_->trigger_ecall();
|
||||
|
@ -549,56 +549,59 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
break;
|
||||
case 1:
|
||||
// CSRRW
|
||||
rddata[t] = csr_value;
|
||||
core_->set_csr(csr_addr, rsdata[t][0], t, id_);
|
||||
trace->used_iregs.set(rsrc0);
|
||||
rd_write = true;
|
||||
break;
|
||||
case 2:
|
||||
// CSRRS
|
||||
rddata[t] = csr_value;
|
||||
core_->set_csr(csr_addr, csr_value | rsdata[t][0], t, id_);
|
||||
trace->used_iregs.set(rsrc0);
|
||||
rd_write = true;
|
||||
break;
|
||||
case 3:
|
||||
// CSRRC
|
||||
rddata[t] = csr_value;
|
||||
core_->set_csr(csr_addr, csr_value & ~rsdata[t][0], t, id_);
|
||||
trace->used_iregs.set(rsrc0);
|
||||
rd_write = true;
|
||||
break;
|
||||
case 5:
|
||||
// CSRRWI
|
||||
rddata[t] = csr_value;
|
||||
core_->set_csr(csr_addr, rsrc0, t, id_);
|
||||
rd_write = true;
|
||||
break;
|
||||
case 6:
|
||||
// CSRRSI
|
||||
rddata[t] = csr_value;
|
||||
core_->set_csr(csr_addr, csr_value | rsrc0, t, id_);
|
||||
rd_write = true;
|
||||
break;
|
||||
case 7:
|
||||
// CSRRCI
|
||||
rddata[t] = csr_value;
|
||||
core_->set_csr(csr_addr, csr_value & ~rsrc0, t, id_);
|
||||
rd_write = true;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
trace->exe_type = ExeType::CSR;
|
||||
csr_value = core_->get_csr(csr_addr, t, id_);
|
||||
switch (func3) {
|
||||
case 1:
|
||||
// CSRRW
|
||||
rddata[t] = csr_value;
|
||||
core_->set_csr(csr_addr, rsdata[t][0], t, id_);
|
||||
trace->used_iregs.set(rsrc0);
|
||||
rd_write = true;
|
||||
break;
|
||||
case 2:
|
||||
// CSRRS
|
||||
rddata[t] = csr_value;
|
||||
core_->set_csr(csr_addr, csr_value | rsdata[t][0], t, id_);
|
||||
trace->used_iregs.set(rsrc0);
|
||||
rd_write = true;
|
||||
break;
|
||||
case 3:
|
||||
// CSRRC
|
||||
rddata[t] = csr_value;
|
||||
core_->set_csr(csr_addr, csr_value & ~rsdata[t][0], t, id_);
|
||||
trace->used_iregs.set(rsrc0);
|
||||
rd_write = true;
|
||||
break;
|
||||
case 5:
|
||||
// CSRRWI
|
||||
rddata[t] = csr_value;
|
||||
core_->set_csr(csr_addr, rsrc0, t, id_);
|
||||
rd_write = true;
|
||||
break;
|
||||
case 6:
|
||||
// CSRRSI;
|
||||
rddata[t] = csr_value;
|
||||
core_->set_csr(csr_addr, csr_value | rsrc0, t, id_);
|
||||
rd_write = true;
|
||||
break;
|
||||
case 7:
|
||||
// CSRRCI
|
||||
rddata[t] = csr_value;
|
||||
core_->set_csr(csr_addr, csr_value & ~rsrc0, t, id_);
|
||||
rd_write = true;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
case FENCE:
|
||||
trace->exe_type = ExeType::LSU;
|
||||
trace->lsu.type = LsuType::FENCE;
|
||||
trace->fetch_stall = true;
|
||||
break;
|
||||
case FCI:
|
||||
trace->exe_type = ExeType::FPU;
|
||||
|
@ -797,6 +800,8 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
DPN(3, std::endl);
|
||||
|
||||
active_ = tmask_.any();
|
||||
trace->gpu.active_warps.reset();
|
||||
trace->gpu.active_warps.set(id_, active_);
|
||||
} break;
|
||||
case 1: {
|
||||
// WSPAWN
|
||||
|
@ -805,13 +810,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
trace->fetch_stall = true;
|
||||
int active_warps = std::min<int>(rsdata.at(ts)[0], core_->arch().num_warps());
|
||||
DP(3, "*** Activate " << (active_warps-1) << " warps at PC: " << std::hex << rsdata.at(ts)[1]);
|
||||
for (int i = 1; i < active_warps; ++i) {
|
||||
Warp &newWarp = core_->warp(i);
|
||||
newWarp.setPC(rsdata[ts][1]);
|
||||
newWarp.setTmask(0, true);
|
||||
}
|
||||
trace->gpu.active_warps = core_->wspawn(rsdata.at(ts)[0], rsdata.at(ts)[1]);
|
||||
} break;
|
||||
case 2: {
|
||||
// SPLIT
|
||||
|
@ -877,9 +876,8 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
|
|||
trace->gpu.type = GpuType::BAR;
|
||||
trace->used_iregs.set(rsrc0);
|
||||
trace->used_iregs.set(rsrc1);
|
||||
trace->fetch_stall = true;
|
||||
active_ = false;
|
||||
core_->barrier(rsdata[ts][0], rsdata[ts][1], id_);
|
||||
trace->fetch_stall = true;
|
||||
trace->gpu.active_warps = core_->barrier(rsdata[ts][0], rsdata[ts][1], id_);
|
||||
} break;
|
||||
case 5: {
|
||||
// PREFETCH
|
||||
|
|
|
@ -10,64 +10,78 @@
|
|||
|
||||
using namespace vortex;
|
||||
|
||||
NopUnit::NopUnit(Core*) : ExeUnit("NOP") {}
|
||||
NopUnit::NopUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "NOP") {}
|
||||
|
||||
void NopUnit::step(uint64_t /*cycle*/) {
|
||||
if (inputs_.empty())
|
||||
if (Input.empty())
|
||||
return;
|
||||
auto trace = inputs_.top();
|
||||
this->schedule_output(trace, 1);
|
||||
inputs_.pop();
|
||||
auto trace = Input.front();
|
||||
Output.send(trace, 1);
|
||||
Input.pop();
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
LsuUnit::LsuUnit(Core* core)
|
||||
: ExeUnit("LSU")
|
||||
, core_(core)
|
||||
LsuUnit::LsuUnit(const SimContext& ctx, Core* core)
|
||||
: ExeUnit(ctx, core, "LSU")
|
||||
, num_threads_(core->arch().num_threads())
|
||||
, pending_dcache_(LSUQ_SIZE)
|
||||
, fence_lock_(false)
|
||||
{}
|
||||
|
||||
void LsuUnit::step(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
|
||||
// handle dcache response
|
||||
for (uint32_t t = 0; t < num_threads_; ++t) {
|
||||
auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(0);
|
||||
if (dcache_rsp_port.empty())
|
||||
continue;
|
||||
auto& mem_rsp = dcache_rsp_port.top();
|
||||
auto& mem_rsp = dcache_rsp_port.front();
|
||||
auto& entry = pending_dcache_.at(mem_rsp.tag);
|
||||
auto trace = entry.first;
|
||||
DT(3, cycle, "dcache-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
|
||||
<< ", tid=" << t << ", " << *trace);
|
||||
assert(entry.second);
|
||||
--entry.second; // track remaining blocks
|
||||
if (0 == entry.second) {
|
||||
auto latency = (SimPlatform::instance().cycles() - trace->dcache_latency);
|
||||
trace->dcache_latency = latency;
|
||||
this->schedule_output(trace, 1);
|
||||
if (0 == entry.second) {
|
||||
Output.send(trace, 1);
|
||||
pending_dcache_.release(mem_rsp.tag);
|
||||
}
|
||||
dcache_rsp_port.pop();
|
||||
}
|
||||
|
||||
// handle shared memory response
|
||||
for (uint32_t t = 0; t < num_threads_; ++t) {
|
||||
auto& smem_rsp_port = core_->shared_mem_->Outputs.at(t);
|
||||
if (smem_rsp_port.empty())
|
||||
continue;
|
||||
auto& mem_rsp = smem_rsp_port.front();
|
||||
auto& entry = pending_dcache_.at(mem_rsp.tag);
|
||||
auto trace = entry.first;
|
||||
DT(3, cycle, "smem-rsp: tag=" << mem_rsp.tag << ", type=" << trace->lsu.type
|
||||
<< ", tid=" << t << ", " << *trace);
|
||||
assert(entry.second);
|
||||
--entry.second; // track remaining blocks
|
||||
if (0 == entry.second) {
|
||||
Output.send(trace, 1);
|
||||
pending_dcache_.release(mem_rsp.tag);
|
||||
}
|
||||
smem_rsp_port.pop();
|
||||
}
|
||||
|
||||
if (fence_lock_) {
|
||||
// wait for all pending memory operations to complete
|
||||
if (!pending_dcache_.empty())
|
||||
return;
|
||||
this->schedule_output(fence_state_, 1);
|
||||
Output.send(fence_state_, 1);
|
||||
fence_lock_ = false;
|
||||
DT(3, cycle, "fence-unlock: " << fence_state_);
|
||||
}
|
||||
|
||||
// check input queue
|
||||
if (inputs_.empty())
|
||||
if (Input.empty())
|
||||
return;
|
||||
|
||||
auto trace = inputs_.top();
|
||||
auto trace = Input.front();
|
||||
|
||||
if (trace->lsu.type == LsuType::FENCE) {
|
||||
// schedule fence lock
|
||||
|
@ -75,179 +89,188 @@ void LsuUnit::step(uint64_t cycle) {
|
|||
fence_lock_ = true;
|
||||
DT(3, cycle, "fence-lock: " << *trace);
|
||||
// remove input
|
||||
inputs_.pop();
|
||||
auto time = Input.pop();
|
||||
core_->perf_stats_.lsu_stalls += (cycle - time);
|
||||
return;
|
||||
}
|
||||
|
||||
// check pending queue capacity
|
||||
if (!trace->check_stalled(pending_dcache_.full())) {
|
||||
DT(3, cycle, "*** lsu-queue-stall: " << *trace);
|
||||
}
|
||||
if (pending_dcache_.full())
|
||||
// check pending queue capacity
|
||||
if (pending_dcache_.full()) {
|
||||
if (!trace->suspend()) {
|
||||
DT(3, cycle, "*** lsu-queue-stall: " << *trace);
|
||||
}
|
||||
return;
|
||||
|
||||
// send memory request
|
||||
|
||||
bool has_shared_memory = false;
|
||||
bool mem_rsp_pending = false;
|
||||
} else {
|
||||
trace->resume();
|
||||
}
|
||||
|
||||
bool is_write = (trace->lsu.type == LsuType::STORE);
|
||||
|
||||
uint32_t valid_addrs = 0;
|
||||
for (auto& mem_addr : trace->mem_addrs) {
|
||||
valid_addrs += mem_addr.size();
|
||||
}
|
||||
// duplicates detection
|
||||
bool is_dup = false;
|
||||
if (trace->tmask.test(0)) {
|
||||
uint64_t addr_mask = sizeof(Word)-1;
|
||||
Word addr0 = trace->mem_addrs.at(0).at(0).addr & ~addr_mask;
|
||||
uint32_t matches = 1;
|
||||
for (uint32_t t = 1; t < num_threads_; ++t) {
|
||||
if (!trace->tmask.test(t))
|
||||
continue;
|
||||
auto mem_addr = trace->mem_addrs.at(t).at(0).addr & ~addr_mask;
|
||||
matches += (addr0 == mem_addr);
|
||||
}
|
||||
is_dup = (matches == trace->tmask.count());
|
||||
}
|
||||
|
||||
uint32_t valid_addrs = 0;
|
||||
if (is_dup) {
|
||||
valid_addrs = 1;
|
||||
} else {
|
||||
for (auto& mem_addr : trace->mem_addrs) {
|
||||
valid_addrs += mem_addr.size();
|
||||
}
|
||||
}
|
||||
|
||||
trace->dcache_latency = SimPlatform::instance().cycles();
|
||||
auto tag = pending_dcache_.allocate({trace, valid_addrs});
|
||||
|
||||
for (uint32_t t = 0; t < num_threads_; ++t) {
|
||||
if (!trace->tmask.test(t))
|
||||
continue;
|
||||
|
||||
auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(0);
|
||||
auto mem_addr = trace->mem_addrs.at(t).at(0);
|
||||
auto type = get_addr_type(mem_addr.addr, mem_addr.size);
|
||||
|
||||
auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(0);
|
||||
for (auto mem_addr : trace->mem_addrs.at(t)) {
|
||||
// check shared memory address
|
||||
if (SM_ENABLE) {
|
||||
if ((mem_addr >= (SMEM_BASE_ADDR-SMEM_SIZE))
|
||||
&& (mem_addr < SMEM_BASE_ADDR)) {
|
||||
DT(3, cycle, "smem-access: addr=" << std::hex << mem_addr << ", tag=" << tag
|
||||
<< ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace);
|
||||
has_shared_memory = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
bool is_io = (mem_addr >= IO_BASE_ADDR);
|
||||
|
||||
MemReq mem_req;
|
||||
mem_req.addr = mem_addr;
|
||||
mem_req.write = is_write;
|
||||
mem_req.tag = tag;
|
||||
mem_req.is_io = is_io;
|
||||
dcache_req_port.send(mem_req, 1);
|
||||
DT(3, cycle, "dcache-req: addr=" << std::hex << mem_addr << ", tag=" << tag
|
||||
<< ", type=" << trace->lsu.type << ", tid=" << t << ", io=" << is_io << ", "<< trace);
|
||||
// do not wait on writes
|
||||
mem_rsp_pending = !is_write;
|
||||
}
|
||||
MemReq mem_req;
|
||||
mem_req.addr = mem_addr.addr;
|
||||
mem_req.write = is_write;
|
||||
mem_req.tag = tag;
|
||||
mem_req.is_io = (type == AddrType::IO);
|
||||
|
||||
if (type == AddrType::Shared) {
|
||||
core_->shared_mem_->Inputs.at(t).send(mem_req, 2);
|
||||
DT(3, cycle, "smem-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
|
||||
<< ", type=" << trace->lsu.type << ", tid=" << t << ", " << *trace);
|
||||
} else {
|
||||
dcache_req_port.send(mem_req, 2);
|
||||
DT(3, cycle, "dcache-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
|
||||
<< ", type=" << trace->lsu.type << ", tid=" << t << ", io=" << mem_req.is_io << ", " << *trace);
|
||||
}
|
||||
|
||||
if (is_dup)
|
||||
break;
|
||||
}
|
||||
|
||||
// do not wait
|
||||
if (!mem_rsp_pending) {
|
||||
// do not wait on writes
|
||||
if (is_write) {
|
||||
pending_dcache_.release(tag);
|
||||
uint32_t delay = 1;
|
||||
if (has_shared_memory) {
|
||||
// all threads accessed shared memory
|
||||
delay += Constants::SMEM_DELAY;
|
||||
}
|
||||
this->schedule_output(trace, delay);
|
||||
Output.send(trace, 1);
|
||||
}
|
||||
|
||||
// remove input
|
||||
inputs_.pop();
|
||||
auto time = Input.pop();
|
||||
core_->perf_stats_.lsu_stalls += (cycle - time);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
AluUnit::AluUnit(Core*) : ExeUnit("ALU") {}
|
||||
AluUnit::AluUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "ALU") {}
|
||||
|
||||
void AluUnit::step(uint64_t /*cycle*/) {
|
||||
if (inputs_.empty())
|
||||
void AluUnit::step(uint64_t cycle) {
|
||||
if (Input.empty())
|
||||
return;
|
||||
auto trace = inputs_.top();
|
||||
auto trace = Input.front();
|
||||
switch (trace->alu.type) {
|
||||
case AluType::ARITH:
|
||||
case AluType::BRANCH:
|
||||
case AluType::CMOV:
|
||||
this->schedule_output(trace, 1);
|
||||
inputs_.pop();
|
||||
Output.send(trace, 1);
|
||||
break;
|
||||
case AluType::IMUL:
|
||||
this->schedule_output(trace, LATENCY_IMUL);
|
||||
inputs_.pop();
|
||||
Output.send(trace, LATENCY_IMUL+1);
|
||||
break;
|
||||
case AluType::IDIV:
|
||||
this->schedule_output(trace, XLEN);
|
||||
inputs_.pop();
|
||||
Output.send(trace, XLEN+1);
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
DT(3, cycle, "pipeline-execute: op=" << trace->alu.type << ", " << *trace);
|
||||
if (trace->fetch_stall) {
|
||||
core_->stalled_warps_.reset(trace->wid);
|
||||
}
|
||||
auto time = Input.pop();
|
||||
core_->perf_stats_.alu_stalls += (cycle - time);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
CsrUnit::CsrUnit(Core*) : ExeUnit("CSR") {}
|
||||
CsrUnit::CsrUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "CSR") {}
|
||||
|
||||
void CsrUnit::step(uint64_t /*cycle*/) {
|
||||
if (inputs_.empty())
|
||||
void CsrUnit::step(uint64_t cycle) {
|
||||
if (Input.empty())
|
||||
return;
|
||||
auto trace = inputs_.top();
|
||||
this->schedule_output(trace, 1);
|
||||
inputs_.pop();
|
||||
auto trace = Input.front();
|
||||
Output.send(trace, 1);
|
||||
auto time = Input.pop();
|
||||
core_->perf_stats_.csr_stalls += (cycle - time);
|
||||
DT(3, cycle, "pipeline-execute: op=CSR, " << *trace);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
FpuUnit::FpuUnit(Core*) : ExeUnit("FPU") {}
|
||||
FpuUnit::FpuUnit(const SimContext& ctx, Core* core) : ExeUnit(ctx, core, "FPU") {}
|
||||
|
||||
void FpuUnit::step(uint64_t /*cycle*/) {
|
||||
if (inputs_.empty())
|
||||
void FpuUnit::step(uint64_t cycle) {
|
||||
if (Input.empty())
|
||||
return;
|
||||
auto trace = inputs_.top();
|
||||
auto trace = Input.front();
|
||||
switch (trace->fpu.type) {
|
||||
case FpuType::FNCP:
|
||||
this->schedule_output(trace, 1);
|
||||
inputs_.pop();
|
||||
Output.send(trace, 2);
|
||||
break;
|
||||
case FpuType::FMA:
|
||||
this->schedule_output(trace, LATENCY_FMA);
|
||||
inputs_.pop();
|
||||
Output.send(trace, LATENCY_FMA+1);
|
||||
break;
|
||||
case FpuType::FDIV:
|
||||
this->schedule_output(trace, LATENCY_FDIV);
|
||||
inputs_.pop();
|
||||
Output.send(trace, LATENCY_FDIV+1);
|
||||
break;
|
||||
case FpuType::FSQRT:
|
||||
this->schedule_output(trace, LATENCY_FSQRT);
|
||||
inputs_.pop();
|
||||
Output.send(trace, LATENCY_FSQRT+1);
|
||||
break;
|
||||
case FpuType::FCVT:
|
||||
this->schedule_output(trace, LATENCY_FCVT);
|
||||
inputs_.pop();
|
||||
Output.send(trace, LATENCY_FCVT+1);
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
DT(3, cycle, "pipeline-execute: op=" << trace->fpu.type << ", " << *trace);
|
||||
auto time = Input.pop();
|
||||
core_->perf_stats_.fpu_stalls += (cycle - time);
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
GpuUnit::GpuUnit(Core* core)
|
||||
: ExeUnit("GPU")
|
||||
, core_(core)
|
||||
GpuUnit::GpuUnit(const SimContext& ctx, Core* core)
|
||||
: ExeUnit(ctx, core, "GPU")
|
||||
, num_threads_(core->arch().num_threads())
|
||||
, pending_tex_reqs_(TEXQ_SIZE)
|
||||
{}
|
||||
|
||||
void GpuUnit::step(uint64_t cycle) {
|
||||
__unused (cycle);
|
||||
#ifdef EXT_TEX_ENABLE
|
||||
// handle memory response
|
||||
for (uint32_t t = 0; t < num_threads_; ++t) {
|
||||
auto& dcache_rsp_port = core_->dcache_switch_.at(t)->RspOut.at(1);
|
||||
if (dcache_rsp_port.empty())
|
||||
continue;
|
||||
auto& mem_rsp = dcache_rsp_port.top();
|
||||
auto& mem_rsp = dcache_rsp_port.front();
|
||||
auto& entry = pending_tex_reqs_.at(mem_rsp.tag);
|
||||
auto trace = entry.first;
|
||||
DT(3, cycle, "tex-rsp: tag=" << mem_rsp.tag << ", tid=" << t << ", " << *trace);
|
||||
assert(entry.second);
|
||||
--entry.second; // track remaining blocks
|
||||
if (0 == entry.second) {
|
||||
auto latency = (SimPlatform::instance().cycles() - trace->dcache_latency);
|
||||
trace->dcache_latency = latency;
|
||||
this->schedule_output(trace, 1);
|
||||
if (0 == entry.second) {
|
||||
Output.send(trace, 1);
|
||||
pending_tex_reqs_.release(mem_rsp.tag);
|
||||
}
|
||||
dcache_rsp_port.pop();
|
||||
|
@ -255,38 +278,67 @@ void GpuUnit::step(uint64_t cycle) {
|
|||
#endif
|
||||
|
||||
// check input queue
|
||||
if (inputs_.empty())
|
||||
if (Input.empty())
|
||||
return;
|
||||
|
||||
auto trace = inputs_.top();
|
||||
auto trace = Input.front();
|
||||
|
||||
bool issued = false;
|
||||
|
||||
switch (trace->gpu.type) {
|
||||
case GpuType::TMC:
|
||||
Output.send(trace, 1);
|
||||
core_->active_warps_.set(trace->wid, trace->gpu.active_warps.test(trace->wid));
|
||||
issued = true;
|
||||
break;
|
||||
case GpuType::WSPAWN:
|
||||
Output.send(trace, 1);
|
||||
core_->active_warps_ = trace->gpu.active_warps;
|
||||
issued = true;
|
||||
break;
|
||||
case GpuType::SPLIT:
|
||||
case GpuType::JOIN:
|
||||
case GpuType::BAR:
|
||||
this->schedule_output(trace, 1);
|
||||
inputs_.pop();
|
||||
Output.send(trace, 1);
|
||||
issued = true;
|
||||
break;
|
||||
case GpuType::TEX: {
|
||||
case GpuType::BAR:
|
||||
Output.send(trace, 1);
|
||||
if (trace->gpu.active_warps != 0)
|
||||
core_->active_warps_ |= trace->gpu.active_warps;
|
||||
else
|
||||
core_->active_warps_.reset(trace->wid);
|
||||
issued = true;
|
||||
break;
|
||||
case GpuType::TEX:
|
||||
if (this->processTexRequest(cycle, trace))
|
||||
inputs_.pop();
|
||||
} break;
|
||||
issued = true;
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
|
||||
if (issued) {
|
||||
DT(3, cycle, "pipeline-execute: op=" << trace->gpu.type << ", " << *trace);
|
||||
if (trace->fetch_stall) {
|
||||
core_->stalled_warps_.reset(trace->wid);
|
||||
}
|
||||
auto time = Input.pop();
|
||||
core_->perf_stats_.fpu_stalls += (cycle - time);
|
||||
}
|
||||
}
|
||||
|
||||
bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) {
|
||||
__unused (cycle);
|
||||
|
||||
// check pending queue capacity
|
||||
if (!trace->check_stalled(pending_tex_reqs_.full())) {
|
||||
DT(3, cycle, "*** tex-queue-stall: " << *trace);
|
||||
}
|
||||
if (pending_tex_reqs_.full())
|
||||
// check pending queue capacity
|
||||
if (pending_tex_reqs_.full()) {
|
||||
if (!trace->suspend()) {
|
||||
DT(3, cycle, "*** tex-queue-stall: " << *trace);
|
||||
}
|
||||
return false;
|
||||
} else {
|
||||
trace->resume();
|
||||
}
|
||||
|
||||
// send memory request
|
||||
|
||||
|
@ -295,7 +347,6 @@ bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) {
|
|||
valid_addrs += mem_addr.size();
|
||||
}
|
||||
|
||||
trace->tex_latency = SimPlatform::instance().cycles();
|
||||
auto tag = pending_tex_reqs_.allocate({trace, valid_addrs});
|
||||
|
||||
for (uint32_t t = 0; t < num_threads_; ++t) {
|
||||
|
@ -305,12 +356,14 @@ bool GpuUnit::processTexRequest(uint64_t cycle, pipeline_trace_t* trace) {
|
|||
auto& dcache_req_port = core_->dcache_switch_.at(t)->ReqIn.at(1);
|
||||
for (auto mem_addr : trace->mem_addrs.at(t)) {
|
||||
MemReq mem_req;
|
||||
mem_req.addr = mem_addr;
|
||||
mem_req.addr = mem_addr.addr;
|
||||
mem_req.write = (trace->lsu.type == LsuType::STORE);
|
||||
mem_req.tag = tag;
|
||||
dcache_req_port.send(mem_req, 1);
|
||||
DT(3, cycle, "tex-req: addr=" << std::hex << mem_addr << ", tag=" << tag
|
||||
dcache_req_port.send(mem_req, 3);
|
||||
DT(3, cycle, "tex-req: addr=" << std::hex << mem_addr.addr << ", tag=" << tag
|
||||
<< ", tid=" << t << ", "<< trace);
|
||||
++ core_->perf_stats_.tex_reads;
|
||||
++ core_->perf_stats_.tex_latency += pending_tex_reqs_.size();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -8,56 +8,29 @@ namespace vortex {
|
|||
|
||||
class Core;
|
||||
|
||||
class ExeUnit {
|
||||
protected:
|
||||
const char* name_;
|
||||
Queue<pipeline_trace_t*> inputs_;
|
||||
Queue<pipeline_trace_t*> outputs_;
|
||||
class ExeUnit : public SimObject<ExeUnit> {
|
||||
public:
|
||||
SimPort<pipeline_trace_t*> Input;
|
||||
SimPort<pipeline_trace_t*> Output;
|
||||
|
||||
void schedule_output(pipeline_trace_t* trace, uint32_t delay) {
|
||||
if (delay > 1) {
|
||||
SimPlatform::instance().schedule(
|
||||
[&](pipeline_trace_t* req) {
|
||||
outputs_.push(req);
|
||||
},
|
||||
trace,
|
||||
(delay - 1)
|
||||
);
|
||||
} else {
|
||||
outputs_.push(trace);
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
typedef std::shared_ptr<ExeUnit> Ptr;
|
||||
|
||||
ExeUnit(const char* name) : name_(name) {}
|
||||
ExeUnit(const SimContext& ctx, Core* core, const char* name)
|
||||
: SimObject<ExeUnit>(ctx, name)
|
||||
, Input(this)
|
||||
, Output(this)
|
||||
, core_(core)
|
||||
{}
|
||||
|
||||
virtual ~ExeUnit() {}
|
||||
|
||||
void push(pipeline_trace_t* trace) {
|
||||
inputs_.push(trace);
|
||||
}
|
||||
|
||||
bool empty() const {
|
||||
return outputs_.empty();
|
||||
}
|
||||
|
||||
pipeline_trace_t* top() const {
|
||||
return outputs_.top();
|
||||
}
|
||||
|
||||
void pop() {
|
||||
outputs_.pop();
|
||||
}
|
||||
|
||||
virtual void step(uint64_t cycle) = 0;
|
||||
protected:
|
||||
Core* core_;
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class NopUnit : public ExeUnit {
|
||||
public:
|
||||
NopUnit(Core*);
|
||||
NopUnit(const SimContext& ctx, Core*);
|
||||
|
||||
void step(uint64_t cycle);
|
||||
};
|
||||
|
@ -65,15 +38,14 @@ public:
|
|||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class LsuUnit : public ExeUnit {
|
||||
private:
|
||||
Core* core_;
|
||||
private:
|
||||
uint32_t num_threads_;
|
||||
HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_dcache_;
|
||||
pipeline_trace_t* fence_state_;
|
||||
bool fence_lock_;
|
||||
|
||||
public:
|
||||
LsuUnit(Core*);
|
||||
LsuUnit(const SimContext& ctx, Core*);
|
||||
|
||||
void step(uint64_t cycle);
|
||||
};
|
||||
|
@ -82,7 +54,7 @@ public:
|
|||
|
||||
class AluUnit : public ExeUnit {
|
||||
public:
|
||||
AluUnit(Core*);
|
||||
AluUnit(const SimContext& ctx, Core*);
|
||||
|
||||
void step(uint64_t cycle);
|
||||
};
|
||||
|
@ -91,7 +63,7 @@ public:
|
|||
|
||||
class CsrUnit : public ExeUnit {
|
||||
public:
|
||||
CsrUnit(Core*);
|
||||
CsrUnit(const SimContext& ctx, Core*);
|
||||
|
||||
void step(uint64_t cycle);
|
||||
};
|
||||
|
@ -100,7 +72,7 @@ public:
|
|||
|
||||
class FpuUnit : public ExeUnit {
|
||||
public:
|
||||
FpuUnit(Core*);
|
||||
FpuUnit(const SimContext& ctx, Core*);
|
||||
|
||||
void step(uint64_t cycle);
|
||||
};
|
||||
|
@ -109,14 +81,13 @@ public:
|
|||
|
||||
class GpuUnit : public ExeUnit {
|
||||
private:
|
||||
Core* core_;
|
||||
uint32_t num_threads_;
|
||||
HashTable<std::pair<pipeline_trace_t*, uint32_t>> pending_tex_reqs_;
|
||||
|
||||
bool processTexRequest(uint64_t cycle, pipeline_trace_t* trace);
|
||||
|
||||
public:
|
||||
GpuUnit(Core*);
|
||||
GpuUnit(const SimContext& ctx, Core*);
|
||||
|
||||
void step(uint64_t cycle);
|
||||
};
|
||||
|
|
|
@ -10,6 +10,7 @@ private:
|
|||
MemSim* simobject_;
|
||||
uint32_t num_banks_;
|
||||
uint32_t latency_;
|
||||
PerfStats perf_stats_;
|
||||
|
||||
public:
|
||||
Impl(MemSim* simobject, uint32_t num_banks, uint32_t latency)
|
||||
|
@ -18,16 +19,23 @@ public:
|
|||
, latency_(latency)
|
||||
{}
|
||||
|
||||
const PerfStats& perf_stats() const {
|
||||
return perf_stats_;
|
||||
}
|
||||
|
||||
void step(uint64_t /*cycle*/) {
|
||||
for (uint32_t i = 0, n = num_banks_; i < n; ++i) {
|
||||
auto& mem_req_port = simobject_->MemReqPorts.at(i);
|
||||
if (mem_req_port.empty())
|
||||
continue;
|
||||
auto& mem_req = mem_req_port.top();
|
||||
auto& mem_req = mem_req_port.front();
|
||||
if (!mem_req.write) {
|
||||
MemRsp mem_rsp;
|
||||
mem_rsp.tag = mem_req.tag;
|
||||
simobject_->MemRspPorts.at(i).send(mem_rsp, latency_);
|
||||
++perf_stats_.reads;
|
||||
} else {
|
||||
++perf_stats_.writes;
|
||||
}
|
||||
mem_req_port.pop();
|
||||
}
|
||||
|
@ -40,9 +48,9 @@ MemSim::MemSim(const SimContext& ctx,
|
|||
uint32_t num_banks,
|
||||
uint32_t latency)
|
||||
: SimObject<MemSim>(ctx, "MemSim")
|
||||
, impl_(new Impl(this, num_banks, latency))
|
||||
, MemReqPorts(num_banks, this)
|
||||
, MemRspPorts(num_banks, this)
|
||||
, impl_(new Impl(this, num_banks, latency))
|
||||
{}
|
||||
|
||||
MemSim::~MemSim() {
|
||||
|
|
|
@ -1,47 +1,36 @@
|
|||
#pragma once
|
||||
|
||||
#include <simobject.h>
|
||||
#include "types.h"
|
||||
#include <vector>
|
||||
#include <list>
|
||||
|
||||
namespace vortex {
|
||||
|
||||
struct MemReq {
|
||||
uint64_t addr;
|
||||
uint32_t tag;
|
||||
bool write;
|
||||
bool is_io;
|
||||
|
||||
MemReq(uint64_t _addr = 0,
|
||||
uint64_t _tag = 0,
|
||||
bool _write = false,
|
||||
bool _is_io = false
|
||||
) : addr(_addr)
|
||||
, tag(_tag)
|
||||
, write(_write)
|
||||
, is_io(_is_io)
|
||||
{}
|
||||
};
|
||||
|
||||
struct MemRsp {
|
||||
uint64_t tag;
|
||||
MemRsp(uint64_t _tag = 0) : tag (_tag) {}
|
||||
};
|
||||
|
||||
class MemSim : public SimObject<MemSim>{
|
||||
private:
|
||||
class Impl;
|
||||
Impl* impl_;
|
||||
|
||||
public:
|
||||
struct PerfStats {
|
||||
uint64_t reads;
|
||||
uint64_t writes;
|
||||
|
||||
MemSim(const SimContext& ctx, uint32_t num_inputs, uint32_t latency);
|
||||
PerfStats()
|
||||
: reads(0)
|
||||
, writes(0)
|
||||
{}
|
||||
};
|
||||
|
||||
std::vector<SimPort<MemReq>> MemReqPorts;
|
||||
std::vector<SimPort<MemRsp>> MemRspPorts;
|
||||
|
||||
MemSim(const SimContext& ctx, uint32_t num_banks, uint32_t latency);
|
||||
~MemSim();
|
||||
|
||||
void step(uint64_t cycle);
|
||||
|
||||
std::vector<SlavePort<MemReq>> MemReqPorts;
|
||||
std::vector<MasterPort<MemRsp>> MemRspPorts;
|
||||
const PerfStats& perf_stats() const;
|
||||
|
||||
private:
|
||||
class Impl;
|
||||
Impl* impl_;
|
||||
};
|
||||
|
||||
};
|
|
@ -12,7 +12,7 @@ namespace vortex {
|
|||
|
||||
struct pipeline_trace_t {
|
||||
//--
|
||||
uint64_t id;
|
||||
uint64_t uuid;
|
||||
|
||||
//--
|
||||
int cid;
|
||||
|
@ -22,7 +22,6 @@ struct pipeline_trace_t {
|
|||
|
||||
//--
|
||||
bool fetch_stall;
|
||||
bool pipeline_stall;
|
||||
|
||||
//--
|
||||
bool wb;
|
||||
|
@ -38,7 +37,7 @@ struct pipeline_trace_t {
|
|||
ExeType exe_type;
|
||||
|
||||
//--
|
||||
std::vector<std::vector<uint64_t>> mem_addrs;
|
||||
std::vector<std::vector<mem_addr_size_t>> mem_addrs;
|
||||
|
||||
//--
|
||||
union {
|
||||
|
@ -53,22 +52,19 @@ struct pipeline_trace_t {
|
|||
} fpu;
|
||||
struct {
|
||||
GpuType type;
|
||||
WarpMask active_warps;
|
||||
} gpu;
|
||||
};
|
||||
|
||||
// stats
|
||||
uint64_t icache_latency;
|
||||
uint64_t dcache_latency;
|
||||
uint64_t tex_latency;
|
||||
bool stalled;
|
||||
|
||||
pipeline_trace_t(uint64_t id_, const ArchDef& arch) {
|
||||
id = id_;
|
||||
pipeline_trace_t(uint64_t uuid_, const ArchDef& arch) {
|
||||
uuid = uuid_;
|
||||
cid = 0;
|
||||
wid = 0;
|
||||
tmask.reset();
|
||||
PC = 0;
|
||||
PC = 0;
|
||||
fetch_stall = false;
|
||||
pipeline_stall = false;
|
||||
wb = false;
|
||||
rdest = 0;
|
||||
rdest_type = RegType::None;
|
||||
|
@ -76,16 +72,18 @@ struct pipeline_trace_t {
|
|||
used_fregs.reset();
|
||||
used_vregs.reset();
|
||||
exe_type = ExeType::NOP;
|
||||
mem_addrs.resize(arch.num_threads());
|
||||
icache_latency = 0;
|
||||
dcache_latency = 0;
|
||||
tex_latency = 0;
|
||||
mem_addrs.resize(arch.num_threads());
|
||||
stalled = false;
|
||||
}
|
||||
|
||||
bool check_stalled(bool stall) {
|
||||
bool old = pipeline_stall;
|
||||
pipeline_stall = stall;
|
||||
return stall ? old : true;
|
||||
bool suspend() {
|
||||
bool old = stalled;
|
||||
stalled = true;
|
||||
return old;
|
||||
}
|
||||
|
||||
void resume() {
|
||||
stalled = false;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -96,16 +94,16 @@ inline std::ostream &operator<<(std::ostream &os, const pipeline_trace_t& state)
|
|||
os << ", rd=" << state.rdest_type << std::dec << state.rdest;
|
||||
}
|
||||
os << ", ex=" << state.exe_type;
|
||||
os << " (#" << std::dec << state.id << ")";
|
||||
os << " (#" << std::dec << state.uuid << ")";
|
||||
return os;
|
||||
}
|
||||
|
||||
class PipelineStage : public Queue<pipeline_trace_t*> {
|
||||
class PipelineLatch : public Queue<pipeline_trace_t*> {
|
||||
protected:
|
||||
const char* name_;
|
||||
|
||||
public:
|
||||
PipelineStage(const char* name = nullptr)
|
||||
PipelineLatch(const char* name = nullptr)
|
||||
: name_(name)
|
||||
{}
|
||||
};
|
||||
|
|
|
@ -18,13 +18,13 @@ Processor::Processor(const ArchDef& arch)
|
|||
|
||||
// connect memory sub-systen
|
||||
memsim_ = MemSim::Create(1, MEM_LATENCY);
|
||||
std::vector<SlavePort<MemReq>*> mem_req_ports(1);
|
||||
std::vector<MasterPort<MemRsp>*> mem_rsp_ports(1);
|
||||
std::vector<SimPort<MemReq>*> mem_req_ports(1);
|
||||
std::vector<SimPort<MemRsp>*> mem_rsp_ports(1);
|
||||
mem_req_ports.at(0) = &memsim_->MemReqPorts.at(0);
|
||||
mem_rsp_ports.at(0) = &memsim_->MemRspPorts.at(0);
|
||||
|
||||
if (L3_ENABLE) {
|
||||
l3cache_ = Cache::Create("l3cache", CacheConfig{
|
||||
l3cache_ = Cache::Create("l3cache", Cache::Config{
|
||||
log2ceil(L3_CACHE_SIZE), // C
|
||||
log2ceil(MEM_BLOCK_SIZE), // B
|
||||
2, // W
|
||||
|
@ -66,7 +66,7 @@ Processor::Processor(const ArchDef& arch)
|
|||
for (uint32_t i = 0; i < NUM_CLUSTERS; ++i) {
|
||||
if (L2_ENABLE) {
|
||||
auto& l2cache = l2caches_.at(i);
|
||||
l2cache = Cache::Create("l2cache", CacheConfig{
|
||||
l2cache = Cache::Create("l2cache", Cache::Config{
|
||||
log2ceil(L2_CACHE_SIZE), // C
|
||||
log2ceil(MEM_BLOCK_SIZE), // B
|
||||
2, // W
|
||||
|
|
|
@ -96,7 +96,7 @@ public:
|
|||
}
|
||||
uint32_t tag = (state->rdest << 16) | (state->wid << 4) | (int)state->rdest_type;
|
||||
assert(owners_.count(tag) == 0);
|
||||
owners_[tag] = state->id;
|
||||
owners_[tag] = state->uuid;
|
||||
}
|
||||
|
||||
void release(pipeline_trace_t* state) {
|
||||
|
|
93
sim/simX/sharedmem.h
Normal file
93
sim/simX/sharedmem.h
Normal file
|
@ -0,0 +1,93 @@
|
|||
#pragma once
|
||||
|
||||
#include <simobject.h>
|
||||
#include <bitmanip.h>
|
||||
#include <vector>
|
||||
#include "types.h"
|
||||
|
||||
namespace vortex {
|
||||
|
||||
class Core;
|
||||
|
||||
class SharedMem : public SimObject<SharedMem> {
|
||||
public:
|
||||
struct Config {
|
||||
uint32_t num_reqs;
|
||||
uint32_t num_banks;
|
||||
uint32_t bank_offset;
|
||||
uint32_t latency;
|
||||
bool write_reponse;
|
||||
};
|
||||
|
||||
struct PerfStats {
|
||||
uint64_t reads;
|
||||
uint64_t writes;
|
||||
uint64_t bank_stalls;
|
||||
|
||||
PerfStats()
|
||||
: reads(0)
|
||||
, writes(0)
|
||||
, bank_stalls(0)
|
||||
{}
|
||||
};
|
||||
|
||||
std::vector<SimPort<MemReq>> Inputs;
|
||||
std::vector<SimPort<MemRsp>> Outputs;
|
||||
|
||||
SharedMem(const SimContext& ctx, const char* name, const Config& config)
|
||||
: SimObject<SharedMem>(ctx, name)
|
||||
, Inputs(config.num_reqs, this)
|
||||
, Outputs(config.num_reqs, this)
|
||||
, config_(config)
|
||||
, bank_sel_addr_start_(config.bank_offset)
|
||||
, bank_sel_addr_end_(config.bank_offset + log2up(config.num_banks)-1)
|
||||
{}
|
||||
|
||||
virtual ~SharedMem() {}
|
||||
|
||||
void step(uint64_t /*cycle*/) {
|
||||
std::vector<bool> in_used_banks(config_.num_banks);
|
||||
for (uint32_t req_id = 0; req_id < config_.num_reqs; ++req_id) {
|
||||
auto& core_req_port = this->Inputs.at(req_id);
|
||||
if (core_req_port.empty())
|
||||
continue;
|
||||
|
||||
auto& core_req = core_req_port.front();
|
||||
|
||||
uint32_t bank_id = (uint32_t)bit_getw(
|
||||
core_req.addr, bank_sel_addr_start_, bank_sel_addr_end_);
|
||||
|
||||
// bank conflict check
|
||||
if (in_used_banks.at(bank_id))
|
||||
continue;
|
||||
|
||||
in_used_banks.at(bank_id) = true;
|
||||
|
||||
if (!core_req.write || config_.write_reponse) {
|
||||
// send response
|
||||
MemRsp core_rsp;
|
||||
core_rsp.tag = core_req.tag;
|
||||
this->Outputs.at(req_id).send(core_rsp, 1);
|
||||
}
|
||||
|
||||
// update perf counters
|
||||
perf_stats_.reads += !core_req.write;
|
||||
perf_stats_.writes += core_req.write;
|
||||
|
||||
// remove input
|
||||
core_req_port.pop();
|
||||
}
|
||||
}
|
||||
|
||||
const PerfStats& perf_stats() const {
|
||||
return perf_stats_;
|
||||
}
|
||||
|
||||
protected:
|
||||
Config config_;
|
||||
uint32_t bank_sel_addr_start_;
|
||||
uint32_t bank_sel_addr_end_;
|
||||
PerfStats perf_stats_;
|
||||
};
|
||||
|
||||
}
|
|
@ -27,7 +27,7 @@ void TexUnit::set_state(uint32_t state, uint32_t value) {
|
|||
uint32_t TexUnit::read(int32_t u,
|
||||
int32_t v,
|
||||
int32_t lod,
|
||||
std::vector<uint64_t>* mem_addrs) {
|
||||
std::vector<mem_addr_size_t>* mem_addrs) {
|
||||
//--
|
||||
auto xu = Fixed<TEX_FXD_FRAC>::make(u);
|
||||
auto xv = Fixed<TEX_FXD_FRAC>::make(v);
|
||||
|
@ -60,10 +60,10 @@ uint32_t TexUnit::read(int32_t u,
|
|||
uint32_t texel10 = core_->dcache_read(addr10, stride);
|
||||
uint32_t texel11 = core_->dcache_read(addr11, stride);
|
||||
|
||||
mem_addrs->push_back(addr00);
|
||||
mem_addrs->push_back(addr01);
|
||||
mem_addrs->push_back(addr10);
|
||||
mem_addrs->push_back(addr11);
|
||||
mem_addrs->push_back({addr00, stride});
|
||||
mem_addrs->push_back({addr01, stride});
|
||||
mem_addrs->push_back({addr10, stride});
|
||||
mem_addrs->push_back({addr11, stride});
|
||||
|
||||
// filtering
|
||||
auto color = TexFilterLinear(
|
||||
|
@ -79,7 +79,7 @@ uint32_t TexUnit::read(int32_t u,
|
|||
|
||||
// memory lookup
|
||||
uint32_t texel = core_->dcache_read(addr, stride);
|
||||
mem_addrs->push_back(addr);
|
||||
mem_addrs->push_back({addr, stride});
|
||||
|
||||
// filtering
|
||||
auto color = TexFilterPoint(format, texel);
|
||||
|
|
|
@ -15,7 +15,7 @@ public:
|
|||
|
||||
void set_state(uint32_t state, uint32_t value);
|
||||
|
||||
uint32_t read(int32_t u, int32_t v, int32_t lod, std::vector<uint64_t>* mem_addrs);
|
||||
uint32_t read(int32_t u, int32_t v, int32_t lod, std::vector<mem_addr_size_t>* mem_addrs);
|
||||
|
||||
private:
|
||||
|
||||
|
|
125
sim/simX/types.h
125
sim/simX/types.h
|
@ -21,6 +21,8 @@ typedef std::bitset<32> RegMask;
|
|||
typedef std::bitset<32> ThreadMask;
|
||||
typedef std::bitset<32> WarpMask;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
enum class RegType {
|
||||
None,
|
||||
Integer,
|
||||
|
@ -38,6 +40,8 @@ inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
|
|||
return os;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
enum class ExeType {
|
||||
NOP,
|
||||
ALU,
|
||||
|
@ -61,6 +65,8 @@ inline std::ostream &operator<<(std::ostream &os, const ExeType& type) {
|
|||
return os;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
enum class AluType {
|
||||
ARITH,
|
||||
BRANCH,
|
||||
|
@ -80,6 +86,8 @@ inline std::ostream &operator<<(std::ostream &os, const AluType& type) {
|
|||
return os;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
enum class LsuType {
|
||||
LOAD,
|
||||
STORE,
|
||||
|
@ -97,6 +105,47 @@ inline std::ostream &operator<<(std::ostream &os, const LsuType& type) {
|
|||
return os;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
enum class AddrType {
|
||||
Global,
|
||||
Shared,
|
||||
IO,
|
||||
};
|
||||
|
||||
inline std::ostream &operator<<(std::ostream &os, const AddrType& type) {
|
||||
switch (type) {
|
||||
case AddrType::Global: os << "Global"; break;
|
||||
case AddrType::Shared: os << "Shared"; break;
|
||||
case AddrType::IO: os << "IO"; break;
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct mem_addr_size_t {
|
||||
uint64_t addr;
|
||||
uint32_t size;
|
||||
};
|
||||
|
||||
inline AddrType get_addr_type(Word addr, uint32_t size) {
|
||||
__unused (size);
|
||||
if (SM_ENABLE) {
|
||||
if (addr >= (SMEM_BASE_ADDR - SMEM_SIZE)
|
||||
&& addr < SMEM_BASE_ADDR) {
|
||||
assert((addr + size) <= SMEM_BASE_ADDR);
|
||||
return AddrType::Shared;
|
||||
}
|
||||
}
|
||||
if (addr >= IO_BASE_ADDR) {
|
||||
return AddrType::IO;
|
||||
}
|
||||
return AddrType::Global;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
enum class FpuType {
|
||||
FNCP,
|
||||
FMA,
|
||||
|
@ -116,6 +165,8 @@ inline std::ostream &operator<<(std::ostream &os, const FpuType& type) {
|
|||
return os;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
enum class GpuType {
|
||||
TMC,
|
||||
WSPAWN,
|
||||
|
@ -137,6 +188,8 @@ inline std::ostream &operator<<(std::ostream &os, const GpuType& type) {
|
|||
return os;
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
enum class ArbiterType {
|
||||
Priority,
|
||||
RoundRobin
|
||||
|
@ -152,6 +205,30 @@ inline std::ostream &operator<<(std::ostream &os, const ArbiterType& type) {
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct MemReq {
|
||||
uint64_t addr;
|
||||
uint32_t tag;
|
||||
bool write;
|
||||
bool is_io;
|
||||
|
||||
MemReq(uint64_t _addr = 0,
|
||||
uint64_t _tag = 0,
|
||||
bool _write = false,
|
||||
bool _is_io = false
|
||||
) : addr(_addr)
|
||||
, tag(_tag)
|
||||
, write(_write)
|
||||
, is_io(_is_io)
|
||||
{}
|
||||
};
|
||||
|
||||
struct MemRsp {
|
||||
uint64_t tag;
|
||||
MemRsp(uint64_t _tag = 0) : tag (_tag) {}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T>
|
||||
class Queue {
|
||||
protected:
|
||||
|
@ -164,21 +241,29 @@ public:
|
|||
return queue_.empty();
|
||||
}
|
||||
|
||||
const T& top() const {
|
||||
const T& front() const {
|
||||
return queue_.front();
|
||||
}
|
||||
|
||||
T& top() {
|
||||
T& front() {
|
||||
return queue_.front();
|
||||
}
|
||||
|
||||
void pop() {
|
||||
queue_.pop();
|
||||
const T& back() const {
|
||||
return queue_.back();
|
||||
}
|
||||
|
||||
T& back() {
|
||||
return queue_.back();
|
||||
}
|
||||
|
||||
void push(const T& value) {
|
||||
queue_.push(value);
|
||||
}
|
||||
|
||||
void pop() {
|
||||
queue_.pop();
|
||||
}
|
||||
};
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -187,20 +272,24 @@ template <typename T>
|
|||
class HashTable {
|
||||
private:
|
||||
std::vector<std::pair<bool, T>> entries_;
|
||||
uint32_t capacity_;
|
||||
uint32_t size_;
|
||||
|
||||
public:
|
||||
HashTable(uint32_t size)
|
||||
: entries_(size)
|
||||
, capacity_(0)
|
||||
HashTable(uint32_t capacity)
|
||||
: entries_(capacity)
|
||||
, size_(0)
|
||||
{}
|
||||
|
||||
bool empty() const {
|
||||
return (0 == capacity_);
|
||||
return (0 == size_);
|
||||
}
|
||||
|
||||
bool full() const {
|
||||
return (capacity_ == entries_.size());
|
||||
return (size_ == entries_.size());
|
||||
}
|
||||
|
||||
uint32_t size() const {
|
||||
return size_;
|
||||
}
|
||||
|
||||
bool contains(uint32_t index) const {
|
||||
|
@ -225,7 +314,7 @@ public:
|
|||
if (!entry.first) {
|
||||
entry.first = true;
|
||||
entry.second = value;
|
||||
++capacity_;
|
||||
++size_;
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
@ -237,7 +326,7 @@ public:
|
|||
auto& entry = entries_.at(index);
|
||||
assert(entry.first);
|
||||
entry.first = false;
|
||||
--capacity_;
|
||||
--size_;
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -287,7 +376,7 @@ public:
|
|||
uint32_t j = (cursor_ + i) % n;
|
||||
auto& req_in = ReqIn.at(j);
|
||||
if (!req_in.empty()) {
|
||||
auto& req = req_in.top();
|
||||
auto& req = req_in.front();
|
||||
if (tag_shift_) {
|
||||
req.tag = (req.tag << tag_shift_) | j;
|
||||
}
|
||||
|
@ -300,7 +389,7 @@ public:
|
|||
|
||||
// process incoming reponses
|
||||
if (!RspIn.empty()) {
|
||||
auto& rsp = RspIn.top();
|
||||
auto& rsp = RspIn.front();
|
||||
uint32_t port_id = 0;
|
||||
if (tag_shift_) {
|
||||
port_id = rsp.tag & ((1 << tag_shift_)-1);
|
||||
|
@ -317,10 +406,10 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
std::vector<SlavePort<Req>> ReqIn;
|
||||
MasterPort<Req> ReqOut;
|
||||
SlavePort<Rsp> RspIn;
|
||||
std::vector<MasterPort<Rsp>> RspOut;
|
||||
std::vector<SimPort<Req>> ReqIn;
|
||||
SimPort<Req> ReqOut;
|
||||
SimPort<Rsp> RspIn;
|
||||
std::vector<SimPort<Rsp>> RspOut;
|
||||
};
|
||||
|
||||
}
|
|
@ -27,7 +27,7 @@ void Warp::eval(pipeline_trace_t *trace) {
|
|||
DPH(2, "Fetch: coreid=" << core_->id() << ", wid=" << id_ << ", tmask=");
|
||||
for (int i = 0, n = core_->arch().num_threads(); i < n; ++i)
|
||||
DPN(2, tmask_.test(n-i-1));
|
||||
DPN(2, ", PC=0x" << std::hex << PC_ << std::endl);
|
||||
DPN(2, ", PC=0x" << std::hex << PC_ << " (#" << std::dec << trace->uuid << ")" << std::endl);
|
||||
|
||||
/* Fetch and decode. */
|
||||
|
||||
|
@ -38,7 +38,7 @@ void Warp::eval(pipeline_trace_t *trace) {
|
|||
std::abort();
|
||||
}
|
||||
|
||||
DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr << " (#" << trace->id << ")");
|
||||
DP(2, "Instr 0x" << std::hex << instr_code << ": " << *instr);
|
||||
|
||||
// Update trace
|
||||
trace->cid = core_->id();
|
||||
|
|
|
@ -46,6 +46,10 @@ public:
|
|||
return active_;
|
||||
}
|
||||
|
||||
void suspend() {
|
||||
active_ = false;
|
||||
}
|
||||
|
||||
void activate() {
|
||||
active_ = true;
|
||||
}
|
||||
|
|
|
@ -62,15 +62,16 @@ int main() {
|
|||
kernel_arg_t* arg = (kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
|
||||
|
||||
// configure texture unit
|
||||
csr_write(CSR_TEX(0, TEX_STATE_WIDTH), arg->src_logwidth);
|
||||
csr_write(CSR_TEX(0, TEX_STATE_HEIGHT), arg->src_logheight);
|
||||
csr_write(CSR_TEX(0, TEX_STATE_FORMAT), arg->format);
|
||||
csr_write(CSR_TEX(0, TEX_STATE_WRAPU), arg->wrapu);
|
||||
csr_write(CSR_TEX(0, TEX_STATE_WRAPV), arg->wrapv);
|
||||
csr_write(CSR_TEX(0, TEX_STATE_FILTER), (arg->filter ? 1 : 0));
|
||||
csr_write(CSR_TEX(0, TEX_STATE_ADDR), arg->src_addr);
|
||||
csr_write(CSR_TEX_UNIT, 0);
|
||||
csr_write(CSR_TEX_WIDTH, arg->src_logwidth);
|
||||
csr_write(CSR_TEX_HEIGHT, arg->src_logheight);
|
||||
csr_write(CSR_TEX_FORMAT, arg->format);
|
||||
csr_write(CSR_TEX_WRAPU, arg->wrapu);
|
||||
csr_write(CSR_TEX_WRAPV, arg->wrapv);
|
||||
csr_write(CSR_TEX_FILTER, (arg->filter ? 1 : 0));
|
||||
csr_write(CSR_TEX_ADDR, arg->src_addr);
|
||||
static_for_t<int, 0, TEX_LOD_MAX+1>()([&](int i) {
|
||||
csr_write(CSR_TEX(0, TEX_STATE_MIPOFF(i)), arg->mip_offs[i]);
|
||||
csr_write(CSR_TEX_MIPOFF(i), arg->mip_offs[i]);
|
||||
});
|
||||
|
||||
tile_arg_t targ;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue