Fix for Single-Threaded

This commit is contained in:
felsabbagh3 2020-03-22 14:44:46 -07:00
parent 902aa685b1
commit d146070275
16 changed files with 46894 additions and 46887 deletions

View file

@ -311,8 +311,8 @@ module VX_bank
// assign is_fill_in_pipe = (|is_fill_st1) || is_fill_st2;
assign dfpq_pop = !dfpq_empty && !stall_bank_pipe && !dfpq_hazard_st0;
assign mrvq_pop = !dfpq_pop && mrvq_valid_st0 && !stall_bank_pipe && !mrvq_hazard_st0;
assign mrvq_pop = mrvq_valid_st0 && !stall_bank_pipe && !mrvq_hazard_st0;
assign dfpq_pop = !mrvq_pop && !dfpq_empty && !stall_bank_pipe && !dfpq_hazard_st0;
assign reqq_pop = !mrvq_pop && !dfpq_pop && !reqq_empty && reqq_req_st0 && !stall_bank_pipe && !is_fill_st1[0] && !(reqq_hazard_st0 || (mrvq_valid_st0 && mrvq_hazard_st0)) && !is_fill_in_pipe;
assign snrq_pop = !reqq_pop && !reqq_pop && !mrvq_pop && !dfpq_pop && snrq_valid_st0 && !stall_bank_pipe && !snrq_hazard_st0;

View file

@ -85,7 +85,7 @@ module VX_fill_invalidator
if (success_fill) begin
success_found = 1;
success_index = curr_fill[(`vx_clog2(FILL_INVALIDAOR_SIZE))-1:0];
success_index = curr_fill;
end
end
end

View file

@ -73,12 +73,12 @@ module VX_tag_data_access
);
reg[`DBANK_LINE_SIZE_RNG][31:0] readdata_st[STAGE_1_CYCLES-2:0];
reg[`DBANK_LINE_SIZE_RNG][31:0] readdata_st[STAGE_1_CYCLES-1:0];
reg read_valid_st1c[STAGE_1_CYCLES-2:0];
reg read_dirty_st1c[STAGE_1_CYCLES-2:0];
reg[`TAG_SELECT_SIZE_RNG] read_tag_st1c [STAGE_1_CYCLES-2:0];
reg[`DBANK_LINE_SIZE_RNG][31:0] read_data_st1c [STAGE_1_CYCLES-2:0];
reg read_valid_st1c[STAGE_1_CYCLES-1:0];
reg read_dirty_st1c[STAGE_1_CYCLES-1:0];
reg[`TAG_SELECT_SIZE_RNG] read_tag_st1c [STAGE_1_CYCLES-1:0];
reg[`DBANK_LINE_SIZE_RNG][31:0] read_data_st1c [STAGE_1_CYCLES-1:0];
wire qual_read_valid_st1;
@ -94,6 +94,9 @@ module VX_tag_data_access
wire[`DBANK_LINE_SIZE_RNG][31:0] use_write_data;
wire real_writefill = writefill_st1e && miss_st1e;
wire fill_sent;
wire invalidate_line;
VX_tag_data_structure #(
@ -128,13 +131,14 @@ module VX_tag_data_access
.invalidate (invalidate_line),
.write_enable(use_write_enable),
.write_fill (writefill_st1e),
.write_fill (real_writefill),
.write_addr (writeaddr_st1e),
.write_data (use_write_data),
.fill_sent (fill_sent)
);
VX_generic_register #(.N( 1 + 1 + `TAG_SELECT_NUM_BITS + (`DBANK_LINE_SIZE_WORDS*32) )) s0_1_c0 (
// VX_generic_register #(.N( 1 + 1 + `TAG_SELECT_NUM_BITS + (`DBANK_LINE_SIZE_WORDS*32) )) s0_1_c0 (
VX_generic_register #(.N( 1 + 1 + `TAG_SELECT_NUM_BITS + (`DBANK_LINE_SIZE_WORDS*32) ), .Valid(0)) s0_1_c0 (
.clk (clk),
.reset(reset),
.stall(stall),
@ -145,7 +149,7 @@ module VX_tag_data_access
genvar curr_stage;
generate
for (curr_stage = 1; curr_stage < STAGE_1_CYCLES-2; curr_stage = curr_stage + 1) begin
for (curr_stage = 1; curr_stage < STAGE_1_CYCLES-1; curr_stage = curr_stage + 1) begin
VX_generic_register #(.N( 1 + 1 + `TAG_SELECT_NUM_BITS + (`DBANK_LINE_SIZE_WORDS*32) )) s0_1_cc (
.clk (clk),
.reset(reset),
@ -158,13 +162,13 @@ module VX_tag_data_access
endgenerate
assign use_read_valid_st1e = read_valid_st1c[STAGE_1_CYCLES-2] || (FUNC_ID == `SFUNC_ID); // If shared memory, always valid
assign use_read_dirty_st1e = read_dirty_st1c[STAGE_1_CYCLES-2] && (FUNC_ID == `DFUNC_ID); // Dirty only applies in Dcache
assign use_read_tag_st1e = (FUNC_ID == `SFUNC_ID) ? writeaddr_st1e[`TAG_SELECT_ADDR_RNG] : read_tag_st1c [STAGE_1_CYCLES-2]; // Tag is always the same in SM
assign use_read_valid_st1e = read_valid_st1c[STAGE_1_CYCLES-1] || (FUNC_ID == `SFUNC_ID); // If shared memory, always valid
assign use_read_dirty_st1e = read_dirty_st1c[STAGE_1_CYCLES-1] && (FUNC_ID != `SFUNC_ID); // Dirty only applies in Dcache
assign use_read_tag_st1e = (FUNC_ID == `SFUNC_ID) ? writeaddr_st1e[`TAG_SELECT_ADDR_RNG] : read_tag_st1c [STAGE_1_CYCLES-1]; // Tag is always the same in SM
genvar curr_w;
for (curr_w = 0; curr_w < `DBANK_LINE_SIZE_WORDS; curr_w = curr_w+1) assign use_read_data_st1e[curr_w][31:0] = read_data_st1c[STAGE_1_CYCLES-2][curr_w][31:0];
// assign use_read_data_st1e = read_data_st1c [STAGE_1_CYCLES-2];
for (curr_w = 0; curr_w < `DBANK_LINE_SIZE_WORDS; curr_w = curr_w+1) assign use_read_data_st1e[curr_w][31:0] = read_data_st1c[STAGE_1_CYCLES-1][curr_w][31:0];
// assign use_read_data_st1e = read_data_st1c [STAGE_1_CYCLES-1];
/////////////////////// LOAD LOGIC ///////////////////
@ -182,12 +186,12 @@ module VX_tag_data_access
wire b2 = (byte_select == 2);
wire b3 = (byte_select == 3);
wire[31:0] w0 = read_data_st1c[STAGE_1_CYCLES-2][0][31:0];
wire[31:0] w1 = read_data_st1c[STAGE_1_CYCLES-2][1][31:0];
wire[31:0] w2 = read_data_st1c[STAGE_1_CYCLES-2][2][31:0];
wire[31:0] w3 = read_data_st1c[STAGE_1_CYCLES-2][3][31:0];
wire[31:0] w0 = read_data_st1c[STAGE_1_CYCLES-1][0][31:0];
wire[31:0] w1 = read_data_st1c[STAGE_1_CYCLES-1][1][31:0];
wire[31:0] w2 = read_data_st1c[STAGE_1_CYCLES-1][2][31:0];
wire[31:0] w3 = read_data_st1c[STAGE_1_CYCLES-1][3][31:0];
wire[31:0] data_unmod = read_data_st1c[STAGE_1_CYCLES-2][block_offset][31:0];
wire[31:0] data_unmod = read_data_st1c[STAGE_1_CYCLES-1][block_offset][31:0];
wire[31:0] data_unQual = (b0 || lw) ? (data_unmod) :
b1 ? (data_unmod >> 8) :
@ -234,7 +238,7 @@ module VX_tag_data_access
wire[3:0] sh_mask = (b0 ? 4'b0011 : 4'b1100);
wire should_write = (sw || sb || sh) && valid_req_st1e && use_read_valid_st1e && !miss_st1e;
wire force_write = writefill_st1e && valid_req_st1e && (!use_read_valid_st1e || (use_read_valid_st1e && !miss_st1e));
wire force_write = writefill_st1e && valid_req_st1e && miss_st1e && (!use_read_valid_st1e || (use_read_valid_st1e && !miss_st1e));
wire[`DBANK_LINE_SIZE_RNG][3:0] we;
wire[`DBANK_LINE_SIZE_RNG][31:0] data_write;
@ -262,7 +266,7 @@ module VX_tag_data_access
///////////////////////
if (FUNC_ID == `LLFUNC_ID) begin
assign readword_st1e = read_data_st1c[STAGE_1_CYCLES-2];
assign readword_st1e = read_data_st1c[STAGE_1_CYCLES-1];
end else begin
assign readword_st1e = data_Qual;
end
@ -272,7 +276,7 @@ module VX_tag_data_access
assign readdata_st1e = use_read_data_st1e;
assign readtag_st1e = use_read_tag_st1e;
assign fill_sent = miss_st1e;
assign fill_saw_dirty_st1e = force_write && dirty_st1e;
assign fill_saw_dirty_st1e = force_write && dirty_st1e && miss_st1e;
assign invalidate_line = is_snp_st1e && !miss_st1e;
endmodule

View file

@ -92,6 +92,7 @@ module VX_tag_data_structure
end
end else if (fill_sent) begin
dirty[write_addr[`LINE_SELECT_ADDR_RNG]] <= 0;
valid[write_addr[`LINE_SELECT_ADDR_RNG]] <= 0;
end
if (invalidate) begin

View file

@ -127,7 +127,7 @@
`define NUMBER_CORES (`NUMBER_CORES_PER_CLUSTER*`NUMBER_CLUSTERS)
// `define SINGLE_CORE_BENCH 0
`define SINGLE_CORE_BENCH 1
`define GLOBAL_BLOCK_SIZE_BYTES 16
// ========================================= Dcache Configurable Knobs =========================================
@ -141,7 +141,7 @@
// Number of Word requests per cycle {1, 2, 4, 8, ...}
`define DNUMBER_REQUESTS `NT
// Number of cycles to complete stage 1 (read from memory)
`define DSTAGE_1_CYCLES 2
`define DSTAGE_1_CYCLES 1
// Function ID
`define DFUNC_ID 0
@ -172,7 +172,7 @@
`define DFFSQ_SIZE 8
// Fill Invalidator Size {Fill invalidator must be active}
`define DFILL_INVALIDAOR_SIZE 16
`define DFILL_INVALIDAOR_SIZE 0
// Dram knobs
`define DSIMULATED_DRAM_LATENCY_CYCLES 10
@ -192,7 +192,7 @@
// Number of Word requests per cycle {1, 2, 4, 8, ...}
`define INUMBER_REQUESTS 1
// Number of cycles to complete stage 1 (read from memory)
`define ISTAGE_1_CYCLES 2
`define ISTAGE_1_CYCLES 1
// Function ID
`define IFUNC_ID 1
@ -214,16 +214,16 @@
// Core Writeback Queue Size
`define ICWBQ_SIZE `IREQQ_SIZE
// Dram Writeback Queue Size
`define IDWBQ_SIZE 0
`define IDWBQ_SIZE 16
// Dram Fill Req Queue Size
`define IDFQQ_SIZE `IREQQ_SIZE
// Lower Level Cache Hit Queue Size
`define ILLVQ_SIZE 0
`define ILLVQ_SIZE 16
// Fill Forward SNP Queue
`define IFFSQ_SIZE 8
// Fill Invalidator Size {Fill invalidator must be active}
`define IFILL_INVALIDAOR_SIZE 16
`define IFILL_INVALIDAOR_SIZE 0
// Dram knobs
`define ISIMULATED_DRAM_LATENCY_CYCLES 10
@ -244,7 +244,7 @@
// Number of Word requests per cycle {1, 2, 4, 8, ...}
`define SNUMBER_REQUESTS `NT
// Number of cycles to complete stage 1 (read from memory)
`define SSTAGE_1_CYCLES 2
`define SSTAGE_1_CYCLES 1
// Function ID
`define SFUNC_ID 2
@ -258,24 +258,24 @@
// Miss Reserv Queue Knob
`define SMRVQ_SIZE `SREQQ_SIZE
// Dram Fill Rsp Queue Size
`define SDFPQ_SIZE 0
`define SDFPQ_SIZE 16
// Snoop Req Queue
`define SSNRQ_SIZE 0
`define SSNRQ_SIZE 16
// Queues for writebacks Knobs {1, 2, 4, 8, ...}
// Core Writeback Queue Size
`define SCWBQ_SIZE `SREQQ_SIZE
// Dram Writeback Queue Size
`define SDWBQ_SIZE 0
`define SDWBQ_SIZE 16
// Dram Fill Req Queue Size
`define SDFQQ_SIZE 0
`define SDFQQ_SIZE 16
// Lower Level Cache Hit Queue Size
`define SLLVQ_SIZE 0
`define SLLVQ_SIZE 16
// Fill Forward SNP Queue
`define SFFSQ_SIZE 0
`define SFFSQ_SIZE 16
// Fill Invalidator Size {Fill invalidator must be active}
`define SFILL_INVALIDAOR_SIZE 16
`define SFILL_INVALIDAOR_SIZE 0
// Dram knobs
`define SSIMULATED_DRAM_LATENCY_CYCLES 10
@ -296,7 +296,7 @@
// Number of Word requests per cycle {1, 2, 4, 8, ...}
`define LLNUMBER_REQUESTS (2*`NUMBER_CORES_PER_CLUSTER)
// Number of cycles to complete stage 1 (read from memory)
`define LLSTAGE_1_CYCLES 2
`define LLSTAGE_1_CYCLES 1
// Function ID
`define LLFUNC_ID 3
@ -322,12 +322,12 @@
// Dram Fill Req Queue Size
`define LLDFQQ_SIZE `LLREQQ_SIZE
// Lower Level Cache Hit Queue Size
`define LLLLVQ_SIZE 0
`define LLLLVQ_SIZE 16
// Fill Forward SNP Queue
`define LLFFSQ_SIZE 8
// Fill Invalidator Size {Fill invalidator must be active}
`define LLFILL_INVALIDAOR_SIZE 16
`define LLFILL_INVALIDAOR_SIZE 0
// Dram knobs
`define LLSIMULATED_DRAM_LATENCY_CYCLES 10
@ -348,7 +348,7 @@
// Number of Word requests per cycle {1, 2, 4, 8, ...}
`define L3NUMBER_REQUESTS (`NUMBER_CLUSTERS)
// Number of cycles to complete stage 1 (read from memory)
`define L3STAGE_1_CYCLES 2
`define L3STAGE_1_CYCLES 1
// Function ID
`define L3FUNC_ID 3
@ -379,7 +379,7 @@
`define L3FFSQ_SIZE 8
// Fill Invalidator Size {Fill invalidator must be active}
`define L3FILL_INVALIDAOR_SIZE 16
`define L3FILL_INVALIDAOR_SIZE 0
// Dram knobs
`define L3SIMULATED_DRAM_LATENCY_CYCLES 10

View file

@ -2,9 +2,9 @@
`ifndef VX_DEFINE_SYNTH
`define VX_DEFINE_SYNTH
`define NT 4
`define NT 8
`define NW 8
`define NUMBER_CORES_PER_CLUSTER 2
`define NUMBER_CORES_PER_CLUSTER 1
`define NUMBER_CLUSTERS 1
`define DCACHE_SIZE_BYTES 4096
`define ICACHE_SIZE_BYTES 1024

View file

@ -1,6 +1,6 @@
module VX_generic_register
#( parameter N = 1)
#( parameter N = 1, parameter Valid = 1)
(
input wire clk,
input wire reset,
@ -10,18 +10,26 @@ module VX_generic_register
output wire[(N-1):0] out
);
reg[(N-1):0] value;
if (Valid == 0) begin
always @(posedge clk or posedge reset) begin
if (reset) begin
value <= 0;
end else if (flush) begin
value <= 0;
end else if (~stall) begin
value <= in;
assign out = in;
end else begin
reg[(N-1):0] value;
always @(posedge clk or posedge reset) begin
if (reset) begin
value <= 0;
end else if (flush) begin
value <= 0;
end else if (~stall) begin
value <= in;
end
end
assign out = value;
end
assign out = value;
endmodule

View file

@ -13,10 +13,11 @@ module VX_gpgpu_inst (
wire is_split = (VX_gpu_inst_req.is_split);
wire[`NT_M1:0] tmc_new_mask;
wire all_threads = `NT < VX_gpu_inst_req.a_reg_data[0];
genvar curr_t;
generate
for (curr_t = 0; curr_t < `NT; curr_t=curr_t+1) begin : tmc_new_mask_init
assign tmc_new_mask[curr_t] = curr_t < VX_gpu_inst_req.a_reg_data[0];
assign tmc_new_mask[curr_t] = all_threads ? 1 : curr_t < VX_gpu_inst_req.a_reg_data[0];
end
endgenerate
@ -30,13 +31,14 @@ module VX_gpgpu_inst (
assign VX_warp_ctl.ebreak = VX_warp_ctl.change_mask && (VX_warp_ctl.thread_mask == 0);
wire wspawn = VX_gpu_inst_req.is_wspawn;
wire[31:0] wspawn_pc = VX_gpu_inst_req.rd2;
wire wspawn = VX_gpu_inst_req.is_wspawn;
wire[31:0] wspawn_pc = VX_gpu_inst_req.rd2;
wire all_active = `NW < VX_gpu_inst_req.a_reg_data[0];
wire[`NW-1:0] wspawn_new_active;
genvar curr_w;
generate
for (curr_w = 0; curr_w < `NW; curr_w=curr_w+1) begin : wspawn_new_active_init
assign wspawn_new_active[curr_w] = curr_w < VX_gpu_inst_req.a_reg_data[0];
assign wspawn_new_active[curr_w] = all_active ? 1 : curr_w < VX_gpu_inst_req.a_reg_data[0];
end
endgenerate

View file

@ -218,9 +218,11 @@ module VX_warp_scheduler (
// Lock/Release
if (scheduled_warp && !stall) begin
warp_lock[warp_num] <= 1'b1;
// warp_lock <= {`NW{1'b1}};
end
if (|icache_stage_valids && !stall) begin
warp_lock[icache_stage_wid] <= 1'b0;
// warp_lock <= {`NW{1'b0}};
end
end
@ -292,7 +294,7 @@ module VX_warp_scheduler (
assign hazard = (should_jal || should_bra) && schedule;
assign real_schedule = schedule && !warp_stalled[warp_to_schedule] && !total_barrier_stall[warp_to_schedule];
assign real_schedule = schedule && !warp_stalled[warp_to_schedule] && !total_barrier_stall[warp_to_schedule] && !warp_lock[0];
assign global_stall = (stall || wstall_this_cycle || hazard || !real_schedule || is_join);

View file

@ -1,9 +1,9 @@
COMP = riscv32-unknown-elf-gcc
COMP = /opt/riscv-new/drops/bin/riscv32-unknown-elf-gcc
CC_FLAGS = -march=rv32im -mabi=ilp32 -O0 -Wl,-Bstatic,-T,../vortex_link.ld -ffreestanding -nostdlib
DMP = riscv32-unknown-elf-objdump
CPY = riscv32-unknown-elf-objcopy
DMP = /opt/riscv-new/drops/bin/riscv32-unknown-elf-objdump
CPY = /opt/riscv-new/drops/bin/riscv32-unknown-elf-objcopy
NEWLIB = ../../newlib/newlib.c
@ -13,7 +13,7 @@ VX_IO = ../../io/vx_io.s ../../io/vx_io.c
VX_API = ../../vx_api/vx_api.c
VX_TEST = ../../tests/tests.c
VX_FIO = ../../fileio/fileio.s
LIBS = ../../../../riscv-gnu-toolchain/drops/riscv32-unknown-elf/lib/libc.a ../../../../riscv-gnu-toolchain/drops/riscv32-unknown-elf/lib/libstdc++.a -static-libgcc -lgcc
LIBS = /opt/riscv-new/drops/riscv32-unknown-elf/lib/libc.a /opt/riscv-new/drops/riscv32-unknown-elf/lib/libstdc++.a -static-libgcc -lgcc
VX_MAIN = vx_simple_main

View file

@ -52,6 +52,18 @@ int main()
// Main is called with all threads active of warp 0
vx_tmc(1);
vx_print_str("Let's start...\n");
unsigned what[36];
for (int i = 0; i < 36; i++)
{
what[i] = i;
}
for (int i = 0; i < 36; i++)
{
vx_printf("Value: ", what[i]);
}
vx_print_str("Simple Main\n");

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -20,22 +20,22 @@ _start:
# Initialize SP
# la sp, __stack_top
la a1, vx_set_sp
li a0, 32
li a0, 4
.word 0x00b5106b # wspawn a0(numWarps), a1(PC SPAWN)
jal vx_set_sp
li a0, 1
.word 0x0005006b # tmc 1
# li a0, 1
# .word 0x0005006b # tmc 1
# Initialize global pointerp
# call __cxx_global_var_init
# Clear the bss segment
la a0, _edata
la a2, _end
sub a2, a2, a0
li a1, 0
call memset
la a0, __libc_fini_array # Register global termination functions
call atexit # to be called upon exit
call __libc_init_array # Run global initialization functions
# la a0, _edata
# la a2, _end
# sub a2, a2, a0
# li a1, 0
# call memset
# la a0, __libc_fini_array # Register global termination functions
# call atexit # to be called upon exit
# call __libc_init_array # Run global initialization functions
# li a0, 4
# .word 0x0005006b # tmc 4
call main
@ -46,7 +46,7 @@ _start:
.type vx_set_sp, @function
.global vx_set_sp
vx_set_sp:
li a0, 32
li a0, 4
.word 0x0005006b # tmc 4
.option push
@ -55,7 +55,7 @@ vx_set_sp:
addi gp, gp, %pcrel_lo(1b)
.option pop
csrr a3, 0x21 # get wid
csrr a3, 0x22 # get wid
slli a3, a3, 0x1a # shift by wid
csrr a2, 0x20 # get tid
slli a1, a2, 10 # multiply tid by 1024

View file

@ -13,6 +13,7 @@ void test_tmc()
vx_tmc(4);
unsigned tid = vx_threadID(); // Get TID
tmc_array[tid] = tid;
vx_tmc(1);
@ -85,6 +86,7 @@ void simple_kernel()
wsapwn_arr[wid] = wid;
wid = vx_warpID();
if (wid != 0)
{
vx_tmc(0);