Added Split/Join - not tested

This commit is contained in:
felsabbagh3 2019-10-21 03:03:15 -04:00
parent 84f5ccb484
commit bab1852a99
12 changed files with 103 additions and 27 deletions

View file

@ -8,6 +8,7 @@ module VX_decode(
// Outputs
VX_frE_to_bckE_req_inter VX_frE_to_bckE_req,
VX_wstall_inter VX_wstall,
VX_join_inter VX_join,
output wire out_ebreak
);
@ -118,6 +119,11 @@ module VX_decode(
assign is_split = is_gpgpu && (func3 == 2); // Goes to BE
assign is_join = is_gpgpu && (func3 == 3); // Doesn't go to BE
assign VX_join.is_join = is_join;
assign VX_join.join_warp_num = in_warp_num;
assign VX_frE_to_bckE_req.is_wspawn = is_wspawn;
assign VX_frE_to_bckE_req.is_tmc = is_tmc;
assign VX_frE_to_bckE_req.is_split = is_split;
@ -283,7 +289,7 @@ module VX_decode(
assign VX_frE_to_bckE_req.branch_type = temp_branch_type;
assign VX_wstall.wstall = (temp_branch_stall || is_tmc || is_split || is_join || is_barrier) && (|in_valid);
assign VX_wstall.wstall = (temp_branch_stall || is_tmc || is_split || is_barrier) && (|in_valid);
assign VX_wstall.warp_num = in_warp_num;
always @(*) begin

View file

@ -4,6 +4,7 @@
module VX_fetch (
input wire clk,
VX_wstall_inter VX_wstall,
VX_join_inter VX_join,
input wire schedule_delay,
VX_icache_response_inter icache_response,
VX_icache_request_inter icache_request,
@ -42,11 +43,16 @@ module VX_fetch (
.wstall (VX_wstall.wstall),
.wstall_warp_num(VX_wstall.warp_num),
// Join
.is_join (VX_join.is_join),
.join_warp_num (VX_join.join_warp_num),
// Split
.is_split (VX_warp_ctl.is_split),
.split_new_mask (VX_warp_ctl.split_new_mask),
.split_later_mask(VX_warp_ctl.split_later_mask),
.split_save_pc (VX_warp_ctl.split_save_pc),
.split_warp_num (VX_warp_ctl.warp_num),
// JAL
.jal (VX_jal_rsp.jal),

View file

@ -37,10 +37,12 @@ wire real_fetch_ebreak;
VX_wstall_inter VX_wstall();
VX_join_inter VX_join();
VX_fetch vx_fetch(
.clk (clk),
.VX_wstall (VX_wstall),
.VX_join (VX_join),
.schedule_delay (schedule_delay),
.VX_jal_rsp (VX_jal_rsp),
.icache_response (icache_response_fe),
@ -65,6 +67,7 @@ VX_decode vx_decode(
.fd_inst_meta_de (fd_inst_meta_de),
.VX_frE_to_bckE_req(VX_frE_to_bckE_req),
.VX_wstall (VX_wstall),
.VX_join (VX_join),
.out_ebreak (fetch_ebreak)
);

View file

@ -5,11 +5,11 @@ module VX_generic_stack
)
(
input wire clk,
input wire reset,
input wire push,
input wire pop,
input wire[WIDTH - 1:0] d,
output reg [WIDTH - 1:0] q,
input reg [WIDTH - 1:0] q1,
input reg [WIDTH - 1:0] q2,
output wire[WIDTH - 1:0] d
);
@ -17,24 +17,22 @@ module VX_generic_stack
reg [WIDTH - 1:0] stack [0:(1 << DEPTH) - 1];
always @(posedge clk) begin
if (reset)
ptr <= 0;
else if (push)
ptr <= ptr + 1;
// if (reset)
// ptr <= 0;
// else
if (push)
ptr <= ptr + 2;
else if (pop)
ptr <= ptr - 1;
end
always @(posedge clk) begin
if (push) begin
if(push)
stack[ptr] <= q;
stack[ptr] <= q1;
stack[ptr+1] <= q2;
end
end
always @(*) begin
if (pop)
q <= stack[ptr - 1];
end
assign d = stack[ptr - 1];
endmodule

View file

@ -32,7 +32,7 @@ module VX_gpgpu_inst (
// VX_gpu_inst_req.pc
genvar curr_s_t;
for (curr_s_t = 0; curr_s_t < `NT; curr_s_t=curr_s_t+1) begin
wire curr_bool = (VX_gpu_inst_req.a_reg_data == 32'b1);
wire curr_bool = (VX_gpu_inst_req.a_reg_data[curr_s_t] == 32'b1);
assign split_new_use_mask[curr_s_t] = VX_gpu_inst_req.valid[curr_s_t] & (curr_bool);
assign split_new_later_mask[curr_s_t] = VX_gpu_inst_req.valid[curr_s_t] & (!curr_bool);
@ -43,7 +43,7 @@ module VX_gpgpu_inst (
always @(*) begin
num_valids = 0;
for (z = 0; z < `NT; z=z+1) begin
if (VX_gpu_inst_req.valid) num_valids = num_valids + 1
if (VX_gpu_inst_req.valid[z]) num_valids = num_valids + 1;
end
end
@ -51,6 +51,7 @@ module VX_gpgpu_inst (
assign VX_warp_ctl.split_new_mask = split_new_use_mask;
assign VX_warp_ctl.split_later_mask = split_new_later_mask;
assign VX_warp_ctl.split_save_pc = VX_gpu_inst_req.pc_next;
assign VX_warp_ctl.split_warp_num = VX_gpu_inst_req.warp_num;
// VX_gpu_inst_req.is_wspawn
// VX_gpu_inst_req.is_split

View file

@ -21,10 +21,15 @@ module VX_warp_scheduler (
input wire[`NW_M1:0] wstall_warp_num,
// Split
input wire is_split,
input wire[`NT_M1:0] split_new_mask,
input wire[`NT_M1:0] split_later_mask,
input wire[31:0] split_save_pc,
input wire is_split,
input wire[`NT_M1:0] split_new_mask,
input wire[`NT_M1:0] split_later_mask,
input wire[31:0] split_save_pc,
input wire[`NW_M1:0] split_warp_num,
// Join
input wire is_join,
input wire[`NW_M1:0] join_warp_num,
// JAL
input wire jal,
@ -114,9 +119,20 @@ module VX_warp_scheduler (
visible_active[wstall_warp_num] <= 0;
end
if (is_split) begin
warp_stalled[split_warp_num] <= 0;
thread_masks[split_warp_num] <= split_new_mask;
end
if (is_join) begin
if (!join_fall) begin
warp_pcs[join_warp_num] <= join_pc;
end
thread_masks[join_warp_num] <= join_tm;
end
// Refilling active warps
if ((visible_active == 0) && !(stall || wstall || hazard)) begin
// if ((num_active <= 1) && !(globa)) begin
if ((visible_active == 0) && !(stall || wstall || hazard || is_join)) begin
visible_active <= warp_active & (~warp_stalled);
end
@ -145,8 +161,36 @@ module VX_warp_scheduler (
end
end
wire[(1+32+`NT_M1):0] q1 = {1'b1, warp_pcs[split_warp_num], thread_masks[split_warp_num]};
wire[(1+32+`NT_M1):0] q2 = {1'b0, split_save_pc , split_later_mask};
wire[(1+32+`NT_M1):0] d;
wire join_fall;
wire[31:0] join_pc;
wire[`NT_M1:0] join_tm;
assign {join_fall, join_pc, join_tm} = d;
genvar curr_warp;
for (curr_warp = 0; curr_warp < `NW; curr_warp = curr_warp + 1) begin
wire correct_warp_s = (curr_warp == split_warp_num);
wire correct_warp_j = (curr_warp == join_warp_num);
wire push = is_split && correct_warp_s;
wire pop = is_join && correct_warp_j;
VX_generic_stack #(.WIDTH(1+32+`NT), .DEPTH($clog2(`NT))) ipdom_stack(
.clk (clk),
.push (push),
.pop (pop),
.d (d),
.q1 (q1),
.q2 (q2)
);
end
// wire should_stall = stall || (jal && (warp_to_schedule == jal_warp_num)) || (branch_dir && (warp_to_schedule == branch_warp_num));
@ -157,7 +201,7 @@ module VX_warp_scheduler (
assign real_schedule = schedule && !warp_stalled[warp_to_schedule];
assign global_stall = (stall || wstall || hazard || !real_schedule);
assign global_stall = (stall || wstall || hazard || !real_schedule || is_join);
assign warp_pc = warp_pcs[warp_to_schedule];

View file

@ -13,7 +13,7 @@ interface VX_gpu_inst_req_inter();
wire is_split;
wire is_barrier;
wire pc_next;
wire[31:0] pc_next;
wire[`NT_M1:0][31:0] a_reg_data;
wire[31:0] rd2;

View file

@ -0,0 +1,17 @@
`include "../VX_define.v"
`ifndef VX_JOIN_INTER
`define VX_JOIN_INTER
interface VX_join_inter ();
wire is_join;
wire[`NW_M1:0] join_warp_num;
endinterface
`endif

View file

@ -18,6 +18,7 @@ interface VX_warp_ctl_inter ();
wire is_split;
wire[`NW_M1:0] split_warp_num;
wire[`NT_M1:0] split_new_mask;
wire[`NT_M1:0] split_later_mask;
wire[31:0] split_save_pc;

View file

@ -3,5 +3,5 @@
# of forwarding stalls: 0
# of branch stalls: 0
# CPI: 2.08333
# time to simulate: 6.95312e-310 milliseconds
# time to simulate: 0 milliseconds
# GRADE: Failed on test: 4294967295

View file

@ -1 +1 @@
#define VCD_OUTPUT
#define VCD_OFF

View file

@ -3,7 +3,7 @@ set link_library [concat * sc12mc_cln28hpm_base_ulvt_c35_ssg_typical_max_0p81v_
set symbol_library {}
set target_library [concat sc12mc_cln28hpm_base_ulvt_c35_ssg_typical_max_0p81v_m40c.db]
set verilog_files [ list VX_csr_wrapper.v VX_csr_req_inter.v VX_csr_wb_inter.v VX_gpgpu_inst.v VX_gpu_inst_req_inter.v VX_wstall_inter.v VX_inst_exec_wb_inter.v VX_lsu.v VX_execute_unit.v VX_lsu_addr_gen.v VX_inst_multiplex.v VX_exec_unit_req_inter.v VX_lsu_req_inter.v VX_alu.v VX_back_end.v VX_gpr_stage.v VX_gpr_data_inter.v VX_csr_handler.v VX_decode.v VX_define.v VX_scheduler.v VX_fetch.v VX_front_end.v VX_generic_register.v VX_gpr.v VX_gpr_wrapper.v VX_one_counter.v VX_priority_encoder.v VX_warp.v VX_warp_scheduler.v VX_writeback.v Vortex.v byte_enabled_simple_dual_port_ram.v VX_branch_response_inter.v VX_dcache_request_inter.v VX_dcache_response_inter.v VX_frE_to_bckE_req_inter.v VX_gpr_clone_inter.v VX_gpr_jal_inter.v VX_gpr_read_inter.v VX_gpr_wspawn_inter.v VX_icache_request_inter.v VX_icache_response_inter.v VX_inst_mem_wb_inter.v VX_inst_meta_inter.v VX_jal_response_inter.v VX_mem_req_inter.v VX_mw_wb_inter.v VX_warp_ctl_inter.v VX_wb_inter.v VX_d_e_reg.v VX_f_d_reg.v \
set verilog_files [ list VX_join_inter.v VX_csr_wrapper.v VX_csr_req_inter.v VX_csr_wb_inter.v VX_gpgpu_inst.v VX_gpu_inst_req_inter.v VX_wstall_inter.v VX_inst_exec_wb_inter.v VX_lsu.v VX_execute_unit.v VX_lsu_addr_gen.v VX_inst_multiplex.v VX_exec_unit_req_inter.v VX_lsu_req_inter.v VX_alu.v VX_back_end.v VX_gpr_stage.v VX_gpr_data_inter.v VX_csr_handler.v VX_decode.v VX_define.v VX_scheduler.v VX_fetch.v VX_front_end.v VX_generic_register.v VX_gpr.v VX_gpr_wrapper.v VX_one_counter.v VX_priority_encoder.v VX_warp.v VX_warp_scheduler.v VX_writeback.v Vortex.v byte_enabled_simple_dual_port_ram.v VX_branch_response_inter.v VX_dcache_request_inter.v VX_dcache_response_inter.v VX_frE_to_bckE_req_inter.v VX_gpr_clone_inter.v VX_gpr_jal_inter.v VX_gpr_read_inter.v VX_gpr_wspawn_inter.v VX_icache_request_inter.v VX_icache_response_inter.v VX_inst_mem_wb_inter.v VX_inst_meta_inter.v VX_jal_response_inter.v VX_mem_req_inter.v VX_mw_wb_inter.v VX_warp_ctl_inter.v VX_wb_inter.v VX_d_e_reg.v VX_f_d_reg.v \
]
analyze -format sverilog $verilog_files