mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 05:17:45 -04:00
Added Split/Join - not tested
This commit is contained in:
parent
84f5ccb484
commit
bab1852a99
12 changed files with 103 additions and 27 deletions
|
@ -8,6 +8,7 @@ module VX_decode(
|
|||
// Outputs
|
||||
VX_frE_to_bckE_req_inter VX_frE_to_bckE_req,
|
||||
VX_wstall_inter VX_wstall,
|
||||
VX_join_inter VX_join,
|
||||
output wire out_ebreak
|
||||
|
||||
);
|
||||
|
@ -118,6 +119,11 @@ module VX_decode(
|
|||
assign is_split = is_gpgpu && (func3 == 2); // Goes to BE
|
||||
assign is_join = is_gpgpu && (func3 == 3); // Doesn't go to BE
|
||||
|
||||
|
||||
assign VX_join.is_join = is_join;
|
||||
assign VX_join.join_warp_num = in_warp_num;
|
||||
|
||||
|
||||
assign VX_frE_to_bckE_req.is_wspawn = is_wspawn;
|
||||
assign VX_frE_to_bckE_req.is_tmc = is_tmc;
|
||||
assign VX_frE_to_bckE_req.is_split = is_split;
|
||||
|
@ -283,7 +289,7 @@ module VX_decode(
|
|||
|
||||
assign VX_frE_to_bckE_req.branch_type = temp_branch_type;
|
||||
|
||||
assign VX_wstall.wstall = (temp_branch_stall || is_tmc || is_split || is_join || is_barrier) && (|in_valid);
|
||||
assign VX_wstall.wstall = (temp_branch_stall || is_tmc || is_split || is_barrier) && (|in_valid);
|
||||
assign VX_wstall.warp_num = in_warp_num;
|
||||
|
||||
always @(*) begin
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
module VX_fetch (
|
||||
input wire clk,
|
||||
VX_wstall_inter VX_wstall,
|
||||
VX_join_inter VX_join,
|
||||
input wire schedule_delay,
|
||||
VX_icache_response_inter icache_response,
|
||||
VX_icache_request_inter icache_request,
|
||||
|
@ -42,11 +43,16 @@ module VX_fetch (
|
|||
.wstall (VX_wstall.wstall),
|
||||
.wstall_warp_num(VX_wstall.warp_num),
|
||||
|
||||
// Join
|
||||
.is_join (VX_join.is_join),
|
||||
.join_warp_num (VX_join.join_warp_num),
|
||||
|
||||
// Split
|
||||
.is_split (VX_warp_ctl.is_split),
|
||||
.split_new_mask (VX_warp_ctl.split_new_mask),
|
||||
.split_later_mask(VX_warp_ctl.split_later_mask),
|
||||
.split_save_pc (VX_warp_ctl.split_save_pc),
|
||||
.split_warp_num (VX_warp_ctl.warp_num),
|
||||
|
||||
// JAL
|
||||
.jal (VX_jal_rsp.jal),
|
||||
|
|
|
@ -37,10 +37,12 @@ wire real_fetch_ebreak;
|
|||
|
||||
|
||||
VX_wstall_inter VX_wstall();
|
||||
VX_join_inter VX_join();
|
||||
|
||||
VX_fetch vx_fetch(
|
||||
.clk (clk),
|
||||
.VX_wstall (VX_wstall),
|
||||
.VX_join (VX_join),
|
||||
.schedule_delay (schedule_delay),
|
||||
.VX_jal_rsp (VX_jal_rsp),
|
||||
.icache_response (icache_response_fe),
|
||||
|
@ -65,6 +67,7 @@ VX_decode vx_decode(
|
|||
.fd_inst_meta_de (fd_inst_meta_de),
|
||||
.VX_frE_to_bckE_req(VX_frE_to_bckE_req),
|
||||
.VX_wstall (VX_wstall),
|
||||
.VX_join (VX_join),
|
||||
.out_ebreak (fetch_ebreak)
|
||||
);
|
||||
|
||||
|
|
|
@ -5,11 +5,11 @@ module VX_generic_stack
|
|||
)
|
||||
(
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
input wire push,
|
||||
input wire pop,
|
||||
input wire[WIDTH - 1:0] d,
|
||||
output reg [WIDTH - 1:0] q,
|
||||
input reg [WIDTH - 1:0] q1,
|
||||
input reg [WIDTH - 1:0] q2,
|
||||
output wire[WIDTH - 1:0] d
|
||||
);
|
||||
|
||||
|
||||
|
@ -17,24 +17,22 @@ module VX_generic_stack
|
|||
reg [WIDTH - 1:0] stack [0:(1 << DEPTH) - 1];
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset)
|
||||
ptr <= 0;
|
||||
else if (push)
|
||||
ptr <= ptr + 1;
|
||||
// if (reset)
|
||||
// ptr <= 0;
|
||||
// else
|
||||
if (push)
|
||||
ptr <= ptr + 2;
|
||||
else if (pop)
|
||||
ptr <= ptr - 1;
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (push) begin
|
||||
if(push)
|
||||
stack[ptr] <= q;
|
||||
stack[ptr] <= q1;
|
||||
stack[ptr+1] <= q2;
|
||||
end
|
||||
end
|
||||
|
||||
always @(*) begin
|
||||
if (pop)
|
||||
q <= stack[ptr - 1];
|
||||
end
|
||||
assign d = stack[ptr - 1];
|
||||
|
||||
endmodule
|
|
@ -32,7 +32,7 @@ module VX_gpgpu_inst (
|
|||
// VX_gpu_inst_req.pc
|
||||
genvar curr_s_t;
|
||||
for (curr_s_t = 0; curr_s_t < `NT; curr_s_t=curr_s_t+1) begin
|
||||
wire curr_bool = (VX_gpu_inst_req.a_reg_data == 32'b1);
|
||||
wire curr_bool = (VX_gpu_inst_req.a_reg_data[curr_s_t] == 32'b1);
|
||||
|
||||
assign split_new_use_mask[curr_s_t] = VX_gpu_inst_req.valid[curr_s_t] & (curr_bool);
|
||||
assign split_new_later_mask[curr_s_t] = VX_gpu_inst_req.valid[curr_s_t] & (!curr_bool);
|
||||
|
@ -43,7 +43,7 @@ module VX_gpgpu_inst (
|
|||
always @(*) begin
|
||||
num_valids = 0;
|
||||
for (z = 0; z < `NT; z=z+1) begin
|
||||
if (VX_gpu_inst_req.valid) num_valids = num_valids + 1
|
||||
if (VX_gpu_inst_req.valid[z]) num_valids = num_valids + 1;
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -51,6 +51,7 @@ module VX_gpgpu_inst (
|
|||
assign VX_warp_ctl.split_new_mask = split_new_use_mask;
|
||||
assign VX_warp_ctl.split_later_mask = split_new_later_mask;
|
||||
assign VX_warp_ctl.split_save_pc = VX_gpu_inst_req.pc_next;
|
||||
assign VX_warp_ctl.split_warp_num = VX_gpu_inst_req.warp_num;
|
||||
|
||||
// VX_gpu_inst_req.is_wspawn
|
||||
// VX_gpu_inst_req.is_split
|
||||
|
|
|
@ -21,10 +21,15 @@ module VX_warp_scheduler (
|
|||
input wire[`NW_M1:0] wstall_warp_num,
|
||||
|
||||
// Split
|
||||
input wire is_split,
|
||||
input wire[`NT_M1:0] split_new_mask,
|
||||
input wire[`NT_M1:0] split_later_mask,
|
||||
input wire[31:0] split_save_pc,
|
||||
input wire is_split,
|
||||
input wire[`NT_M1:0] split_new_mask,
|
||||
input wire[`NT_M1:0] split_later_mask,
|
||||
input wire[31:0] split_save_pc,
|
||||
input wire[`NW_M1:0] split_warp_num,
|
||||
|
||||
// Join
|
||||
input wire is_join,
|
||||
input wire[`NW_M1:0] join_warp_num,
|
||||
|
||||
// JAL
|
||||
input wire jal,
|
||||
|
@ -114,9 +119,20 @@ module VX_warp_scheduler (
|
|||
visible_active[wstall_warp_num] <= 0;
|
||||
end
|
||||
|
||||
if (is_split) begin
|
||||
warp_stalled[split_warp_num] <= 0;
|
||||
thread_masks[split_warp_num] <= split_new_mask;
|
||||
end
|
||||
|
||||
if (is_join) begin
|
||||
if (!join_fall) begin
|
||||
warp_pcs[join_warp_num] <= join_pc;
|
||||
end
|
||||
thread_masks[join_warp_num] <= join_tm;
|
||||
end
|
||||
|
||||
// Refilling active warps
|
||||
if ((visible_active == 0) && !(stall || wstall || hazard)) begin
|
||||
// if ((num_active <= 1) && !(globa)) begin
|
||||
if ((visible_active == 0) && !(stall || wstall || hazard || is_join)) begin
|
||||
visible_active <= warp_active & (~warp_stalled);
|
||||
end
|
||||
|
||||
|
@ -145,8 +161,36 @@ module VX_warp_scheduler (
|
|||
end
|
||||
end
|
||||
|
||||
wire[(1+32+`NT_M1):0] q1 = {1'b1, warp_pcs[split_warp_num], thread_masks[split_warp_num]};
|
||||
wire[(1+32+`NT_M1):0] q2 = {1'b0, split_save_pc , split_later_mask};
|
||||
|
||||
|
||||
wire[(1+32+`NT_M1):0] d;
|
||||
|
||||
wire join_fall;
|
||||
wire[31:0] join_pc;
|
||||
wire[`NT_M1:0] join_tm;
|
||||
|
||||
assign {join_fall, join_pc, join_tm} = d;
|
||||
|
||||
|
||||
|
||||
genvar curr_warp;
|
||||
for (curr_warp = 0; curr_warp < `NW; curr_warp = curr_warp + 1) begin
|
||||
wire correct_warp_s = (curr_warp == split_warp_num);
|
||||
wire correct_warp_j = (curr_warp == join_warp_num);
|
||||
|
||||
wire push = is_split && correct_warp_s;
|
||||
wire pop = is_join && correct_warp_j;
|
||||
VX_generic_stack #(.WIDTH(1+32+`NT), .DEPTH($clog2(`NT))) ipdom_stack(
|
||||
.clk (clk),
|
||||
.push (push),
|
||||
.pop (pop),
|
||||
.d (d),
|
||||
.q1 (q1),
|
||||
.q2 (q2)
|
||||
);
|
||||
end
|
||||
|
||||
// wire should_stall = stall || (jal && (warp_to_schedule == jal_warp_num)) || (branch_dir && (warp_to_schedule == branch_warp_num));
|
||||
|
||||
|
@ -157,7 +201,7 @@ module VX_warp_scheduler (
|
|||
|
||||
assign real_schedule = schedule && !warp_stalled[warp_to_schedule];
|
||||
|
||||
assign global_stall = (stall || wstall || hazard || !real_schedule);
|
||||
assign global_stall = (stall || wstall || hazard || !real_schedule || is_join);
|
||||
|
||||
|
||||
assign warp_pc = warp_pcs[warp_to_schedule];
|
||||
|
|
|
@ -13,7 +13,7 @@ interface VX_gpu_inst_req_inter();
|
|||
wire is_split;
|
||||
wire is_barrier;
|
||||
|
||||
wire pc_next;
|
||||
wire[31:0] pc_next;
|
||||
|
||||
wire[`NT_M1:0][31:0] a_reg_data;
|
||||
wire[31:0] rd2;
|
||||
|
|
17
rtl/interfaces/VX_join_inter.v
Normal file
17
rtl/interfaces/VX_join_inter.v
Normal file
|
@ -0,0 +1,17 @@
|
|||
|
||||
`include "../VX_define.v"
|
||||
|
||||
`ifndef VX_JOIN_INTER
|
||||
|
||||
`define VX_JOIN_INTER
|
||||
|
||||
interface VX_join_inter ();
|
||||
|
||||
wire is_join;
|
||||
wire[`NW_M1:0] join_warp_num;
|
||||
|
||||
|
||||
endinterface
|
||||
|
||||
|
||||
`endif
|
|
@ -18,6 +18,7 @@ interface VX_warp_ctl_inter ();
|
|||
|
||||
|
||||
wire is_split;
|
||||
wire[`NW_M1:0] split_warp_num;
|
||||
wire[`NT_M1:0] split_new_mask;
|
||||
wire[`NT_M1:0] split_later_mask;
|
||||
wire[31:0] split_save_pc;
|
||||
|
|
|
@ -3,5 +3,5 @@
|
|||
# of forwarding stalls: 0
|
||||
# of branch stalls: 0
|
||||
# CPI: 2.08333
|
||||
# time to simulate: 6.95312e-310 milliseconds
|
||||
# time to simulate: 0 milliseconds
|
||||
# GRADE: Failed on test: 4294967295
|
||||
|
|
|
@ -1 +1 @@
|
|||
#define VCD_OUTPUT
|
||||
#define VCD_OFF
|
||||
|
|
|
@ -3,7 +3,7 @@ set link_library [concat * sc12mc_cln28hpm_base_ulvt_c35_ssg_typical_max_0p81v_
|
|||
set symbol_library {}
|
||||
set target_library [concat sc12mc_cln28hpm_base_ulvt_c35_ssg_typical_max_0p81v_m40c.db]
|
||||
|
||||
set verilog_files [ list VX_csr_wrapper.v VX_csr_req_inter.v VX_csr_wb_inter.v VX_gpgpu_inst.v VX_gpu_inst_req_inter.v VX_wstall_inter.v VX_inst_exec_wb_inter.v VX_lsu.v VX_execute_unit.v VX_lsu_addr_gen.v VX_inst_multiplex.v VX_exec_unit_req_inter.v VX_lsu_req_inter.v VX_alu.v VX_back_end.v VX_gpr_stage.v VX_gpr_data_inter.v VX_csr_handler.v VX_decode.v VX_define.v VX_scheduler.v VX_fetch.v VX_front_end.v VX_generic_register.v VX_gpr.v VX_gpr_wrapper.v VX_one_counter.v VX_priority_encoder.v VX_warp.v VX_warp_scheduler.v VX_writeback.v Vortex.v byte_enabled_simple_dual_port_ram.v VX_branch_response_inter.v VX_dcache_request_inter.v VX_dcache_response_inter.v VX_frE_to_bckE_req_inter.v VX_gpr_clone_inter.v VX_gpr_jal_inter.v VX_gpr_read_inter.v VX_gpr_wspawn_inter.v VX_icache_request_inter.v VX_icache_response_inter.v VX_inst_mem_wb_inter.v VX_inst_meta_inter.v VX_jal_response_inter.v VX_mem_req_inter.v VX_mw_wb_inter.v VX_warp_ctl_inter.v VX_wb_inter.v VX_d_e_reg.v VX_f_d_reg.v \
|
||||
set verilog_files [ list VX_join_inter.v VX_csr_wrapper.v VX_csr_req_inter.v VX_csr_wb_inter.v VX_gpgpu_inst.v VX_gpu_inst_req_inter.v VX_wstall_inter.v VX_inst_exec_wb_inter.v VX_lsu.v VX_execute_unit.v VX_lsu_addr_gen.v VX_inst_multiplex.v VX_exec_unit_req_inter.v VX_lsu_req_inter.v VX_alu.v VX_back_end.v VX_gpr_stage.v VX_gpr_data_inter.v VX_csr_handler.v VX_decode.v VX_define.v VX_scheduler.v VX_fetch.v VX_front_end.v VX_generic_register.v VX_gpr.v VX_gpr_wrapper.v VX_one_counter.v VX_priority_encoder.v VX_warp.v VX_warp_scheduler.v VX_writeback.v Vortex.v byte_enabled_simple_dual_port_ram.v VX_branch_response_inter.v VX_dcache_request_inter.v VX_dcache_response_inter.v VX_frE_to_bckE_req_inter.v VX_gpr_clone_inter.v VX_gpr_jal_inter.v VX_gpr_read_inter.v VX_gpr_wspawn_inter.v VX_icache_request_inter.v VX_icache_response_inter.v VX_inst_mem_wb_inter.v VX_inst_meta_inter.v VX_jal_response_inter.v VX_mem_req_inter.v VX_mw_wb_inter.v VX_warp_ctl_inter.v VX_wb_inter.v VX_d_e_reg.v VX_f_d_reg.v \
|
||||
]
|
||||
|
||||
analyze -format sverilog $verilog_files
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue