Support exec multi-cycle for div/mul

This commit is contained in:
wgulian3 2020-02-13 13:17:46 -05:00
parent c1bd731d7f
commit 8318aff69f
8 changed files with 100 additions and 58 deletions

View file

@ -3,7 +3,7 @@ all: RUNFILE
# /rf2_256x128_wm1/
BaseMEM=../models/memory/cln28hpm
INCLUDE=-I. -Ishared_memory -Icache -I$(BaseMEM)/rf2_128x128_wm1/ -I$(BaseMEM)/rf2_256x128_wm1/ -I$(BaseMEM)/rf2_256x19_wm0/ -I$(BaseMEM)/rf2_32x128_wm1/ -Iinterfaces/ -Ipipe_regs/ -Isimulate
INCLUDE=-I. -Ishared_memory -Icache -I$(BaseMEM)/rf2_128x128_wm1/ -I$(BaseMEM)/rf2_256x128_wm1/ -I$(BaseMEM)/rf2_256x19_wm0/ -I$(BaseMEM)/rf2_32x128_wm1/ -Iinterfaces/ -Ipipe_regs/ -Icompat/ -Isimulate
FILE=Vortex.v
@ -49,4 +49,4 @@ w: VERILATORnoWarnings
$(MAKECPP)
clean:
rm obj_dir/*
rm obj_dir/*

View file

@ -1,6 +1,8 @@
`include "VX_define.v"
module VX_alu(
input wire clk,
input wire reset,
input wire[31:0] in_1,
input wire[31:0] in_2,
input wire in_rs2_src,
@ -8,9 +10,11 @@ module VX_alu(
input wire[19:0] in_upper_immed,
input wire[4:0] in_alu_op,
input wire[31:0] in_curr_PC,
output reg[31:0] out_alu_result
output reg[31:0] out_alu_result,
output reg out_alu_stall
);
localparam div_pipeline_len = 3;
`ifdef SYN_FUNC
wire which_in2;
@ -25,23 +29,25 @@ module VX_alu(
wire[31:0] signed_div_result;
wire[31:0] signed_rem_result;
reg [15:0] inst_delay;
reg [15:0] inst_delay_count;
assign out_alu_stall = inst_delay != 0 || inst_delay_count != 0;
assign which_in2 = in_rs2_src == `RS2_IMMED;
assign ALU_in1 = in_1;
assign ALU_in2 = which_in2 ? in_itype_immed : in_2;
assign upper_immed = {in_upper_immed, {12{1'b0}}};
VX_divide #(
.WIDTHN(32),
.WIDTHD(32),
.SPEED("HIGHEST"),
.PIPELINE(0)
.PIPELINE(div_pipeline_len)
) unsigned_div (
.clk(0),
.clock(clk),
.aclr(0),
.clken(1), // TODO this could be disabled on inactive instructions
.numer(ALU_in1),
@ -56,9 +62,9 @@ module VX_alu(
.NREP("SIGNED"),
.DREP("SIGNED"),
.SPEED("HIGHEST"),
.PIPELINE(0)
.PIPELINE(div_pipeline_len)
) signed_div (
.clk(0),
.clock(clk),
.aclr(0),
.clken(1), // TODO this could be disabled on inactive instructions
.numer(ALU_in1),
@ -101,6 +107,7 @@ module VX_alu(
`MULH: out_alu_result = mult_result[63:32];
`MULHSU: out_alu_result = mult_result[63:32];
`MULHU: out_alu_result = mult_result[63:32];
// TODO profitable to roll these exceptional cases into inst_delay to avoid pipeline when possible?
`DIV: out_alu_result = (ALU_in2 == 0) ? 32'hffffffff : signed_div_result;
`DIVU: out_alu_result = (ALU_in2 == 0) ? 32'hffffffff : unsigned_div_result;
`REM: out_alu_result = (ALU_in2 == 0) ? ALU_in1 : signed_rem_result;
@ -109,6 +116,25 @@ module VX_alu(
endcase // in_alu_op
end
always @(*) begin
case(in_alu_op)
`DIV,
`DIVU,
`REM,
`REMU: inst_delay = div_pipeline_len;
default: inst_delay = 0;
endcase // in_alu_op
end
always @(posedge clk or posedge reset) begin
if (reset)
inst_delay_count <= 0;
else if (inst_delay_count > 0)
inst_delay_count <= inst_delay_count - 1;
else if (inst_delay != 0)
inst_delay_count <= inst_delay - 1;
end
`else
wire which_in2;
@ -169,4 +195,4 @@ module VX_alu(
end
`endif
endmodule
endmodule : VX_alu

View file

@ -6,6 +6,7 @@ module VX_back_end (
input wire schedule_delay,
output wire out_mem_delay,
output wire out_exec_delay,
output wire gpr_stage_delay,
VX_jal_response_inter VX_jal_rsp,
VX_branch_response_inter VX_branch_rsp,
@ -32,7 +33,7 @@ assign VX_writeback_inter.wb_warp_num = VX_writeback_temp.wb_warp_num;
VX_mw_wb_inter VX_mw_wb();
wire no_slot_mem;
wire no_slot_mem, no_slot_exec;
VX_mem_req_inter VX_exe_mem_req();
@ -69,6 +70,7 @@ VX_gpr_stage VX_gpr_stage(
.VX_csr_req (VX_csr_req),
// End new
.memory_delay (out_mem_delay),
.exec_delay (out_exec_delay),
.gpr_stage_delay (gpr_stage_delay)
);
@ -91,7 +93,9 @@ VX_execute_unit VX_execUnit(
.VX_exec_unit_req(VX_exec_unit_req),
.VX_inst_exec_wb (VX_inst_exec_wb),
.VX_jal_rsp (VX_jal_rsp),
.VX_branch_rsp (VX_branch_rsp)
.VX_branch_rsp (VX_branch_rsp),
.out_delay (out_exec_delay),
.no_slot_exec (no_slot_exec)
);
@ -113,7 +117,8 @@ VX_writeback VX_wb(
.VX_csr_wb (VX_csr_wb),
.VX_writeback_inter(VX_writeback_temp),
.no_slot_mem (no_slot_mem)
.no_slot_mem (no_slot_mem),
.no_slot_exec (no_slot_exec)
);
endmodule

View file

@ -12,7 +12,10 @@ module VX_execute_unit (
// JAL Response
VX_jal_response_inter VX_jal_rsp,
// Branch Response
VX_branch_response_inter VX_branch_rsp
VX_branch_response_inter VX_branch_rsp,
input wire no_slot_exec,
output wire out_delay
);
@ -41,10 +44,13 @@ module VX_execute_unit (
wire[`NT_M1:0][31:0] alu_result;
wire[`NT_M1:0] alu_stall;
genvar index_out_reg;
generate
for (index_out_reg = 0; index_out_reg < `NT; index_out_reg = index_out_reg + 1) begin : alu_defs
VX_alu vx_alu(
.clk(clk),
.reset(reset),
// .in_reg_data (in_reg_data[1:0]),
.in_1 (in_a_reg_data[index_out_reg]),
.in_2 (in_b_reg_data[index_out_reg]),
@ -53,11 +59,17 @@ module VX_execute_unit (
.in_upper_immed(in_upper_immed),
.in_alu_op (in_alu_op),
.in_curr_PC (in_curr_PC),
.out_alu_result(alu_result[index_out_reg])
.out_alu_result(alu_result[index_out_reg]),
.out_alu_stall(alu_stall[index_out_reg])
);
end
endgenerate
wire internal_stall;
assign internal_stall = |alu_stall;
assign out_delay = no_slot_exec || internal_stall;
wire [$clog2(`NT)-1:0] jal_branch_use_index;
wire jal_branch_found_valid;
@ -103,7 +115,7 @@ module VX_execute_unit (
// Actual Writeback
assign VX_inst_exec_wb.rd = VX_exec_unit_req.rd;
assign VX_inst_exec_wb.wb = VX_exec_unit_req.wb;
assign VX_inst_exec_wb.wb_valid = VX_exec_unit_req.valid;
assign VX_inst_exec_wb.wb_valid = VX_exec_unit_req.valid && !internal_stall;
assign VX_inst_exec_wb.wb_warp_num = VX_exec_unit_req.warp_num;
assign VX_inst_exec_wb.alu_result = VX_exec_unit_req.jal ? duplicate_PC_data : alu_result;
@ -163,4 +175,4 @@ module VX_execute_unit (
// assign out_is_csr = VX_exec_unit_req.is_csr;
// assign out_csr_address = VX_exec_unit_req.csr_address;
endmodule
endmodule : VX_execute_unit

View file

@ -7,6 +7,7 @@ module VX_gpr_stage (
input wire schedule_delay,
input wire memory_delay,
input wire exec_delay,
output wire gpr_stage_delay,
// inputs
@ -93,7 +94,10 @@ module VX_gpr_stage (
wire stall_lsu = memory_delay;
wire flush_lsu = schedule_delay && !stall_lsu;
assign gpr_stage_delay = stall_lsu;
wire stall_exec = exec_delay;
wire flush_exec = schedule_delay && !stall_exec;
assign gpr_stage_delay = stall_lsu || stall_exec;
`ifdef ASIC
wire delayed_lsu_last_cycle;
@ -145,8 +149,8 @@ module VX_gpr_stage (
VX_generic_register #(.N(224 + `NW_M1 + 1 + (`NT))) exec_unit_reg(
.clk (clk),
.reset(reset),
.stall(stall_rest),
.flush(flush_rest),
.stall(stall_exec),
.flush(flush_exec),
.in ({VX_exec_unit_req_temp.valid, VX_exec_unit_req_temp.warp_num, VX_exec_unit_req_temp.curr_PC, VX_exec_unit_req_temp.PC_next, VX_exec_unit_req_temp.rd, VX_exec_unit_req_temp.wb, VX_exec_unit_req_temp.alu_op, VX_exec_unit_req_temp.rs1, VX_exec_unit_req_temp.rs2, VX_exec_unit_req_temp.rs2_src, VX_exec_unit_req_temp.itype_immed, VX_exec_unit_req_temp.upper_immed, VX_exec_unit_req_temp.branch_type, VX_exec_unit_req_temp.jalQual, VX_exec_unit_req_temp.jal, VX_exec_unit_req_temp.jal_offset, VX_exec_unit_req_temp.ebreak, VX_exec_unit_req_temp.wspawn, VX_exec_unit_req_temp.is_csr, VX_exec_unit_req_temp.csr_address, VX_exec_unit_req_temp.csr_immed, VX_exec_unit_req_temp.csr_mask}),
.out ({VX_exec_unit_req.valid , VX_exec_unit_req.warp_num , VX_exec_unit_req.curr_PC , VX_exec_unit_req.PC_next , VX_exec_unit_req.rd , VX_exec_unit_req.wb , VX_exec_unit_req.alu_op , VX_exec_unit_req.rs1 , VX_exec_unit_req.rs2 , VX_exec_unit_req.rs2_src , VX_exec_unit_req.itype_immed , VX_exec_unit_req.upper_immed , VX_exec_unit_req.branch_type , VX_exec_unit_req.jalQual , VX_exec_unit_req.jal , VX_exec_unit_req.jal_offset , VX_exec_unit_req.ebreak , VX_exec_unit_req.wspawn , VX_exec_unit_req.is_csr , VX_exec_unit_req.csr_address , VX_exec_unit_req.csr_immed , VX_exec_unit_req.csr_mask })
);
@ -193,8 +197,8 @@ module VX_gpr_stage (
VX_generic_register #(.N(224 + `NW_M1 + 1 + 65*(`NT))) exec_unit_reg(
.clk (clk),
.reset(reset),
.stall(stall_rest),
.flush(flush_rest),
.stall(stall_exec),
.flush(flush_exec),
.in ({VX_exec_unit_req_temp.valid, VX_exec_unit_req_temp.warp_num, VX_exec_unit_req_temp.curr_PC, VX_exec_unit_req_temp.PC_next, VX_exec_unit_req_temp.rd, VX_exec_unit_req_temp.wb, VX_exec_unit_req_temp.a_reg_data, VX_exec_unit_req_temp.b_reg_data, VX_exec_unit_req_temp.alu_op, VX_exec_unit_req_temp.rs1, VX_exec_unit_req_temp.rs2, VX_exec_unit_req_temp.rs2_src, VX_exec_unit_req_temp.itype_immed, VX_exec_unit_req_temp.upper_immed, VX_exec_unit_req_temp.branch_type, VX_exec_unit_req_temp.jalQual, VX_exec_unit_req_temp.jal, VX_exec_unit_req_temp.jal_offset, VX_exec_unit_req_temp.ebreak, VX_exec_unit_req_temp.wspawn, VX_exec_unit_req_temp.is_csr, VX_exec_unit_req_temp.csr_address, VX_exec_unit_req_temp.csr_immed, VX_exec_unit_req_temp.csr_mask}),
.out ({VX_exec_unit_req.valid , VX_exec_unit_req.warp_num , VX_exec_unit_req.curr_PC , VX_exec_unit_req.PC_next , VX_exec_unit_req.rd , VX_exec_unit_req.wb , VX_exec_unit_req.a_reg_data , VX_exec_unit_req.b_reg_data , VX_exec_unit_req.alu_op , VX_exec_unit_req.rs1 , VX_exec_unit_req.rs2 , VX_exec_unit_req.rs2_src , VX_exec_unit_req.itype_immed , VX_exec_unit_req.upper_immed , VX_exec_unit_req.branch_type , VX_exec_unit_req.jalQual , VX_exec_unit_req.jal , VX_exec_unit_req.jal_offset , VX_exec_unit_req.ebreak , VX_exec_unit_req.wspawn , VX_exec_unit_req.is_csr , VX_exec_unit_req.csr_address , VX_exec_unit_req.csr_immed , VX_exec_unit_req.csr_mask })
);
@ -219,4 +223,4 @@ module VX_gpr_stage (
`endif
endmodule
endmodule : VX_gpr_stage

View file

@ -6,6 +6,7 @@ module VX_scheduler (
input wire clk,
input wire reset,
input wire memory_delay,
input wire exec_delay,
input wire gpr_stage_delay,
VX_frE_to_bckE_req_inter VX_bckE_req,
VX_wb_inter VX_writeback_inter,
@ -27,7 +28,11 @@ module VX_scheduler (
wire is_store = (VX_bckE_req.mem_write != `NO_MEM_WRITE);
wire is_load = (VX_bckE_req.mem_read != `NO_MEM_READ);
// classify our next instruction.
wire is_mem = is_store || is_load;
wire is_gpu = (VX_bckE_req.is_wspawn || VX_bckE_req.is_tmc || VX_bckE_req.is_barrier || VX_bckE_req.is_split);
wire is_csr = VX_bckE_req.is_csr;
wire is_exec = !is_mem && !is_gpu && !is_csr;
wire rs1_pass = ((valid_wb && (VX_writeback_inter.rd == VX_bckE_req.rs1)));
@ -44,8 +49,10 @@ module VX_scheduler (
wire rename_valid = rs1_rename_qual || rs2_rename_qual ;
assign schedule_delay = ((rename_valid) && (|VX_bckE_req.valid)) || (memory_delay && (is_mem)) || (gpr_stage_delay && is_mem);
assign schedule_delay = ((rename_valid) && (|VX_bckE_req.valid))
|| (memory_delay && is_mem)
|| (gpr_stage_delay && (is_mem || is_exec))
|| (exec_delay && is_exec);
integer i;
integer w;

View file

@ -14,10 +14,10 @@ module VX_writeback (
// Actual WB to GPR
VX_wb_inter VX_writeback_inter,
output wire no_slot_mem
output wire no_slot_mem,
output wire no_slot_exec
);
VX_wb_inter VX_writeback_tempp();
wire exec_wb = (VX_inst_exec_wb.wb != 0) && (|VX_inst_exec_wb.wb_valid);
@ -25,38 +25,39 @@ module VX_writeback (
wire csr_wb = (VX_csr_wb.wb != 0) && (|VX_csr_wb.valid);
assign no_slot_mem = mem_wb && (exec_wb || csr_wb);
assign no_slot_mem = mem_wb && (exec_wb || csr_wb);
assign no_slot_exec = exec_wb && (csr_wb);
assign VX_writeback_tempp.write_data = exec_wb ? VX_inst_exec_wb.alu_result :
csr_wb ? VX_csr_wb.csr_result :
assign VX_writeback_tempp.write_data = csr_wb ? VX_csr_wb.csr_result :
exec_wb ? VX_inst_exec_wb.alu_result :
mem_wb ? VX_mem_wb.loaded_data :
0;
assign VX_writeback_tempp.wb_valid = exec_wb ? VX_inst_exec_wb.wb_valid :
csr_wb ? VX_csr_wb.valid :
assign VX_writeback_tempp.wb_valid = csr_wb ? VX_csr_wb.valid :
exec_wb ? VX_inst_exec_wb.wb_valid :
mem_wb ? VX_mem_wb.wb_valid :
0;
assign VX_writeback_tempp.rd = exec_wb ? VX_inst_exec_wb.rd :
csr_wb ? VX_csr_wb.rd :
assign VX_writeback_tempp.rd = csr_wb ? VX_csr_wb.rd :
exec_wb ? VX_inst_exec_wb.rd :
mem_wb ? VX_mem_wb.rd :
0;
assign VX_writeback_tempp.wb = exec_wb ? VX_inst_exec_wb.wb :
csr_wb ? VX_csr_wb.wb :
assign VX_writeback_tempp.wb = csr_wb ? VX_csr_wb.wb :
exec_wb ? VX_inst_exec_wb.wb :
mem_wb ? VX_mem_wb.wb :
0;
assign VX_writeback_tempp.wb_warp_num = exec_wb ? VX_inst_exec_wb.wb_warp_num :
csr_wb ? VX_csr_wb.warp_num :
assign VX_writeback_tempp.wb_warp_num = csr_wb ? VX_csr_wb.warp_num :
exec_wb ? VX_inst_exec_wb.wb_warp_num :
mem_wb ? VX_mem_wb.wb_warp_num :
0;
assign VX_writeback_tempp.wb_pc = exec_wb ? VX_inst_exec_wb.exec_wb_pc :
csr_wb ? 32'hdeadbeef :
assign VX_writeback_tempp.wb_pc = csr_wb ? 32'hdeadbeef :
exec_wb ? VX_inst_exec_wb.exec_wb_pc :
mem_wb ? VX_mem_wb.mem_wb_pc :
32'hdeadbeef;
@ -65,17 +66,6 @@ module VX_writeback (
wire[`NT-1:0][31:0] use_wb_data;
reg prev_is_mem;
always @(posedge clk, posedge reset) begin
if (reset)
begin
prev_is_mem = 0;
end begin
prev_is_mem = mem_wb && !no_slot_mem;
end
end
VX_generic_register #(.N(39 + `NW_M1 + 1 + `NT*33)) wb_register(
.clk (clk),
.reset(reset),
@ -85,14 +75,9 @@ module VX_writeback (
.out ({use_wb_data , VX_writeback_inter.wb_valid, VX_writeback_inter.rd, VX_writeback_inter.wb, VX_writeback_inter.wb_warp_num, VX_writeback_inter.wb_pc})
);
`ifdef SYN
assign VX_writeback_inter.write_data = prev_is_mem ? VX_writeback_tempp.write_data : use_wb_data;
`else
assign VX_writeback_inter.write_data = use_wb_data;
`endif
assign VX_writeback_inter.write_data = use_wb_data;
endmodule // VX_writeback
endmodule : VX_writeback // VX_writeback

View file

@ -46,6 +46,7 @@ module Vortex
wire memory_delay;
wire exec_delay;
wire gpr_stage_delay;
wire schedule_delay;
@ -179,6 +180,7 @@ VX_scheduler schedule(
.clk (clk),
.reset (reset),
.memory_delay (memory_delay),
.exec_delay (exec_delay),
.gpr_stage_delay (gpr_stage_delay),
.VX_bckE_req (VX_bckE_req),
.VX_writeback_inter(VX_writeback_inter),
@ -197,6 +199,7 @@ VX_back_end vx_back_end(
.VX_dcache_req (VX_dcache_req),
.VX_writeback_inter (VX_writeback_inter),
.out_mem_delay (memory_delay),
.out_exec_delay (exec_delay),
.gpr_stage_delay (gpr_stage_delay)
);