mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-22 12:57:41 -04:00
226 lines
No EOL
8 KiB
Systemverilog
226 lines
No EOL
8 KiB
Systemverilog
`include "VX_define.vh"
|
|
|
|
module VX_muldiv (
|
|
input wire clk,
|
|
input wire reset,
|
|
|
|
// Inputs
|
|
input wire [`INST_MUL_BITS-1:0] alu_op,
|
|
input wire [`UUID_BITS-1:0] uuid_in,
|
|
input wire [`NW_BITS-1:0] wid_in,
|
|
input wire [`NUM_THREADS-1:0] tmask_in,
|
|
input wire [31:0] PC_in,
|
|
input wire [`NR_BITS-1:0] rd_in,
|
|
input wire wb_in,
|
|
input wire [`NUM_THREADS-1:0][31:0] alu_in1,
|
|
input wire [`NUM_THREADS-1:0][31:0] alu_in2,
|
|
|
|
// Outputs
|
|
output wire [`UUID_BITS-1:0] uuid_out,
|
|
output wire [`NW_BITS-1:0] wid_out,
|
|
output wire [`NUM_THREADS-1:0] tmask_out,
|
|
output wire [31:0] PC_out,
|
|
output wire [`NR_BITS-1:0] rd_out,
|
|
output wire wb_out,
|
|
output wire [`NUM_THREADS-1:0][31:0] data_out,
|
|
|
|
// handshake
|
|
input wire valid_in,
|
|
output wire ready_in,
|
|
output wire valid_out,
|
|
input wire ready_out
|
|
);
|
|
|
|
wire is_div_op = `INST_MUL_IS_DIV(alu_op);
|
|
|
|
wire [`NUM_THREADS-1:0][31:0] mul_result;
|
|
wire [`UUID_BITS-1:0] mul_uuid_out;
|
|
wire [`NW_BITS-1:0] mul_wid_out;
|
|
wire [`NUM_THREADS-1:0] mul_tmask_out;
|
|
wire [31:0] mul_PC_out;
|
|
wire [`NR_BITS-1:0] mul_rd_out;
|
|
wire mul_wb_out;
|
|
|
|
wire stall_out;
|
|
|
|
wire mul_valid_out;
|
|
wire mul_valid_in = valid_in && !is_div_op;
|
|
wire mul_ready_in = ~stall_out || ~mul_valid_out;
|
|
|
|
wire is_mulh_in = (alu_op != `INST_MUL_MUL);
|
|
wire is_signed_mul_a = (alu_op != `INST_MUL_MULHU);
|
|
wire is_signed_mul_b = (alu_op != `INST_MUL_MULHU && alu_op != `INST_MUL_MULHSU);
|
|
|
|
`ifdef IMUL_DPI
|
|
|
|
wire [`NUM_THREADS-1:0][31:0] mul_result_tmp;
|
|
|
|
wire mul_fire_in = mul_valid_in && mul_ready_in;
|
|
|
|
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
|
wire [31:0] mul_resultl, mul_resulth;
|
|
always @(*) begin
|
|
dpi_imul (mul_fire_in, alu_in1[i], alu_in2[i], is_signed_mul_a, is_signed_mul_b, mul_resultl, mul_resulth);
|
|
end
|
|
assign mul_result_tmp[i] = is_mulh_in ? mul_resulth : mul_resultl;
|
|
end
|
|
|
|
VX_shift_register #(
|
|
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)),
|
|
.DEPTH (`LATENCY_IMUL),
|
|
.RESETW (1)
|
|
) mul_shift_reg (
|
|
.clk(clk),
|
|
.reset (reset),
|
|
.enable (mul_ready_in),
|
|
.data_in ({mul_valid_in, uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, mul_result_tmp}),
|
|
.data_out ({mul_valid_out, mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, mul_result})
|
|
);
|
|
|
|
`else
|
|
|
|
wire is_mulh_out;
|
|
|
|
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
|
wire [32:0] mul_in1 = {is_signed_mul_a & alu_in1[i][31], alu_in1[i]};
|
|
wire [32:0] mul_in2 = {is_signed_mul_b & alu_in2[i][31], alu_in2[i]};
|
|
`IGNORE_UNUSED_BEGIN
|
|
wire [65:0] mul_result_tmp;
|
|
`IGNORE_UNUSED_END
|
|
|
|
VX_multiplier #(
|
|
.WIDTHA (33),
|
|
.WIDTHB (33),
|
|
.WIDTHP (66),
|
|
.SIGNED (1),
|
|
.LATENCY (`LATENCY_IMUL)
|
|
) multiplier (
|
|
.clk (clk),
|
|
.enable (mul_ready_in),
|
|
.dataa (mul_in1),
|
|
.datab (mul_in2),
|
|
.result (mul_result_tmp)
|
|
);
|
|
|
|
assign mul_result[i] = is_mulh_out ? mul_result_tmp[63:32] : mul_result_tmp[31:0];
|
|
end
|
|
|
|
VX_shift_register #(
|
|
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1),
|
|
.DEPTH (`LATENCY_IMUL),
|
|
.RESETW (1)
|
|
) mul_shift_reg (
|
|
.clk(clk),
|
|
.reset (reset),
|
|
.enable (mul_ready_in),
|
|
.data_in ({mul_valid_in, uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, is_mulh_in}),
|
|
.data_out ({mul_valid_out, mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, is_mulh_out})
|
|
);
|
|
|
|
`endif
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
|
|
wire [`NUM_THREADS-1:0][31:0] div_result;
|
|
wire [`UUID_BITS-1:0] div_uuid_out;
|
|
wire [`NW_BITS-1:0] div_wid_out;
|
|
wire [`NUM_THREADS-1:0] div_tmask_out;
|
|
wire [31:0] div_PC_out;
|
|
wire [`NR_BITS-1:0] div_rd_out;
|
|
wire div_wb_out;
|
|
|
|
wire is_rem_op_in = (alu_op == `INST_MUL_REM) || (alu_op == `INST_MUL_REMU);
|
|
wire is_signed_div = (alu_op == `INST_MUL_DIV) || (alu_op == `INST_MUL_REM);
|
|
wire div_valid_in = valid_in && is_div_op;
|
|
wire div_ready_out = ~stall_out && ~mul_valid_out; // arbitration prioritizes MUL
|
|
wire div_ready_in;
|
|
wire div_valid_out;
|
|
|
|
`ifdef IDIV_DPI
|
|
|
|
wire [`NUM_THREADS-1:0][31:0] div_result_tmp;
|
|
|
|
wire div_fire_in = div_valid_in && div_ready_in;
|
|
|
|
for (genvar i = 0; i < `NUM_THREADS; i++) begin
|
|
wire [31:0] div_quotient, div_remainder;
|
|
always @(*) begin
|
|
dpi_idiv (div_fire_in, alu_in1[i], alu_in2[i], is_signed_div, div_quotient, div_remainder);
|
|
end
|
|
assign div_result_tmp[i] = is_rem_op_in ? div_remainder : div_quotient;
|
|
end
|
|
|
|
VX_shift_register #(
|
|
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)),
|
|
.DEPTH (`LATENCY_IMUL),
|
|
.RESETW (1)
|
|
) div_shift_reg (
|
|
.clk(clk),
|
|
.reset (reset),
|
|
.enable (div_ready_in),
|
|
.data_in ({div_valid_in, uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, div_result_tmp}),
|
|
.data_out ({div_valid_out, div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, div_result})
|
|
);
|
|
|
|
assign div_ready_in = div_ready_out || ~div_valid_out;
|
|
|
|
`else
|
|
|
|
wire [`NUM_THREADS-1:0][31:0] div_result_tmp, rem_result_tmp;
|
|
wire is_rem_op_out;
|
|
|
|
VX_serial_div #(
|
|
.WIDTHN (32),
|
|
.WIDTHD (32),
|
|
.WIDTHQ (32),
|
|
.WIDTHR (32),
|
|
.LANES (`NUM_THREADS),
|
|
.TAGW (64 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1)
|
|
) divide (
|
|
.clk (clk),
|
|
.reset (reset),
|
|
.valid_in (div_valid_in),
|
|
.ready_in (div_ready_in),
|
|
.signed_mode(is_signed_div),
|
|
.tag_in ({uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, is_rem_op_in}),
|
|
.numer (alu_in1),
|
|
.denom (alu_in2),
|
|
.quotient (div_result_tmp),
|
|
.remainder (rem_result_tmp),
|
|
.ready_out (div_ready_out),
|
|
.valid_out (div_valid_out),
|
|
.tag_out ({div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, is_rem_op_out})
|
|
);
|
|
|
|
assign div_result = is_rem_op_out ? rem_result_tmp : div_result_tmp;
|
|
|
|
`endif
|
|
|
|
///////////////////////////////////////////////////////////////////////////
|
|
|
|
wire rsp_valid = mul_valid_out || div_valid_out;
|
|
wire [`UUID_BITS-1:0] rsp_uuid = mul_valid_out ? mul_uuid_out : div_uuid_out;
|
|
wire [`NW_BITS-1:0] rsp_wid = mul_valid_out ? mul_wid_out : div_wid_out;
|
|
wire [`NUM_THREADS-1:0] rsp_tmask = mul_valid_out ? mul_tmask_out : div_tmask_out;
|
|
wire [31:0] rsp_PC = mul_valid_out ? mul_PC_out : div_PC_out;
|
|
wire [`NR_BITS-1:0] rsp_rd = mul_valid_out ? mul_rd_out : div_rd_out;
|
|
wire rsp_wb = mul_valid_out ? mul_wb_out : div_wb_out;
|
|
wire [`NUM_THREADS-1:0][31:0] rsp_data = mul_valid_out ? mul_result : div_result;
|
|
|
|
assign stall_out = ~ready_out && valid_out;
|
|
|
|
VX_pipe_register #(
|
|
.DATAW (1 + `UUID_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)),
|
|
.RESETW (1)
|
|
) pipe_reg (
|
|
.clk (clk),
|
|
.reset (reset),
|
|
.enable (~stall_out),
|
|
.data_in ({rsp_valid, rsp_uuid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data}),
|
|
.data_out ({valid_out, uuid_out, wid_out, tmask_out, PC_out, rd_out, wb_out, data_out})
|
|
);
|
|
|
|
// can accept new request?
|
|
assign ready_in = is_div_op ? div_ready_in : mul_ready_in;
|
|
|
|
endmodule |