fpu implementation (part1)

This commit is contained in:
Blaise Tine 2020-07-23 03:18:09 -07:00
parent 6836f397f8
commit 75e3c31b56
31 changed files with 662 additions and 159 deletions

3
.gitmodules vendored
View file

@ -1,3 +1,6 @@
[submodule "hw/rtl/fp_cores/fpu_div_sqrt_mvp"]
path = hw/rtl/fp_cores/fpu_div_sqrt_mvp
url = https://github.com/pulp-platform/fpu_div_sqrt_mvp.git
[submodule "hw/rtl/fp_cores/fpnew"]
path = hw/rtl/fp_cores/fpnew
url = https://github.com/pulp-platform/fpnew.git

View file

@ -1,4 +1,6 @@
`include "VX_define.vh"
`include "fpnew_pkg.sv"
`include "defs_div_sqrt_mvp.sv"
module VX_alu_unit #(
parameter CORE_ID = 0
@ -99,7 +101,7 @@ module VX_alu_unit #(
);
VX_generic_register #(
.N(`NUM_THREADS + `NW_BITS + 32 + `NR_BITS + `WB_BITS + (`NUM_THREADS * 32))
.N(`NUM_THREADS + `NW_BITS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32))
) alu_reg (
.clk (clk),
.reset (reset),

View file

@ -11,6 +11,7 @@ module VX_commit #(
VX_commit_if lsu_commit_if,
VX_commit_if mul_commit_if,
VX_commit_if csr_commit_if,
VX_commit_if fpu_commit_if,
VX_commit_if gpu_commit_if,
// outputs
@ -20,9 +21,10 @@ module VX_commit #(
wire [`NUM_EXS-1:0] commited_mask;
assign commited_mask = {((| alu_commit_if.valid) && alu_commit_if.ready),
((| lsu_commit_if.valid) && lsu_commit_if.ready),
((| mul_commit_if.valid) && mul_commit_if.ready),
((| lsu_commit_if.valid) && lsu_commit_if.ready),
((| csr_commit_if.valid) && csr_commit_if.ready),
((| mul_commit_if.valid) && mul_commit_if.ready),
((| fpu_commit_if.valid) && fpu_commit_if.ready),
((| gpu_commit_if.valid) && gpu_commit_if.ready)};
wire [`NE_BITS:0] num_commits;
@ -65,6 +67,7 @@ module VX_commit #(
.lsu_commit_if (lsu_commit_if),
.csr_commit_if (csr_commit_if),
.mul_commit_if (mul_commit_if),
.fpu_commit_if (fpu_commit_if),
.writeback_if (writeback_if)
);
@ -77,11 +80,14 @@ module VX_commit #(
if ((| lsu_commit_if.valid) && lsu_commit_if.ready) begin
$display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=LSU, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, lsu_commit_if.warp_num, lsu_commit_if.curr_PC, lsu_commit_if.wb, lsu_commit_if.rd, lsu_commit_if.data);
end
if ((| mul_commit_if.valid) && mul_commit_if.ready) begin
$display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=MUL, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, mul_commit_if.warp_num, mul_commit_if.curr_PC, mul_commit_if.wb, mul_commit_if.rd, mul_commit_if.data);
end
if ((| csr_commit_if.valid) && csr_commit_if.ready) begin
$display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=CSR, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, csr_commit_if.warp_num, csr_commit_if.curr_PC, csr_commit_if.wb, csr_commit_if.rd, csr_commit_if.data);
end
if ((| mul_commit_if.valid) && mul_commit_if.ready) begin
$display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=MUL, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, mul_commit_if.warp_num, mul_commit_if.curr_PC, mul_commit_if.wb, mul_commit_if.rd, mul_commit_if.data);
end
if ((| fpu_commit_if.valid) && fpu_commit_if.ready) begin
$display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=FPU, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, fpu_commit_if.warp_num, fpu_commit_if.curr_PC, fpu_commit_if.wb, fpu_commit_if.rd, fpu_commit_if.data);
end
if ((| gpu_commit_if.valid) && gpu_commit_if.ready) begin
$display("%t: Core%0d-commit: warp=%0d, PC=%0h, ex=GPU, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, gpu_commit_if.warp_num, gpu_commit_if.curr_PC, gpu_commit_if.wb, gpu_commit_if.rd, gpu_commit_if.data);

View file

@ -15,40 +15,41 @@ module VX_csr_arb (
VX_commit_if csr_rsp_if,
// outputs
VX_csr_io_rsp_if csr_io_rsp_if,
VX_commit_if csr_commit_if
VX_commit_if csr_commit_if,
VX_csr_io_rsp_if csr_io_rsp_if,
input wire select_io_req,
input wire select_io_rsp
);
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
wire core_select = ~(| csr_io_req_if.valid);
// requests
assign csr_req_if.valid = core_select ? csr_core_req_if.valid : {`NUM_THREADS{csr_io_req_if.valid}};
assign csr_req_if.warp_num = core_select ? csr_core_req_if.warp_num : 0;
assign csr_req_if.curr_PC = core_select ? csr_core_req_if.curr_PC : 0;
assign csr_req_if.csr_op = core_select ? csr_core_req_if.csr_op : (csr_io_req_if.rw ? `CSR_RW : `CSR_RS);
assign csr_req_if.csr_addr = core_select ? csr_core_req_if.csr_addr : csr_io_req_if.addr;
assign csr_req_if.csr_mask = core_select ? csr_core_req_if.csr_mask : (csr_io_req_if.rw ? csr_io_req_if.data : 32'b0);
assign csr_req_if.rd = core_select ? csr_core_req_if.rd : 0;
assign csr_req_if.wb = core_select ? csr_core_req_if.wb : 0;
assign csr_req_if.is_io = ~core_select;
assign csr_req_if.valid = (~select_io_req) ? csr_core_req_if.valid : {`NUM_THREADS{csr_io_req_if.valid}};
assign csr_req_if.warp_num = (~select_io_req) ? csr_core_req_if.warp_num : 0;
assign csr_req_if.curr_PC = (~select_io_req) ? csr_core_req_if.curr_PC : 0;
assign csr_req_if.csr_op = (~select_io_req) ? csr_core_req_if.csr_op : (csr_io_req_if.rw ? `CSR_RW : `CSR_RS);
assign csr_req_if.csr_addr = (~select_io_req) ? csr_core_req_if.csr_addr : csr_io_req_if.addr;
assign csr_req_if.csr_mask = (~select_io_req) ? csr_core_req_if.csr_mask : (csr_io_req_if.rw ? csr_io_req_if.data : 32'b0);
assign csr_req_if.rd = (~select_io_req) ? csr_core_req_if.rd : 0;
assign csr_req_if.wb = (~select_io_req) ? csr_core_req_if.wb : 0;
assign csr_req_if.is_io = select_io_req;
assign csr_core_req_if.ready = csr_req_if.ready && core_select;
assign csr_io_req_if.ready = csr_req_if.ready && ~core_select;
assign csr_core_req_if.ready = csr_req_if.ready && (~select_io_req);
assign csr_io_req_if.ready = csr_req_if.ready && select_io_req;
// responses
assign csr_io_rsp_if.valid = csr_rsp_if.valid[0] & csr_rsp_if.is_io;
assign csr_io_rsp_if.valid = csr_rsp_if.valid[0] & select_io_rsp;
assign csr_io_rsp_if.data = csr_rsp_if.data[0];
assign csr_commit_if.valid = csr_rsp_if.valid & {`NUM_THREADS{~csr_rsp_if.is_io}};
assign csr_commit_if.warp_num = csr_rsp_if.warp_num;
assign csr_commit_if.curr_PC = csr_rsp_if.curr_PC;
assign csr_commit_if.data = csr_rsp_if.data;
assign csr_commit_if.rd = csr_rsp_if.rd;
assign csr_commit_if.wb = csr_rsp_if.wb;
assign csr_commit_if.valid = csr_rsp_if.valid & {`NUM_THREADS{~select_io_rsp}};
assign csr_commit_if.warp_num = csr_rsp_if.warp_num;
assign csr_commit_if.curr_PC = csr_rsp_if.curr_PC;
assign csr_commit_if.data = csr_rsp_if.data;
assign csr_commit_if.rd = csr_rsp_if.rd;
assign csr_commit_if.wb = csr_rsp_if.wb;
assign csr_rsp_if.ready = csr_rsp_if.is_io ? csr_io_rsp_if.ready : csr_commit_if.ready;
assign csr_rsp_if.ready = select_io_rsp ? csr_io_rsp_if.ready : csr_commit_if.ready;
endmodule

View file

@ -6,7 +6,8 @@ module VX_csr_unit #(
input wire clk,
input wire reset,
VX_perf_cntrs_if perf_cntrs_if,
VX_perf_cntrs_if perf_cntrs_if,
VX_fpu_to_csr_if fpu_to_csr_if,
VX_csr_io_req_if csr_io_req_if,
VX_csr_io_rsp_if csr_io_rsp_if,
@ -17,15 +18,23 @@ module VX_csr_unit #(
VX_csr_req_if csr_pipe_req_if();
VX_commit_if csr_pipe_commit_if();
wire select_io_req = (| csr_io_req_if.valid);
wire select_io_rsp;
VX_csr_arb csr_arb (
.clk (clk),
.reset (reset),
.csr_core_req_if (csr_req_if),
.csr_io_req_if (csr_io_req_if),
.csr_req_if (csr_pipe_req_if),
.csr_rsp_if (csr_pipe_commit_if),
.csr_io_rsp_if (csr_io_rsp_if),
.csr_commit_if (csr_commit_if)
.csr_commit_if (csr_commit_if),
.select_io_req (select_io_req),
.select_io_rsp (select_io_rsp)
);
wire [`CSR_ADDR_SIZE-1:0] csr_addr_s2;
@ -68,14 +77,14 @@ module VX_csr_unit #(
wire stall = ~csr_pipe_commit_if.ready && (| csr_pipe_commit_if.valid);
VX_generic_register #(
.N(`NUM_THREADS + `NW_BITS + 32 + `NR_BITS + `WB_BITS + `CSR_ADDR_SIZE + 1 + 32 + 32)
.N(`NUM_THREADS + `NW_BITS + 32 + `NR_BITS + 1 + `CSR_ADDR_SIZE + 1 + 32 + 32)
) csr_reg (
.clk (clk),
.reset (reset),
.stall (stall),
.flush (0),
.in ({csr_pipe_req_if.valid, csr_pipe_req_if.warp_num, csr_pipe_req_if.curr_PC, csr_pipe_req_if.rd, csr_pipe_req_if.wb, csr_pipe_req_if.csr_addr, csr_pipe_req_if.is_io, csr_read_data, csr_updated_data}),
.out ({csr_pipe_commit_if.valid, csr_pipe_commit_if.warp_num, csr_pipe_commit_if.curr_PC, csr_pipe_commit_if.rd, csr_pipe_commit_if.wb, csr_addr_s2, csr_pipe_commit_if.is_io, csr_read_data_s2, csr_updated_data_s2})
.in ({csr_pipe_req_if.valid, csr_pipe_req_if.warp_num, csr_pipe_req_if.curr_PC, csr_pipe_req_if.rd, csr_pipe_req_if.wb, csr_pipe_req_if.csr_addr, csr_pipe_req_if.is_io, csr_read_data, csr_updated_data}),
.out ({csr_pipe_commit_if.valid, csr_pipe_commit_if.warp_num, csr_pipe_commit_if.curr_PC, csr_pipe_commit_if.rd, csr_pipe_commit_if.wb, csr_addr_s2, select_io_rsp, csr_read_data_s2, csr_updated_data_s2})
);
genvar i;

View file

@ -19,10 +19,11 @@ module VX_decode #(
wire [31:0] instr = ifetch_rsp_if.instr;
reg [`ALU_BITS-1:0] alu_op;
reg [`BR_BITS-1:0] br_op;
reg [`MUL_BITS-1:0] mul_op;
reg [`BR_BITS-1:0] br_op;
wire [`LSU_BITS-1:0] lsu_op;
reg [`CSR_BITS-1:0] csr_op;
reg [`MUL_BITS-1:0] mul_op;
reg [`FPU_BITS-1:0] fpu_op;
reg [`GPU_BITS-1:0] gpu_op;
reg [19:0] upper_imm;
@ -37,6 +38,7 @@ module VX_decode #(
wire [`NR_BITS-1:0] rd = instr[11:7];
wire [`NR_BITS-1:0] rs1 = instr[19:15];
wire [`NR_BITS-1:0] rs2 = instr[24:20];
wire [`NR_BITS-1:0] rs3 = instr[31:27];
// opcode types
wire is_rtype = (opcode == `INST_R);
@ -51,10 +53,9 @@ module VX_decode #(
wire is_jals = (opcode == `INST_SYS) && (func3 == 0);
wire is_csr = (opcode == `INST_SYS) && (func3 != 0);
wire is_gpu = (opcode == `INST_GPU);
wire is_br = (is_btype || is_jal || is_jalr || is_jals);
wire is_mul = is_rtype && (func7 == 7'h1);
// upper immediate
always @(*) begin
case (opcode)
`INST_LUI: upper_imm = {func7, rs2, rs1, func3};
@ -63,20 +64,8 @@ module VX_decode #(
endcase
end
// JAL
wire [20:0] jal_imm = {instr[31], instr[19:12], instr[20], instr[30:21], 1'b0};
wire [31:0] jal_offset = {{11{jal_imm[20]}}, jal_imm};
wire [11:0] jalr_imm = {func7, rs2};
wire [31:0] jalr_offset = {{20{jalr_imm[11]}}, jalr_imm};
always @(*) begin
case (opcode)
`INST_JAL: jalx_offset = jal_offset;
`INST_JALR: jalx_offset = jalr_offset;
default: jalx_offset = 32'd4;
endcase
end
// I-type immediate
wire alu_shift_i = (func3 == 3'h1) || (func3 == 3'h5);
wire [11:0] alu_shift_imm = {{7{1'b0}}, rs2};
wire [11:0] alu_imm = alu_shift_i ? alu_shift_imm : u_12;
@ -88,9 +77,26 @@ module VX_decode #(
`INST_B: src2_imm = {{20{instr[31]}}, instr[7], instr[30:25], instr[11:8], 1'b0};
default: src2_imm = 32'hdeadbeef;
endcase
end
end
// JAL
wire [20:0] jal_imm = {instr[31], instr[19:12], instr[20], instr[30:21], 1'b0};
wire [31:0] jal_offset = {{11{jal_imm[20]}}, jal_imm};
wire [11:0] jalr_imm = {func7, rs2};
wire [31:0] jalr_offset = {{20{jalr_imm[11]}}, jalr_imm};
always @(*) begin
case (opcode)
`INST_JAL: jalx_offset = jal_offset;
`INST_JALR: jalx_offset = jalr_offset;
default: jalx_offset = 32'd4;
endcase
end
// BRANCH
wire is_br = (is_btype || is_jal || is_jalr || is_jals);
always @(*) begin
br_op = `BR_EQ;
case (opcode)
@ -119,6 +125,7 @@ module VX_decode #(
end
// ALU
always @(*) begin
alu_op = `ALU_OTHER;
if (is_lui) begin
@ -140,7 +147,29 @@ module VX_decode #(
end
end
// MUL
// LSU
wire is_lsu = (is_ltype || is_stype);
assign lsu_op = {is_stype, func3};
// CSR
wire is_csr_imm = is_csr && (func3[2] == 1);
always @(*) begin
csr_op = `CSR_OTHER;
case (func3[1:0])
2'h1: csr_op = `CSR_RW;
2'h2: csr_op = `CSR_RS;
2'h3: csr_op = `CSR_RC;
default:;
endcase
end
// MUL
wire is_mul = is_rtype && (func7 == 7'h1);
always @(*) begin
mul_op = `MUL_MUL;
case (func3)
@ -156,23 +185,50 @@ module VX_decode #(
endcase
end
// LSU
wire is_lsu = (is_ltype || is_stype);
assign lsu_op = {is_stype, func3};
// FPU
// CSR
wire is_csr_imm = is_csr && (func3[2] == 1);
always @(*) begin
csr_op = `CSR_OTHER;
case (func3[1:0])
2'h1: csr_op = `CSR_RW;
2'h2: csr_op = `CSR_RS;
2'h3: csr_op = `CSR_RC;
default:;
endcase
wire is_fl = (opcode == `INST_FL) && ((func3 == 2));
wire is_fs = (opcode == `INST_FS) && ((func3 == 2));
wire is_fci = (opcode == `INST_FCI);
wire is_fmadd = (opcode == `INST_FMADD);
wire is_fmsub = (opcode == `INST_FMSUB);
wire is_fnmsub = (opcode == `INST_FNMSUB);
wire is_fnmadd = (opcode == `INST_FNMADD);
wire is_fr4 = is_fmadd || is_fmsub || is_fnmsub || is_fnmadd;
wire is_fpu = (is_fl || is_fs || is_fci || is_fr4);
always @(*) begin
fpu_op = `FPU_OTHER;
if (is_fr4) begin
case ({is_fmadd, is_fmsub, is_fnmsub, is_fnmadd})
4'b1000: fpu_op = `FPU_MADD;
4'b0100: fpu_op = `FPU_MSUB;
4'b0010: fpu_op = `FPU_NMSUB;
4'b0001: fpu_op = `FPU_NMADD;
default:;
endcase
end
else begin
case (func7)
7'h00: fpu_op = `FPU_ADD;
7'h04: fpu_op = `FPU_SUB;
7'h08: fpu_op = `FPU_MUL;
7'h0C: fpu_op = `FPU_DIV;
7'h2C: fpu_op = `FPU_SQRT;
7'h14: fpu_op = (func3 == 3'h0) ? `FPU_MIN : `FPU_MAX;
7'h60: fpu_op = (instr[20]) ? `FPU_CVTWUS : `FPU_CVTWS; // doesn't need rs2, and read rs1 from fpReg, WB to intReg
7'h68: fpu_op = (instr[20]) ? `FPU_CVTSWU : `FPU_CVTSW; // doesn't need rs2, and read rs1 from intReg
7'h70: fpu_op = (func3 == 3'h0) ? `FPU_MVXW : `FPU_CLASS; // both wb to intReg
7'h78: fpu_op = `FPU_MVWX;
7'h50: fpu_op = `FPU_CMP; // wb to intReg
7'h10: fpu_op = (func3[1]) ? `FPU_SGNJX : ((func3[0]) ? `FPU_SGNJN : `FPU_SGNJ);
default:;
endcase
end
end
// GPU
always @(*) begin
gpu_op = `GPU_OTHER;
case (func3)
@ -195,23 +251,23 @@ module VX_decode #(
assign decode_tmp_if.ex_type = is_lsu ? `EX_LSU :
is_csr ? `EX_CSR :
is_mul ? `EX_MUL :
is_gpu ? `EX_GPU :
is_br ? `EX_ALU :
(is_rtype || is_itype || is_lui || is_auipc) ? `EX_ALU :
`EX_NOP;
is_fpu ? `EX_FPU :
is_gpu ? `EX_GPU :
is_br ? `EX_ALU :
(is_rtype || is_itype || is_lui || is_auipc) ? `EX_ALU :
`EX_NOP;
assign decode_tmp_if.instr_op = is_lsu ? `OP_BITS'(lsu_op) :
is_csr ? `OP_BITS'(csr_op) :
is_mul ? `OP_BITS'(mul_op) :
is_gpu ? `OP_BITS'(gpu_op) :
is_br ? `OP_BITS'({1'b1, br_op}) :
(is_rtype || is_itype || is_lui || is_auipc) ? `OP_BITS'(alu_op) :
0;
is_fpu ? `OP_BITS'(fpu_op) :
is_gpu ? `OP_BITS'(gpu_op) :
is_br ? `OP_BITS'({1'b1, br_op}) :
(is_rtype || is_itype || is_lui || is_auipc) ? `OP_BITS'(alu_op) :
0;
assign decode_tmp_if.rd = rd;
assign decode_tmp_if.rs1 = is_lui ? `NR_BITS'(0) : rs1;
assign decode_tmp_if.rs2 = rs2;
assign decode_tmp_if.imm = (is_lui || is_auipc) ? {upper_imm, 12'(0)} :
@ -220,20 +276,22 @@ module VX_decode #(
src2_imm;
assign decode_tmp_if.rs1_is_PC = is_auipc;
assign decode_tmp_if.rs2_is_imm = is_itype || is_lui || is_auipc || is_csr_imm;
assign decode_tmp_if.rs2_is_imm = is_itype || is_lui || is_auipc || is_csr_imm;
assign decode_tmp_if.use_rs1 = (decode_tmp_if.rs1 != 0)
&& (is_jalr || is_btype || is_ltype || is_stype || is_itype || is_rtype || ~is_csr_imm || is_gpu);
assign decode_tmp_if.use_rs2 = (decode_tmp_if.rs2 != 0)
&& (is_btype || is_stype || is_rtype || (is_gpu && (gpu_op == `GPU_BAR || gpu_op == `GPU_WSPAWN)));
assign decode_tmp_if.rs1_is_fp = (is_fci && ((func7 != 7'h68) && (fpu_op != `FPU_MVWX)) || is_fr4);
assign decode_tmp_if.rs2_is_fp = is_fs || (is_fci && ((func7 != 7'h60) && (func7 != 7'h68)) || is_fr4);
assign decode_tmp_if.rs3 = rs3;
assign decode_tmp_if.use_rs3 = is_fr4;
assign decode_tmp_if.frm = func3;
assign decode_tmp_if.wb = (rd == 0) ? `WB_NO : // disable writeback to r0
(is_itype || is_rtype || is_lui || is_auipc || is_csr) ? `WB_ALU :
(is_jal || is_jalr || is_jals) ? `WB_JAL :
is_ltype ? `WB_MEM :
`WB_NO;
assign decode_tmp_if.wb = (is_fpu && (is_fl || (is_fci && ((func7 != 7'h50) || (func7 != 7'h70) || (func7 != 7'h60))) || is_fr4))
|| (~is_fpu && (rd != 0) && (is_itype || is_rtype || is_lui || is_auipc || is_csr || is_jal || is_jalr || is_jals || is_ltype));
assign join_if.is_join = in_valid && is_gpu && (gpu_op == `GPU_JOIN);
assign join_if.warp_num = ifetch_rsp_if.warp_num;
@ -241,17 +299,17 @@ module VX_decode #(
assign wstall_if.wstall = in_valid && (is_btype || is_jal || is_jalr || (is_gpu && (gpu_op == `GPU_TMC || gpu_op == `GPU_SPLIT || gpu_op == `GPU_BAR)));
assign wstall_if.warp_num = ifetch_rsp_if.warp_num;
wire stall = ~decode_if.ready && (| decode_if.valid);
wire stall = ~decode_if.ready && (| decode_if.valid);
VX_generic_register #(
.N(`NUM_THREADS + `NW_BITS + 32 + 32 + `NR_BITS + `NR_BITS + `NR_BITS + 32 + 1 + 1 + 1 + 1 + `EX_BITS + `OP_BITS + `WB_BITS)
.N(`NUM_THREADS + `NW_BITS + 32 + 32 + `NR_BITS + `NR_BITS + `NR_BITS + 32 + 1 + 1 + 1 + 1 + `EX_BITS + `OP_BITS + 1 + `NR_BITS + 1 + 1 + 1 + `FRM_BITS)
) decode_reg (
.clk (clk),
.reset (reset),
.stall (stall),
.flush (0),
.in ({decode_tmp_if.valid, decode_tmp_if.warp_num, decode_tmp_if.curr_PC, decode_tmp_if.next_PC, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.use_rs1, decode_tmp_if.use_rs2, decode_tmp_if.ex_type, decode_tmp_if.instr_op, decode_tmp_if.wb}),
.out ({decode_if.valid, decode_if.warp_num, decode_if.curr_PC, decode_if.next_PC, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.use_rs1, decode_if.use_rs2, decode_if.ex_type, decode_if.instr_op, decode_if.wb})
.in ({decode_tmp_if.valid, decode_tmp_if.warp_num, decode_tmp_if.curr_PC, decode_tmp_if.next_PC, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.use_rs1, decode_tmp_if.use_rs2, decode_tmp_if.ex_type, decode_tmp_if.instr_op, decode_tmp_if.wb, decode_tmp_if.rs3, decode_tmp_if.use_rs3, decode_tmp_if.rs1_is_fp, decode_tmp_if.rs2_is_fp, decode_tmp_if.frm}),
.out ({decode_if.valid, decode_if.warp_num, decode_if.curr_PC, decode_if.next_PC, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.use_rs1, decode_if.use_rs2, decode_if.ex_type, decode_if.instr_op, decode_if.wb, decode_if.rs3, decode_if.use_rs3, decode_if.rs1_is_fp, decode_if.rs2_is_fp, decode_if.frm})
);
assign ifetch_rsp_if.ready = ~stall;
@ -263,9 +321,7 @@ module VX_decode #(
print_ex_type(decode_tmp_if.ex_type);
$write(", op=");
print_instr_op(decode_tmp_if.ex_type, decode_tmp_if.instr_op);
$write(", wb=");
print_wb(decode_tmp_if.wb);
$write(", rd=%0d, rs1=%0d, rs2=%0d, imm=%0h, use_pc=%b, use_imm=%b, use_rs1=%b, use_rs2=%b\n", decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.use_rs1, decode_tmp_if.use_rs2);
$write(", wb=%b, rd=%0d, rs1=%0d, rs2=%0d, imm=%0h, use_pc=%b, use_imm=%b, use_rs1=%b, use_rs2=%b\n", decode_tmp_if.wb, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.use_rs1, decode_tmp_if.use_rs2);
// trap unsupported instructions
assert(~(~stall && (decode_tmp_if.ex_type == `EX_ALU) && `ALU_OP(decode_tmp_if.instr_op) == `ALU_OTHER));

View file

@ -19,12 +19,13 @@ module VX_execute #(
// perf
VX_perf_cntrs_if perf_cntrs_if,
// inputs
VX_alu_req_if alu_req_if,
VX_lsu_req_if lsu_req_if,
VX_csr_req_if csr_req_if,
VX_mul_req_if mul_req_if,
VX_fpu_req_if fpu_req_if,
VX_gpu_req_if gpu_req_if,
// outputs
@ -34,10 +35,13 @@ module VX_execute #(
VX_commit_if lsu_commit_if,
VX_commit_if csr_commit_if,
VX_commit_if mul_commit_if,
VX_commit_if fpu_commit_if,
VX_commit_if gpu_commit_if,
output wire ebreak
);
VX_fpu_to_csr_if fpu_to_csr_if();
VX_fpu_from_csr_if fpu_from_csr_if();
VX_alu_unit #(
.CORE_ID(CORE_ID)
@ -67,6 +71,7 @@ module VX_execute #(
.clk (clk),
.reset (reset),
.perf_cntrs_if (perf_cntrs_if),
.fpu_to_csr_if (fpu_to_csr_if),
.csr_io_req_if (csr_io_req_if),
.csr_io_rsp_if (csr_io_rsp_if),
.csr_req_if (csr_req_if),
@ -82,6 +87,17 @@ module VX_execute #(
.mul_commit_if (mul_commit_if)
);
VX_fpu_unit #(
.CORE_ID(CORE_ID)
) fpu_unit (
.clk (clk),
.reset (reset),
.fpu_req_if (fpu_req_if),
.fpu_from_csr_if(fpu_from_csr_if),
.fpu_to_csr_if (fpu_to_csr_if),
.fpu_commit_if (fpu_commit_if)
);
VX_gpu_unit #(
.CORE_ID(CORE_ID)
) gpu_unit (

140
hw/rtl/VX_fpu_unit.v Normal file
View file

@ -0,0 +1,140 @@
`include "VX_define.vh"
module VX_fpu_unit #(
parameter CORE_ID = 0
) (
// inputs
input wire clk,
input wire reset,
// inputs
VX_fpu_req_if fpu_req_if,
VX_fpu_from_csr_if fpu_from_csr_if,
// outputs
VX_commit_if fpu_commit_if,
VX_fpu_to_csr_if fpu_to_csr_if
);
localparam FOP_BITS = fpnew_pkg::OP_BITS;
localparam FMTF_BITS = $clog2(fpnew_pkg::NUM_FP_FORMATS);
localparam FMTI_BITS = $clog2(fpnew_pkg::NUM_INT_FORMATS);
localparam int FPU_DPATHW = `NUM_THREADS * 32;
localparam fpnew_pkg::fpu_features_t FPU_FEATURES = '{
Width: FPU_DPATHW,
EnableVectors: 1,
EnableNanBox: 1,
FpFmtMask: 5'b10000,
IntFmtMask: 4'b0010
};
localparam fpnew_pkg::fpu_implementation_t FPU_IMPLEMENTATION = '{
PipeRegs:'{'{`LATENCY_FMULADD, 0, 0, 0, 0}, // ADDMUL
'{default: `LATENCY_FDIVSQRT}, // DIVSQRT
'{default: `LATENCY_FNONCOMP}, // NONCOMP
'{default: `LATENCY_FCONV}}, // CONV
UnitTypes:'{'{default: fpnew_pkg::PARALLEL}, // ADDMUL
'{default: fpnew_pkg::MERGED}, // DIVSQRT
'{default: fpnew_pkg::PARALLEL}, // NONCOMP
'{default: fpnew_pkg::MERGED}}, // CONV
PipeConfig: fpnew_pkg::DISTRIBUTED
};
wire fpu_in_ready;
wire fpu_in_valid;
wire fpu_out_ready;
wire fpu_out_valid;
wire [2:0][`NUM_THREADS-1:0][31:0] fpu_operands;
wire [FMTF_BITS-1:0] fpu_src_fmt = fpnew_pkg::FP32;
wire [FMTF_BITS-1:0] fpu_dst_fmt = fpnew_pkg::FP32;
wire [FMTI_BITS-1:0] fpu_int_fmt = fpnew_pkg::INT32;
assign fpu_in_valid = (| fpu_req_if.valid);
assign fpu_operands[0] = fpu_req_if.rs1_data;
assign fpu_operands[1] = fpu_req_if.rs2_data;
assign fpu_operands[2] = fpu_req_if.rs3_data;
assign fpu_req_if.ready = fpu_in_ready;
wire [`NUM_THREADS-1:0][31:0] fpu_result;
fpnew_pkg::status_t fpu_status;
reg [FOP_BITS-1:0] fpu_op;
reg [`FRM_BITS-1:0] fpu_rnd;
reg fpu_op_mod;
always @(*) begin
fpu_op = fpnew_pkg::SGNJ;
fpu_op_mod = 0;
fpu_rnd = fpu_req_if.frm;
case (fpu_req_if.fpu_op)
`FPU_ADD: fpu_op = fpnew_pkg::ADD;
`FPU_SUB: begin fpu_op = fpnew_pkg::ADD; fpu_op_mod = 1; end
`FPU_MUL: fpu_op = fpnew_pkg::MUL;
`FPU_DIV: fpu_op = fpnew_pkg::DIV;
`FPU_SQRT: fpu_op = fpnew_pkg::SQRT;
`FPU_MADD: fpu_op = fpnew_pkg::FMADD;
`FPU_MSUB: begin fpu_op = fpnew_pkg::FMADD; fpu_op_mod = 1; end
`FPU_NMSUB: fpu_op = fpnew_pkg::FNMSUB;
`FPU_NMADD: begin fpu_op = fpnew_pkg::FNMSUB; fpu_op_mod = 1; end
`FPU_SGNJ: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RNE; end
`FPU_SGNJN: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RTZ; end
`FPU_SGNJX: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RDN; end
`FPU_MIN: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `FRM_RNE; end
`FPU_MAX: begin fpu_op = fpnew_pkg::MINMAX; fpu_rnd = `FRM_RTZ; end
`FPU_CVTWS: fpu_op = fpnew_pkg::F2I;
`FPU_CVTWUS:begin fpu_op = fpnew_pkg::ADD; fpu_op_mod = 1; end
`FPU_CVTSW: fpu_op = fpnew_pkg::I2F;
`FPU_CVTSWU:begin fpu_op = fpnew_pkg::I2F; fpu_op_mod = 1; end
`FPU_MVXW: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RUP; end
`FPU_MVWX: begin fpu_op = fpnew_pkg::SGNJ; fpu_rnd = `FRM_RUP; end
`FPU_CLASS: fpu_op = fpnew_pkg::CLASSIFY;
`FPU_CMP: fpu_op = fpnew_pkg::CMP;
default:;
endcase
end
fpnew_top #(
.Features (FPU_FEATURES),
.Implementation (FPU_IMPLEMENTATION),
.TagType (logic)
) fpnew_core (
.clk_i (clk),
.rst_ni (1'b1),
.operands_i (fpu_operands),
.rnd_mode_i (fpu_rnd),
.op_i (fpu_op),
.op_mod_i (fpu_op_mod),
.src_fmt_i (fpu_src_fmt),
.dst_fmt_i (fpu_dst_fmt),
.int_fmt_i (fpu_int_fmt),
.vectorial_op_i (1'b1),
.tag_i (1'b0),
.in_valid_i (fpu_in_valid),
.in_ready_o (fpu_in_ready),
.flush_i (reset),
.result_o (fpu_result),
.status_o (fpu_status),
`UNUSED_PIN (tag_o),
.out_valid_o (fpu_out_valid),
.out_ready_i (fpu_out_ready),
`UNUSED_PIN (busy_o)
);
assign fpu_commit_if.valid = fpu_req_if.valid & {`NUM_THREADS{fpu_out_valid}};
assign fpu_commit_if.data = fpu_result;
assign fpu_commit_if.wb = fpu_req_if.wb;
assign fpu_commit_if.rd = fpu_req_if.rd;
assign fpu_out_ready = fpu_commit_if.ready;
assign fpu_to_csr_if.valid = fpu_out_valid;
assign fpu_to_csr_if.warp_num = fpu_req_if.warp_num;
assign fpu_to_csr_if.fflags_NV = fpu_status.NV;
assign fpu_to_csr_if.fflags_DZ = fpu_status.DZ;
assign fpu_to_csr_if.fflags_OF = fpu_status.OF;
assign fpu_to_csr_if.fflags_UF = fpu_status.UF;
assign fpu_to_csr_if.fflags_NX = fpu_status.NX;
endmodule

94
hw/rtl/VX_gpr_fp_ctrl.v Normal file
View file

@ -0,0 +1,94 @@
`include "VX_define.vh"
// control module to support multi-cycle read for fp register
module VX_gpr_fp_ctrl (
input wire clk,
input wire reset,
VX_decode_if decode_if,
input wire [`NUM_THREADS-1:0][31:0] rs1_int_data,
input wire [`NUM_THREADS-1:0][31:0] rs2_int_data,
input wire [`NUM_THREADS-1:0][31:0] rs1_fp_data,
input wire [`NUM_THREADS-1:0][31:0] rs2_fp_data,
// outputs
output wire [`NR_BITS-1:0] raddr1,
output wire [`NR_BITS-1:0] raddr2,
VX_gpr_data_if gpr_data_if,
input wire schedule_delay,
output wire gpr_delay
);
// param
localparam GPR_DELAY_WID = 1;
reg [GPR_DELAY_WID-1:0] multi_cyc_state;
reg [`NUM_THREADS-1:0][31:0] tmp_rs1_data;
reg [`NUM_THREADS-1:0][31:0] tmp_rs2_data;
reg [`NUM_THREADS-1:0][31:0] rs1_data;
reg [`NUM_THREADS-1:0][31:0] rs2_data;
reg [`NUM_THREADS-1:0][31:0] rs3_data;
always @(posedge clk) begin
if (reset) begin
multi_cyc_state <= 0;
end else if (!schedule_delay) begin
multi_cyc_state <= decode_if.use_rs3 && (multi_cyc_state == 0);
end else begin
multi_cyc_state <= 0;
end
end
// select rs1 data
always @(posedge clk) begin
if (reset) begin
tmp_rs1_data <= 0;
end else begin
if (decode_if.rs1_is_fp) begin
tmp_rs1_data <= rs1_fp_data;
end else begin
tmp_rs1_data <= decode_if.rs1_is_PC ? {`NUM_THREADS{decode_if.curr_PC}} : rs1_int_data;
end
end
end
// select rs2 data
always @(posedge clk) begin
if(reset) begin
tmp_rs2_data <= 0;
end else begin
if (decode_if.rs2_is_fp) begin
tmp_rs2_data <= rs2_fp_data;
end else begin
tmp_rs2_data <= decode_if.rs2_is_imm ? {`NUM_THREADS{decode_if.imm}} : rs2_int_data;
end
end
end
// outputs
assign gpr_delay = (multi_cyc_state == 0) && decode_if.use_rs3;
assign raddr1 = multi_cyc_state ? decode_if.rs3 : decode_if.rs1 ;
assign raddr2 = decode_if.rs2;
always @(*) begin
if (decode_if.use_rs3) begin
rs1_data = tmp_rs1_data;
rs2_data = tmp_rs2_data;
rs3_data = rs1_fp_data;
end else begin
rs1_data = decode_if.rs1_is_fp ? rs1_fp_data : rs1_int_data;
rs2_data = decode_if.rs2_is_fp ? rs2_fp_data : rs2_int_data;
rs3_data = {`NUM_THREADS{32'h8000_0000}}; // default value: -0 in single fp
end
end
assign gpr_data_if.rs1_data = rs1_data;
assign gpr_data_if.rs2_data = rs2_data;
assign gpr_data_if.rs3_data = rs3_data;
endmodule

View file

@ -4,42 +4,76 @@ module VX_gpr_stage #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
// inputs
VX_wb_if writeback_if,
VX_decode_if decode_if,
VX_decode_if decode_if,
// outputs
VX_gpr_data_if gpr_data_if
VX_gpr_data_if gpr_data_if,
input wire schedule_delay,
output wire gpr_delay
);
wire [`NUM_THREADS-1:0][31:0] rs1_data_all [`NUM_WARPS-1:0];
wire [`NUM_THREADS-1:0][31:0] rs2_data_all [`NUM_WARPS-1:0];
wire [`NUM_THREADS-1:0][31:0] rs1_PC;
wire [`NUM_THREADS-1:0][31:0] rs2_imm;
wire [`NUM_THREADS-1:0][31:0] rs1_int_data [`NUM_WARPS-1:0];
wire [`NUM_THREADS-1:0][31:0] rs2_int_data [`NUM_WARPS-1:0];
wire [`NUM_THREADS-1:0][31:0] rs1_fp_data [`NUM_WARPS-1:0];
wire [`NUM_THREADS-1:0][31:0] rs2_fp_data [`NUM_WARPS-1:0];
wire [`NUM_THREADS-1:0] we [`NUM_WARPS-1:0];
wire [`NR_BITS-1:0] raddr1;
wire [`NR_BITS-1:0] raddr2;
genvar i;
for (i = 0; i < `NUM_THREADS; i++) begin
assign rs1_PC[i] = decode_if.curr_PC;
assign rs2_imm[i] = decode_if.imm;
end
assign gpr_data_if.rs1_data = decode_if.rs1_is_PC ? rs1_PC : rs1_data_all[decode_if.warp_num];
assign gpr_data_if.rs2_data = decode_if.rs2_is_imm ? rs2_imm : rs2_data_all[decode_if.warp_num];
for (i = 0; i < `NUM_WARPS; i++) begin
assign we[i] = writeback_if.valid & {`NUM_THREADS{(i == writeback_if.warp_num)}};
VX_gpr_ram gpr_ram (
// Int GPRs
VX_gpr_ram gpr_int_ram (
.clk (clk),
.we (we[i]),
.we (we[i] & {`NUM_THREADS{~writeback_if.is_fp}}),
.waddr (writeback_if.rd),
.wdata (writeback_if.data),
.rs1 (decode_if.rs1),
.rs2 (decode_if.rs2),
.rs1_data (rs1_data_all[i]),
.rs2_data (rs2_data_all[i])
.rs1 (raddr1),
.rs2 (raddr2),
.rs1_data (rs1_int_data[i]),
.rs2_data (rs2_int_data[i])
);
// FP GPRs
VX_gpr_ram gpr_fp_ram (
.clk (clk),
.we (we[i] & {`NUM_THREADS{writeback_if.is_fp}}),
.waddr (writeback_if.rd),
.wdata (writeback_if.data),
.rs1 (raddr1),
.rs2 (raddr2),
.rs1_data (rs1_fp_data[i]),
.rs2_data (rs2_fp_data[i])
);
// controller for multi-cycle read
VX_gpr_fp_ctrl VX_gpr_fp_ctrl (
.clk (clk),
.reset (reset),
//inputs
.decode_if (decode_if),
.rs1_int_data (rs1_int_data[i]),
.rs2_int_data (rs2_int_data[i]),
.rs1_fp_data (rs1_fp_data[i]),
.rs2_fp_data (rs2_fp_data[i]),
// outputs
.raddr1 (raddr1),
.raddr2 (raddr2),
.gpr_data_if (gpr_data_if),
.schedule_delay (schedule_delay),
.gpr_delay (gpr_delay)
);
end
assign writeback_if.ready = 1'b1;

View file

@ -79,7 +79,7 @@ module VX_gpu_unit #(
assign gpu_commit_if.valid = gpu_req_if.valid;
assign gpu_commit_if.warp_num = gpu_req_if.warp_num;
assign gpu_commit_if.curr_PC = gpu_req_if.curr_PC;
assign gpu_commit_if.wb = `WB_NO;
assign gpu_commit_if.wb = 0;
assign gpu_commit_if.rd = 0;
assign gpu_commit_if.data = 0;

View file

@ -13,16 +13,19 @@ module VX_issue #(
VX_lsu_req_if lsu_req_if,
VX_csr_req_if csr_req_if,
VX_mul_req_if mul_req_if,
VX_fpu_req_if fpu_req_if,
VX_gpu_req_if gpu_req_if
);
VX_gpr_data_if gpr_data_if();
wire schedule_delay;
wire gpr_delay;
wire alu_busy = ~alu_req_if.ready/* && (| alu_req_if.valid)*/;
wire lsu_busy = ~lsu_req_if.ready/* && (| lsu_req_if.valid)*/;
wire csr_busy = ~csr_req_if.ready/* && (| csr_req_if.valid)*/;
wire mul_busy = ~mul_req_if.ready/* && (| mul_req_if.valid)*/;
wire gpu_busy = ~gpu_req_if.ready/* && (| gpu_req_if.valid)*/;
wire alu_busy = ~alu_req_if.ready;
wire lsu_busy = ~lsu_req_if.ready;
wire csr_busy = ~csr_req_if.ready;
wire mul_busy = ~mul_req_if.ready;
wire fpu_busy = ~mul_req_if.ready;
wire gpu_busy = ~gpu_req_if.ready;
VX_scheduler #(
.CORE_ID(CORE_ID)
@ -31,10 +34,12 @@ module VX_issue #(
.reset (reset),
.decode_if (decode_if),
.writeback_if (writeback_if),
.gpr_busy (gpr_delay),
.alu_busy (alu_busy),
.lsu_busy (lsu_busy),
.csr_busy (csr_busy),
.mul_busy (mul_busy),
.fpu_busy (fpu_busy),
.gpu_busy (gpu_busy),
.schedule_delay (schedule_delay),
`UNUSED_PIN (is_empty)
@ -43,16 +48,20 @@ module VX_issue #(
VX_gpr_stage #(
.CORE_ID(CORE_ID)
) gpr_stage (
.clk (clk),
.clk (clk),
.reset (reset),
.decode_if (decode_if),
.writeback_if (writeback_if),
.gpr_data_if (gpr_data_if)
.gpr_data_if (gpr_data_if),
.schedule_delay (schedule_delay),
.gpr_delay (gpr_delay)
);
VX_alu_req_if alu_req_tmp_if();
VX_lsu_req_if lsu_req_tmp_if();
VX_csr_req_if csr_req_tmp_if();
VX_mul_req_if mul_req_tmp_if();
VX_fpu_req_if fpu_req_tmp_if();
VX_gpu_req_if gpu_req_tmp_if();
VX_issue_mux issue_mux (
@ -62,6 +71,7 @@ module VX_issue #(
.lsu_req_if (lsu_req_tmp_if),
.csr_req_if (csr_req_tmp_if),
.mul_req_if (mul_req_tmp_if),
.fpu_req_if (fpu_req_tmp_if),
.gpu_req_if (gpu_req_tmp_if)
);
@ -69,16 +79,18 @@ module VX_issue #(
wire stall_lsu = ~lsu_req_if.ready || schedule_delay;
wire stall_csr = ~csr_req_if.ready || schedule_delay;
wire stall_mul = ~mul_req_if.ready || schedule_delay;
wire stall_fpu = ~fpu_req_if.ready || schedule_delay;
wire stall_gpu = ~gpu_req_if.ready || schedule_delay;
wire flush_alu = alu_req_if.ready && schedule_delay;
wire flush_lsu = lsu_req_if.ready && schedule_delay;
wire flush_csr = csr_req_if.ready && schedule_delay;
wire flush_mul = mul_req_if.ready && schedule_delay;
wire flush_fpu = fpu_req_if.ready && schedule_delay;
wire flush_gpu = gpu_req_if.ready && schedule_delay;
VX_generic_register #(
.N(`NUM_THREADS +`NW_BITS + 32 + `ALU_BITS + `WB_BITS + `NR_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + 32 + 32)
.N(`NUM_THREADS +`NW_BITS + 32 + `ALU_BITS + 1 + `NR_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + 32 + 32)
) alu_reg (
.clk (clk),
.reset (reset),
@ -89,7 +101,7 @@ module VX_issue #(
);
VX_generic_register #(
.N(`NUM_THREADS + `NW_BITS + 32 + 1 + `BYTEEN_BITS + `WB_BITS + `NR_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + 32)
.N(`NUM_THREADS + `NW_BITS + 32 + 1 + `BYTEEN_BITS + 1 + `NR_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + 32)
) lsu_reg (
.clk (clk),
.reset (reset),
@ -100,7 +112,7 @@ module VX_issue #(
);
VX_generic_register #(
.N(`NUM_THREADS + `NW_BITS + 32 + `CSR_BITS + `WB_BITS + `NR_BITS + `CSR_ADDR_SIZE + 32 + 1)
.N(`NUM_THREADS + `NW_BITS + 32 + `CSR_BITS + 1 + `NR_BITS + `CSR_ADDR_SIZE + 32 + 1)
) csr_reg (
.clk (clk),
.reset (reset),
@ -110,8 +122,8 @@ module VX_issue #(
.out ({csr_req_if.valid, csr_req_if.warp_num, csr_req_if.curr_PC, csr_req_if.csr_op, csr_req_if.wb, csr_req_if.rd, csr_req_if.csr_addr, csr_req_if.csr_mask, csr_req_if.is_io})
);
VX_generic_register #(
.N(`NUM_THREADS +`NW_BITS + 32 + `MUL_BITS + `WB_BITS + `NR_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32))
VX_generic_register #(
.N(`NUM_THREADS +`NW_BITS + 32 + `MUL_BITS + 1 + `NR_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32))
) mul_reg (
.clk (clk),
.reset (reset),
@ -121,6 +133,17 @@ module VX_issue #(
.out ({mul_req_if.valid, mul_req_if.warp_num, mul_req_if.curr_PC, mul_req_if.mul_op, mul_req_if.wb, mul_req_if.rd, mul_req_if.rs1_data, mul_req_if.rs2_data})
);
VX_generic_register #(
.N(`NUM_THREADS +`NW_BITS + 32 + `FPU_BITS + 1 + `NR_BITS + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + (`NUM_THREADS * 32) + `FRM_BITS)
) fpu_reg (
.clk (clk),
.reset (reset),
.stall (stall_fpu),
.flush (flush_fpu),
.in ({fpu_req_tmp_if.valid, fpu_req_tmp_if.warp_num, fpu_req_tmp_if.curr_PC, fpu_req_tmp_if.fpu_op, fpu_req_tmp_if.wb, fpu_req_tmp_if.rd, fpu_req_tmp_if.rs1_data, fpu_req_tmp_if.rs2_data, fpu_req_tmp_if.rs3_data, fpu_req_tmp_if.frm}),
.out ({fpu_req_if.valid, fpu_req_if.warp_num, fpu_req_if.curr_PC, fpu_req_if.fpu_op, fpu_req_if.wb, fpu_req_if.rd, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data, fpu_req_if.frm})
);
VX_generic_register #(
.N(`NUM_THREADS + `NW_BITS + 32 + `GPU_BITS + (`NUM_THREADS * 32) + 32 + 32)
) gpu_reg (
@ -140,6 +163,9 @@ module VX_issue #(
if ((| mul_req_tmp_if.valid) && ~stall_mul) begin
$display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=MUL, op=%0d, wb=%d, rd=%0d, rs1=%0h, rs2=%0h", $time, CORE_ID, mul_req_tmp_if.warp_num, mul_req_tmp_if.curr_PC, mul_req_tmp_if.mul_op, mul_req_tmp_if.wb, mul_req_tmp_if.rd, mul_req_tmp_if.rs1_data, mul_req_tmp_if.rs2_data);
end
if ((| fpu_req_tmp_if.valid) && ~stall_fpu) begin
$display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=MUL, op=%0d, wb=%d, rd=%0d, rs1=%0h, rs2=%0h", $time, CORE_ID, fpu_req_tmp_if.warp_num, fpu_req_tmp_if.curr_PC, fpu_req_tmp_if.fpu_op, fpu_req_tmp_if.wb, fpu_req_tmp_if.rd, fpu_req_tmp_if.rs1_data, fpu_req_tmp_if.rs2_data);
end
if ((| lsu_req_tmp_if.valid) && ~stall_lsu) begin
$display("%t: Core%0d-issue: warp=%0d, PC=%0h, ex=LSU, rw=%b, wb=%0d, rd=%0d, byteen=%b, baddr=%0h, offset=%0h", $time, CORE_ID, lsu_req_tmp_if.warp_num, lsu_req_tmp_if.curr_PC, lsu_req_tmp_if.rw, lsu_req_tmp_if.rd, lsu_req_tmp_if.wb, lsu_req_tmp_if.byteen, lsu_req_tmp_if.base_addr, lsu_req_tmp_if.offset);
end

View file

@ -10,6 +10,7 @@ module VX_issue_mux (
VX_lsu_req_if lsu_req_if,
VX_csr_req_if csr_req_if,
VX_mul_req_if mul_req_if,
VX_fpu_req_if fpu_req_if,
VX_gpu_req_if gpu_req_if
);
@ -17,6 +18,7 @@ module VX_issue_mux (
wire[`NUM_THREADS-1:0] is_lsu = {`NUM_THREADS{decode_if.ex_type == `EX_LSU}};
wire[`NUM_THREADS-1:0] is_csr = {`NUM_THREADS{decode_if.ex_type == `EX_CSR}};
wire[`NUM_THREADS-1:0] is_mul = {`NUM_THREADS{decode_if.ex_type == `EX_MUL}};
wire[`NUM_THREADS-1:0] is_fpu = {`NUM_THREADS{decode_if.ex_type == `EX_FPU}};
wire[`NUM_THREADS-1:0] is_gpu = {`NUM_THREADS{decode_if.ex_type == `EX_GPU}};
// ALU unit
@ -64,6 +66,18 @@ module VX_issue_mux (
assign mul_req_if.rd = decode_if.rd;
assign mul_req_if.wb = decode_if.wb;
// FPU unit
assign fpu_req_if.valid = decode_if.valid & is_fpu;
assign fpu_req_if.warp_num = decode_if.warp_num;
assign fpu_req_if.curr_PC = decode_if.curr_PC;
assign fpu_req_if.fpu_op = `FPU_OP(decode_if.instr_op);
assign fpu_req_if.rs1_data = gpr_data_if.rs1_data;
assign fpu_req_if.rs2_data = gpr_data_if.rs2_data;
assign fpu_req_if.rs3_data = gpr_data_if.rs3_data;
assign fpu_req_if.frm = decode_if.frm;
assign fpu_req_if.rd = decode_if.rd;
assign fpu_req_if.wb = decode_if.wb;
// GPU unit
assign gpu_req_if.valid = decode_if.valid & is_gpu;
assign gpu_req_if.warp_num = decode_if.warp_num;

View file

@ -28,7 +28,7 @@ module VX_lsu_unit #(
wire [`BYTEEN_BITS-1:0] mem_byteen;
wire [`NR_BITS-1:0] use_rd;
wire [`NW_BITS-1:0] use_warp_num;
wire [`WB_BITS-1:0] use_wb;
wire use_wb;
wire [31:0] use_pc;
wire mrq_full;
@ -69,7 +69,7 @@ module VX_lsu_unit #(
`IGNORE_WARNINGS_END
VX_generic_register #(
.N(`NUM_THREADS + (`NUM_THREADS * 32) + `BYTEEN_BITS + 1 + (`NUM_THREADS * (30 + 2 + 4 + 32)) + `NR_BITS + `NW_BITS + `WB_BITS + 32)
.N(`NUM_THREADS + (`NUM_THREADS * 32) + `BYTEEN_BITS + 1 + (`NUM_THREADS * (30 + 2 + 4 + 32)) + `NR_BITS + `NW_BITS + 1 + 32)
) mem_req_reg (
.clk (clk),
.reset (reset),
@ -97,7 +97,7 @@ module VX_lsu_unit #(
wire mrq_pop = mrq_pop_part && (0 == mem_rsp_mask_upd);
VX_index_queue #(
.DATAW (`LOG2UP(`DCREQ_SIZE) + 32 + `WB_BITS + (`NUM_THREADS * 2) + `BYTEEN_BITS + `NR_BITS + `NW_BITS),
.DATAW (`LOG2UP(`DCREQ_SIZE) + 32 + 1 + (`NUM_THREADS * 2) + `BYTEEN_BITS + `NR_BITS + `NW_BITS),
.SIZE (`DCREQ_SIZE)
) mem_req_queue (
.clk (clk),

View file

@ -36,7 +36,7 @@ module VX_mul_unit #(
.WIDTHB(33),
.WIDTHP(64),
.SIGNED(1),
.PIPELINE(`MUL_LATENCY)
.PIPELINE(`LATENCY_IMUL)
) multiplier (
.clk(clk),
.reset(reset),
@ -52,7 +52,7 @@ module VX_mul_unit #(
.WIDTHR(32),
.NSIGNED(1),
.DSIGNED(1),
.PIPELINE(`DIV_LATENCY)
.PIPELINE(`LATENCY_IDIV)
) sdiv (
.clk(clk),
.reset(reset),
@ -81,7 +81,7 @@ module VX_mul_unit #(
reg result_avail;
reg [4:0] pending_ctr;
wire [4:0] instr_delay = `IS_DIV_OP(alu_op) ? `DIV_LATENCY : `MUL_LATENCY;
wire [4:0] instr_delay = `IS_DIV_OP(alu_op) ? `LATENCY_IDIV : `LATENCY_IMUL;
always @(posedge clk) begin
if (reset) begin
@ -112,7 +112,7 @@ module VX_mul_unit #(
wire flush = mul_commit_if.ready && pipeline_stall;
VX_generic_register #(
.N(`NUM_THREADS + `NW_BITS + 32 + `NR_BITS + `WB_BITS + (`NUM_THREADS * 32))
.N(`NUM_THREADS + `NW_BITS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32))
) mul_reg (
.clk (clk),
.reset (reset),

View file

@ -110,6 +110,7 @@ module VX_pipeline #(
VX_lsu_req_if lsu_req_if();
VX_csr_req_if csr_req_if();
VX_mul_req_if mul_req_if();
VX_fpu_req_if fpu_req_if();
VX_gpu_req_if gpu_req_if();
VX_wb_if writeback_if();
VX_wstall_if wstall_if();
@ -118,6 +119,7 @@ module VX_pipeline #(
VX_commit_if lsu_commit_if();
VX_commit_if csr_commit_if();
VX_commit_if mul_commit_if();
VX_commit_if fpu_commit_if();
VX_commit_if gpu_commit_if();
VX_fetch #(
@ -159,6 +161,7 @@ module VX_pipeline #(
.lsu_req_if (lsu_req_if),
.csr_req_if (csr_req_if),
.mul_req_if (mul_req_if),
.fpu_req_if (fpu_req_if),
.gpu_req_if (gpu_req_if)
);
@ -181,6 +184,7 @@ module VX_pipeline #(
.lsu_req_if (lsu_req_if),
.csr_req_if (csr_req_if),
.mul_req_if (mul_req_if),
.fpu_req_if (fpu_req_if),
.gpu_req_if (gpu_req_if),
.warp_ctl_if (warp_ctl_if),
@ -189,6 +193,7 @@ module VX_pipeline #(
.lsu_commit_if (lsu_commit_if),
.csr_commit_if (csr_commit_if),
.mul_commit_if (mul_commit_if),
.fpu_commit_if (fpu_commit_if),
.gpu_commit_if (gpu_commit_if),
.ebreak (ebreak)
@ -204,6 +209,7 @@ module VX_pipeline #(
.lsu_commit_if (lsu_commit_if),
.csr_commit_if (csr_commit_if),
.mul_commit_if (mul_commit_if),
.fpu_commit_if (fpu_commit_if),
.gpu_commit_if (gpu_commit_if),
.writeback_if (writeback_if),

View file

@ -8,10 +8,12 @@ module VX_scheduler #(
VX_decode_if decode_if,
VX_wb_if writeback_if,
input wire gpr_busy,
input wire alu_busy,
input wire lsu_busy,
input wire csr_busy,
input wire mul_busy,
input wire fpu_busy,
input wire gpu_busy,
output wire schedule_delay,
output wire is_empty
@ -19,23 +21,27 @@ module VX_scheduler #(
localparam CTVW = `CLOG2(`NUM_WARPS * `NUM_REGS + 1);
reg [`NUM_REGS-1:0][`NUM_THREADS-1:0] rename_table [`NUM_WARPS-1:0];
reg [`NUM_REGS-1:0] busy_table[`NUM_WARPS-1:0];
reg [`NUM_REGS-1:0] busy_table [`NUM_WARPS-1:0];
reg [CTVW-1:0] count_valid;
wire rs1_rename = busy_table[decode_if.warp_num][decode_if.rs1];
wire rs2_rename = busy_table[decode_if.warp_num][decode_if.rs2];
wire rs3_rename = busy_table[decode_if.warp_num][decode_if.rs3];
wire rd_rename = busy_table[decode_if.warp_num][decode_if.rd];
wire rs1_rename_qual = (rs1_rename) && (decode_if.use_rs1);
wire rs2_rename_qual = (rs2_rename) && (decode_if.use_rs2);
wire rd_rename_qual = (rd_rename) && (decode_if.wb != 0);
wire rs1_rename_qual = rs1_rename && decode_if.use_rs1;
wire rs2_rename_qual = rs2_rename && decode_if.use_rs2;
wire rs3_rename_qual = rs3_rename && decode_if.use_rs3;
wire rd_rename_qual = rd_rename && decode_if.wb;
wire rename_valid = (rs1_rename_qual || rs2_rename_qual || rd_rename_qual);
wire rename_valid = (rs1_rename_qual || rs2_rename_qual || rs3_rename_qual || rd_rename_qual);
wire ex_stalled = ((alu_busy && (decode_if.ex_type == `EX_ALU))
wire ex_stalled = ((gpr_busy)
|| (alu_busy && (decode_if.ex_type == `EX_ALU))
|| (lsu_busy && (decode_if.ex_type == `EX_LSU))
|| (csr_busy && (decode_if.ex_type == `EX_CSR))
|| (mul_busy && (decode_if.ex_type == `EX_MUL))
|| (fpu_busy && (decode_if.ex_type == `EX_FPU))
|| (gpu_busy && (decode_if.ex_type == `EX_GPU)));
wire stall = (ex_stalled || rename_valid) && (| decode_if.valid);
@ -82,7 +88,7 @@ module VX_scheduler #(
`ifdef DBG_PRINT_PIPELINE
always @(posedge clk) begin
if (stall) begin
$display("%t: Core%0d-stall: warp=%0d, PC=%0h, rd=%0d, wb=%0d, rename=%b%b%b, alu=%b, lsu=%b, csr=%b, mul=%b, gpu=%b", $time, CORE_ID, decode_if.warp_num, decode_if.curr_PC, decode_if.rd, decode_if.wb, rd_rename_qual, rs1_rename_qual, rs2_rename_qual, alu_busy, lsu_busy, csr_busy, mul_busy, gpu_busy);
$display("%t: Core%0d-stall: warp=%0d, PC=%0h, rd=%0d, wb=%0d, rename=%b%b%b, alu=%b, lsu=%b, csr=%b, mul=%b, fpu=%b, gpu=%b", $time, CORE_ID, decode_if.warp_num, decode_if.curr_PC, decode_if.rd, decode_if.wb, rd_rename_qual, rs1_rename_qual, rs2_rename_qual, alu_busy, lsu_busy, csr_busy, mul_busy, fpu_busy, gpu_busy);
end
end
`endif

View file

@ -9,17 +9,19 @@ module VX_writeback #(
// inputs
VX_commit_if alu_commit_if,
VX_commit_if lsu_commit_if,
VX_commit_if mul_commit_if,
VX_commit_if mul_commit_if,
VX_commit_if fpu_commit_if,
VX_commit_if csr_commit_if,
// outputs
VX_wb_if writeback_if
);
wire lsu_valid = (| lsu_commit_if.valid) && (lsu_commit_if.wb != `WB_NO);
wire mul_valid = (| mul_commit_if.valid) && (mul_commit_if.wb != `WB_NO);
wire alu_valid = (| alu_commit_if.valid) && (alu_commit_if.wb != `WB_NO);
wire csr_valid = (| csr_commit_if.valid) && (csr_commit_if.wb != `WB_NO);
wire alu_valid = (| alu_commit_if.valid) && alu_commit_if.wb;
wire lsu_valid = (| lsu_commit_if.valid) && lsu_commit_if.wb;
wire csr_valid = (| csr_commit_if.valid) && csr_commit_if.wb;
wire mul_valid = (| mul_commit_if.valid) && mul_commit_if.wb;
wire fpu_valid = (| fpu_commit_if.valid) && fpu_commit_if.wb;
VX_wb_if writeback_tmp_if();
@ -47,23 +49,26 @@ module VX_writeback #(
csr_valid ? csr_commit_if.rd :
0;
assign writeback_tmp_if.is_fp = fpu_valid && fpu_commit_if.ready;
wire stall = ~writeback_if.ready && (| writeback_if.valid);
VX_generic_register #(
.N(`NUM_THREADS + `NW_BITS + `NR_BITS + (`NUM_THREADS * 32))
.N(`NUM_THREADS + `NW_BITS + `NR_BITS + (`NUM_THREADS * 32) + 1)
) wb_reg (
.clk (clk),
.reset (reset),
.stall (stall),
.flush (0),
.in ({writeback_tmp_if.valid, writeback_tmp_if.warp_num, writeback_tmp_if.rd, writeback_tmp_if.data}),
.out ({writeback_if.valid, writeback_if.warp_num, writeback_if.rd, writeback_if.data})
.in ({writeback_tmp_if.valid, writeback_tmp_if.warp_num, writeback_tmp_if.rd, writeback_tmp_if.data, writeback_tmp_if.is_fp}),
.out ({writeback_if.valid, writeback_if.warp_num, writeback_if.rd, writeback_if.data, writeback_if.is_fp})
);
assign lsu_commit_if.ready = !stall;
assign mul_commit_if.ready = !stall && !lsu_valid;
assign alu_commit_if.ready = !stall && !lsu_valid && !mul_valid;
assign csr_commit_if.ready = !stall && !lsu_valid && !mul_valid && !alu_valid;
assign fpu_commit_if.ready = !stall && !lsu_valid;
assign mul_commit_if.ready = !stall && !lsu_valid && !fpu_valid;
assign alu_commit_if.ready = !stall && !lsu_valid && !fpu_valid && !mul_valid;
assign csr_commit_if.ready = !stall && !lsu_valid && !fpu_valid && !mul_valid && !alu_valid;
// special workaround to control RISC-V benchmarks termination on Verilator
reg [31:0] last_data_wb /* verilator public */;

1
hw/rtl/fp_cores/fpnew Submodule

@ -0,0 +1 @@
Subproject commit 1def7bb630ceae2ebc58921f6b5ee3e686fb6d5a

View file

@ -11,7 +11,7 @@ interface VX_alu_req_if ();
wire [`ALU_BITS-1:0] alu_op;
wire [`WB_BITS-1:0] wb;
wire wb;
wire [`NR_BITS-1:0] rd;
wire [`NUM_THREADS-1:0][31:0] rs1_data;

View file

@ -10,8 +10,7 @@ interface VX_commit_if ();
wire [31:0] curr_PC;
wire [`NUM_THREADS-1:0][31:0] data;
wire [`NR_BITS-1:0] rd;
wire [`WB_BITS-1:0] wb;
wire is_io;
wire wb;
wire ready;
endinterface

View file

@ -15,7 +15,7 @@ interface VX_csr_req_if ();
wire [31:0] csr_mask;
wire [`NR_BITS-1:0] rd;
wire [`WB_BITS-1:0] wb;
wire wb;
wire is_io;
wire ready;

View file

@ -19,12 +19,19 @@ interface VX_decode_if ();
wire [31:0] imm;
wire rs1_is_PC;
wire rs2_is_imm;
wire rs2_is_imm;
wire use_rs1;
wire use_rs2;
wire [`WB_BITS-1:0] wb;
// FP states
wire [`NR_BITS-1:0] rs3;
wire use_rs3;
wire rs1_is_fp;
wire rs2_is_fp;
wire [`FRM_BITS-1:0] frm;
wire wb;
wire ready;

View file

@ -0,0 +1,16 @@
`ifndef VX_FPU_FROM_CSR_IF
`define VX_FPU_FROM_CSR_IF
`include "VX_define.vh"
interface VX_fpu_from_csr_if ();
`IGNORE_WARNINGS_BEGIN
wire [`NUM_WARPS-1:0][`FRM_BITS-1:0] frm;
`IGNORE_WARNINGS_END
endinterface
`endif

View file

@ -0,0 +1,26 @@
`ifndef VX_FPU_REQ_IF
`define VX_FPU_REQ_IF
`include "VX_define.vh"
interface VX_fpu_req_if ();
wire [`NUM_THREADS-1:0] valid;
wire [`NW_BITS-1:0] warp_num;
wire [31:0] curr_PC;
wire [`FPU_BITS-1:0] fpu_op;
wire [`FRM_BITS-1:0] frm;
wire wb;
wire [`NR_BITS-1:0] rd;
wire [`NUM_THREADS-1:0][31:0] rs1_data;
wire [`NUM_THREADS-1:0][31:0] rs2_data;
wire [`NUM_THREADS-1:0][31:0] rs3_data;
wire ready;
endinterface
`endif

View file

@ -0,0 +1,23 @@
`ifndef VX_FPU_TO_CSR_IF
`define VX_FPU_TO_CSR_IF
`include "VX_define.vh"
interface VX_fpu_to_csr_if ();
`IGNORE_WARNINGS_BEGIN
wire valid;
wire [`NW_BITS-1:0] warp_num;
wire fflags_NV;
wire fflags_DZ;
wire fflags_OF;
wire fflags_UF;
wire fflags_NX;
`IGNORE_WARNINGS_END
endinterface
`endif

View file

@ -7,6 +7,7 @@ interface VX_gpr_data_if ();
wire [`NUM_THREADS-1:0][31:0] rs1_data;
wire [`NUM_THREADS-1:0][31:0] rs2_data;
wire [`NUM_THREADS-1:0][31:0] rs3_data;
endinterface

View file

@ -12,7 +12,7 @@ interface VX_lsu_req_if ();
wire rw;
wire [`BYTEEN_BITS-1:0] byteen;
wire [`WB_BITS-1:0] wb;
wire wb;
wire [`NR_BITS-1:0] rd;
wire [`NUM_THREADS-1:0][31:0] store_data;

View file

@ -11,7 +11,7 @@ interface VX_mul_req_if ();
wire [`MUL_BITS-1:0] mul_op;
wire [`WB_BITS-1:0] wb;
wire wb;
wire [`NR_BITS-1:0] rd;
wire [`NUM_THREADS-1:0][31:0] rs1_data;

View file

@ -9,6 +9,7 @@ interface VX_wb_if ();
wire [`NW_BITS-1:0] warp_num;
wire [`NR_BITS-1:0] rd;
wire [`NUM_THREADS-1:0][31:0] data;
wire is_fp;
wire ready;
endinterface

11
hw/simulate/verilator.vlt Normal file
View file

@ -0,0 +1,11 @@
`verilator_config
lint_off -rule BLKANDNBLK -file "../rtl/fp_cores/fpnew/*"
lint_off -rule UNOPTFLAT -file "../rtl/fp_cores/fpnew/*"
lint_off -rule WIDTH -file "../rtl/fp_cores/fpnew/*"
lint_off -rule UNUSED -file "../rtl/fp_cores/fpnew/*"
lint_off -rule LITENDIAN -file "../rtl/fp_cores/fpnew/*"
lint_off -rule IMPORTSTAR -file "../rtl/fp_cores/fpnew/*"
lint_off -rule PINCONNECTEMPTY -file "../rtl/fp_cores/fpnew/*"
//lint_off -rule CASEINCOMPLETE -file "../rtl/fp_cores/fpnew/*"