ibuffer addition

This commit is contained in:
Blaise Tine 2020-08-22 00:22:04 -07:00
parent 6c12391338
commit 0b355f228e
80 changed files with 1811 additions and 1528 deletions

View file

@ -89,4 +89,26 @@ extern int vx_upload_kernel_file(vx_device_h device, const char* filename) {
delete[] content;
return err;
}
extern int vx_get_perf(vx_device_h device, uint64_t* cycles, uint64_t* instrs) {
int ret = 0;
unsigned value;
if (cycles) {
ret |= vx_csr_get(device, 0, CSR_CYCLE_H, &value);
*cycles = value;
ret |= vx_csr_get(device, 0, CSR_CYCLE, &value);
*cycles = (*cycles << 32) | value;
}
if (instrs) {
ret |= vx_csr_get(device, 0, CSR_INSTRET_H, &value);
*instrs = value;
ret |= vx_csr_get(device, 0, CSR_INSTRET, &value);
*instrs = (*instrs << 32) | value;
}
return ret;
}

View file

@ -71,6 +71,9 @@ int vx_upload_kernel_bytes(vx_device_h device, const void* content, size_t size)
// upload kernel file to device
int vx_upload_kernel_file(vx_device_h device, const char* filename);
// get performance counters
int vx_get_perf(vx_device_h device, uint64_t* cycles, uint64_t* instrs);
#ifdef __cplusplus
}
#endif

View file

@ -212,25 +212,11 @@ extern int vx_dev_close(vx_device_h hdevice) {
#endif
{
// Dump performance stats
// Dump perf stats
uint64_t instrs, cycles;
unsigned value;
int ret = 0;
ret |= vx_csr_get(hdevice, 0, CSR_INSTRET_H, &value);
instrs = value;
ret |= vx_csr_get(hdevice, 0, CSR_INSTRET, &value);
instrs = (instrs << 32) | value;
ret |= vx_csr_get(hdevice, 0, CSR_CYCLE_H, &value);
cycles = value;
ret |= vx_csr_get(hdevice, 0, CSR_CYCLE, &value);
cycles = (cycles << 32) | value;
int ret = vx_get_perf(hdevice, &instrs, &cycles);
float IPC = (float)(double(instrs) / double(cycles));
fprintf(stdout, "PERF: instrs=%ld, cycles=%ld, IPC=%f\n", instrs, cycles, IPC);
assert(ret == 0);
}

View file

@ -68,7 +68,8 @@ public:
simulator_.attach_ram(&ram_);
}
~vx_device() {
~vx_device() {
simulator_.print_stats(std::cout);
if (future_.valid()) {
future_.wait();
}

View file

@ -155,7 +155,7 @@ int run_kernel_test(const kernel_arg_t& kernel_arg,
int32_t curr = ((int32_t*)vx_host_ptr(buffer))[i];
int32_t ref = i;
if (curr != ref) {
std::cout << "error at value " << i
std::cout << "error at result #" << i
<< ": actual 0x" << curr << ", expected 0x" << ref << std::endl;
++errors;
}
@ -238,7 +238,7 @@ int main(int argc, char *argv[]) {
std::cout << "cleanup" << std::endl;
cleanup();
std::cout << "Test PASSED" << std::endl;
std::cout << "Test PASSED" << std::endl;
return 0;
}

View file

@ -86,7 +86,7 @@ int run_test(const kernel_arg_t& kernel_arg,
int ref = i + i;
int cur = buf_ptr[i];
if (cur != ref) {
std::cout << "error at value " << i
std::cout << "error at result #" << i
<< ": actual 0x" << cur << ", expected 0x" << ref << std::endl;
++errors;
}

View file

@ -57,7 +57,7 @@ public:
for (int i = 0; i < n; ++i) {
auto ref = a[i] + b[i];
if (c[i] != ref) {
std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
@ -85,7 +85,7 @@ public:
for (int i = 0; i < n; ++i) {
auto ref = a[i] * b[i];
if (c[i] != ref) {
std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
@ -113,7 +113,7 @@ public:
for (int i = 0; i < n; ++i) {
auto ref = a[i] / b[i];
if (c[i] != ref) {
std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
@ -143,7 +143,7 @@ public:
auto y = a[i] * b[i];
auto ref = x + y;
if (c[i] != ref) {
std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
@ -171,7 +171,7 @@ public:
for (int i = 0; i < n; ++i) {
auto ref = a[i] + b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
@ -199,7 +199,7 @@ public:
for (int i = 0; i < n; ++i) {
auto ref = a[i] - b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
@ -227,7 +227,7 @@ public:
for (int i = 0; i < n; ++i) {
auto ref = a[i] * b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
@ -255,7 +255,7 @@ public:
for (int i = 0; i < n; ++i) {
auto ref = a[i] * b[i] + 0.5f;
if (!almost_equal(c[i], ref)) {
std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
@ -283,7 +283,7 @@ public:
for (int i = 0; i < n; ++i) {
auto ref = a[i] * b[i] - 0.5f;
if (!almost_equal(c[i], ref)) {
std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
@ -311,7 +311,7 @@ public:
for (int i = 0; i < n; ++i) {
auto ref = -a[i] * b[i] - 0.5f;
if (!almost_equal(c[i], ref)) {
std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
@ -339,7 +339,7 @@ public:
for (int i = 0; i < n; ++i) {
auto ref = -a[i] * b[i] + 0.5f;
if (!almost_equal(c[i], ref)) {
std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
@ -369,7 +369,7 @@ public:
auto y = a[i] * b[i] + 0.5f;
auto ref = x + y;
if (!almost_equal(c[i], ref)) {
std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
@ -397,7 +397,7 @@ public:
for (int i = 0; i < n; ++i) {
auto ref = a[i] / b[i];
if (!almost_equal(c[i], ref)) {
std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
@ -427,7 +427,7 @@ public:
auto y = b[i] / a[i];
auto ref = x + y;
if (!almost_equal(c[i], ref)) {
std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
@ -456,7 +456,7 @@ public:
for (int i = 0; i < n; ++i) {
auto ref = sqrt(a[i] * b[i]);
if (!almost_equal(c[i], ref)) {
std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
@ -485,7 +485,7 @@ public:
auto x = a[i] + b[i];
auto ref = (int32_t)x;
if (c[i] != ref) {
std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
@ -514,7 +514,7 @@ public:
auto x = a[i] + b[i];
auto ref = (uint32_t)x;
if (c[i] != ref) {
std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
@ -543,7 +543,7 @@ public:
auto x = a[i] + b[i];
auto ref = (float)x;
if (!almost_equal(c[i], ref)) {
std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}
@ -572,7 +572,7 @@ public:
auto x = a[i] + b[i];
auto ref = (float)x;
if (!almost_equal(c[i], ref)) {
std::cout << "error at value " << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
std::cout << "error at result #" << i << ": expected " << ref << ", actual " << c[i] << ", a=" << a[i] << ", b=" << b[i] << std::endl;
++errors;
}
}

View file

@ -75,6 +75,9 @@ tar -zcvf output_files_1c.tar.gz `find ./build_fpga_1c -type f \( -iname \*.rpt
# compress VCD trace
tar -zcvf vortex.vcd.tar.gz ./build_ase_1c/work/vortex.vcd
tar -zcvf trace.vcd.tar.gz obj_dir/trace.vcd
tar -zcvf trace.vcd.tar.gz trace.vcd
tar -zcvf run.log.tar.gz run.log
# decompress VCD trace
tar -zxvf /mnt/c/Users/Blaise/Downloads/vortex.vcd.tar.gz

View file

@ -3,90 +3,143 @@
module VX_alu_unit #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
input wire clk,
input wire reset,
// Inputs
VX_alu_req_if alu_req_if,
VX_alu_req_if alu_req_if,
// Outputs
VX_exu_to_cmt_if alu_commit_if
VX_branch_ctl_if branch_ctl_if,
VX_exu_to_cmt_if alu_commit_if
);
reg [`NUM_THREADS-1:0][31:0] alu_result;
wire [`NUM_THREADS-1:0][31:0] addsub_result;
wire [`NUM_THREADS-1:0] less_result;
wire [`NUM_THREADS-1:0][31:0] shift_result;
reg [`NUM_THREADS-1:0][31:0] misc_result;
reg [`NUM_THREADS-1:0][31:0] alu_result;
reg [`NUM_THREADS-1:0][31:0] add_result;
reg [`NUM_THREADS-1:0][32:0] sub_result;
reg [`NUM_THREADS-1:0][31:0] shift_result;
reg [`NUM_THREADS-1:0][31:0] misc_result;
wire valid_r;
wire [`NW_BITS-1:0] wid_r;
wire [`NUM_THREADS-1:0] thread_mask_r;
wire [31:0] curr_PC_r;
wire [`NR_BITS-1:0] rd_r;
wire wb_r;
wire [`NT_BITS-1:0] tid_r;
wire is_sub_r;
wire [`BR_BITS-1:0] br_op_r;
wire is_br_op_r, is_br_op_s;
wire [1:0] alu_op_class_r;
wire [31:0] next_PC_r;
wire is_br_op = `IS_BR_OP(alu_req_if.op);
wire [`ALU_BITS-1:0] alu_op = `ALU_OP(alu_req_if.op);
wire [`BR_BITS-1:0] br_op = `BR_OP(alu_req_if.op);
wire alu_signed = `ALU_SIGNED(alu_op);
wire [1:0] alu_op_class = `ALU_OP_CLASS(alu_op);
wire is_sub = (alu_op == `ALU_SUB);
wire [`ALU_BITS-1:0] alu_op = `ALU_OP(alu_req_if.op);
wire [`NUM_THREADS-1:0][31:0] alu_in1 = alu_req_if.rs1_data;
wire [`NUM_THREADS-1:0][31:0] alu_in2 = alu_req_if.rs2_data;
wire [`NUM_THREADS-1:0][31:0] alu_in1_PC = alu_req_if.rs1_is_PC ? {`NUM_THREADS{alu_req_if.curr_PC}} : alu_in1;
wire [`NUM_THREADS-1:0][31:0] alu_in2_imm = alu_req_if.rs2_is_imm ? {`NUM_THREADS{alu_req_if.imm}} : alu_in2;
wire negate_add = (alu_op == `ALU_SUB);
wire signed_less = (alu_op == `ALU_SLT);
wire signed_shift = (alu_op == `ALU_SRA);
wire [`NUM_THREADS-1:0][31:0] alu_in1_PC = alu_req_if.rs1_is_PC ? {`NUM_THREADS{alu_req_if.curr_PC}} : alu_in1;
wire [`NUM_THREADS-1:0][31:0] alu_in2_imm = alu_req_if.rs2_is_imm ? {`NUM_THREADS{alu_req_if.imm}} : alu_in2;
wire [`NUM_THREADS-1:0][31:0] alu_in2_less = (alu_req_if.rs2_is_imm && ~is_br_op) ? {`NUM_THREADS{alu_req_if.imm}} : alu_in2;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
wire [32:0] addsub_in1 = {alu_in1_PC[i], 1'b1};
wire [32:0] addsub_in2 = {alu_in2_imm[i], 1'b0} ^ {33{negate_add}};
`IGNORE_WARNINGS_BEGIN
wire [32:0] addsub_addd = addsub_in1 + addsub_in2;
`IGNORE_WARNINGS_END
assign addsub_result[i] = addsub_addd[32:1];
always @(posedge clk) begin
add_result[i] <= alu_in1_PC[i] + alu_in2_imm[i];
end
end
for (genvar i = 0; i < `NUM_THREADS; i++) begin
wire [32:0] less_in1 = {signed_less & alu_in1[i][31], alu_in1[i]};
wire [32:0] less_in2 = {signed_less & alu_in2_imm[i][31], alu_in2_imm[i]};
assign less_result[i] = $signed(less_in1) < $signed(less_in2);
wire [32:0] sub_in1 = {alu_signed & alu_in1[i][31], alu_in1[i]};
wire [32:0] sub_in2 = {alu_signed & alu_in2_less[i][31], alu_in2_less[i]};
always @(posedge clk) begin
sub_result[i] <= $signed(sub_in1) - $signed(sub_in2);
end
end
for (genvar i = 0; i < `NUM_THREADS; i++) begin
wire [32:0] shift_in1 = {signed_shift & alu_in1[i][31], alu_in1[i]};
wire [32:0] shift_in1 = {alu_signed & alu_in1[i][31], alu_in1[i]};
`IGNORE_WARNINGS_BEGIN
wire [32:0] shift_value = $signed(shift_in1) >>> alu_in2_imm[i][4:0];
`IGNORE_WARNINGS_END
assign shift_result[i] = shift_value[31:0];
always @(posedge clk) begin
shift_result[i] <= shift_value[31:0];
end
end
for (genvar i = 0; i < `NUM_THREADS; i++) begin
always @(*) begin
always @(posedge clk) begin
case (alu_op)
`ALU_AND: misc_result[i] = alu_in1[i] & alu_in2_imm[i];
`ALU_OR: misc_result[i] = alu_in1[i] | alu_in2_imm[i];
`ALU_XOR: misc_result[i] = alu_in1[i] ^ alu_in2_imm[i];
`ALU_AND: misc_result[i] <= alu_in1[i] & alu_in2_imm[i];
`ALU_OR: misc_result[i] <= alu_in1[i] | alu_in2_imm[i];
`ALU_XOR: misc_result[i] <= alu_in1[i] ^ alu_in2_imm[i];
//`ALU_SLL,
default: misc_result[i] = alu_in1[i] << alu_in2_imm[i][4:0];
default: misc_result[i] <= alu_in1[i] << alu_in2_imm[i][4:0];
endcase
end
end
reg [31:0] next_PC = alu_req_if.curr_PC + 4;
VX_shift_register #(
.DATAW(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + `NT_BITS + 1 + 1 + `BR_BITS + 2 + 32),
.DEPTH(1)
) alu_shift_reg (
.clk(clk),
.reset(reset),
.enable(alu_req_if.ready),
.in({alu_req_if.valid, alu_req_if.wid, alu_req_if.thread_mask, alu_req_if.curr_PC, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid, is_sub, is_br_op, br_op, alu_op_class, next_PC}),
.out({valid_r, wid_r, thread_mask_r, curr_PC_r, rd_r, wb_r, tid_r, is_sub_r, is_br_op_r, br_op_r, alu_op_class_r, next_PC_r})
);
for (genvar i = 0; i < `NUM_THREADS; i++) begin
always @(*) begin
case (`ALU_OP_CLASS(alu_op))
0: alu_result[i] = addsub_result[i];
1: alu_result[i] = {31'b0, less_result[i]};
case (alu_op_class_r)
0: alu_result[i] = is_sub_r ? sub_result[i][31:0] : add_result[i];
1: alu_result[i] = {31'b0, sub_result[i][32]};
2: alu_result[i] = shift_result[i];
default: alu_result[i] = misc_result[i];
endcase
end
end
end
// branch handling
wire br_neg = `BR_NEG(br_op_r);
wire br_less = `BR_LESS(br_op_r);
wire br_static = `BR_STATIC(br_op_r);
wire is_jal = is_br_op_r && (br_op_r == `BR_JAL || br_op_r == `BR_JALR);
wire [31:0] br_dest = add_result[tid_r];
wire [32:0] cmp_result = sub_result[tid_r];
wire is_less = cmp_result[32];
wire is_equal = ~(| cmp_result[31:0]);
wire br_taken = ((br_less ? is_less : is_equal) ^ br_neg) | br_static;
wire [`NUM_THREADS-1:0][31:0] alu_jal_result = is_jal ? {`NUM_THREADS{next_PC_r}} : alu_result;
// output
wire stall_out = ~alu_commit_if.ready && alu_commit_if.valid;
VX_generic_register #(
.N(1 + `ISTAG_BITS + (`NUM_THREADS * 32))
.N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + 1 + 32)
) alu_reg (
.clk (clk),
.reset (reset),
.stall (0),
.stall (stall_out),
.flush (0),
.in ({alu_req_if.valid, alu_req_if.issue_tag, alu_result}),
.out ({alu_commit_if.valid, alu_commit_if.issue_tag, alu_commit_if.data})
.in ({valid_r, wid_r, thread_mask_r, curr_PC_r, rd_r, wb_r, alu_jal_result, is_br_op_r, br_taken, br_dest}),
.out ({alu_commit_if.valid, alu_commit_if.wid, alu_commit_if.thread_mask, alu_commit_if.curr_PC, alu_commit_if.rd, alu_commit_if.wb, alu_commit_if.data, is_br_op_s, branch_ctl_if.taken, branch_ctl_if.dest})
);
assign alu_req_if.ready = 1'b1;
assign branch_ctl_if.valid = alu_commit_if.valid && alu_commit_if.ready && is_br_op_s;
assign branch_ctl_if.wid = alu_commit_if.wid;
// can accept new request?
assign alu_req_if.ready = ~stall_out;
endmodule

View file

@ -1,56 +0,0 @@
`include "VX_define.vh"
module VX_bru_unit #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
// Inputs
VX_bru_req_if bru_req_if,
// Outputs
VX_branch_ctl_if branch_ctl_if,
VX_exu_to_cmt_if bru_commit_if
);
wire [`BRU_BITS-1:0] bru_op = bru_req_if.op;
wire bru_neg = `BRU_NEG(bru_op);
wire bru_less = `BRU_LESS(bru_op);
wire bru_signed = `BRU_SIGNED(bru_op);
wire bru_static = `BRU_STATIC(bru_op);
wire [31:0] rs1_data = bru_req_if.rs1_data;
wire [31:0] rs2_data = bru_req_if.rs2_data;
wire [32:0] signed_in1 = {bru_signed & rs1_data[31], rs1_data};
wire [32:0] signed_in2 = {bru_signed & rs2_data[31], rs2_data};
wire is_less = $signed(signed_in1) < $signed(signed_in2);
wire is_equal = (rs1_data == rs2_data);
wire taken = ((bru_less ? is_less : is_equal) ^ bru_neg) | bru_static;
wire [31:0] base_addr = bru_req_if.rs1_is_PC ? bru_req_if.curr_PC : rs1_data;
wire [31:0] dest = base_addr + bru_req_if.offset;
wire [31:0] jal_result = bru_req_if.curr_PC + 4;
wire [31:0] jal_result_r;
VX_generic_register #(
.N(1 + `NW_BITS + `ISTAG_BITS + 1 + 32 + 32)
) bru_reg (
.clk (clk),
.reset (reset),
.stall (0),
.flush (0),
.in ({bru_req_if.valid, bru_req_if.wid, bru_req_if.issue_tag, taken, dest, jal_result}),
.out ({bru_commit_if.valid, branch_ctl_if.wid, bru_commit_if.issue_tag, branch_ctl_if.taken, branch_ctl_if.dest, jal_result_r})
);
assign branch_ctl_if.valid = bru_commit_if.valid;
assign bru_commit_if.data = {`NUM_THREADS{jal_result_r}};
assign bru_req_if.ready = 1'b1;
endmodule

View file

@ -8,7 +8,6 @@ module VX_commit #(
// inputs
VX_exu_to_cmt_if alu_commit_if,
VX_exu_to_cmt_if bru_commit_if,
VX_exu_to_cmt_if lsu_commit_if,
VX_exu_to_cmt_if mul_commit_if,
VX_exu_to_cmt_if csr_commit_if,
@ -16,15 +15,13 @@ module VX_commit #(
VX_exu_to_cmt_if gpu_commit_if,
// outputs
VX_cmt_to_issue_if cmt_to_issue_if,
VX_wb_if writeback_if,
VX_writeback_if writeback_if,
VX_cmt_to_csr_if cmt_to_csr_if
);
// update CRSs
// CSRs update
wire [`NUM_EXS-1:0] commited_mask;
assign commited_mask = {alu_commit_if.valid,
bru_commit_if.valid,
assign commited_mask = {alu_commit_if.valid,
lsu_commit_if.valid,
csr_commit_if.valid,
mul_commit_if.valid,
@ -44,7 +41,7 @@ module VX_commit #(
always @(*) begin
fflags = 0;
for (integer i = 0; i < `NUM_THREADS; i++) begin
if (cmt_to_issue_if.fpu_data.thread_mask[i]) begin
if (fpu_commit_if.thread_mask[i]) begin
fflags.NX |= fpu_commit_if.fflags[i].NX;
fflags.UF |= fpu_commit_if.fflags[i].UF;
fflags.OF |= fpu_commit_if.fflags[i].OF;
@ -64,7 +61,7 @@ module VX_commit #(
csr_update_r <= (| commited_mask);
fflags_r <= fflags;
has_fflags_r <= fpu_commit_if.valid && fpu_commit_if.has_fflags;
wid_r <= cmt_to_issue_if.fpu_data.wid;
wid_r <= fpu_commit_if.wid;
num_commits_r <= num_commits;
end
@ -74,23 +71,7 @@ module VX_commit #(
assign cmt_to_csr_if.has_fflags = has_fflags_r;
assign cmt_to_csr_if.fflags = fflags_r;
// Notify issue stage
assign cmt_to_issue_if.alu_valid = alu_commit_if.valid;
assign cmt_to_issue_if.bru_valid = bru_commit_if.valid;
assign cmt_to_issue_if.lsu_valid = lsu_commit_if.valid;
assign cmt_to_issue_if.csr_valid = csr_commit_if.valid;
assign cmt_to_issue_if.mul_valid = mul_commit_if.valid;
assign cmt_to_issue_if.fpu_valid = fpu_commit_if.valid;
assign cmt_to_issue_if.gpu_valid = gpu_commit_if.valid;
assign cmt_to_issue_if.alu_tag = alu_commit_if.issue_tag;
assign cmt_to_issue_if.bru_tag = bru_commit_if.issue_tag;
assign cmt_to_issue_if.lsu_tag = lsu_commit_if.issue_tag;
assign cmt_to_issue_if.csr_tag = csr_commit_if.issue_tag;
assign cmt_to_issue_if.mul_tag = mul_commit_if.issue_tag;
assign cmt_to_issue_if.fpu_tag = fpu_commit_if.issue_tag;
assign cmt_to_issue_if.gpu_tag = gpu_commit_if.issue_tag;
// Writeback
VX_writeback #(
.CORE_ID(CORE_ID)
@ -99,41 +80,38 @@ module VX_commit #(
.reset (reset),
.alu_commit_if (alu_commit_if),
.bru_commit_if (bru_commit_if),
.lsu_commit_if (lsu_commit_if),
.csr_commit_if (csr_commit_if),
.mul_commit_if (mul_commit_if),
.fpu_commit_if (fpu_commit_if),
.gpu_commit_if (gpu_commit_if),
.cmt_to_issue_if(cmt_to_issue_if),
.writeback_if (writeback_if)
);
`ifdef DBG_PRINT_PIPELINE
always @(posedge clk) begin
if (alu_commit_if.valid) begin
$display("%t: Core%0d-commit: wid=%0d, PC=%0h, ex=ALU, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, cmt_to_issue_if.alu_data.wid, cmt_to_issue_if.alu_data.curr_PC, alu_commit_if.issue_tag, cmt_to_issue_if.alu_data.thread_mask, cmt_to_issue_if.alu_data.wb, cmt_to_issue_if.alu_data.rd, alu_commit_if.data);
if (alu_commit_if.valid && alu_commit_if.ready) begin
$display("%t: core%0d-commit: wid=%0d, PC=%0h, ex=ALU, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, alu_commit_if.wid, alu_commit_if.curr_PC, alu_commit_if.thread_mask, alu_commit_if.wb, alu_commit_if.rd, alu_commit_if.data);
end
if (bru_commit_if.valid) begin
$display("%t: Core%0d-commit: wid=%0d, PC=%0h, ex=BRU, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, cmt_to_issue_if.bru_data.wid, cmt_to_issue_if.bru_data.curr_PC, bru_commit_if.issue_tag, cmt_to_issue_if.bru_data.thread_mask, cmt_to_issue_if.bru_data.wb, cmt_to_issue_if.bru_data.rd, bru_commit_if.data);
if (lsu_commit_if.valid && lsu_commit_if.ready) begin
$display("%t: core%0d-commit: wid=%0d, PC=%0h, ex=LSU, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, lsu_commit_if.wid, lsu_commit_if.curr_PC, lsu_commit_if.thread_mask, lsu_commit_if.wb, lsu_commit_if.rd, lsu_commit_if.data);
end
if (lsu_commit_if.valid) begin
$display("%t: Core%0d-commit: wid=%0d, PC=%0h, ex=LSU, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, cmt_to_issue_if.lsu_data.wid, cmt_to_issue_if.lsu_data.curr_PC, lsu_commit_if.issue_tag, cmt_to_issue_if.lsu_data.thread_mask, cmt_to_issue_if.lsu_data.wb, cmt_to_issue_if.lsu_data.rd, lsu_commit_if.data);
end
if (csr_commit_if.valid) begin
$display("%t: Core%0d-commit: wid=%0d, PC=%0h, ex=CSR, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, cmt_to_issue_if.csr_data.wid, cmt_to_issue_if.csr_data.curr_PC, csr_commit_if.issue_tag, cmt_to_issue_if.csr_data.thread_mask, cmt_to_issue_if.csr_data.wb, cmt_to_issue_if.csr_data.rd, csr_commit_if.data);
if (csr_commit_if.valid && csr_commit_if.ready) begin
$display("%t: core%0d-commit: wid=%0d, PC=%0h, ex=CSR, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, csr_commit_if.wid, csr_commit_if.curr_PC, csr_commit_if.thread_mask, csr_commit_if.wb, csr_commit_if.rd, csr_commit_if.data);
end
if (mul_commit_if.validy) begin
$display("%t: Core%0d-commit: wid=%0d, PC=%0h, ex=MUL, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, cmt_to_issue_if.mul_data.wid, cmt_to_issue_if.mul_data.curr_PC, mul_commit_if.issue_tag, cmt_to_issue_if.mul_data.thread_mask, cmt_to_issue_if.mul_data.wb, cmt_to_issue_if.mul_data.rd, mul_commit_if.data);
if (mul_commit_if.valid && mul_commit_if.ready) begin
$display("%t: core%0d-commit: wid=%0d, PC=%0h, ex=MUL, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, mul_commit_if.wid, mul_commit_if.curr_PC, mul_commit_if.thread_mask, mul_commit_if.wb, mul_commit_if.rd, mul_commit_if.data);
end
if (fpu_commit_if.valid) begin
$display("%t: Core%0d-commit: wid=%0d, PC=%0h, ex=FPU, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, cmt_to_issue_if.fpu_data.wid, cmt_to_issue_if.fpu_data.curr_PC, fpu_commit_if.issue_tag, cmt_to_issue_if.fpu_data.thread_mask, cmt_to_issue_if.fpu_data.wb, cmt_to_issue_if.fpu_data.rd, fpu_commit_if.data);
if (fpu_commit_if.valid && fpu_commit_if.ready) begin
$display("%t: core%0d-commit: wid=%0d, PC=%0h, ex=FPU, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, fpu_commit_if.wid, fpu_commit_if.curr_PC, fpu_commit_if.thread_mask, fpu_commit_if.wb, fpu_commit_if.rd, fpu_commit_if.data);
end
if (gpu_commit_if.valid) begin
$display("%t: Core%0d-commit: wid=%0d, PC=%0h, ex=GPU, istag=%0d, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, cmt_to_issue_if.gpu_data.wid, cmt_to_issue_if.gpu_data.curr_PC, gpu_commit_if.issue_tag, cmt_to_issue_if.gpu_data.thread_mask, cmt_to_issue_if.gpu_data.wb, cmt_to_issue_if.gpu_data.rd, gpu_commit_if.data);
if (gpu_commit_if.valid && gpu_commit_if.ready) begin
$display("%t: core%0d-commit: wid=%0d, PC=%0h, ex=GPU, tmask=%b, wb=%0d, rd=%0d, data=%0h", $time, CORE_ID, gpu_commit_if.wid, gpu_commit_if.curr_PC, gpu_commit_if.thread_mask, gpu_commit_if.wb, gpu_commit_if.rd, gpu_commit_if.data);
end
end
`else
`UNUSED_FIELD(fpu_commit_if, curr_PC)
`endif
endmodule

View file

@ -60,16 +60,6 @@
`define ARCHITECTURE_ID 0
`define IMPLEMENTATION_ID 0
// Size of MUL Request Queue Size
`ifndef MULRQ_SIZE
`define MULRQ_SIZE 8
`endif
// Size of issue queue
`ifndef ISSUEQ_SIZE
`define ISSUEQ_SIZE (8 + `NUM_WARPS)
`endif
// CSR Addresses //////////////////////////////////////////////////////////////
`define CSR_FFLAGS 12'h001
@ -109,6 +99,28 @@
`define CSR_MIMPID 12'hF13
`define CSR_MHARTID 12'hF14
// Pipeline Queues ============================================================
// Size of instruction queue
`ifndef IBUF_SIZE
`define IBUF_SIZE 8
`endif
// Size of LSU Request Queue
`ifndef LSUQ_SIZE
`define LSUQ_SIZE 8
`endif
// Size of MUL Request Queue
`ifndef MULQ_SIZE
`define MULQ_SIZE 8
`endif
// Size of FPU Request Queue
`ifndef FPUQ_SIZE
`define FPUQ_SIZE 8
`endif
// Dcache Configurable Knobs ==================================================
// Size of cache in bytes

View file

@ -1,9 +1,6 @@
`include "VX_define.vh"
module VX_csr_arb (
input wire clk,
input wire reset,
module VX_csr_arb (
// inputs
VX_csr_req_if csr_core_req_if,
VX_csr_io_req_if csr_io_req_if,
@ -12,7 +9,7 @@ module VX_csr_arb (
VX_csr_req_if csr_req_if,
// input
VX_csr_rsp_if csr_rsp_if,
VX_exu_to_cmt_if csr_rsp_if,
// outputs
VX_exu_to_cmt_if csr_commit_if,
@ -21,33 +18,33 @@ module VX_csr_arb (
input wire select_io_req,
input wire select_io_rsp
);
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
// requests
assign csr_req_if.valid = (~select_io_req) ? csr_core_req_if.valid : csr_io_req_if.valid;
assign csr_req_if.issue_tag = (~select_io_req) ? csr_core_req_if.issue_tag : 0;
assign csr_req_if.wid = (~select_io_req) ? csr_core_req_if.wid : 0;
assign csr_req_if.curr_PC = (~select_io_req) ? csr_core_req_if.curr_PC : 0;
assign csr_req_if.op = (~select_io_req) ? csr_core_req_if.op : (csr_io_req_if.rw ? `CSR_RW : `CSR_RS);
assign csr_req_if.csr_addr = (~select_io_req) ? csr_core_req_if.csr_addr : csr_io_req_if.addr;
assign csr_req_if.csr_mask = (~select_io_req) ? csr_core_req_if.csr_mask : (csr_io_req_if.rw ? csr_io_req_if.data : 32'b0);
assign csr_req_if.rd = (~select_io_req) ? csr_core_req_if.rd : 0;
assign csr_req_if.wb = (~select_io_req) ? csr_core_req_if.wb : 0;
assign csr_req_if.valid = (~select_io_req) ? csr_core_req_if.valid : csr_io_req_if.valid;
assign csr_req_if.wid = (~select_io_req) ? csr_core_req_if.wid : 0;
assign csr_req_if.thread_mask = (~select_io_req) ? csr_core_req_if.thread_mask : 0;
assign csr_req_if.curr_PC = (~select_io_req) ? csr_core_req_if.curr_PC : 0;
assign csr_req_if.op = (~select_io_req) ? csr_core_req_if.op : (csr_io_req_if.rw ? `CSR_RW : `CSR_RS);
assign csr_req_if.csr_addr = (~select_io_req) ? csr_core_req_if.csr_addr : csr_io_req_if.addr;
assign csr_req_if.csr_mask = (~select_io_req) ? csr_core_req_if.csr_mask : (csr_io_req_if.rw ? csr_io_req_if.data : 32'b0);
assign csr_req_if.rd = (~select_io_req) ? csr_core_req_if.rd : 0;
assign csr_req_if.wb = (~select_io_req) ? csr_core_req_if.wb : 0;
assign csr_req_if.is_io = select_io_req;
assign csr_core_req_if.ready = csr_req_if.ready && (~select_io_req);
assign csr_io_req_if.ready = csr_req_if.ready && select_io_req;
assign csr_io_req_if.ready = csr_req_if.ready && select_io_req;
// responses
assign csr_io_rsp_if.valid = csr_rsp_if.valid & select_io_rsp;
assign csr_io_rsp_if.data = csr_rsp_if.data[0];
assign csr_commit_if.valid = csr_rsp_if.valid & ~select_io_rsp;
assign csr_commit_if.issue_tag= csr_rsp_if.issue_tag;
assign csr_commit_if.data = csr_rsp_if.data;
assign csr_commit_if.valid = csr_rsp_if.valid & ~select_io_rsp;
assign csr_commit_if.wid = csr_rsp_if.wid;
assign csr_commit_if.thread_mask = csr_rsp_if.thread_mask;
assign csr_commit_if.curr_PC = csr_rsp_if.curr_PC;
assign csr_commit_if.rd = csr_rsp_if.rd;
assign csr_commit_if.wb = csr_rsp_if.wb;
assign csr_commit_if.data = csr_rsp_if.data;
assign csr_rsp_if.ready = select_io_rsp ? csr_io_rsp_if.ready : 1'b1;
assign csr_rsp_if.ready = select_io_rsp ? csr_io_rsp_if.ready : csr_commit_if.ready;
endmodule

View file

@ -7,7 +7,7 @@ module VX_csr_data #(
input wire reset,
VX_cmt_to_csr_if cmt_to_csr_if,
VX_csr_to_fpu_if csr_to_fpu_if,
VX_csr_to_issue_if csr_to_issue_if,
input wire[`NW_BITS-1:0] wid,
@ -129,11 +129,11 @@ module VX_csr_data #(
`CSR_MIMPID : read_data = `IMPLEMENTATION_ID;
default: begin
assert(~read_enable) else $error("%t: invalid CSR read address: %0h", $time, read_addr);
end
assert(~read_enable) else $error("%t: invalid CSR read address: %0h", $time, read_addr);
end
endcase
end
assign csr_to_fpu_if.frm = csr_frm[csr_to_fpu_if.wid];
assign csr_to_issue_if.frm = csr_frm[csr_to_issue_if.wid];
endmodule

View file

@ -7,7 +7,7 @@ module VX_csr_unit #(
input wire reset,
VX_cmt_to_csr_if cmt_to_csr_if,
VX_csr_to_fpu_if csr_to_fpu_if,
VX_csr_to_issue_if csr_to_issue_if,
VX_csr_io_req_if csr_io_req_if,
VX_csr_io_rsp_if csr_io_rsp_if,
@ -15,16 +15,13 @@ module VX_csr_unit #(
VX_csr_req_if csr_req_if,
VX_exu_to_cmt_if csr_commit_if
);
VX_csr_req_if csr_pipe_req_if();
VX_csr_rsp_if csr_pipe_rsp_if();
VX_csr_req_if csr_pipe_req_if();
VX_exu_to_cmt_if csr_pipe_rsp_if();
wire select_io_req = csr_io_req_if.valid;
wire select_io_rsp;
VX_csr_arb csr_arb (
.clk (clk),
.reset (reset),
.csr_core_req_if (csr_req_if),
.csr_io_req_if (csr_io_req_if),
.csr_req_if (csr_pipe_req_if),
@ -41,7 +38,6 @@ module VX_csr_unit #(
wire [`CSR_ADDR_BITS-1:0] csr_addr_s1;
wire [31:0] csr_read_data, csr_read_data_s1;
wire [31:0] csr_updated_data_s1;
wire [`NW_BITS-1:0] wid_s1;
VX_csr_data #(
.CORE_ID(CORE_ID)
@ -49,7 +45,7 @@ module VX_csr_unit #(
.clk (clk),
.reset (reset),
.cmt_to_csr_if (cmt_to_csr_if),
.csr_to_fpu_if (csr_to_fpu_if),
.csr_to_issue_if (csr_to_issue_if),
.read_enable (csr_pipe_req_if.valid),
.read_addr (csr_pipe_req_if.csr_addr),
.read_data (csr_read_data),
@ -60,7 +56,7 @@ module VX_csr_unit #(
);
wire csr_hazard = (csr_addr_s1 == csr_pipe_req_if.csr_addr)
&& (wid_s1 == csr_pipe_req_if.wid)
&& (csr_pipe_rsp_if.wid == csr_pipe_req_if.wid)
&& csr_pipe_rsp_if.valid;
wire [31:0] csr_read_data_qual = csr_hazard ? csr_updated_data_s1 : csr_read_data;
@ -86,21 +82,21 @@ module VX_csr_unit #(
end
default: csr_updated_data = 32'hdeadbeef;
endcase
end
end
wire csr_we_s0 = csr_we_s0_unqual && csr_pipe_req_if.valid;
wire stall = ~csr_pipe_rsp_if.ready && csr_pipe_rsp_if.valid;
VX_generic_register #(
.N(1 + `ISTAG_BITS + `NW_BITS + 1 + `CSR_ADDR_BITS + 1 + 32 + 32)
.N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + 1 + `CSR_ADDR_BITS + 1 + 32 + 32)
) csr_reg (
.clk (clk),
.reset (reset),
.stall (stall),
.flush (0),
.in ({csr_pipe_req_if.valid, csr_pipe_req_if.issue_tag, csr_pipe_req_if.wid, csr_we_s0, csr_pipe_req_if.csr_addr, csr_pipe_req_if.is_io, csr_read_data_qual, csr_updated_data}),
.out ({csr_pipe_rsp_if.valid, csr_pipe_rsp_if.issue_tag, wid_s1, csr_we_s1, csr_addr_s1, select_io_rsp, csr_read_data_s1, csr_updated_data_s1})
.in ({csr_pipe_req_if.valid, csr_pipe_req_if.wid, csr_pipe_req_if.thread_mask, csr_pipe_req_if.curr_PC, csr_pipe_req_if.rd, csr_pipe_req_if.wb, csr_we_s0, csr_pipe_req_if.csr_addr, csr_pipe_req_if.is_io, csr_read_data_qual, csr_updated_data}),
.out ({csr_pipe_rsp_if.valid, csr_pipe_rsp_if.wid, csr_pipe_rsp_if.thread_mask, csr_pipe_rsp_if.curr_PC, csr_pipe_rsp_if.rd, csr_pipe_rsp_if.wb, csr_we_s1, csr_addr_s1, select_io_rsp, csr_read_data_s1, csr_updated_data_s1})
);
for (genvar i = 0; i < `NUM_THREADS; i++) begin
@ -109,6 +105,7 @@ module VX_csr_unit #(
csr_read_data_s1;
end
// can accept new request?
assign csr_pipe_req_if.ready = ~stall;
endmodule

View file

@ -15,11 +15,13 @@ module VX_decode #(
VX_wstall_if wstall_if,
VX_join_if join_if
);
wire valid_in = ifetch_rsp_if.valid;
wire [31:0] instr = ifetch_rsp_if.instr;
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
wire [31:0] instr = ifetch_rsp_if.instr;
reg [`ALU_BITS-1:0] alu_op;
reg [`BRU_BITS-1:0] br_op;
reg [`BR_BITS-1:0] br_op;
reg [`LSU_BITS-1:0] lsu_op;
reg [`CSR_BITS-1:0] csr_op;
reg [`MUL_BITS-1:0] mul_op;
@ -100,27 +102,27 @@ module VX_decode #(
wire is_br = (is_btype || is_jal || is_jalr || is_jals);
always @(*) begin
br_op = `BRU_OTHER;
br_op = `BR_OTHER;
case (opcode)
`INST_B: begin
case (func3)
3'h0: br_op = `BRU_EQ;
3'h1: br_op = `BRU_NE;
3'h4: br_op = `BRU_LT;
3'h5: br_op = `BRU_GE;
3'h6: br_op = `BRU_LTU;
3'h7: br_op = `BRU_GEU;
3'h0: br_op = `BR_EQ;
3'h1: br_op = `BR_NE;
3'h4: br_op = `BR_LT;
3'h5: br_op = `BR_GE;
3'h6: br_op = `BR_LTU;
3'h7: br_op = `BR_GEU;
default:;
endcase
end
`INST_JAL: br_op = `BRU_JAL;
`INST_JALR: br_op = `BRU_JALR;
`INST_JAL: br_op = `BR_JAL;
`INST_JALR: br_op = `BR_JALR;
`INST_SYS: begin
if (is_jals && u_12 == 12'h000) br_op = `BRU_ECALL;
if (is_jals && u_12 == 12'h001) br_op = `BRU_EBREAK;
if (is_jals && u_12 == 12'h302) br_op = `BRU_MRET;
if (is_jals && u_12 == 12'h102) br_op = `BRU_SRET;
if (is_jals && u_12 == 12'h7B2) br_op = `BRU_DRET;
if (is_jals && u_12 == 12'h000) br_op = `BR_ECALL;
if (is_jals && u_12 == 12'h001) br_op = `BR_EBREAK;
if (is_jals && u_12 == 12'h302) br_op = `BR_MRET;
if (is_jals && u_12 == 12'h102) br_op = `BR_SRET;
if (is_jals && u_12 == 12'h7B2) br_op = `BR_DRET;
end
default:;
endcase
@ -290,104 +292,93 @@ module VX_decode #(
///////////////////////////////////////////////////////////////////////////
VX_decode_if decode_tmp_if();
assign decode_if.valid = ifetch_rsp_if.valid
&& (decode_if.ex_type != `EX_NOP); // skip noop
assign decode_tmp_if.valid = ifetch_rsp_if.valid;
assign decode_tmp_if.wid = ifetch_rsp_if.wid;
assign decode_tmp_if.thread_mask = ifetch_rsp_if.thread_mask;
assign decode_tmp_if.curr_PC = ifetch_rsp_if.curr_PC;
assign decode_if.wid = ifetch_rsp_if.wid;
assign decode_if.thread_mask = ifetch_rsp_if.thread_mask;
assign decode_if.curr_PC = ifetch_rsp_if.curr_PC;
assign decode_tmp_if.ex_type = is_lsu ? `EX_LSU :
is_csr ? `EX_CSR :
is_mul ? `EX_MUL :
is_fpu ? `EX_FPU :
is_gpu ? `EX_GPU :
is_br ? `EX_BRU :
(is_rtype || is_itype || is_lui || is_auipc) ? `EX_ALU :
`EX_NOP;
assign decode_if.ex_type = is_lsu ? `EX_LSU :
is_csr ? `EX_CSR :
is_mul ? `EX_MUL :
is_fpu ? `EX_FPU :
is_gpu ? `EX_GPU :
is_br ? `EX_ALU :
(is_rtype || is_itype || is_lui || is_auipc) ? `EX_ALU :
`EX_NOP;
assign decode_tmp_if.ex_op = is_lsu ? `OP_BITS'(lsu_op) :
is_csr ? `OP_BITS'(csr_op) :
is_mul ? `OP_BITS'(mul_op) :
is_fpu ? `OP_BITS'(fpu_op) :
is_gpu ? `OP_BITS'(gpu_op) :
is_br ? `OP_BITS'(br_op) :
(is_rtype || is_itype || is_lui || is_auipc) ? `OP_BITS'(alu_op) :
0;
assign decode_if.ex_op = is_lsu ? `OP_BITS'(lsu_op) :
is_csr ? `OP_BITS'(csr_op) :
is_mul ? `OP_BITS'(mul_op) :
is_fpu ? `OP_BITS'(fpu_op) :
is_gpu ? `OP_BITS'(gpu_op) :
is_br ? `OP_BITS'({1'b1, br_op}) :
(is_rtype || is_itype || is_lui || is_auipc) ? `OP_BITS'({1'b0, alu_op}) :
0;
assign decode_tmp_if.wb = use_rd;
assign decode_if.wb = use_rd;
`ifdef EXT_F_ENABLE
wire rd_is_fp = is_fpu && ~(is_fcmp || is_fcvti || (fpu_op == `FPU_MVXW || fpu_op == `FPU_CLASS));
wire rs1_is_fp = is_fr4 || (is_fci && ~(is_fcvtf || (fpu_op == `FPU_MVWX)));
wire rs2_is_fp = is_fs || is_fr4 || is_fci;
assign decode_tmp_if.rd = {rd_is_fp, rd};
assign decode_tmp_if.rs1 = {rs1_is_fp, rs1_qual};
assign decode_tmp_if.rs2 = {rs2_is_fp, rs2};
assign decode_tmp_if.rs3 = {1'b1, rs3};
assign decode_if.rd = {rd_is_fp, rd};
assign decode_if.rs1 = {rs1_is_fp, rs1_qual};
assign decode_if.rs2 = {rs2_is_fp, rs2};
assign decode_if.rs3 = {1'b1, rs3};
`else
assign decode_tmp_if.rd = rd;
assign decode_tmp_if.rs1 = rs1_qual;
assign decode_tmp_if.rs2 = rs2;
assign decode_tmp_if.rs3 = rs3;
assign decode_if.rd = rd;
assign decode_if.rs1 = rs1_qual;
assign decode_if.rs2 = rs2;
assign decode_if.rs3 = rs3;
`endif
assign decode_tmp_if.use_rs3 = use_rs3;
assign decode_if.use_rs3 = use_rs3;
assign decode_tmp_if.reg_use_mask = ((`NUM_REGS)'(use_rd) << decode_tmp_if.rd)
| ((`NUM_REGS)'(use_rs1) << decode_tmp_if.rs1)
| ((`NUM_REGS)'(use_rs2) << decode_tmp_if.rs2)
| ((`NUM_REGS)'(use_rs3) << decode_tmp_if.rs3);
assign decode_if.used_regs = ((`NUM_REGS)'(use_rd) << decode_if.rd)
| ((`NUM_REGS)'(use_rs1) << decode_if.rs1)
| ((`NUM_REGS)'(use_rs2) << decode_if.rs2)
| ((`NUM_REGS)'(use_rs3) << decode_if.rs3);
assign decode_tmp_if.imm = (is_lui || is_auipc) ? {upper_imm, 12'(0)} :
(is_jal || is_jalr || is_jals) ? jalx_offset :
is_csr ? 32'(u_12) :
src2_imm;
assign decode_if.imm = (is_lui || is_auipc) ? {upper_imm, 12'(0)} :
(is_jal || is_jalr || is_jals) ? jalx_offset :
is_csr ? 32'(u_12) :
src2_imm;
assign decode_tmp_if.rs1_is_PC = is_auipc || is_btype || is_jal || is_jals;
assign decode_tmp_if.rs2_is_imm = is_itype || is_lui || is_auipc || is_csr_imm;
assign decode_if.rs1_is_PC = is_auipc || is_btype || is_jal || is_jals;
assign decode_if.rs2_is_imm = is_itype || is_lui || is_auipc || is_csr_imm || is_br;
assign decode_tmp_if.frm = func3;
assign decode_if.frm = func3;
assign join_if.is_join = valid_in && is_gpu && (gpu_op == `GPU_JOIN);
///////////////////////////////////////////////////////////////////////////
wire decode_fire = decode_if.valid && decode_if.ready;
assign join_if.is_join = decode_fire && is_gpu && (gpu_op == `GPU_JOIN);
assign join_if.wid = ifetch_rsp_if.wid;
assign wstall_if.wstall = valid_in && (is_btype || is_jal || is_jalr || (is_gpu && (gpu_op == `GPU_TMC || gpu_op == `GPU_SPLIT || gpu_op == `GPU_BAR)));
assign wstall_if.wstall = decode_fire && (is_btype || is_jal || is_jalr
|| (is_gpu && (gpu_op == `GPU_TMC
|| gpu_op == `GPU_SPLIT
|| gpu_op == `GPU_BAR)));
assign wstall_if.wid = ifetch_rsp_if.wid;
wire stall = ~decode_if.ready && decode_if.valid;
///////////////////////////////////////////////////////////////////////////
VX_generic_register #(
.N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + `NR_BITS + `NR_BITS + 32 + 1 + `EX_BITS + `OP_BITS + 1 + `NR_BITS + 1 + 1 + `FRM_BITS + `NUM_REGS)
) decode_reg (
.clk (clk),
.reset (reset),
.stall (stall),
.flush (0),
.in ({decode_tmp_if.valid, decode_tmp_if.wid, decode_tmp_if.thread_mask, decode_tmp_if.curr_PC, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm, decode_tmp_if.ex_type, decode_tmp_if.ex_op, decode_tmp_if.wb, decode_tmp_if.rs3, decode_tmp_if.use_rs3, decode_tmp_if.frm, decode_tmp_if.reg_use_mask}),
.out ({decode_if.valid, decode_if.wid, decode_if.thread_mask, decode_if.curr_PC, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.ex_type, decode_if.ex_op, decode_if.wb, decode_if.rs3, decode_if.use_rs3, decode_if.frm, decode_if.reg_use_mask})
);
assign ifetch_rsp_if.ready = ~stall;
assign ifetch_rsp_if.ready = decode_if.ready;
`ifdef DBG_PRINT_PIPELINE
always @(posedge clk) begin
if (decode_tmp_if.valid && ~stall) begin
$write("%t: Core%0d-Decode: wid=%0d, PC=%0h, ex=", $time, CORE_ID, decode_tmp_if.wid, decode_tmp_if.curr_PC);
print_ex_type(decode_tmp_if.ex_type);
if (decode_if.valid && decode_if.ready) begin
$write("%t: core%0d-decode: wid=%0d, PC=%0h, ex=", $time, CORE_ID, decode_if.wid, decode_if.curr_PC);
print_ex_type(decode_if.ex_type);
$write(", op=");
print_ex_op(decode_tmp_if.ex_type, decode_tmp_if.ex_op);
$write(", tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b, frm=", decode_tmp_if.thread_mask, decode_tmp_if.wb, decode_tmp_if.rd, decode_tmp_if.rs1, decode_tmp_if.rs2, decode_tmp_if.rs3, decode_tmp_if.imm, decode_tmp_if.rs1_is_PC, decode_tmp_if.rs2_is_imm);
print_frm(decode_tmp_if.frm);
$write("\n");
// trap unsupported instructions
assert(~(~stall && (decode_tmp_if.ex_type == `EX_ALU) && `ALU_OP(decode_tmp_if.ex_op) == `ALU_OTHER));
assert(~(~stall && (decode_tmp_if.ex_type == `EX_BRU) && `BRU_OP(decode_tmp_if.ex_op) == `BRU_OTHER));
assert(~(~stall && (decode_tmp_if.ex_type == `EX_CSR) && `CSR_OP(decode_tmp_if.ex_op) == `CSR_OTHER));
assert(~(~stall && (decode_tmp_if.ex_type == `EX_GPU) && `GPU_OP(decode_tmp_if.ex_op) == `GPU_OTHER));
print_ex_op(decode_if.ex_type, decode_if.ex_op);
$write(", tmask=%b, wb=%b, rd=%0d, rs1=%0d, rs2=%0d, rs3=%0d, imm=%0h, use_pc=%b, use_imm=%b, frm=", decode_if.thread_mask, decode_if.wb, decode_if.rd, decode_if.rs1, decode_if.rs2, decode_if.rs3, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm);
print_frm(decode_if.frm);
$write("\n");
end
end
`endif

View file

@ -33,8 +33,6 @@
`define CSR_WIDTH 12
`define ISTAG_BITS `LOG2UP(`ISSUEQ_SIZE)
///////////////////////////////////////////////////////////////////////////////
`define LATENCY_IDIV 33
@ -98,15 +96,14 @@
`define EX_NOP 3'h0
`define EX_ALU 3'h1
`define EX_BRU 3'h2
`define EX_LSU 3'h3
`define EX_CSR 3'h4
`define EX_MUL 3'h5
`define EX_FPU 3'h6
`define EX_GPU 3'h7
`define EX_LSU 3'h2
`define EX_CSR 3'h3
`define EX_MUL 3'h4
`define EX_FPU 3'h5
`define EX_GPU 3'h6
`define EX_BITS 3
`define NUM_EXS 7
`define NUM_EXS 6
`define NE_BITS `LOG2UP(`NUM_EXS)
///////////////////////////////////////////////////////////////////////////////
@ -117,8 +114,8 @@
`define ALU_SUB 4'b0001
`define ALU_LUI 4'b0010
`define ALU_AUIPC 4'b0011
`define ALU_SLT 4'b0100
`define ALU_SLTU 4'b0101
`define ALU_SLTU 4'b0100
`define ALU_SLT 4'b0101
`define ALU_SRL 4'b1000
`define ALU_SRA 4'b1001
`define ALU_AND 4'b1100
@ -129,27 +126,31 @@
`define ALU_BITS 4
`define ALU_OP(x) x[`ALU_BITS-1:0]
`define ALU_OP_CLASS(x) x[3:2]
`define ALU_SIGNED(x) x[0]
`define BRU_EQ 4'b0000
`define BRU_NE 4'b0001
`define BRU_LTU 4'b0010
`define BRU_GEU 4'b0011
`define BRU_LT 4'b0110
`define BRU_GE 4'b0111
`define BRU_JAL 4'b1000
`define BRU_JALR 4'b1001
`define BRU_ECALL 4'b1010
`define BRU_EBREAK 4'b1011
`define BRU_MRET 4'b1100
`define BRU_SRET 4'b1101
`define BRU_DRET 4'b1110
`define BRU_OTHER 4'b1111
`define BRU_BITS 4
`define BRU_OP(x) x[`BRU_BITS-1:0]
`define BRU_NEG(x) x[0]
`define BRU_LESS(x) x[1]
`define BRU_SIGNED(x) x[2]
`define BRU_STATIC(x) x[3]
`define BR_EQ 4'b0000
`define BR_NE 4'b0010
`define BR_LTU 4'b0100
`define BR_GEU 4'b0110
`define BR_LT 4'b0101
`define BR_GE 4'b0111
`define BR_JAL 4'b1000
`define BR_JALR 4'b1001
`define BR_ECALL 4'b1010
`define BR_EBREAK 4'b1011
`define BR_MRET 4'b1100
`define BR_SRET 4'b1101
`define BR_DRET 4'b1110
`define BR_OTHER 4'b1111
`define BR_BITS 4
`define BR_OP(x) x[`BR_BITS-1:0]
`define BR_NEG(x) x[1]
`define BR_LESS(x) x[2]
`define BR_STATIC(x) x[3]
`define ALU_BR_BITS 5
`define ALU_BR_OP(x) x[`ALU_BR_BITS-1:0]
`define IS_BR_OP(x) x[4]
`define LSU_LB {1'b0, `BYTEEN_SB}
`define LSU_LH {1'b0, `BYTEEN_SH}
@ -262,10 +263,10 @@
///////////////////////////////////////////////////////////////////////////////
`ifdef DBG_CORE_REQ_INFO // pc, wb, rd, wid
`define DEBUG_CORE_REQ_MDATA_WIDTH (32 + 1 + `NR_BITS + `NW_BITS)
`ifdef DBG_CORE_REQ_INFO // pc, rd, wid
`define DBG_CORE_REQ_MDATAW (32 + `NR_BITS + `NW_BITS)
`else
`define DEBUG_CORE_REQ_MDATA_WIDTH 0
`define DBG_CORE_REQ_MDATAW 0
`endif
////////////////////////// Dcache Configurable Knobs //////////////////////////
@ -274,10 +275,10 @@
`define DCACHE_ID (((`L3_ENABLE && `L2_ENABLE) ? 2 : `L2_ENABLE ? 1 : 0) + (CORE_ID * 3) + 0)
// TAG sharing enable
`define DCORE_TAG_ID_BITS `ISTAG_BITS
`define DCORE_TAG_ID_BITS `LOG2UP(`LSUQ_SIZE)
// Core request tag bits
`define DCORE_TAG_WIDTH (`DEBUG_CORE_REQ_MDATA_WIDTH + `DCORE_TAG_ID_BITS)
`define DCORE_TAG_WIDTH (`DBG_CORE_REQ_MDATAW + `DCORE_TAG_ID_BITS)
// DRAM request data bits
`define DDRAM_LINE_WIDTH (`DBANK_LINE_SIZE * 8)
@ -312,7 +313,7 @@
`define ICORE_TAG_ID_BITS `NW_BITS
// Core request tag bits
`define ICORE_TAG_WIDTH (`DEBUG_CORE_REQ_MDATA_WIDTH + `ICORE_TAG_ID_BITS)
`define ICORE_TAG_WIDTH (`DBG_CORE_REQ_MDATAW + `ICORE_TAG_ID_BITS)
// DRAM request data bits
`define IDRAM_LINE_WIDTH (`IBANK_LINE_SIZE * 8)

View file

@ -22,7 +22,6 @@ module VX_execute #(
// inputs
VX_alu_req_if alu_req_if,
VX_bru_req_if bru_req_if,
VX_lsu_req_if lsu_req_if,
VX_csr_req_if csr_req_if,
VX_mul_req_if mul_req_if,
@ -30,10 +29,10 @@ module VX_execute #(
VX_gpu_req_if gpu_req_if,
// outputs
VX_csr_to_issue_if csr_to_issue_if,
VX_branch_ctl_if branch_ctl_if,
VX_warp_ctl_if warp_ctl_if,
VX_exu_to_cmt_if alu_commit_if,
VX_exu_to_cmt_if bru_commit_if,
VX_exu_to_cmt_if lsu_commit_if,
VX_exu_to_cmt_if csr_commit_if,
VX_exu_to_cmt_if mul_commit_if,
@ -43,25 +42,14 @@ module VX_execute #(
output wire ebreak
);
VX_csr_to_fpu_if csr_to_fpu_if();
VX_alu_unit #(
.CORE_ID(CORE_ID)
) alu_unit (
.clk (clk),
.reset (reset),
.alu_req_if (alu_req_if),
.alu_commit_if (alu_commit_if)
);
VX_bru_unit #(
.CORE_ID(CORE_ID)
) bru_unit (
.clk (clk),
.reset (reset),
.bru_req_if (bru_req_if),
.branch_ctl_if (branch_ctl_if),
.bru_commit_if (bru_commit_if)
.alu_commit_if (alu_commit_if)
);
VX_lsu_unit #(
@ -82,7 +70,7 @@ module VX_execute #(
.clk (clk),
.reset (reset),
.cmt_to_csr_if (cmt_to_csr_if),
.csr_to_fpu_if (csr_to_fpu_if),
.csr_to_issue_if (csr_to_issue_if),
.csr_io_req_if (csr_io_req_if),
.csr_io_rsp_if (csr_io_rsp_if),
.csr_req_if (csr_req_if),
@ -95,8 +83,8 @@ module VX_execute #(
) mul_unit (
.clk (clk),
.reset (reset),
.alu_req_if (mul_req_if),
.alu_commit_if (mul_commit_if)
.mul_req_if (mul_req_if),
.mul_commit_if (mul_commit_if)
);
`else
assign mul_req_if.ready = 0;
@ -112,7 +100,6 @@ module VX_execute #(
.clk (clk),
.reset (reset),
.fpu_req_if (fpu_req_if),
.csr_to_fpu_if (csr_to_fpu_if),
.fpu_commit_if (fpu_commit_if)
);
`else
@ -134,9 +121,10 @@ module VX_execute #(
.gpu_commit_if (gpu_commit_if)
);
assign ebreak = bru_req_if.valid
&& (bru_req_if.op == `BRU_EBREAK
|| bru_req_if.op == `BRU_ECALL);
assign ebreak = alu_req_if.valid
&& `IS_BR_OP(alu_req_if.op)
&& (`BR_OP(alu_req_if.op) == `BR_EBREAK
|| `BR_OP(alu_req_if.op) == `BR_ECALL);
`SCOPE_ASSIGN (scope_decode_valid, decode_if.valid);
`SCOPE_ASSIGN (scope_decode_wid, decode_if.wid);

View file

@ -9,59 +9,81 @@ module VX_fpu_unit #(
// inputs
VX_fpu_req_if fpu_req_if,
VX_csr_to_fpu_if csr_to_fpu_if,
// outputs
VX_fpu_to_cmt_if fpu_commit_if
);
VX_fpu_req_if fpu_req_tmp_if();
);
localparam FPUQ_BITS = `LOG2UP(`FPUQ_SIZE);
// resolve dynamic FRM
wire [`FRM_BITS-1:0] frm, frm_tmp;
assign csr_to_fpu_if.wid = fpu_req_if.wid;
assign frm = (fpu_req_if.frm == `FRM_DYN) ? csr_to_fpu_if.frm : fpu_req_if.frm;
wire ready_in;
wire valid_out;
wire ready_out;
// use a skid buffer since fpcore has realtime backpressure
VX_elastic_buffer #(
.DATAW (`ISTAG_BITS + `NW_BITS + 32 + `FPU_BITS + `FRM_BITS + (3 * `NUM_THREADS * 32)),
.SIZE (0)
) input_buffer (
.clk (clk),
.reset (reset),
.valid_in (fpu_req_if.valid),
.ready_in (fpu_req_if.ready),
.data_in ({fpu_req_if.issue_tag, fpu_req_if.wid, fpu_req_if.curr_PC, fpu_req_if.op, frm, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}),
.data_out ({fpu_req_tmp_if.issue_tag, fpu_req_tmp_if.wid, fpu_req_tmp_if.curr_PC, fpu_req_tmp_if.op, frm_tmp, fpu_req_tmp_if.rs1_data, fpu_req_tmp_if.rs2_data, fpu_req_tmp_if.rs3_data}),
.ready_out (fpu_req_tmp_if.ready),
.valid_out (fpu_req_tmp_if.valid)
wire [`NW_BITS-1:0] rsp_wid;
wire [`NUM_THREADS-1:0] rsp_thread_mask;
wire [31:0] rsp_curr_PC;
wire [`NR_BITS-1:0] rsp_rd;
wire rsp_wb;
wire has_fflags;
fflags_t [`NUM_THREADS-1:0] fflags;
wire [`NUM_THREADS-1:0][31:0] result;
wire [FPUQ_BITS-1:0] tag_in, tag_out;
wire fpuq_full;
wire fpuq_push = fpu_req_if.valid && fpu_req_if.ready;
wire fpuq_pop = valid_out && ready_out;
VX_cam_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1),
.SIZE (`FPUQ_SIZE)
) mul_queue (
.clk (clk),
.reset (reset),
.acquire_slot (fpuq_push),
.write_addr (tag_in),
.read_addr (tag_out),
.release_addr (tag_out),
.write_data ({fpu_req_if.wid, fpu_req_if.thread_mask, fpu_req_if.curr_PC, fpu_req_if.rd, fpu_req_if.wb}),
.read_data ({rsp_wid, rsp_thread_mask, rsp_curr_PC, rsp_rd, rsp_wb}),
.release_slot (fpuq_pop),
.full (fpuq_full)
);
wire valid_in = fpu_req_if.valid && ~fpuq_full;
// can accept new request?
assign fpu_req_if.ready = ready_in && ~fpuq_full;
`ifdef SYNTHESIS
VX_fp_fpga fp_core (
VX_fp_fpga #(
.TAGW (FPUQ_BITS)
) fp_core (
.clk (clk),
.reset (reset),
.valid_in (fpu_req_tmp_if.valid),
.ready_in (fpu_req_tmp_if.ready),
.valid_in (valid_in),
.ready_in (ready_in),
.tag_in (fpu_req_tmp_if.issue_tag),
.tag_in (tag_in),
.op (fpu_req_tmp_if.op),
.frm (frm_tmp),
.op (fpu_req_if.op),
.frm (fpu_req_if.frm),
.dataa (fpu_req_tmp_if.rs1_data),
.datab (fpu_req_tmp_if.rs2_data),
.datac (fpu_req_tmp_if.rs3_data),
.result (fpu_commit_if.data),
.dataa (fpu_req_if.rs1_data),
.datab (fpu_req_if.rs2_data),
.datac (fpu_req_if.rs3_data),
.result (result),
.has_fflags (fpu_commit_if.has_fflags),
.fflags (fpu_commit_if.fflags),
.has_fflags (has_fflags),
.fflags (fflags),
.tag_out (fpu_commit_if.issue_tag),
.tag_out (tag_out),
.ready_out (1'b1),
.valid_out (fpu_commit_if.valid)
.ready_out (ready_out),
.valid_out (valid_out)
);
`else
@ -70,33 +92,49 @@ module VX_fpu_unit #(
.FMULADD (1),
.FDIVSQRT (1),
.FNONCOMP (1),
.FCONV (1)
.FCONV (1),
.TAGW (FPUQ_BITS)
) fp_core (
.clk (clk),
.reset (reset),
.valid_in (fpu_req_tmp_if.valid),
.ready_in (fpu_req_tmp_if.ready),
.valid_in (valid_in),
.ready_in (ready_in),
.tag_in (fpu_req_tmp_if.issue_tag),
.tag_in (tag_in),
.op (fpu_req_tmp_if.op),
.frm (frm_tmp),
.op (fpu_req_if.op),
.frm (fpu_req_if.frm),
.dataa (fpu_req_tmp_if.rs1_data),
.datab (fpu_req_tmp_if.rs2_data),
.datac (fpu_req_tmp_if.rs3_data),
.result (fpu_commit_if.data),
.dataa (fpu_req_if.rs1_data),
.datab (fpu_req_if.rs2_data),
.datac (fpu_req_if.rs3_data),
.result (result),
.has_fflags (fpu_commit_if.has_fflags),
.fflags (fpu_commit_if.fflags),
.has_fflags (has_fflags),
.fflags (fflags),
.tag_out (fpu_commit_if.issue_tag),
.tag_out (tag_out),
.ready_out (1'b1),
.valid_out (fpu_commit_if.valid)
.ready_out (ready_out),
.valid_out (valid_out)
);
`endif
wire stall_out = ~fpu_commit_if.ready && fpu_commit_if.valid;
VX_generic_register #(
.N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1 + (`NUM_THREADS * `FFG_BITS))
) fpu_reg (
.clk (clk),
.reset (reset),
.stall (stall_out),
.flush (1'b0),
.in ({valid_out, rsp_wid, rsp_thread_mask, rsp_curr_PC, rsp_rd, rsp_wb, result, has_fflags, fflags}),
.out ({fpu_commit_if.valid, fpu_commit_if.wid, fpu_commit_if.thread_mask, fpu_commit_if.curr_PC, fpu_commit_if.rd, fpu_commit_if.wb, fpu_commit_if.data, fpu_commit_if.has_fflags, fpu_commit_if.fflags})
);
assign ready_out = ~stall_out;
endmodule

53
hw/rtl/VX_gpr_bypass.v Normal file
View file

@ -0,0 +1,53 @@
`include "VX_platform.vh"
module VX_gpr_bypass #(
parameter DATAW = 1,
parameter BUFFERED = 1
) (
input wire clk,
input wire reset,
input wire push,
input reg pop,
input wire [DATAW-1:0] data_in,
output wire [DATAW-1:0] data_out
);
reg [DATAW-1:0] buffer, buffer2;
reg use_buffer, use_buffer2;
reg delayed_push;
always @(posedge clk) begin
if (reset) begin
delayed_push <= 0;
use_buffer <= 0;
use_buffer2 <= 0;
end else begin
delayed_push <= push;
assert(!use_buffer2 || use_buffer);
if (pop) begin
if (use_buffer) begin
buffer <= buffer2;
use_buffer <= use_buffer2;
use_buffer2 <= 0;
end
end
if (delayed_push) begin
if (use_buffer) begin
assert(!use_buffer2); // queue full!
if (pop) begin
buffer <= data_in;
end else begin
buffer2 <= data_in;
use_buffer2 <= 1;
end
use_buffer <= 1;
end else if (!pop) begin
buffer <= data_in;
use_buffer <= 1;
end
end
end
end
assign data_out = use_buffer ? buffer : data_in;
endmodule

View file

@ -16,25 +16,30 @@ module VX_gpr_fp_ctrl (
reg [`NUM_THREADS-1:0][31:0] rs1_tmp_data, rs2_tmp_data, rs3_tmp_data;
reg read_rs3;
reg [`NW_BITS-1:0] rs3_wid;
wire rs3_delay = gpr_read_if.valid && gpr_read_if.use_rs3 && ~read_rs3;
wire read_fire = gpr_read_if.valid && read_rs3;
wire read_fire = gpr_read_if.valid && gpr_read_if.ready_out;
always @(posedge clk) begin
if (reset) begin
read_rs3 <= 0;
rs3_wid <= 0;
end else begin
if (rs3_delay) begin
read_rs3 <= 1;
rs3_wid <= gpr_read_if.wid;
end else if (read_fire) begin
read_rs3 <= 0;
end
if (read_rs3) begin
assert(rs3_wid == gpr_read_if.wid);
end
end
end
// backup original rs1 data
always @(posedge clk) begin
if (~gpr_read_if.use_rs3 || rs3_delay) begin
always @(posedge clk) begin
if (~read_rs3) begin
rs1_tmp_data <= rs1_data;
end
rs2_tmp_data <= rs2_data;
@ -44,7 +49,7 @@ module VX_gpr_fp_ctrl (
// outputs
wire [`NR_BITS-1:0] rs1 = read_rs3 ? gpr_read_if.rs3 : gpr_read_if.rs1;
assign raddr1 = {gpr_read_if.wid, rs1};
assign gpr_read_if.ready = ~rs3_delay;
assign gpr_read_if.ready_in = ~rs3_delay;
assign gpr_read_if.rs1_data = rs1_tmp_data;
assign gpr_read_if.rs2_data = rs2_tmp_data;
assign gpr_read_if.rs3_data = rs3_tmp_data;

View file

@ -7,7 +7,7 @@ module VX_gpr_stage #(
input wire reset,
// inputs
VX_wb_if writeback_if,
VX_writeback_if writeback_if,
// outputs
VX_gpr_read_if gpr_read_if
@ -50,14 +50,14 @@ module VX_gpr_stage #(
assign gpr_read_if.rs1_data = rs1_tmp_data;
assign gpr_read_if.rs2_data = rs2_tmp_data;
assign gpr_read_if.rs3_data = 0;
assign gpr_read_if.ready = 1;
assign gpr_read_if.ready_in = 1;
wire valid = gpr_read_if.valid;
wire use_rs3 = gpr_read_if.use_rs3;
wire [`NR_BITS-1:0] rs3 = gpr_read_if.rs3;
`UNUSED_VAR (valid);
`UNUSED_VAR (use_rs3);
`UNUSED_VAR (rs3);
`UNUSED_FIELD (gpr_read_if, valid);
`UNUSED_FIELD (gpr_read_if, use_rs3);
`UNUSED_FIELD (gpr_read_if, rs3);
`UNUSED_FIELD (gpr_read_if, ready_out);
`endif
assign writeback_if.ready = 1'b1;
endmodule

View file

@ -3,15 +3,15 @@
module VX_gpu_unit #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
input wire clk,
input wire reset,
// Input
VX_gpu_req_if gpu_req_if,
VX_gpu_req_if gpu_req_if,
// Output
VX_warp_ctl_if warp_ctl_if,
VX_exu_to_cmt_if gpu_commit_if
VX_warp_ctl_if warp_ctl_if,
VX_exu_to_cmt_if gpu_commit_if
);
gpu_tmc_t tmc;
gpu_wspawn_t wspawn;
@ -23,15 +23,13 @@ module VX_gpu_unit #(
wire is_split = (gpu_req_if.op == `GPU_SPLIT);
wire is_bar = (gpu_req_if.op == `GPU_BAR);
wire gpu_req_fire = gpu_req_if.valid;
// tmc
wire [`NUM_THREADS-1:0] tmc_new_mask;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
assign tmc_new_mask[i] = (i < gpu_req_if.rs1_data[0]);
end
assign tmc.valid = gpu_req_fire && is_tmc;
assign tmc.valid = is_tmc;
assign tmc.thread_mask = tmc_new_mask;
// wspawn
@ -41,7 +39,7 @@ module VX_gpu_unit #(
for (genvar i = 0; i < `NUM_WARPS; i++) begin
assign wspawn_wmask[i] = (i < gpu_req_if.rs1_data[0]);
end
assign wspawn.valid = gpu_req_fire && is_wspawn;
assign wspawn.valid = is_wspawn;
assign wspawn.wmask = wspawn_wmask;
assign wspawn.pc = wspawn_pc;
@ -56,7 +54,7 @@ module VX_gpu_unit #(
assign split_else_mask[i] = gpu_req_if.thread_mask[i] & ~taken;
end
assign split.valid = gpu_req_fire && is_split;
assign split.valid = is_split;
assign split.diverged = (| split_then_mask) && (| split_else_mask);
assign split.then_mask = split_then_mask;
assign split.else_mask = split_else_mask;
@ -64,23 +62,29 @@ module VX_gpu_unit #(
// barrier
assign barrier.valid = is_bar && gpu_req_fire;
assign barrier.id = gpu_req_if.rs1_data[0][`NB_BITS-1:0];
assign barrier.num_warps = (`NW_BITS+1)'(gpu_req_if.rs2_data - 1);
assign barrier.valid = is_bar;
assign barrier.id = gpu_req_if.rs1_data[0][`NB_BITS-1:0];
assign barrier.size_m1 = (`NW_BITS)'(gpu_req_if.rs2_data - 1);
// output
wire stall = ~gpu_commit_if.ready && gpu_commit_if.valid;
VX_generic_register #(
.N(1 + `ISTAG_BITS + `NW_BITS + $bits(gpu_tmc_t) + $bits(gpu_wspawn_t) + $bits(gpu_split_t) + $bits(gpu_barrier_t))
.N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + $bits(gpu_tmc_t) + $bits(gpu_wspawn_t) + $bits(gpu_split_t) + $bits(gpu_barrier_t))
) gpu_reg (
.clk (clk),
.reset (reset),
.stall (0),
.stall (stall),
.flush (0),
.in ({gpu_req_if.valid, gpu_req_if.issue_tag, gpu_req_if.wid, tmc, wspawn, split, barrier}),
.out ({gpu_commit_if.valid, gpu_commit_if.issue_tag, warp_ctl_if.wid, warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.barrier})
.in ({gpu_req_if.valid, gpu_req_if.wid, gpu_req_if.thread_mask, gpu_req_if.curr_PC, gpu_req_if.rd, gpu_req_if.wb, tmc, wspawn, split, barrier}),
.out ({gpu_commit_if.valid, gpu_commit_if.wid, gpu_commit_if.thread_mask, gpu_commit_if.curr_PC, gpu_commit_if.rd, gpu_commit_if.wb, warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.barrier})
);
assign gpu_req_if.ready = 1'b1;
assign warp_ctl_if.valid = gpu_commit_if.valid && gpu_commit_if.ready;
assign warp_ctl_if.wid = gpu_commit_if.wid;
// can accept new request?
assign gpu_req_if.ready = ~stall;
endmodule

187
hw/rtl/VX_ibuffer.v Normal file
View file

@ -0,0 +1,187 @@
`include "VX_define.vh"
module VX_ibuffer #(
parameter CORE_ID = 0
) (
input wire clk,
input wire reset,
// inputs
input wire freeze, // do not switch to another warp
VX_decode_if ibuf_enq_if,
// outputs
VX_decode_if ibuf_deq_if
);
localparam DATAW = `NUM_THREADS + 32 + `EX_BITS + `OP_BITS + `FRM_BITS + 1 + (`NR_BITS * 4) + 32 + 1 + 1 + 1 + `NUM_REGS;
localparam SIZE = `IBUF_SIZE;
`USE_FAST_BRAM reg [DATAW-1:0] entries [`NUM_WARPS-1:0][SIZE-1:0];
reg [`LOG2UP(SIZE+1)-1:0] size_r [`NUM_WARPS-1:0];
reg [`LOG2UP(SIZE):0] rd_ptr_r [`NUM_WARPS-1:0];
reg [`LOG2UP(SIZE):0] wr_ptr_r [`NUM_WARPS-1:0];
wire [`NUM_WARPS-1:0] q_full;
wire [`NUM_WARPS-1:0][`LOG2UP(SIZE+1)-1:0] q_size;
wire [DATAW-1:0] q_data_in;
wire [`NUM_WARPS-1:0][DATAW-1:0] q_data_prev;
reg [`NUM_WARPS-1:0][DATAW-1:0] q_data_out;
wire enq_fire = ibuf_enq_if.valid && ibuf_enq_if.ready;
wire deq_fire = ibuf_deq_if.valid && ibuf_deq_if.ready;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
wire writing = enq_fire && (i == ibuf_enq_if.wid);
wire reading = deq_fire && (i == ibuf_deq_if.wid);
wire [`LOG2UP(SIZE-1)-1:0] rd_ptr_a = rd_ptr_r[i][`LOG2UP(SIZE-1)-1:0];
wire [`LOG2UP(SIZE-1)-1:0] wr_ptr_a = wr_ptr_r[i][`LOG2UP(SIZE-1)-1:0];
always @(posedge clk) begin
if (reset) begin
rd_ptr_r[i] <= 0;
wr_ptr_r[i] <= 0;
size_r[i] <= 0;
end else begin
if (writing) begin
if ((0 == size_r[i]) || ((1 == size_r[i]) && reading)) begin
q_data_out[i] <= q_data_in;
end else begin
entries[i][wr_ptr_a] <= q_data_in;
wr_ptr_r[i] <= wr_ptr_r[i] + 1;
end
if (!reading) begin
size_r[i] <= size_r[i] + 1;
end
end
if (reading) begin
if (size_r[i] != 1) begin
q_data_out[i] <= q_data_prev[i];
rd_ptr_r[i] <= rd_ptr_r[i] + 1;
end
if (!writing) begin
size_r[i] <= size_r[i] - 1;
end
end
end
end
assign q_data_prev[i] = (wr_ptr_r != rd_ptr_r) ? entries[i][rd_ptr_a] : q_data_in;
assign q_full[i] = (size_r[i] == SIZE);
assign q_size[i] = size_r[i];
end
///////////////////////////////////////////////////////////////////////////
reg [`NUM_WARPS-1:0] valid_table, valid_table_n;
reg [`NUM_WARPS-1:0] ready_table, ready_table_n;
reg [`LOG2UP(`NUM_WARPS+1)-1:0] active_warps;
reg [`NW_BITS-1:0] deq_wid, deq_wid_n;
reg deq_valid, deq_valid_n;
reg [DATAW-1:0] deq_instr, deq_instr_n;
always @(*) begin
valid_table_n = valid_table;
if (deq_fire) begin
valid_table_n[ibuf_deq_if.wid] = (q_size[ibuf_deq_if.wid] != 1);
end
if (enq_fire) begin
valid_table_n[ibuf_enq_if.wid] = 1;
end
end
always @(*) begin
deq_wid_n = 0;
deq_valid_n = 0;
ready_table_n = ready_table;
if (deq_fire) begin
ready_table_n[ibuf_deq_if.wid] = (q_size[ibuf_deq_if.wid] != 1);
end
for (integer i = 0; i < `NUM_WARPS; i++) begin
if (ready_table_n[i]) begin
deq_wid_n = `NW_BITS'(i);
deq_valid_n = 1;
deq_instr_n = (deq_fire && (ibuf_deq_if.wid == `NW_BITS'(i))) ? q_data_prev[i] : q_data_out[i];
ready_table_n[i] = 0;
break;
end
end
end
wire warp_added = enq_fire && (0 == q_size[ibuf_enq_if.wid]) && (!deq_fire || ibuf_enq_if.wid != ibuf_deq_if.wid);
wire warp_removed = deq_fire && (1 == q_size[ibuf_deq_if.wid]) && (!enq_fire || ibuf_enq_if.wid != ibuf_deq_if.wid);
always @(posedge clk) begin
if (reset) begin
valid_table <= 0;
ready_table <= 0;
deq_valid <= 0;
active_warps <= 0;
end else begin
valid_table <= valid_table_n;
ready_table <= (| ready_table_n) ? ready_table_n : valid_table_n;
if (enq_fire && (0 == active_warps)) begin
deq_valid <= 1;
deq_wid <= ibuf_enq_if.wid;
deq_instr <= q_data_in;
end else if (!freeze) begin
deq_valid <= deq_valid_n;
deq_wid <= deq_wid_n;
deq_instr <= deq_instr_n;
end
if (warp_added && !warp_removed) begin
active_warps <= active_warps + 1;
end else if (warp_removed && !warp_added) begin
active_warps <= active_warps - 1;
end
begin
integer k = 0;
for (integer i = 0; i < `NUM_WARPS; i++) begin
k += 32'(q_size[i] != 0);
end
assert(k == 32'(active_warps));
assert(~deq_fire || active_warps != 0);
end
end
end
assign ibuf_enq_if.ready = ~q_full[ibuf_enq_if.wid];
assign q_data_in = {ibuf_enq_if.thread_mask,
ibuf_enq_if.curr_PC,
ibuf_enq_if.ex_type,
ibuf_enq_if.ex_op,
ibuf_enq_if.frm,
ibuf_enq_if.wb,
ibuf_enq_if.rd,
ibuf_enq_if.rs1,
ibuf_enq_if.rs2,
ibuf_enq_if.rs3,
ibuf_enq_if.imm,
ibuf_enq_if.rs1_is_PC,
ibuf_enq_if.rs2_is_imm,
ibuf_enq_if.use_rs3,
ibuf_enq_if.used_regs};
assign ibuf_deq_if.valid = deq_valid;
assign ibuf_deq_if.wid = deq_wid;
assign {ibuf_deq_if.thread_mask,
ibuf_deq_if.curr_PC,
ibuf_deq_if.ex_type,
ibuf_deq_if.ex_op,
ibuf_deq_if.frm,
ibuf_deq_if.wb,
ibuf_deq_if.rd,
ibuf_deq_if.rs1,
ibuf_deq_if.rs2,
ibuf_deq_if.rs3,
ibuf_deq_if.imm,
ibuf_deq_if.rs1_is_PC,
ibuf_deq_if.rs2_is_imm,
ibuf_deq_if.use_rs3,
ibuf_deq_if.used_regs} = deq_instr;
endmodule

View file

@ -46,7 +46,7 @@ module VX_icache_stage #(
assign ifetch_req_if.ready = icache_req_if.ready;
`ifdef DBG_CORE_REQ_INFO
assign icache_req_if.tag = {ifetch_req_if.curr_PC, 1'b0, `NR_BITS'(0), ifetch_req_if.wid, req_tag};
assign icache_req_if.tag = {ifetch_req_if.curr_PC, `NR_BITS'(0), ifetch_req_if.wid, req_tag};
`else
assign icache_req_if.tag = req_tag;
`endif

233
hw/rtl/VX_instr_demux.v Normal file
View file

@ -0,0 +1,233 @@
`include "VX_define.vh"
module VX_instr_demux (
input wire clk,
input wire reset,
// inputs
VX_decode_if execute_if,
VX_gpr_read_if gpr_read_if,
VX_csr_to_issue_if csr_to_issue_if,
// outputs
VX_alu_req_if alu_req_if,
VX_lsu_req_if lsu_req_if,
VX_csr_req_if csr_req_if,
VX_mul_req_if mul_req_if,
VX_fpu_req_if fpu_req_if,
VX_gpu_req_if gpu_req_if
);
// ALU unit
wire alu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_ALU);
wire alu_req_ready;
wire [`NT_BITS-1:0] tid;
VX_priority_encoder #(
.N(`NUM_THREADS)
) tid_select (
.data_in (execute_if.thread_mask),
.data_out (tid),
`UNUSED_PIN (valid_out)
);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `ALU_BR_BITS + 32 + 1 + 1 + `NR_BITS + 1 + `NT_BITS)
) alu_reg (
.clk (clk),
.reset (reset),
.ready_in (alu_req_ready),
.valid_in (alu_req_valid),
.data_in ({execute_if.wid, execute_if.thread_mask, execute_if.curr_PC, `ALU_BR_OP(execute_if.ex_op), execute_if.imm, execute_if.rs1_is_PC, execute_if.rs2_is_imm, execute_if.rd, execute_if.wb, tid}),
.data_out ({alu_req_if.wid, alu_req_if.thread_mask, alu_req_if.curr_PC, alu_req_if.op, alu_req_if.imm, alu_req_if.rs1_is_PC, alu_req_if.rs2_is_imm, alu_req_if.rd, alu_req_if.wb, alu_req_if.tid}),
.ready_out (alu_req_if.ready),
.valid_out (alu_req_if.valid)
);
VX_gpr_bypass #(
.DATAW ((2 * `NUM_THREADS * 32))
) alu_bypass (
.clk (clk),
.reset (reset),
.push (alu_req_valid && alu_req_ready),
.data_in ({gpr_read_if.rs1_data, gpr_read_if.rs2_data}),
.data_out ({alu_req_if.rs1_data, alu_req_if.rs2_data}),
.pop (alu_req_if.valid && alu_req_if.ready)
);
// lsu unit
wire lsu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_LSU);
wire lsu_req_ready;
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 1 + `BYTEEN_BITS + 32 + `NR_BITS + 1)
) lsu_reg (
.clk (clk),
.reset (reset),
.ready_in (lsu_req_ready),
.valid_in (lsu_req_valid),
.data_in ({execute_if.wid, execute_if.thread_mask, execute_if.curr_PC, `LSU_RW(execute_if.ex_op), `LSU_BE(execute_if.ex_op), execute_if.imm, execute_if.rd, execute_if.wb}),
.data_out ({lsu_req_if.wid, lsu_req_if.thread_mask, lsu_req_if.curr_PC, lsu_req_if.rw, lsu_req_if.byteen, lsu_req_if.offset, lsu_req_if.rd, lsu_req_if.wb}),
.ready_out (lsu_req_if.ready),
.valid_out (lsu_req_if.valid)
);
VX_gpr_bypass #(
.DATAW ((2 * `NUM_THREADS * 32))
) lsu_bypass (
.clk (clk),
.reset (reset),
.push (lsu_req_valid && lsu_req_ready),
.data_in ({gpr_read_if.rs1_data, gpr_read_if.rs2_data}),
.data_out ({lsu_req_if.base_addr, lsu_req_if.store_data}),
.pop (lsu_req_if.valid && lsu_req_if.ready)
);
// csr unit
wire csr_req_valid = execute_if.valid && (execute_if.ex_type == `EX_CSR);
wire csr_req_ready;
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `CSR_BITS + `CSR_ADDR_BITS + `NR_BITS + 1 + 1)
) csr_reg (
.clk (clk),
.reset (reset),
.ready_in (csr_req_ready),
.valid_in (csr_req_valid),
.data_in ({execute_if.wid, execute_if.thread_mask, execute_if.curr_PC, `CSR_OP(execute_if.ex_op), execute_if.imm[`CSR_ADDR_BITS-1:0], execute_if.rd, execute_if.wb, 1'b0}),
.data_out ({csr_req_if.wid, csr_req_if.thread_mask, csr_req_if.curr_PC, csr_req_if.op, csr_req_if.csr_addr, csr_req_if.rd, csr_req_if.wb, csr_req_if.is_io}),
.ready_out (csr_req_if.ready),
.valid_out (csr_req_if.valid)
);
reg tmp_rs2_is_imm;
reg [`NR_BITS-1:0] tmp_rs1;
always @(posedge clk) begin
tmp_rs2_is_imm <= execute_if.rs2_is_imm;
tmp_rs1 <= execute_if.rs1;
end
wire [31:0] csr_req_mask = tmp_rs2_is_imm ? 32'(tmp_rs1) : gpr_read_if.rs1_data[0];
VX_gpr_bypass #(
.DATAW (32)
) csr_bypass (
.clk (clk),
.reset (reset),
.push (csr_req_valid && csr_req_ready),
.data_in (csr_req_mask),
.data_out (csr_req_if.csr_mask),
.pop (csr_req_if.valid && csr_req_if.ready)
);
// mul unit
`ifdef EXT_M_ENABLE
wire mul_req_valid = execute_if.valid && (execute_if.ex_type == `EX_MUL);
wire mul_req_ready;
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `MUL_BITS + `NR_BITS + 1)
) mul_reg (
.clk (clk),
.reset (reset),
.ready_in (mul_req_ready),
.valid_in (mul_req_valid),
.data_in ({execute_if.wid, execute_if.thread_mask, execute_if.curr_PC, `MUL_OP(execute_if.ex_op), execute_if.rd, execute_if.wb}),
.data_out ({mul_req_if.wid, mul_req_if.thread_mask, mul_req_if.curr_PC, mul_req_if.op, mul_req_if.rd, mul_req_if.wb}),
.ready_out (mul_req_if.ready),
.valid_out (mul_req_if.valid)
);
VX_gpr_bypass #(
.DATAW ((2 * `NUM_THREADS * 32))
) mul_bypass (
.clk (clk),
.reset (reset),
.push (mul_req_valid && mul_req_ready),
.data_in ({gpr_read_if.rs1_data, gpr_read_if.rs2_data}),
.data_out ({mul_req_if.rs1_data, mul_req_if.rs2_data}),
.pop (mul_req_if.valid && mul_req_if.ready)
);
`endif
// fpu unit
`ifdef EXT_F_ENABLE
wire fpu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_FPU);
wire fpu_req_ready;
// resolve dynamic FRM
assign csr_to_issue_if.wid = execute_if.wid;
wire [`FRM_BITS-1:0] fpu_frm = (execute_if.frm == `FRM_DYN) ? csr_to_issue_if.frm : execute_if.frm;
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `FPU_BITS + `FRM_BITS + `NR_BITS + 1)
) fpu_reg (
.clk (clk),
.reset (reset),
.ready_in (fpu_req_ready),
.valid_in (fpu_req_valid),
.data_in ({execute_if.wid, execute_if.thread_mask, execute_if.curr_PC, `FPU_OP(execute_if.ex_op), fpu_frm, execute_if.rd, execute_if.wb}),
.data_out ({fpu_req_if.wid, fpu_req_if.thread_mask, fpu_req_if.curr_PC, fpu_req_if.op, fpu_req_if.frm, fpu_req_if.rd, fpu_req_if.wb}),
.ready_out (fpu_req_if.ready),
.valid_out (fpu_req_if.valid)
);
VX_gpr_bypass #(
.DATAW ((3 * `NUM_THREADS * 32))
) fpu_bypass (
.clk (clk),
.reset (reset),
.push (fpu_req_valid && fpu_req_ready),
.data_in ({gpr_read_if.rs1_data, gpr_read_if.rs2_data, gpr_read_if.rs3_data}),
.data_out ({fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data}),
.pop (fpu_req_if.valid && fpu_req_if.ready)
);
`endif
// gpu unit
wire gpu_req_valid = execute_if.valid && (execute_if.ex_type == `EX_GPU);
wire gpu_req_ready;
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `GPU_BITS + `NR_BITS + 1)
) gpu_reg (
.clk (clk),
.reset (reset),
.ready_in (gpu_req_ready),
.valid_in (gpu_req_valid),
.data_in ({execute_if.wid, execute_if.thread_mask, execute_if.curr_PC, `GPU_OP(execute_if.ex_op), execute_if.rd, execute_if.wb}),
.data_out ({gpu_req_if.wid, gpu_req_if.thread_mask, gpu_req_if.curr_PC, gpu_req_if.op, gpu_req_if.rd, gpu_req_if.wb}),
.ready_out (gpu_req_if.ready),
.valid_out (gpu_req_if.valid)
);
VX_gpr_bypass #(
.DATAW ((`NUM_THREADS * 32) + 32)
) gpu_bypass (
.clk (clk),
.reset (reset),
.push (gpu_req_valid && gpu_req_ready),
.data_in ({gpr_read_if.rs1_data, gpr_read_if.rs2_data[0]}),
.data_out ({gpu_req_if.rs1_data, gpu_req_if.rs2_data}),
.pop (gpu_req_if.valid && gpu_req_if.ready)
);
// can take next request?
assign execute_if.ready = (alu_req_ready && (execute_if.ex_type == `EX_ALU))
|| (lsu_req_ready && (execute_if.ex_type == `EX_LSU))
|| (csr_req_ready && (execute_if.ex_type == `EX_CSR))
`ifdef EXT_M_ENABLE
|| (mul_req_ready && (execute_if.ex_type == `EX_MUL))
`endif
`ifdef EXT_F_ENABLE
|| (fpu_req_ready && (execute_if.ex_type == `EX_FPU))
`endif
|| (gpu_req_ready && (execute_if.ex_type == `EX_GPU));
endmodule

View file

@ -7,53 +7,51 @@ module VX_issue #(
input wire reset,
VX_decode_if decode_if,
VX_wb_if writeback_if,
VX_cmt_to_issue_if cmt_to_issue_if,
VX_writeback_if writeback_if,
VX_csr_to_issue_if csr_to_issue_if,
VX_alu_req_if alu_req_if,
VX_bru_req_if bru_req_if,
VX_lsu_req_if lsu_req_if,
VX_csr_req_if csr_req_if,
VX_mul_req_if mul_req_if,
VX_fpu_req_if fpu_req_if,
VX_gpu_req_if gpu_req_if
);
wire [`ISTAG_BITS-1:0] issue_tag;
wire schedule_delay;
VX_decode_if ibuf_deq_if();
VX_decode_if execute_if();
VX_gpr_read_if gpr_read_if();
assign gpr_read_if.valid = decode_if.valid && ~schedule_delay;
assign gpr_read_if.wid = decode_if.wid;
assign gpr_read_if.rs1 = decode_if.rs1;
assign gpr_read_if.rs2 = decode_if.rs2;
assign gpr_read_if.rs3 = decode_if.rs3;
assign gpr_read_if.use_rs3 = decode_if.use_rs3;
wire ex_busy = (~alu_req_if.ready && (decode_if.ex_type == `EX_ALU))
|| (~bru_req_if.ready && (decode_if.ex_type == `EX_BRU))
|| (~lsu_req_if.ready && (decode_if.ex_type == `EX_LSU))
|| (~csr_req_if.ready && (decode_if.ex_type == `EX_CSR))
`ifdef EXT_M_ENABLE
|| (~mul_req_if.ready && (decode_if.ex_type == `EX_MUL))
`endif
`ifdef EXT_F_ENABLE
|| (~fpu_req_if.ready && (decode_if.ex_type == `EX_FPU))
`endif
|| (~gpu_req_if.ready && (decode_if.ex_type == `EX_GPU));
wire scoreboard_delay;
VX_ibuffer #(
.CORE_ID(CORE_ID)
) ibuffer (
.clk (clk),
.reset (reset),
.ibuf_enq_if (decode_if),
.ibuf_deq_if (ibuf_deq_if),
.freeze (~gpr_read_if.ready_in)
);
VX_scoreboard #(
.CORE_ID(CORE_ID)
) scoreboard (
.clk (clk),
.reset (reset),
.decode_if (decode_if),
.ibuf_deq_if (ibuf_deq_if),
.writeback_if (writeback_if),
.cmt_to_issue_if(cmt_to_issue_if),
.ex_busy (ex_busy),
.issue_tag (issue_tag),
.schedule_delay (schedule_delay)
.exe_delay (~execute_if.ready),
.gpr_delay (~gpr_read_if.ready_in),
.delay (scoreboard_delay)
);
assign gpr_read_if.valid = ibuf_deq_if.valid && ~scoreboard_delay;
assign gpr_read_if.wid = ibuf_deq_if.wid;
assign gpr_read_if.rs1 = ibuf_deq_if.rs1;
assign gpr_read_if.rs2 = ibuf_deq_if.rs2;
assign gpr_read_if.rs3 = ibuf_deq_if.rs3;
assign gpr_read_if.use_rs3 = ibuf_deq_if.use_rs3;
assign gpr_read_if.ready_out = execute_if.ready;
VX_gpr_stage #(
.CORE_ID(CORE_ID)
@ -63,72 +61,54 @@ module VX_issue #(
.writeback_if (writeback_if),
.gpr_read_if (gpr_read_if)
);
VX_issue_if issue_if();
assign issue_if.rs1_data = gpr_read_if.rs1_data;
assign issue_if.rs2_data = gpr_read_if.rs2_data;
assign issue_if.rs3_data = gpr_read_if.rs3_data;
wire [`NT_BITS-1:0] tid;
VX_priority_encoder #(
.N(`NUM_THREADS)
) sel_src (
.data_in (decode_if.thread_mask),
.data_out (tid),
`UNUSED_PIN (valid_out)
);
wire stall = schedule_delay || ~gpr_read_if.ready;
wire flush = stall; // clear output on stall
VX_generic_register #(
.N(1 + `ISTAG_BITS + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + `NR_BITS + 32 + 1 + 1 + `EX_BITS + `OP_BITS + 1 + `FRM_BITS + `NT_BITS)
) issue_reg (
.clk (clk),
.reset (reset),
.stall (stall),
.flush (flush),
.in ({decode_if.valid, issue_tag, decode_if.wid, decode_if.thread_mask, decode_if.curr_PC, decode_if.rd, decode_if.rs1, decode_if.imm, decode_if.rs1_is_PC, decode_if.rs2_is_imm, decode_if.ex_type, decode_if.ex_op, decode_if.wb, decode_if.frm, tid}),
.out ({issue_if.valid, issue_if.issue_tag, issue_if.wid, issue_if.thread_mask, issue_if.curr_PC, issue_if.rd, issue_if.rs1, issue_if.imm, issue_if.rs1_is_PC, issue_if.rs2_is_imm, issue_if.ex_type, issue_if.ex_op, issue_if.wb, issue_if.frm, issue_if.tid})
);
assign decode_if.ready = issue_if.ready;
assign issue_if.ready = ~stall;
VX_issue_demux issue_demux (
.issue_if (issue_if),
.alu_req_if (alu_req_if),
.bru_req_if (bru_req_if),
.lsu_req_if (lsu_req_if),
.csr_req_if (csr_req_if),
.mul_req_if (mul_req_if),
.fpu_req_if (fpu_req_if),
.gpu_req_if (gpu_req_if)
);
assign execute_if.valid = ibuf_deq_if.valid && gpr_read_if.ready_in && ~scoreboard_delay;
assign execute_if.wid = ibuf_deq_if.wid;
assign execute_if.thread_mask = ibuf_deq_if.thread_mask;
assign execute_if.curr_PC = ibuf_deq_if.curr_PC;
assign execute_if.ex_type = ibuf_deq_if.ex_type;
assign execute_if.ex_op = ibuf_deq_if.ex_op;
assign execute_if.frm = ibuf_deq_if.frm;
assign execute_if.wb = ibuf_deq_if.wb;
assign execute_if.rd = ibuf_deq_if.rd;
assign execute_if.rs1 = ibuf_deq_if.rs1;
assign execute_if.imm = ibuf_deq_if.imm;
assign execute_if.rs1_is_PC = ibuf_deq_if.rs1_is_PC;
assign execute_if.rs2_is_imm = ibuf_deq_if.rs2_is_imm;
VX_instr_demux instr_demux (
.clk (clk),
.reset (reset),
.execute_if (execute_if),
.gpr_read_if (gpr_read_if),
.csr_to_issue_if(csr_to_issue_if),
.alu_req_if (alu_req_if),
.lsu_req_if (lsu_req_if),
.csr_req_if (csr_req_if),
.mul_req_if (mul_req_if),
.fpu_req_if (fpu_req_if),
.gpu_req_if (gpu_req_if)
);
`ifdef DBG_PRINT_PIPELINE
always @(posedge clk) begin
if (alu_req_if.valid && alu_req_if.ready) begin
$display("%t: Core%0d-issue: wid=%0d, PC=%0h, ex=ALU, istag=%0d, tmask=%b, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, alu_req_if.wid, alu_req_if.curr_PC, alu_req_if.issue_tag, alu_req_if.thread_mask, alu_req_if.rs1_data, alu_req_if.rs2_data);
end
if (bru_req_if.valid && bru_req_if.ready) begin
$display("%t: Core%0d-issue: wid=%0d, PC=%0h, ex=BRU, istag=%0d, tmask=%b, rs1_data=%0h, rs2_data=%0h, offset=%0h", $time, CORE_ID, bru_req_if.wid, bru_req_if.curr_PC, bru_req_if.issue_tag, bru_req_if.thread_mask, bru_req_if.rs1_data, bru_req_if.rs2_data, bru_req_if.offset);
$display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=ALU, tmask=%b, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, alu_req_if.wid, alu_req_if.curr_PC, alu_req_if.thread_mask, alu_req_if.rs1_data, alu_req_if.rs2_data);
end
if (lsu_req_if.valid && lsu_req_if.ready) begin
$display("%t: Core%0d-issue: wid=%0d, PC=%0h, ex=LSU, istag=%0d, tmask=%b, rw=%b, byteen=%b, baddr=%0h, offset=%0h, data=%0h", $time, CORE_ID, lsu_req_if.wid, lsu_req_if.curr_PC, lsu_req_if.issue_tag, lsu_req_if.thread_mask, lsu_req_if.rw, lsu_req_if.byteen, lsu_req_if.base_addr, lsu_req_if.offset, lsu_req_if.store_data);
$display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=LSU, tmask=%b, rw=%b, byteen=%b, baddr=%0h, offset=%0h, data=%0h", $time, CORE_ID, lsu_req_if.wid, lsu_req_if.curr_PC, lsu_req_if.thread_mask, lsu_req_if.rw, lsu_req_if.byteen, lsu_req_if.base_addr, lsu_req_if.offset, lsu_req_if.store_data);
end
if (csr_req_if.valid && csr_req_if.ready) begin
$display("%t: Core%0d-issue: wid=%0d, PC=%0h, ex=CSR, istag=%0d, tmask=%b, addr=%0h, mask=%0h", $time, CORE_ID, csr_req_if.wid, csr_req_if.curr_PC, csr_req_if.issue_tag, csr_req_if.thread_mask, csr_req_if.csr_addr, csr_req_if.csr_mask);
$display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=CSR, tmask=%b, addr=%0h, mask=%0h", $time, CORE_ID, csr_req_if.wid, csr_req_if.curr_PC, csr_req_if.thread_mask, csr_req_if.csr_addr, csr_req_if.csr_mask);
end
if (mul_req_if.valid && mul_req_if.ready) begin
$display("%t: Core%0d-issue: wid=%0d, PC=%0h, ex=MUL, istag=%0d, tmask=%b, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, mul_req_if.wid, mul_req_if.curr_PC, mul_req_if.issue_tag, mul_req_if.thread_mask, mul_req_if.rs1_data, mul_req_if.rs2_data);
$display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=MUL, tmask=%b, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, mul_req_if.wid, mul_req_if.curr_PC, mul_req_if.thread_mask, mul_req_if.rs1_data, mul_req_if.rs2_data);
end
if (fpu_req_if.valid && fpu_req_if.ready) begin
$display("%t: Core%0d-issue: wid=%0d, PC=%0h, ex=FPU, istag=%0d, tmask=%b, rs1_data=%0h, rs2_data=%0h, rs3_data=%0h", $time, CORE_ID, fpu_req_if.wid, fpu_req_if.curr_PC, fpu_req_if.issue_tag, fpu_req_if.thread_mask, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data);
$display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=FPU, tmask=%b, rs1_data=%0h, rs2_data=%0h, rs3_data=%0h", $time, CORE_ID, fpu_req_if.wid, fpu_req_if.curr_PC, fpu_req_if.thread_mask, fpu_req_if.rs1_data, fpu_req_if.rs2_data, fpu_req_if.rs3_data);
end
if (gpu_req_if.valid && gpu_req_if.ready) begin
$display("%t: Core%0d-issue: wid=%0d, PC=%0h, ex=GPU, istag=%0d, tmask=%b, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, gpu_req_if.wid, gpu_req_if.curr_PC, gpu_req_if.issue_tag, gpu_req_if.thread_mask, gpu_req_if.rs1_data, gpu_req_if.rs2_data);
$display("%t: core%0d-issue: wid=%0d, PC=%0h, ex=GPU, tmask=%b, rs1_data=%0h, rs2_data=%0h", $time, CORE_ID, gpu_req_if.wid, gpu_req_if.curr_PC, gpu_req_if.thread_mask, gpu_req_if.rs1_data, gpu_req_if.rs2_data);
end
end
`endif

View file

@ -1,102 +0,0 @@
`include "VX_define.vh"
module VX_issue_demux (
// inputs
VX_issue_if issue_if,
// outputs
VX_alu_req_if alu_req_if,
VX_bru_req_if bru_req_if,
VX_lsu_req_if lsu_req_if,
VX_csr_req_if csr_req_if,
VX_mul_req_if mul_req_if,
VX_fpu_req_if fpu_req_if,
VX_gpu_req_if gpu_req_if
);
// ALU unit
assign alu_req_if.valid = issue_if.valid && (issue_if.ex_type == `EX_ALU);
assign alu_req_if.issue_tag = issue_if.issue_tag;
assign alu_req_if.wid = issue_if.wid;
assign alu_req_if.thread_mask = issue_if.thread_mask;
assign alu_req_if.curr_PC = issue_if.curr_PC;
assign alu_req_if.op = `ALU_OP(issue_if.ex_op);
assign alu_req_if.rs1_is_PC = issue_if.rs1_is_PC;
assign alu_req_if.rs2_is_imm = issue_if.rs2_is_imm;
assign alu_req_if.imm = issue_if.imm;
assign alu_req_if.rs1_data = issue_if.rs1_data;
assign alu_req_if.rs2_data = issue_if.rs2_data;
// BRU unit
assign bru_req_if.valid = issue_if.valid && (issue_if.ex_type == `EX_BRU);
assign bru_req_if.issue_tag = issue_if.issue_tag;
assign bru_req_if.wid = issue_if.wid;
assign bru_req_if.thread_mask = issue_if.thread_mask;
assign bru_req_if.curr_PC = issue_if.curr_PC;
assign bru_req_if.op = `BRU_OP(issue_if.ex_op);
assign bru_req_if.rs1_is_PC = issue_if.rs1_is_PC;
assign bru_req_if.rs1_data = issue_if.rs1_data[issue_if.tid];
assign bru_req_if.rs2_data = issue_if.rs2_data[issue_if.tid];
assign bru_req_if.offset = issue_if.imm;
// LSU unit
assign lsu_req_if.valid = issue_if.valid && (issue_if.ex_type == `EX_LSU);
assign lsu_req_if.issue_tag = issue_if.issue_tag;
assign lsu_req_if.wid = issue_if.wid;
assign lsu_req_if.thread_mask = issue_if.thread_mask;
assign lsu_req_if.curr_PC = issue_if.curr_PC;
assign lsu_req_if.rw = `LSU_RW(issue_if.ex_op);
assign lsu_req_if.byteen = `LSU_BE(issue_if.ex_op);
assign lsu_req_if.base_addr = issue_if.rs1_data;
assign lsu_req_if.store_data = issue_if.rs2_data;
assign lsu_req_if.offset = issue_if.imm;
assign lsu_req_if.rd = issue_if.rd;
assign lsu_req_if.wb = issue_if.wb;
// CSR unit
assign csr_req_if.valid = issue_if.valid && (issue_if.ex_type == `EX_CSR);
assign csr_req_if.issue_tag = issue_if.issue_tag;
assign csr_req_if.wid = issue_if.wid;
assign csr_req_if.thread_mask = issue_if.thread_mask;
assign csr_req_if.curr_PC = issue_if.curr_PC;
assign csr_req_if.op = `CSR_OP(issue_if.ex_op);
assign csr_req_if.csr_addr = issue_if.imm[`CSR_ADDR_BITS-1:0];
assign csr_req_if.csr_mask = issue_if.rs2_is_imm ? 32'(issue_if.rs1) : issue_if.rs1_data[0];
assign csr_req_if.is_io = 1'b0;
// MUL unit
`ifdef EXT_M_ENABLE
assign mul_req_if.valid = issue_if.valid && (issue_if.ex_type == `EX_MUL);
assign mul_req_if.issue_tag = issue_if.issue_tag;
assign mul_req_if.wid = issue_if.wid;
assign mul_req_if.thread_mask = issue_if.thread_mask;
assign mul_req_if.curr_PC = issue_if.curr_PC;
assign mul_req_if.op = `MUL_OP(issue_if.ex_op);
assign mul_req_if.rs1_data = issue_if.rs1_data;
assign mul_req_if.rs2_data = issue_if.rs2_data;
`endif
// FPU unit
`ifdef EXT_F_ENABLE
assign fpu_req_if.valid = issue_if.valid && (issue_if.ex_type == `EX_FPU);
assign fpu_req_if.issue_tag = issue_if.issue_tag;
assign fpu_req_if.wid = issue_if.wid;
assign fpu_req_if.thread_mask = issue_if.thread_mask;
assign fpu_req_if.curr_PC = issue_if.curr_PC;
assign fpu_req_if.op = `FPU_OP(issue_if.ex_op);
assign fpu_req_if.frm = issue_if.frm;
assign fpu_req_if.rs1_data = issue_if.rs1_data;
assign fpu_req_if.rs2_data = issue_if.rs2_data;
assign fpu_req_if.rs3_data = issue_if.rs3_data;
`endif
// GPU unit
assign gpu_req_if.valid = issue_if.valid && (issue_if.ex_type == `EX_GPU);
assign gpu_req_if.issue_tag = issue_if.issue_tag;
assign gpu_req_if.wid = issue_if.wid;
assign gpu_req_if.thread_mask = issue_if.thread_mask;
assign gpu_req_if.curr_PC = issue_if.curr_PC;
assign gpu_req_if.op = `GPU_OP(issue_if.ex_op);
assign gpu_req_if.rs1_data = issue_if.rs1_data;
assign gpu_req_if.rs2_data = issue_if.rs2_data[0];
endmodule

View file

@ -18,10 +18,6 @@ module VX_lsu_unit #(
// outputs
VX_exu_to_cmt_if lsu_commit_if
);
wire valid_in;
wire ready_in;
wire [`NUM_THREADS-1:0] req_thread_mask;
wire req_rw;
wire [`NUM_THREADS-1:0][29:0] req_addr;
@ -30,10 +26,9 @@ module VX_lsu_unit #(
wire [`NUM_THREADS-1:0][31:0] req_data;
wire [1:0] req_sext;
wire [`NR_BITS-1:0] req_rd;
wire [`NW_BITS-1:0] req_wid;
wire [`ISTAG_BITS-1:0] req_issue_tag;
wire req_wb;
wire [31:0] req_pc;
wire [`NW_BITS-1:0] req_wid;
wire [31:0] req_curr_PC;
wire [`NUM_THREADS-1:0][31:0] full_address;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
@ -74,121 +69,127 @@ module VX_lsu_unit #(
wire [`NUM_THREADS-1:0][31:0] req_address;
`IGNORE_WARNINGS_END
// use a skid buffer because the dcache's ready signal is combinational
// use buffer size of two for stall-free execution
VX_elastic_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + `ISTAG_BITS + (`NUM_THREADS * 32) + 2 + 1 + (`NUM_THREADS * (30 + 2 + 4 + 32)) + `NR_BITS + 1 + 32),
.SIZE (2)
) input_buffer (
.clk (clk),
.reset (reset),
.valid_in (lsu_req_if.valid),
.ready_in (lsu_req_if.ready),
.data_in ({lsu_req_if.wid, lsu_req_if.thread_mask, lsu_req_if.issue_tag, full_address, mem_req_sext, lsu_req_if.rw, mem_req_addr, mem_req_offset, mem_req_byteen, mem_req_data, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.curr_PC}),
.data_out ({req_wid, req_thread_mask, req_issue_tag, req_address, req_sext, req_rw, req_addr, req_offset, req_byteen, req_data, req_rd, req_wb, req_pc}),
.ready_out (ready_in),
.valid_out (valid_in)
wire valid_in;
wire stall_in;
VX_generic_register #(
.N(1 + `NW_BITS + `NUM_THREADS + 32 + 1 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 2 + (`NUM_THREADS * (30 + 2 + 4 + 32)))
) lsu_req_reg (
.clk (clk),
.reset (reset),
.stall (stall_in),
.flush (0),
.in ({lsu_req_if.valid, lsu_req_if.wid, lsu_req_if.thread_mask, lsu_req_if.curr_PC, lsu_req_if.rw, lsu_req_if.rd, lsu_req_if.wb, full_address, mem_req_sext, mem_req_addr, mem_req_offset, mem_req_byteen, mem_req_data}),
.out ({valid_in, req_wid, req_thread_mask, req_curr_PC, req_rw, req_rd, req_wb, req_address, req_sext, req_addr, req_offset, req_byteen, req_data})
);
reg [`ISSUEQ_SIZE-1:0][`NUM_THREADS-1:0] mem_rsp_mask_buf;
reg [`ISSUEQ_SIZE-1:0][`NUM_THREADS-1:0][31:0] mem_rsp_data_prev_buf;
wire [`NW_BITS-1:0] rsp_wid;
wire [31:0] rsp_curr_PC;
wire [`NR_BITS-1:0] rsp_rd;
wire rsp_wb;
wire [`NUM_THREADS-1:0][1:0] rsp_offset;
wire [1:0] rsp_sext;
reg [`NUM_THREADS-1:0][31:0] rsp_data;
reg [`NUM_THREADS-1:0][1:0] mem_rsp_offset_buf [`ISSUEQ_SIZE-1:0];
reg [1:0] mem_rsp_sext_buf [`ISSUEQ_SIZE-1:0];
reg [`NW_BITS-1:0] mem_rsp_wid_buf [`ISSUEQ_SIZE-1:0];
reg [31:0] mem_rsp_curr_PC_buf [`ISSUEQ_SIZE-1:0];
reg [`NR_BITS-1:0] mem_rsp_rd_buf [`ISSUEQ_SIZE-1:0];
reg [`NUM_THREADS-1:0] mem_rsp_mask[`LSUQ_SIZE-1:0];
reg [`NUM_THREADS-1:0][31:0] mem_rsp_data_curr;
wire [`DCORE_TAG_ID_BITS-1:0] req_tag, rsp_tag;
wire lsuq_full;
wire [`ISTAG_BITS-1:0] rsp_issue_tag = dcache_rsp_if.tag[0][`ISTAG_BITS-1:0];
wire lsuq_push = (| dcache_req_if.valid) && dcache_req_if.ready
&& (0 == req_rw); // only loads
wire [`NUM_THREADS-1:0] mem_rsp_mask = mem_rsp_mask_buf [rsp_issue_tag];
wire [`NUM_THREADS-1:0][1:0] mem_rsp_offset = mem_rsp_offset_buf [rsp_issue_tag];
wire [1:0] mem_rsp_sext = mem_rsp_sext_buf [rsp_issue_tag];
wire [`NUM_THREADS-1:0][31:0] mem_rsp_data_prev= mem_rsp_data_prev_buf [rsp_issue_tag];
wire [`NW_BITS-1:0] mem_rsp_wid = mem_rsp_wid_buf [rsp_issue_tag];
wire [31:0] mem_rsp_curr_PC = mem_rsp_curr_PC_buf [rsp_issue_tag];
wire [`NR_BITS-1:0] mem_rsp_rd = mem_rsp_rd_buf [rsp_issue_tag];
wire lsuq_pop_part = (| dcache_rsp_if.valid) && dcache_rsp_if.ready;
assign rsp_tag = dcache_rsp_if.tag[0][`DCORE_TAG_ID_BITS-1:0];
wire dcache_req_fire = (| dcache_req_if.valid) && dcache_req_if.ready;
wire dcache_rsp_fire = (| dcache_rsp_if.valid) && dcache_rsp_if.ready;
wire [`NUM_THREADS-1:0] mem_rsp_mask_upd = mem_rsp_mask[rsp_tag] & ~dcache_rsp_if.valid;
wire [`NUM_THREADS-1:0] mem_rsp_mask_n = mem_rsp_mask & ~dcache_rsp_if.valid;
wire lsuq_pop = lsuq_pop_part && (0 == mem_rsp_mask_upd);
VX_cam_buffer #(
.DATAW (`NW_BITS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 2) + 2),
.SIZE (`LSUQ_SIZE)
) lsu_queue (
.clk (clk),
.reset (reset),
.write_addr (req_tag),
.acquire_slot (lsuq_push),
.read_addr (rsp_tag),
.write_data ({req_wid, req_curr_PC, req_rd, req_wb, req_offset, req_sext}),
.read_data ({rsp_wid, rsp_curr_PC, rsp_rd, rsp_wb, rsp_offset, rsp_sext}),
.release_addr (rsp_tag),
.release_slot (lsuq_pop),
.full (lsuq_full)
);
always @(posedge clk) begin
if (dcache_req_fire && (0 == req_rw)) begin
mem_rsp_mask_buf [req_issue_tag] <= req_thread_mask;
mem_rsp_data_prev_buf [req_issue_tag] <= 0;
if (lsuq_push) begin
mem_rsp_mask[req_tag] <= req_thread_mask;
end
if (dcache_rsp_fire) begin
mem_rsp_mask_buf [rsp_issue_tag] <= mem_rsp_mask_n;
mem_rsp_data_prev_buf [rsp_issue_tag] <= mem_rsp_data_curr | mem_rsp_data_prev;
if (lsuq_pop_part) begin
mem_rsp_mask[rsp_tag] <= mem_rsp_mask_upd;
end
end
always @(posedge clk) begin
if (dcache_req_fire && (0 == req_rw)) begin
mem_rsp_offset_buf [req_issue_tag] <= req_offset;
mem_rsp_sext_buf [req_issue_tag] <= req_sext;
mem_rsp_wid_buf [req_issue_tag] <= req_wid;
mem_rsp_curr_PC_buf [req_issue_tag] <= req_pc;
mem_rsp_rd_buf [req_issue_tag] <= req_rd;
end
end
wire stall_in;
wire store_stall = valid_in && req_rw && stall_out;
// Core Request
assign dcache_req_if.valid = {`NUM_THREADS{valid_in && ~stall_in}} & req_thread_mask;
assign dcache_req_if.valid = {`NUM_THREADS{valid_in && ~lsuq_full && ~store_stall}} & req_thread_mask;
assign dcache_req_if.rw = {`NUM_THREADS{req_rw}};
assign dcache_req_if.byteen = req_byteen;
assign dcache_req_if.addr = req_addr;
assign dcache_req_if.data = req_data;
assign ready_in = dcache_req_if.ready && ~stall_in;
`ifdef DBG_CORE_REQ_INFO
assign dcache_req_if.tag = {req_pc, req_wb, req_rd, req_wid, req_issue_tag};
assign dcache_req_if.tag = {req_curr_PC, req_rd, req_wid, req_tag};
`else
assign dcache_req_if.tag = req_issue_tag;
assign dcache_req_if.tag = req_tag;
`endif
assign stall_in = ~dcache_req_if.ready || lsuq_full || store_stall;
// Can accept new request?
assign lsu_req_if.ready = ~stall_in;
// Core Response
for (genvar i = 0; i < `NUM_THREADS; i++) begin
wire [31:0] rsp_data_shifted = dcache_rsp_if.data[i] >> {mem_rsp_offset[i], 3'b0};
wire [31:0] rsp_data_shifted = dcache_rsp_if.data[i] >> {rsp_offset[i], 3'b0};
always @(*) begin
case (mem_rsp_sext)
1: mem_rsp_data_curr[i] = {{24{rsp_data_shifted[7]}}, rsp_data_shifted[7:0]};
2: mem_rsp_data_curr[i] = {{16{rsp_data_shifted[15]}}, rsp_data_shifted[15:0]};
default: mem_rsp_data_curr[i] = rsp_data_shifted;
case (rsp_sext)
1: rsp_data[i] = {{24{rsp_data_shifted[7]}}, rsp_data_shifted[7:0]};
2: rsp_data[i] = {{16{rsp_data_shifted[15]}}, rsp_data_shifted[15:0]};
default: rsp_data[i] = rsp_data_shifted;
endcase
end
end
reg is_load_rsp;
reg [`NUM_THREADS-1:0][31:0] load_data;
reg [`ISTAG_BITS-1:0] rsp_issue_tag_r;
wire is_store_req = valid_in && ~lsuq_full && req_rw && dcache_req_if.ready;
wire is_load_rsp = (| dcache_rsp_if.valid);
always @(posedge clk) begin
if (reset) begin
is_load_rsp <= 0;
end else begin
is_load_rsp <= dcache_rsp_fire && (0 == mem_rsp_mask_n);
load_data <= mem_rsp_data_curr | mem_rsp_data_prev;
rsp_issue_tag_r <= rsp_issue_tag;
end
end
wire stall_out = ~lsu_commit_if.ready && lsu_commit_if.valid;
wire mem_rsp_stall = is_load_rsp && is_store_req; // arbitration prioritizes stores
wire is_store_req = dcache_req_fire && req_rw;
assign stall_in = is_load_rsp && valid_in && req_rw; // LOAD has priority
wire arb_valid = is_store_req || is_load_rsp;
wire [`NW_BITS-1:0] arb_wid = is_store_req ? req_wid : rsp_wid;
wire [`NUM_THREADS-1:0] arb_thread_mask = is_store_req ? req_thread_mask : dcache_rsp_if.valid;
wire [31:0] arb_curr_PC = is_store_req ? req_curr_PC : rsp_curr_PC;
wire [`NR_BITS-1:0] arb_rd = is_store_req ? 0 : rsp_rd;
wire arb_wb = is_store_req ? 0 : rsp_wb;
assign lsu_commit_if.valid = is_load_rsp || is_store_req;
assign lsu_commit_if.issue_tag = is_load_rsp ? rsp_issue_tag_r : req_issue_tag;
assign lsu_commit_if.data = load_data;
VX_generic_register #(
.N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32))
) lsu_rsp_reg (
.clk (clk),
.reset (reset),
.stall (stall_out),
.flush (1'b0),
.in ({arb_valid, arb_wid, arb_thread_mask, arb_curr_PC, arb_rd, arb_wb, rsp_data}),
.out ({lsu_commit_if.valid, lsu_commit_if.wid, lsu_commit_if.thread_mask, lsu_commit_if.curr_PC, lsu_commit_if.rd, lsu_commit_if.wb, lsu_commit_if.data})
);
// Can accept new cache response?
assign dcache_rsp_if.ready = 1'b1;
assign dcache_rsp_if.ready = ~(stall_out || mem_rsp_stall);
// scope registration
`SCOPE_ASSIGN (scope_dcache_req_valid, dcache_req_if.valid);
@ -198,28 +199,23 @@ module VX_lsu_unit #(
`SCOPE_ASSIGN (scope_dcache_req_data, dcache_req_if.data);
`SCOPE_ASSIGN (scope_dcache_req_tag, dcache_req_if.tag);
`SCOPE_ASSIGN (scope_dcache_req_ready, dcache_req_if.ready);
`SCOPE_ASSIGN (scope_dcache_req_wid, req_wid);
`SCOPE_ASSIGN (scope_dcache_req_wid, req_wid);
`SCOPE_ASSIGN (scope_dcache_req_curr_PC, req_pc);
`SCOPE_ASSIGN (scope_dcache_rsp_valid, dcache_rsp_if.valid);
`SCOPE_ASSIGN (scope_dcache_rsp_data, dcache_rsp_if.data);
`SCOPE_ASSIGN (scope_dcache_rsp_tag, dcache_rsp_if.tag);
`SCOPE_ASSIGN (scope_dcache_rsp_ready, dcache_rsp_if.ready);
`UNUSED_VAR (mem_rsp_wid)
`UNUSED_VAR (mem_rsp_curr_PC)
`UNUSED_VAR (mem_rsp_rd)
`UNUSED_VAR (req_wb)
`ifdef DBG_PRINT_CORE_DCACHE
always @(posedge clk) begin
if ((| dcache_req_if.valid) && dcache_req_if.ready) begin
$display("%t: D$%0d req: wid=%0d, PC=%0h, tmask=%b, addr=%0h, tag=%0h, rd=%0d, rw=%0b, byteen=%0h, data=%0h",
$time, CORE_ID, req_wid, req_pc, dcache_req_if.valid, req_address, dcache_req_if.tag, req_rd, dcache_req_if.rw, dcache_req_if.byteen, dcache_req_if.data);
$time, CORE_ID, req_wid, req_curr_PC, dcache_req_if.valid, req_address, dcache_req_if.tag, req_rd, dcache_req_if.rw, dcache_req_if.byteen, dcache_req_if.data);
end
if ((| dcache_rsp_if.valid) && dcache_rsp_if.ready) begin
$display("%t: D$%0d rsp: valid=%b, wid=%0d, PC=%0h, tag=%0h, rd=%0d, data=%0h",
$time, CORE_ID, dcache_rsp_if.valid, mem_rsp_wid, mem_rsp_curr_PC, dcache_rsp_if.tag, mem_rsp_rd, dcache_rsp_if.data);
$time, CORE_ID, dcache_rsp_if.valid, rsp_wid, rsp_curr_PC, dcache_rsp_if.tag, rsp_rd, dcache_rsp_if.data);
end
end
`endif

View file

@ -7,34 +7,52 @@ module VX_mul_unit #(
input wire reset,
// Inputs
VX_mul_req_if alu_req_if,
VX_mul_req_if mul_req_if,
// Outputs
VX_exu_to_cmt_if alu_commit_if
VX_exu_to_cmt_if mul_commit_if
);
wire [`ISTAG_BITS-1:0] issue_tag;
wire [`MUL_BITS-1:0] alu_op;
wire [`NUM_THREADS-1:0][31:0] alu_in1, alu_in2;
wire valid_in, ready_in;
// use a skid buffer due to MUL/DIV output arbitration adding realtime backpressure
VX_elastic_buffer #(
.DATAW (`ISTAG_BITS + `MUL_BITS + (2 * `NUM_THREADS * 32)),
.SIZE (0)
) input_buffer (
.clk (clk),
.reset (reset),
.valid_in (alu_req_if.valid),
.ready_in (alu_req_if.ready),
.data_in ({alu_req_if.issue_tag, alu_req_if.op, alu_req_if.rs1_data, alu_req_if.rs2_data}),
.data_out ({issue_tag, alu_op, alu_in1, alu_in2}),
.ready_out (ready_in),
.valid_out (valid_in)
);
localparam MULQ_BITS = `LOG2UP(`MULQ_SIZE);
wire [`MUL_BITS-1:0] alu_op = mul_req_if.op;
wire [`NUM_THREADS-1:0][31:0] alu_in1 = mul_req_if.rs1_data;
wire [`NUM_THREADS-1:0][31:0] alu_in2 = mul_req_if.rs2_data;
wire [`NW_BITS-1:0] rsp_wid;
wire [`NUM_THREADS-1:0] rsp_thread_mask;
wire [31:0] rsp_curr_PC;
wire [`NR_BITS-1:0] rsp_rd;
wire rsp_wb;
wire [MULQ_BITS-1:0] tag_in, tag_out;
wire valid_out;
wire stall_out;
wire mulq_full;
wire mulq_push = mul_req_if.valid && mul_req_if.ready;
wire mulq_pop = valid_out && ~stall_out;
VX_cam_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1),
.SIZE (`MULQ_SIZE)
) mul_queue (
.clk (clk),
.reset (reset),
.acquire_slot (mulq_push),
.write_addr (tag_in),
.read_addr (tag_out),
.release_addr (tag_out),
.write_data ({mul_req_if.wid, mul_req_if.thread_mask, mul_req_if.curr_PC, mul_req_if.rd, mul_req_if.wb}),
.read_data ({rsp_wid, rsp_thread_mask, rsp_curr_PC, rsp_rd, rsp_wb}),
.release_slot (mulq_pop),
.full (mulq_full)
);
///////////////////////////////////////////////////////////////////////////
wire [`NUM_THREADS-1:0][31:0] mul_result;
wire is_mulw = (alu_op == `MUL_MUL);
wire is_mulw_out;
wire stall_mul;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
@ -51,7 +69,7 @@ module VX_mul_unit #(
) multiplier (
.clk(clk),
.reset(reset),
.clk_en(1'b1),
.clk_en(~stall_mul),
.dataa(mul_in1),
.datab(mul_in2),
.result(mul_result_tmp)
@ -60,20 +78,20 @@ module VX_mul_unit #(
assign mul_result[i] = is_mulw_out ? mul_result_tmp[31:0] : mul_result_tmp[63:32];
end
wire [`ISTAG_BITS-1:0] mul_issue_tag;
wire [MULQ_BITS-1:0] mul_tag;
wire mul_valid_out;
wire mul_fire = valid_in && ready_in && ~`IS_DIV_OP(alu_op);
wire mul_fire = mul_req_if.valid && mul_req_if.ready && ~`IS_DIV_OP(alu_op);
VX_shift_register #(
.DATAW(1 + `ISTAG_BITS + 1),
.DATAW(1 + MULQ_BITS + 1),
.DEPTH(`LATENCY_IMUL)
) mul_shift_reg (
.clk(clk),
.reset(reset),
.enable(1'b1),
.in({mul_fire, issue_tag, is_mulw}),
.out({mul_valid_out, mul_issue_tag, is_mulw_out})
.enable(~stall_mul),
.in({mul_fire, tag_in, is_mulw}),
.out({mul_valid_out, mul_tag, is_mulw_out})
);
///////////////////////////////////////////////////////////////////////////
@ -81,8 +99,8 @@ module VX_mul_unit #(
wire [`NUM_THREADS-1:0][31:0] div_result;
wire is_div = (alu_op == `MUL_DIV || alu_op == `MUL_DIVU);
wire is_signed_div = (alu_op == `MUL_DIV || alu_op == `MUL_REM);
reg [`NUM_THREADS-1:0] is_div_qual;
wire [`NUM_THREADS-1:0] is_div_out;
reg [`NUM_THREADS-1:0] is_div_qual;
wire is_div_out;
wire stall_div;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
@ -95,8 +113,8 @@ module VX_mul_unit #(
always @(*) begin
if (~stall_div) begin
is_div_qual[i] = is_div;
div_in1_qual = alu_in1[i];
div_in2_qual = alu_in2[i];
div_in1_qual = alu_in1[i];
div_in2_qual = alu_in2[i];
if (0 == alu_in2[i]) begin
div_in2_qual = 1;
if (is_div) begin
@ -134,34 +152,52 @@ module VX_mul_unit #(
.remainder(rem_result_tmp)
);
assign div_result[i] = is_div_out[i] ? div_result_tmp : rem_result_tmp;
assign div_result[i] = is_div_out ? div_result_tmp : rem_result_tmp;
end
wire [`ISTAG_BITS-1:0] div_issue_tag;
wire [MULQ_BITS-1:0] div_tag;
wire div_valid_out;
wire div_fire = valid_in && ready_in && `IS_DIV_OP(alu_op);
wire div_fire = mul_req_if.valid && mul_req_if.ready && `IS_DIV_OP(alu_op);
VX_shift_register #(
.DATAW(1 + `ISTAG_BITS + `NUM_THREADS),
.DATAW(1 + MULQ_BITS + 1),
.DEPTH(`LATENCY_IDIV + 1)
) div_shift_reg (
.clk(clk),
.reset(reset),
.enable(~stall_div),
.in({div_fire, issue_tag, is_div_qual}),
.out({div_valid_out, div_issue_tag, is_div_out})
.in({div_fire, tag_in, (| is_div_qual)}),
.out({div_valid_out, div_tag, is_div_out})
);
///////////////////////////////////////////////////////////////////////////
assign stall_div = mul_valid_out && div_valid_out; // arbitration prioritizes MUL
wire arbiter_hazard = mul_valid_out && div_valid_out;
assign stall_out = ~mul_commit_if.ready && mul_commit_if.valid;
assign stall_mul = stall_out || mulq_full;
assign stall_div = stall_out || mulq_full
|| arbiter_hazard; // arbitration prioritizes MUL
wire stall_in = stall_mul || stall_div;
assign valid_out = mul_valid_out || div_valid_out;
assign tag_out = mul_valid_out ? mul_tag : div_tag;
wire [`NUM_THREADS-1:0][31:0] result = mul_valid_out ? mul_result : div_result;
VX_generic_register #(
.N(1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32))
) alu_reg (
.clk (clk),
.reset (reset),
.stall (stall_out),
.flush (0),
.in ({valid_out, rsp_wid, rsp_thread_mask, rsp_curr_PC, rsp_rd, rsp_wb, result}),
.out ({mul_commit_if.valid, mul_commit_if.wid, mul_commit_if.thread_mask, mul_commit_if.curr_PC, mul_commit_if.rd, mul_commit_if.wb, mul_commit_if.data})
);
// can accept new request?
assign ready_in = ~stall_div;
assign alu_commit_if.valid = mul_valid_out || div_valid_out;
assign alu_commit_if.issue_tag = mul_valid_out ? mul_issue_tag : div_issue_tag;
assign alu_commit_if.data = mul_valid_out ? mul_result : div_result;
assign mul_req_if.ready = ~stall_in;
endmodule

View file

@ -101,24 +101,22 @@ module VX_pipeline #(
assign csr_io_rsp_data = csr_io_rsp_if.data;
assign csr_io_rsp_if.ready = csr_io_rsp_ready;
VX_csr_to_issue_if csr_to_issue_if();
VX_cmt_to_csr_if cmt_to_csr_if();
VX_decode_if decode_if();
VX_branch_ctl_if branch_ctl_if();
VX_warp_ctl_if warp_ctl_if();
VX_ifetch_rsp_if ifetch_rsp_if();
VX_alu_req_if alu_req_if();
VX_bru_req_if bru_req_if();
VX_lsu_req_if lsu_req_if();
VX_csr_req_if csr_req_if();
VX_mul_req_if mul_req_if();
VX_fpu_req_if fpu_req_if();
VX_gpu_req_if gpu_req_if();
VX_wb_if writeback_if();
VX_cmt_to_issue_if cmt_to_issue_if();
VX_writeback_if writeback_if();
VX_wstall_if wstall_if();
VX_join_if join_if();
VX_exu_to_cmt_if alu_commit_if();
VX_exu_to_cmt_if bru_commit_if();
VX_exu_to_cmt_if lsu_commit_if();
VX_exu_to_cmt_if csr_commit_if();
VX_exu_to_cmt_if mul_commit_if();
@ -159,10 +157,9 @@ module VX_pipeline #(
.decode_if (decode_if),
.writeback_if (writeback_if),
.cmt_to_issue_if(cmt_to_issue_if),
.csr_to_issue_if(csr_to_issue_if),
.alu_req_if (alu_req_if),
.bru_req_if (bru_req_if),
.lsu_req_if (lsu_req_if),
.csr_req_if (csr_req_if),
.mul_req_if (mul_req_if),
@ -183,10 +180,10 @@ module VX_pipeline #(
.csr_io_req_if (csr_io_req_if),
.csr_io_rsp_if (csr_io_rsp_if),
.csr_to_issue_if(csr_to_issue_if),
.cmt_to_csr_if (cmt_to_csr_if),
.alu_req_if (alu_req_if),
.bru_req_if (bru_req_if),
.lsu_req_if (lsu_req_if),
.csr_req_if (csr_req_if),
.mul_req_if (mul_req_if),
@ -196,7 +193,6 @@ module VX_pipeline #(
.warp_ctl_if (warp_ctl_if),
.branch_ctl_if (branch_ctl_if),
.alu_commit_if (alu_commit_if),
.bru_commit_if (bru_commit_if),
.lsu_commit_if (lsu_commit_if),
.csr_commit_if (csr_commit_if),
.mul_commit_if (mul_commit_if),
@ -213,14 +209,12 @@ module VX_pipeline #(
.reset (reset),
.alu_commit_if (alu_commit_if),
.bru_commit_if (bru_commit_if),
.lsu_commit_if (lsu_commit_if),
.csr_commit_if (csr_commit_if),
.mul_commit_if (mul_commit_if),
.fpu_commit_if (fpu_commit_if),
.gpu_commit_if (gpu_commit_if),
.cmt_to_issue_if(cmt_to_issue_if),
.writeback_if (writeback_if),
.cmt_to_csr_if (cmt_to_csr_if)
);

View file

@ -35,6 +35,10 @@
wire [$bits(x)-1:0] __``x``__ = x; \
/* verilator lint_on UNUSED */
`define UNUSED_FIELD(x,y) /* verilator lint_off UNUSED */ \
wire [$bits(x.y)-1:0] __``y``__ = x.y; \
/* verilator lint_on UNUSED */
`define UNUSED_PIN(x) /* verilator lint_off PINCONNECTEMPTY */ \
. x () \
/* verilator lint_on PINCONNECTEMPTY */

View file

@ -24,39 +24,40 @@ task print_ex_op;
begin
case (ex)
`EX_ALU: begin
case (`ALU_BITS'(op))
`ALU_ADD: $write("ADD");
`ALU_SUB: $write("SUB");
`ALU_SLL: $write("SLL");
`ALU_SRL: $write("SRL");
`ALU_SRA: $write("SRA");
`ALU_SLT: $write("SLT");
`ALU_SLTU: $write("SLTU");
`ALU_XOR: $write("XOR");
`ALU_OR: $write("OR");
`ALU_AND: $write("AND");
`ALU_LUI: $write("LUI");
`ALU_AUIPC: $write("AUIPC");
default: $write("?");
endcase
end
`EX_BRU: begin
case (`BRU_BITS'(op))
`BRU_EQ: $write("BEQ");
`BRU_NE: $write("BNE");
`BRU_LT: $write("BLT");
`BRU_GE: $write("BGE");
`BRU_LTU: $write("BLTU");
`BRU_GEU: $write("BGEU");
`BRU_JAL: $write("JAL");
`BRU_JALR: $write("JALR");
`BRU_ECALL: $write("ECALL");
`BRU_EBREAK:$write("EBREAK");
`BRU_MRET: $write("MRET");
`BRU_SRET: $write("SRET");
`BRU_DRET: $write("DRET");
default: $write("?");
endcase
if (`IS_BR_OP(op)) begin
case (`BR_BITS'(op))
`BR_EQ: $write("BEQ");
`BR_NE: $write("BNE");
`BR_LT: $write("BLT");
`BR_GE: $write("BGE");
`BR_LTU: $write("BLTU");
`BR_GEU: $write("BGEU");
`BR_JAL: $write("JAL");
`BR_JALR: $write("JALR");
`BR_ECALL: $write("ECALL");
`BR_EBREAK:$write("EBREAK");
`BR_MRET: $write("MRET");
`BR_SRET: $write("SRET");
`BR_DRET: $write("DRET");
default: $write("?");
endcase
end else begin
case (`ALU_BITS'(op))
`ALU_ADD: $write("ADD");
`ALU_SUB: $write("SUB");
`ALU_SLL: $write("SLL");
`ALU_SRL: $write("SRL");
`ALU_SRA: $write("SRA");
`ALU_SLT: $write("SLT");
`ALU_SLTU: $write("SLTU");
`ALU_XOR: $write("XOR");
`ALU_OR: $write("OR");
`ALU_AND: $write("AND");
`ALU_LUI: $write("LUI");
`ALU_AUIPC: $write("AUIPC");
default: $write("?");
endcase
end
end
`EX_LSU: begin
case (`LSU_BITS'(op))

View file

@ -6,66 +6,56 @@ module VX_scoreboard #(
input wire clk,
input wire reset,
VX_decode_if decode_if,
VX_wb_if writeback_if,
VX_cmt_to_issue_if cmt_to_issue_if,
input wire ex_busy,
output wire [`ISTAG_BITS-1:0] issue_tag,
output wire schedule_delay
VX_decode_if ibuf_deq_if,
VX_writeback_if writeback_if,
input wire exe_delay,
input wire gpr_delay,
output wire delay
);
reg [`NUM_REGS-1:0] inuse_reg_mask [`NUM_WARPS-1:0];
reg [`NUM_THREADS-1:0] inuse_registers [(`NUM_WARPS * `NUM_REGS)-1:0];
reg [`NUM_REGS-1:0] inuse_reg_mask [`NUM_WARPS-1:0];
wire [`NUM_REGS-1:0] inuse_mask = inuse_reg_mask[decode_if.wid] & decode_if.reg_use_mask;
wire inuse_hazard = (inuse_mask != 0);
wire [`NUM_REGS-1:0] inuse_mask = inuse_reg_mask[ibuf_deq_if.wid] & ibuf_deq_if.used_regs;
wire issue_buf_full;
assign schedule_delay = ex_busy || inuse_hazard || issue_buf_full;
wire issue_fire = decode_if.valid && decode_if.ready;
assign delay = (| inuse_mask);
wire reserve_rd = issue_fire && (decode_if.wb != 0);
wire reserve_reg = ibuf_deq_if.valid && ibuf_deq_if.ready && (ibuf_deq_if.wb != 0);
wire release_rd = writeback_if.valid;
wire release_reg = writeback_if.valid && writeback_if.ready;
wire [`NUM_THREADS-1:0] inuse_registers_n = inuse_registers[{writeback_if.wid, writeback_if.rd}] & ~writeback_if.thread_mask;
always @(posedge clk) begin
if (reset) begin
for (integer i = 0; i < `NUM_WARPS; i++) begin
inuse_reg_mask[i] <= `NUM_REGS'(0);
for (integer w = 0; w < `NUM_WARPS; w++) begin
for (integer i = 0; i < `NUM_REGS; i++) begin
inuse_registers[w * `NUM_REGS + i] <= 0;
end
inuse_reg_mask [w] <= `NUM_REGS'(0);
end
end else begin
if (reserve_rd) begin
inuse_reg_mask[decode_if.wid][decode_if.rd] <= 1;
if (reserve_reg) begin
inuse_registers[{ibuf_deq_if.wid, ibuf_deq_if.rd}] <= ibuf_deq_if.thread_mask;
inuse_reg_mask[ibuf_deq_if.wid][ibuf_deq_if.rd] <= 1;
end
if (release_rd) begin
if (release_reg) begin
assert(inuse_reg_mask[writeback_if.wid][writeback_if.rd] != 0);
inuse_reg_mask[writeback_if.wid][writeback_if.rd] <= 0;
inuse_registers[{writeback_if.wid, writeback_if.rd}] <= inuse_registers_n;
inuse_reg_mask[writeback_if.wid][writeback_if.rd] <= (| inuse_registers_n);
end
end
end
VX_cam_buffer #(
.DATAW ($bits(issue_data_t)),
.SIZE (`ISSUEQ_SIZE),
.RPORTS (`NUM_EXS)
) issue_table (
.clk (clk),
.reset (reset),
.write_data ({decode_if.wid, decode_if.thread_mask, decode_if.curr_PC, decode_if.rd, decode_if.wb}),
.write_addr (issue_tag),
.acquire_slot (issue_fire),
.release_slot ({cmt_to_issue_if.alu_valid, cmt_to_issue_if.bru_valid, cmt_to_issue_if.lsu_valid, cmt_to_issue_if.csr_valid, cmt_to_issue_if.mul_valid, cmt_to_issue_if.fpu_valid, cmt_to_issue_if.gpu_valid}),
.read_addr ({cmt_to_issue_if.alu_tag, cmt_to_issue_if.bru_tag, cmt_to_issue_if.lsu_tag, cmt_to_issue_if.csr_tag, cmt_to_issue_if.mul_tag, cmt_to_issue_if.fpu_tag, cmt_to_issue_if.gpu_tag}),
.read_data ({cmt_to_issue_if.alu_data, cmt_to_issue_if.bru_data, cmt_to_issue_if.lsu_data, cmt_to_issue_if.csr_data, cmt_to_issue_if.mul_data, cmt_to_issue_if.fpu_data, cmt_to_issue_if.gpu_data}),
.full (issue_buf_full)
);
// issue the instruction
assign ibuf_deq_if.ready = ~(delay || exe_delay || gpr_delay);
`ifdef DBG_PRINT_PIPELINE
always @(posedge clk) begin
if (decode_if.valid && ~decode_if.ready) begin
$display("%t: Core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, ib_full=%b, inuse=%b%b%b%b, ex_busy=%b",
$time, CORE_ID, decode_if.wid, decode_if.curr_PC, decode_if.rd, decode_if.wb, issue_buf_full,
inuse_mask[decode_if.rd], inuse_mask[decode_if.rs1], inuse_mask[decode_if.rs2], inuse_mask[decode_if.rs3], ex_busy);
if (ibuf_deq_if.valid && ~ibuf_deq_if.ready) begin
$display("%t: core%0d-stall: wid=%0d, PC=%0h, rd=%0d, wb=%0d, inuse=%b%b%b%b, exe=%b, gpr=%b",
$time, CORE_ID, ibuf_deq_if.wid, ibuf_deq_if.curr_PC, ibuf_deq_if.rd, ibuf_deq_if.wb,
inuse_mask[ibuf_deq_if.rd], inuse_mask[ibuf_deq_if.rs1], inuse_mask[ibuf_deq_if.rs2], inuse_mask[ibuf_deq_if.rs3], exe_delay, gpr_delay);
end
end
`endif

View file

@ -3,14 +3,6 @@
`include "VX_define.vh"
typedef struct packed {
logic [`NW_BITS-1:0] wid;
logic [`NUM_THREADS-1:0] thread_mask;
logic [31:0] curr_PC;
logic [`NR_BITS-1:0] rd;
logic wb;
} issue_data_t;
typedef struct packed {
logic is_normal;
logic is_zero;
@ -53,7 +45,7 @@ typedef struct packed {
typedef struct packed {
logic valid;
logic [`NB_BITS-1:0] id;
logic [`NW_BITS:0] num_warps;
logic [`NW_BITS-1:0] size_m1;
} gpu_barrier_t;
`endif

View file

@ -20,36 +20,46 @@ module VX_warp_sched #(
wire [31:0] join_pc;
wire [`NUM_THREADS-1:0] join_tm;
reg [`NUM_WARPS-1:0] warp_active;
reg [`NUM_WARPS-1:0] warp_stalled;
reg [`NUM_WARPS-1:0] visible_active;
wire update_visible_active;
reg [`NUM_WARPS-1:0] warp_active; // real active warps (updated when a warp is activated or disabled)
reg [`NUM_WARPS-1:0] warp_stalled; // asserted when a branch/gpgpu instructions are issued
reg [`NUM_WARPS-1:0] warp_ready, warp_ready_n; // enforces round-robin, barrier, and non-speculating branches
reg [`NUM_WARPS-1:0] warp_lock;
// Lock warp until instruction decode to resolve branches
reg [`NUM_WARPS-1:0] fetch_lock;
reg [`NUM_THREADS-1:0] thread_masks[`NUM_WARPS-1:0];
reg [31:0] warp_pcs[`NUM_WARPS-1:0];
// barriers
reg [`NUM_WARPS-1:0] barrier_stall_mask[`NUM_BARRIERS-1:0];
wire reached_barrier_limit;
reg [`NUM_WARPS-1:0] total_barrier_stall;
reg [`NUM_WARPS-1:0] barrier_stall_mask[`NUM_BARRIERS-1:0]; // warps waiting on barrier
wire reached_barrier_limit; // the expected number of warps reached the barrier
// wspawn
reg [31:0] use_wspawn_pc;
reg [`NUM_WARPS-1:0] use_wspawn;
wire [`NUM_THREADS-1:0] thread_mask;
wire [31:0] warp_pc;
wire [`NW_BITS-1:0] warp_to_schedule;
wire scheduled_warp;
wire [`NUM_WARPS-1:0] total_warp_stalled;
wire stall_out;
wire global_stall;
wire real_schedule;
reg didnt_split;
reg didnt_split;
always @(*) begin
warp_ready_n = warp_ready;
if (warp_ctl_if.valid
&& warp_ctl_if.tmc.valid
&& (0 == warp_ctl_if.tmc.thread_mask)) begin
warp_ready_n[warp_ctl_if.wid] = 0;
end
if (wstall_if.wstall) begin
warp_ready_n[wstall_if.wid] = 0;
end
if (scheduled_warp) begin
warp_ready_n[warp_to_schedule] = 0;
end
end
always @(posedge clk) begin
if (reset) begin
@ -57,42 +67,41 @@ module VX_warp_sched #(
barrier_stall_mask[i] <= 0;
end
use_wspawn_pc <= 0;
use_wspawn <= 0;
warp_pcs[0] <= `STARTUP_ADDR;
warp_active[0] <= 1; // Activating first warp
visible_active[0] <= 1; // Activating first warp
thread_masks[0] <= 1; // Activating first thread in first warp
warp_stalled <= 0;
didnt_split <= 0;
warp_lock <= 0;
use_wspawn_pc <= 0;
use_wspawn <= 0;
warp_pcs[0] <= `STARTUP_ADDR;
warp_active[0] <= 1; // Activating first warp
warp_ready[0] <= 1; // set first warp as ready
thread_masks[0] <= 1; // Activating first thread in first warp
warp_stalled <= 0;
didnt_split <= 0;
fetch_lock <= 0;
for (integer i = 1; i < `NUM_WARPS; i++) begin
warp_pcs[i] <= 0;
warp_active[i] <= 0; // Activating first warp
visible_active[i] <= 0; // Activating first warp
thread_masks[i] <= 1; // Activating first thread in first warp
warp_pcs[i] <= 0;
warp_active[i] <= 0;
warp_ready[i] <= 0;
thread_masks[i] <= 0;
end
end else begin
if (warp_ctl_if.wspawn.valid) begin
if (warp_ctl_if.valid && warp_ctl_if.wspawn.valid) begin
warp_active <= warp_ctl_if.wspawn.wmask;
use_wspawn <= warp_ctl_if.wspawn.wmask & (~`NUM_WARPS'(1));
use_wspawn_pc <= warp_ctl_if.wspawn.pc;
end
if (warp_ctl_if.barrier.valid) begin
if (warp_ctl_if.valid && warp_ctl_if.barrier.valid) begin
warp_stalled[warp_ctl_if.wid] <= 0;
if (reached_barrier_limit) begin
barrier_stall_mask[warp_ctl_if.barrier.id] <= 0;
end else begin
barrier_stall_mask[warp_ctl_if.barrier.id][warp_ctl_if.wid] <= 1;
end
end else if (warp_ctl_if.tmc.valid) begin
end else if (warp_ctl_if.valid && warp_ctl_if.tmc.valid) begin
thread_masks[warp_ctl_if.wid] <= warp_ctl_if.tmc.thread_mask;
warp_stalled[warp_ctl_if.wid] <= 0;
if (0 == warp_ctl_if.tmc.thread_mask) begin
warp_active[warp_ctl_if.wid] <= 0;
visible_active[warp_ctl_if.wid] <= 0;
warp_active[warp_ctl_if.wid] <= 0;
end
end else if (join_if.is_join && !didnt_split) begin
if (!join_fall) begin
@ -100,7 +109,7 @@ module VX_warp_sched #(
end
thread_masks[join_if.wid] <= join_tm;
didnt_split <= 0;
end else if (warp_ctl_if.split.valid) begin
end else if (warp_ctl_if.valid && warp_ctl_if.split.valid) begin
warp_stalled[warp_ctl_if.wid] <= 0;
if (warp_ctl_if.split.diverged) begin
thread_masks[warp_ctl_if.wid] <= warp_ctl_if.split.then_mask;
@ -110,26 +119,19 @@ module VX_warp_sched #(
end
end
if (use_wspawn[warp_to_schedule] && !global_stall) begin
if (use_wspawn[warp_to_schedule] && scheduled_warp) begin
use_wspawn[warp_to_schedule] <= 0;
thread_masks[warp_to_schedule] <= 1;
end
// Stalling the scheduling of warps
if (wstall_if.wstall) begin
warp_stalled[wstall_if.wid] <= 1;
visible_active[wstall_if.wid] <= 0;
warp_stalled[wstall_if.wid] <= 1;
end
// Refilling active warps
if (update_visible_active) begin
visible_active <= warp_active & ~warp_stalled & ~total_barrier_stall & ~warp_lock;
end
// Don't change state if stall
if (!global_stall && real_schedule && (thread_mask != 0)) begin
visible_active[warp_to_schedule] <= 0;
warp_pcs[warp_to_schedule] <= warp_pc + 4;
// update 'warp_ready' when a warp is scheduled (update round-robin warp schedule)
if (scheduled_warp) begin
warp_pcs[warp_to_schedule] <= warp_pc + 4;
end
// Branch
@ -140,38 +142,42 @@ module VX_warp_sched #(
warp_stalled[branch_ctl_if.wid] <= 0;
end
// Lock/Release
if (scheduled_warp && !stall_out) begin
warp_lock[warp_to_schedule] <= 1;
// Lock warp until instruction decode to resolve branches
if (scheduled_warp) begin
fetch_lock[warp_to_schedule] <= 1;
end
if (ifetch_rsp_if.valid && ifetch_rsp_if.ready) begin
warp_lock[ifetch_rsp_if.wid] <= 0;
fetch_lock[ifetch_rsp_if.wid] <= 0;
end
// reset 'warp_ready' when it goes to zero (reset round-robin warp schedule)
warp_ready <= (| warp_ready_n) ? warp_ready_n : (warp_active & ~total_warp_stalled);
end
end
wire [`NUM_WARPS-1:0] b_mask = barrier_stall_mask[warp_ctl_if.barrier.id][`NUM_WARPS-1:0];
wire [`NW_BITS:0] b_count;
// calculate active barrier status
`IGNORE_WARNINGS_BEGIN
wire [`NW_BITS:0] active_barrier_count;
`IGNORE_WARNINGS_END
VX_countones #(
.N(`NUM_WARPS)
) barrier_count (
.valids(b_mask),
.count (b_count)
);
.valids(barrier_stall_mask[warp_ctl_if.barrier.id]),
.count (active_barrier_count)
);
wire [`NW_BITS:0] count_visible_active;
wire reached_barrier_limit = (active_barrier_count[`NW_BITS-1:0] == warp_ctl_if.barrier.size_m1);
VX_countones #(
.N(`NUM_WARPS)
) num_visible (
.valids(visible_active),
.count (count_visible_active)
);
assign reached_barrier_limit = (b_count == warp_ctl_if.barrier.num_warps);
reg [`NUM_WARPS-1:0] total_barrier_stall;
always @(*) begin
total_barrier_stall = barrier_stall_mask[0];
for (integer i = 1; i < `NUM_BARRIERS; ++i) begin
total_barrier_stall |= barrier_stall_mask[i];
end
end
assign total_barrier_stall = barrier_stall_mask[0] | barrier_stall_mask[1] | barrier_stall_mask[2] | barrier_stall_mask[3];
// split/join stack management
wire [(1+32+`NUM_THREADS-1):0] ipdom[`NUM_WARPS-1:0];
wire [(1+32+`NUM_THREADS-1):0] q1 = {1'b1, 32'b0, thread_masks[warp_ctl_if.wid]};
@ -180,7 +186,8 @@ module VX_warp_sched #(
assign {join_fall, join_pc, join_tm} = ipdom[join_if.wid];
for (genvar i = 0; i < `NUM_WARPS; i++) begin
wire push = warp_ctl_if.split.valid
wire push = warp_ctl_if.valid
&& warp_ctl_if.split.valid
&& warp_ctl_if.split.diverged
&& (i == warp_ctl_if.wid);
@ -203,46 +210,40 @@ module VX_warp_sched #(
);
end
// calculate next warp schedule
wire schedule;
wire branch_hazard = schedule
&& branch_ctl_if.valid
&& branch_ctl_if.taken
&& (branch_ctl_if.wid == warp_to_schedule);
assign real_schedule = schedule
&& !warp_stalled[warp_to_schedule]
&& !total_barrier_stall[warp_to_schedule]
&& !warp_lock[0];
wire wstall_this_cycle = wstall_if.wstall && (wstall_if.wid == warp_to_schedule); // Maybe bug
assign update_visible_active = (0 == count_visible_active) && !(stall_out || wstall_this_cycle || branch_hazard || join_if.is_join);
assign global_stall = stall_out || wstall_this_cycle || branch_hazard || !real_schedule || join_if.is_join;
assign scheduled_warp = !(wstall_this_cycle || branch_hazard || !real_schedule || join_if.is_join) && !reset;
assign warp_pc = use_wspawn[warp_to_schedule] ? use_wspawn_pc : warp_pcs[warp_to_schedule];
assign thread_mask = global_stall ? 0 : (use_wspawn[warp_to_schedule] ? `NUM_THREADS'(1) : thread_masks[warp_to_schedule]);
assign total_warp_stalled = warp_stalled | total_barrier_stall | fetch_lock;
wire [`NUM_WARPS-1:0] use_active = (count_visible_active != 0) ? visible_active :
(warp_active & ~warp_stalled & ~total_barrier_stall & ~warp_lock);
wire [`NUM_WARPS-1:0] use_ready = warp_ready & ~total_warp_stalled;
// Choosing a warp to schedule
VX_fixed_arbiter #(
.N(`NUM_WARPS)
) choose_schedule (
.clk (clk),
.reset (reset),
.requests (use_active),
.requests (use_ready),
.grant_index (warp_to_schedule),
.grant_valid (schedule),
`UNUSED_PIN (grant_onehot)
);
);
assign stall_out = ~ifetch_req_if.ready && ifetch_req_if.valid;
wire stall_out = ~ifetch_req_if.ready && ifetch_req_if.valid;
wire branch_hazard = branch_ctl_if.valid
&& branch_ctl_if.taken
&& (branch_ctl_if.wid == warp_to_schedule);
wire wstall_this_cycle = wstall_if.wstall && (wstall_if.wid == warp_to_schedule);
wire stall = stall_out || wstall_this_cycle || branch_hazard || join_if.is_join;
assign scheduled_warp = schedule && ~stall;
wire [`NUM_THREADS-1:0] thread_mask = use_wspawn[warp_to_schedule] ? `NUM_THREADS'(1) : thread_masks[warp_to_schedule];
assign warp_pc = use_wspawn[warp_to_schedule] ? use_wspawn_pc : warp_pcs[warp_to_schedule];
VX_generic_register #(
.N(1 + `NUM_THREADS + 32 + `NW_BITS)
@ -251,7 +252,7 @@ module VX_warp_sched #(
.reset (reset),
.stall (stall_out),
.flush (0),
.in ({(| thread_mask), thread_mask, warp_pc, warp_to_schedule}),
.in ({scheduled_warp, thread_mask, warp_pc, warp_to_schedule}),
.out ({ifetch_req_if.valid, ifetch_req_if.thread_mask, ifetch_req_if.curr_PC, ifetch_req_if.wid})
);

View file

@ -8,165 +8,82 @@ module VX_writeback #(
// inputs
VX_exu_to_cmt_if alu_commit_if,
VX_exu_to_cmt_if bru_commit_if,
VX_exu_to_cmt_if lsu_commit_if,
VX_exu_to_cmt_if csr_commit_if,
VX_exu_to_cmt_if mul_commit_if,
VX_fpu_to_cmt_if fpu_commit_if,
VX_exu_to_cmt_if gpu_commit_if,
VX_cmt_to_issue_if cmt_to_issue_if,
// outputs
VX_wb_if writeback_if
VX_writeback_if writeback_if
);
reg [`ISSUEQ_SIZE-1:0] wb_valid_table, wb_valid_table_n;
reg [`ISSUEQ_SIZE-1:0][`NUM_THREADS-1:0][31:0] wb_data_table, wb_data_table_n;
reg [`ISSUEQ_SIZE-1:0][`NW_BITS-1:0] wb_wid_table, wb_wid_table_n;
reg [`ISSUEQ_SIZE-1:0][`NUM_THREADS-1:0] wb_thread_mask_table, wb_thread_mask_table_n;
reg [`ISSUEQ_SIZE-1:0][31:0] wb_curr_PC_table, wb_curr_PC_table_n;
reg [`ISSUEQ_SIZE-1:0][`NR_BITS-1:0] wb_rd_table, wb_rd_table_n;
wire alu_valid = alu_commit_if.valid && alu_commit_if.wb;
wire lsu_valid = lsu_commit_if.valid && lsu_commit_if.wb;
wire csr_valid = csr_commit_if.valid && csr_commit_if.wb;
wire mul_valid = mul_commit_if.valid && mul_commit_if.wb;
wire fpu_valid = fpu_commit_if.valid && fpu_commit_if.wb;
reg wb_valid, wb_valid_n;
reg [`NUM_THREADS-1:0][31:0] wb_data, wb_data_n;
reg [`NW_BITS-1:0] wb_wid, wb_wid_n;
reg [`NUM_THREADS-1:0] wb_thread_mask, wb_thread_mask_n;
reg [31:0] wb_curr_PC, wb_curr_PC_n;
reg [`NR_BITS-1:0] wb_rd, wb_rd_n;
VX_writeback_if writeback_tmp_if();
reg [`ISTAG_BITS-1:0] wb_index;
reg [`ISTAG_BITS-1:0] wb_index_n;
assign writeback_tmp_if.valid = alu_valid ? alu_commit_if.valid :
lsu_valid ? lsu_commit_if.valid :
csr_valid ? csr_commit_if.valid :
mul_valid ? mul_commit_if.valid :
fpu_valid ? fpu_commit_if.valid :
0;
always @(*) begin
wb_valid_table_n = wb_valid_table;
wb_wid_table_n = wb_wid_table;
wb_thread_mask_table_n = wb_thread_mask_table;
wb_curr_PC_table_n = wb_curr_PC_table;
wb_rd_table_n = wb_rd_table;
wb_data_table_n = wb_data_table;
assign writeback_tmp_if.wid = alu_valid ? alu_commit_if.wid :
lsu_valid ? lsu_commit_if.wid :
csr_valid ? csr_commit_if.wid :
mul_valid ? mul_commit_if.wid :
fpu_valid ? fpu_commit_if.wid :
0;
assign writeback_tmp_if.thread_mask = alu_valid ? alu_commit_if.thread_mask :
lsu_valid ? lsu_commit_if.thread_mask :
csr_valid ? csr_commit_if.thread_mask :
mul_valid ? mul_commit_if.thread_mask :
fpu_valid ? fpu_commit_if.thread_mask :
0;
if (wb_valid) begin
wb_valid_table_n[wb_index] = 0;
end
assign writeback_tmp_if.rd = alu_valid ? alu_commit_if.rd :
lsu_valid ? lsu_commit_if.rd :
csr_valid ? csr_commit_if.rd :
mul_valid ? mul_commit_if.rd :
fpu_valid ? fpu_commit_if.rd :
0;
if (alu_commit_if.valid) begin
wb_valid_table_n [alu_commit_if.issue_tag] = cmt_to_issue_if.alu_data.wb;
wb_thread_mask_table_n [alu_commit_if.issue_tag] = cmt_to_issue_if.alu_data.thread_mask;
wb_data_table_n [alu_commit_if.issue_tag] = alu_commit_if.data;
wb_wid_table_n [alu_commit_if.issue_tag] = cmt_to_issue_if.alu_data.wid;
wb_curr_PC_table_n [alu_commit_if.issue_tag] = cmt_to_issue_if.alu_data.curr_PC;
wb_rd_table_n [alu_commit_if.issue_tag] = cmt_to_issue_if.alu_data.rd;
end
assign writeback_tmp_if.data = alu_valid ? alu_commit_if.data :
lsu_valid ? lsu_commit_if.data :
csr_valid ? csr_commit_if.data :
mul_valid ? mul_commit_if.data :
fpu_valid ? fpu_commit_if.data :
0;
if (bru_commit_if.valid) begin
wb_valid_table_n [bru_commit_if.issue_tag] = cmt_to_issue_if.bru_data.wb;
wb_thread_mask_table_n [bru_commit_if.issue_tag] = cmt_to_issue_if.bru_data.thread_mask;
wb_data_table_n [bru_commit_if.issue_tag] = bru_commit_if.data;
wb_wid_table_n [bru_commit_if.issue_tag] = cmt_to_issue_if.bru_data.wid;
wb_curr_PC_table_n [bru_commit_if.issue_tag] = cmt_to_issue_if.bru_data.curr_PC;
wb_rd_table_n [bru_commit_if.issue_tag] = cmt_to_issue_if.bru_data.rd;
end
wire stall = ~writeback_if.ready && writeback_if.valid;
if (lsu_commit_if.valid) begin
wb_valid_table_n [lsu_commit_if.issue_tag] = cmt_to_issue_if.lsu_data.wb;
wb_thread_mask_table_n [lsu_commit_if.issue_tag] = cmt_to_issue_if.lsu_data.thread_mask;
wb_data_table_n [lsu_commit_if.issue_tag] = lsu_commit_if.data;
wb_wid_table_n [lsu_commit_if.issue_tag] = cmt_to_issue_if.lsu_data.wid;
wb_curr_PC_table_n [lsu_commit_if.issue_tag] = cmt_to_issue_if.lsu_data.curr_PC;
wb_rd_table_n [lsu_commit_if.issue_tag] = cmt_to_issue_if.lsu_data.rd;
end
VX_generic_register #(
.N(1 + `NW_BITS + `NUM_THREADS + `NR_BITS + (`NUM_THREADS * 32))
) wb_reg (
.clk (clk),
.reset (reset),
.stall (stall),
.flush (0),
.in ({writeback_tmp_if.valid, writeback_tmp_if.wid, writeback_tmp_if.thread_mask, writeback_tmp_if.rd, writeback_tmp_if.data}),
.out ({writeback_if.valid, writeback_if.wid, writeback_if.thread_mask, writeback_if.rd, writeback_if.data})
);
if (csr_commit_if.valid) begin
wb_valid_table_n [csr_commit_if.issue_tag] = cmt_to_issue_if.csr_data.wb;
wb_thread_mask_table_n [csr_commit_if.issue_tag] = cmt_to_issue_if.csr_data.thread_mask;
wb_data_table_n [csr_commit_if.issue_tag] = csr_commit_if.data;
wb_wid_table_n [csr_commit_if.issue_tag] = cmt_to_issue_if.csr_data.wid;
wb_curr_PC_table_n [csr_commit_if.issue_tag] = cmt_to_issue_if.csr_data.curr_PC;
wb_rd_table_n [csr_commit_if.issue_tag] = cmt_to_issue_if.csr_data.rd;
end
if (mul_commit_if.valid) begin
wb_valid_table_n [mul_commit_if.issue_tag] = cmt_to_issue_if.mul_data.wb;
wb_thread_mask_table_n [mul_commit_if.issue_tag] = cmt_to_issue_if.mul_data.thread_mask;
wb_data_table_n [mul_commit_if.issue_tag] = mul_commit_if.data;
wb_wid_table_n [mul_commit_if.issue_tag] = cmt_to_issue_if.mul_data.wid;
wb_curr_PC_table_n [mul_commit_if.issue_tag] = cmt_to_issue_if.mul_data.curr_PC;
wb_rd_table_n [mul_commit_if.issue_tag] = cmt_to_issue_if.mul_data.rd;
end
if (fpu_commit_if.valid) begin
wb_valid_table_n [fpu_commit_if.issue_tag] = cmt_to_issue_if.fpu_data.wb;
wb_thread_mask_table_n [fpu_commit_if.issue_tag] = cmt_to_issue_if.fpu_data.thread_mask;
wb_data_table_n [fpu_commit_if.issue_tag] = fpu_commit_if.data;
wb_wid_table_n [fpu_commit_if.issue_tag] = cmt_to_issue_if.fpu_data.wid;
wb_curr_PC_table_n [fpu_commit_if.issue_tag] = cmt_to_issue_if.fpu_data.curr_PC;
wb_rd_table_n [fpu_commit_if.issue_tag] = cmt_to_issue_if.fpu_data.rd;
end
if (gpu_commit_if.valid) begin
wb_valid_table_n [gpu_commit_if.issue_tag] = cmt_to_issue_if.gpu_data.wb;
wb_thread_mask_table_n [gpu_commit_if.issue_tag] = cmt_to_issue_if.gpu_data.thread_mask;
wb_data_table_n [gpu_commit_if.issue_tag] = gpu_commit_if.data;
wb_wid_table_n [gpu_commit_if.issue_tag] = cmt_to_issue_if.gpu_data.wid;
wb_curr_PC_table_n [gpu_commit_if.issue_tag] = cmt_to_issue_if.gpu_data.curr_PC;
wb_rd_table_n [gpu_commit_if.issue_tag] = cmt_to_issue_if.gpu_data.rd;
end
end
always @(*) begin
wb_index_n = 0;
wb_valid_n = 0;
wb_thread_mask_n = {`NUM_THREADS{1'bx}};
wb_wid_n = {`NW_BITS{1'bx}};
wb_curr_PC_n = {32{1'bx}};
wb_data_n = {(`NUM_THREADS * 32){1'bx}};
for (integer i = `ISSUEQ_SIZE-1; i >= 0; i--) begin
if (wb_valid_table_n[i]) begin
wb_index_n = `ISTAG_BITS'(i);
wb_valid_n = 1;
wb_thread_mask_n= wb_thread_mask_table_n[i];
wb_wid_n = wb_wid_table_n[i];
wb_curr_PC_n = wb_curr_PC_table_n[i];
wb_rd_n = wb_rd_table_n[i];
wb_data_n = wb_data_table_n[i];
end
end
end
always @(posedge clk) begin
if (reset) begin
wb_valid_table <= 0;
wb_index <= 0;
wb_valid <= 0;
end else begin
wb_valid_table <= wb_valid_table_n;
wb_thread_mask_table <= wb_thread_mask_table_n;
wb_wid_table <= wb_wid_table_n;
wb_curr_PC_table <= wb_curr_PC_table_n;
wb_rd_table <= wb_rd_table_n;
wb_data_table <= wb_data_table_n;
wb_index <= wb_index_n;
wb_valid <= wb_valid_n;
wb_thread_mask <= wb_thread_mask_n;
wb_wid <= wb_wid_n;
wb_curr_PC <= wb_curr_PC_n;
wb_rd <= wb_rd_n;
wb_data <= wb_data_n;
end
end
// writeback request
assign writeback_if.valid = wb_valid;
assign writeback_if.thread_mask = wb_thread_mask;
assign writeback_if.wid = wb_wid;
assign writeback_if.curr_PC = wb_curr_PC;
assign writeback_if.rd = wb_rd;
assign writeback_if.data = wb_data;
assign alu_commit_if.ready = !stall;
assign lsu_commit_if.ready = !stall && !alu_valid;
assign csr_commit_if.ready = !stall && !alu_valid && !lsu_valid;
assign mul_commit_if.ready = !stall && !alu_valid && !lsu_valid && !csr_valid;
assign fpu_commit_if.ready = !stall && !alu_valid && !lsu_valid && !csr_valid && !mul_valid;
assign gpu_commit_if.ready = 1'b1;
// special workaround to get RISC-V tests Pass/Fail status
reg [31:0] last_wb_value [`NUM_REGS-1:0] /* verilator public */;
always @(posedge clk) begin
if (writeback_if.valid) begin
if (writeback_if.valid && writeback_if.ready) begin
last_wb_value[writeback_if.rd] <= writeback_if.data[0];
end
end

View file

@ -106,7 +106,6 @@ module VX_bank #(
`ifdef DBG_CORE_REQ_INFO
/* verilator lint_off UNUSED */
wire[31:0] debug_pc_st0;
wire debug_wb_st0;
wire[`NR_BITS-1:0] debug_rd_st0;
wire[`NW_BITS-1:0] debug_wid_st0;
wire debug_rw_st0;
@ -115,7 +114,6 @@ module VX_bank #(
wire[`UP(CORE_TAG_ID_BITS)-1:0] debug_tagid_st0;
wire[31:0] debug_pc_st1e;
wire debug_wb_st1e;
wire[`NR_BITS-1:0] debug_rd_st1e;
wire[`NW_BITS-1:0] debug_wid_st1e;
wire debug_rw_st1e;
@ -124,7 +122,6 @@ module VX_bank #(
wire[`UP(CORE_TAG_ID_BITS)-1:0] debug_tagid_st1e;
wire[31:0] debug_pc_st2;
wire debug_wb_st2;
wire[`NR_BITS-1:0] debug_rd_st2;
wire[`NW_BITS-1:0] debug_wid_st2;
wire debug_rw_st2;
@ -359,7 +356,7 @@ module VX_bank #(
`ifdef DBG_CORE_REQ_INFO
if (WORD_SIZE != `GLOBAL_BLOCK_SIZE) begin
assign {debug_pc_st0, debug_wb_st0, debug_rd_st0, debug_wid_st0, debug_tagid_st0, debug_rw_st0, debug_byteen_st0, debug_tid_st0} = qual_inst_meta_st0;
assign {debug_pc_st0, debug_rd_st0, debug_wid_st0, debug_tagid_st0, debug_rw_st0, debug_byteen_st0, debug_tid_st0} = qual_inst_meta_st0;
end
`endif
@ -446,7 +443,6 @@ module VX_bank #(
`ifdef DBG_CORE_REQ_INFO
.debug_pc_st1e(debug_pc_st1e),
.debug_wb_st1e(debug_wb_st1e),
.debug_rd_st1e(debug_rd_st1e),
.debug_wid_st1e(debug_wid_st1e),
.debug_tagid_st1e(debug_tagid_st1e),
@ -488,7 +484,7 @@ module VX_bank #(
`ifdef DBG_CORE_REQ_INFO
if (WORD_SIZE != `GLOBAL_BLOCK_SIZE) begin
assign {debug_pc_st1e, debug_wb_st1e, debug_rd_st1e, debug_wid_st1e, debug_tagid_st1e, debug_rw_st1e, debug_byteen_st1e, debug_tid_st1e} = inst_meta_st1[STAGE_1_CYCLES-1];
assign {debug_pc_st1e, debug_rd_st1e, debug_wid_st1e, debug_tagid_st1e, debug_rw_st1e, debug_byteen_st1e, debug_tid_st1e} = inst_meta_st1[STAGE_1_CYCLES-1];
end
`endif
@ -529,7 +525,7 @@ module VX_bank #(
`ifdef DBG_CORE_REQ_INFO
if (WORD_SIZE != `GLOBAL_BLOCK_SIZE) begin
assign {debug_pc_st2, debug_wb_st2, debug_rd_st2, debug_wid_st2, debug_tagid_st2, debug_rw_st2, debug_byteen_st2, debug_tid_st2} = inst_meta_st2;
assign {debug_pc_st2, debug_rd_st2, debug_wid_st2, debug_tagid_st2, debug_rw_st2, debug_byteen_st2, debug_tid_st2} = inst_meta_st2;
end
`endif
@ -740,25 +736,25 @@ module VX_bank #(
`ifdef DBG_PRINT_CACHE_BANK
always @(posedge clk) begin
if ((|core_req_valid) && core_req_ready) begin
$display("%t: bank%0d:%0d core req: addr=%0h, tag=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(core_req_addr[0], BANK_ID), core_req_tag);
$display("%t: cache%0d:%0d core req: addr=%0h, tag=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(core_req_addr[0], BANK_ID), core_req_tag);
end
if (core_rsp_valid && core_rsp_ready) begin
$display("%t: bank%0d:%0d core rsp: tag=%0h, data=%0h", $time, CACHE_ID, BANK_ID, core_rsp_tag, core_rsp_data);
$display("%t: cache%0d:%0d core rsp: tag=%0h, data=%0h", $time, CACHE_ID, BANK_ID, core_rsp_tag, core_rsp_data);
end
if (dram_fill_req_valid && dram_fill_req_ready) begin
$display("%t: bank%0d:%0d dram_fill req: addr=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(dram_fill_req_addr, BANK_ID));
$display("%t: cache%0d:%0d dram_fill req: addr=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(dram_fill_req_addr, BANK_ID));
end
if (dram_wb_req_valid && dram_wb_req_ready) begin
$display("%t: bank%0d:%0d dram_wb req: addr=%0h, data=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(dram_wb_req_addr, BANK_ID), dram_wb_req_data);
$display("%t: cache%0d:%0d dram_wb req: addr=%0h, data=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(dram_wb_req_addr, BANK_ID), dram_wb_req_data);
end
if (dram_fill_rsp_valid && dram_fill_rsp_ready) begin
$display("%t: bank%0d:%0d dram_fill rsp: addr=%0h, data=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(dram_fill_rsp_addr, BANK_ID), dram_fill_rsp_data);
$display("%t: cache%0d:%0d dram_fill rsp: addr=%0h, data=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(dram_fill_rsp_addr, BANK_ID), dram_fill_rsp_data);
end
if (snp_req_valid && snp_req_ready) begin
$display("%t: bank%0d:%0d snp req: addr=%0h, invalidate=%0d, tag=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(snp_req_addr, BANK_ID), snp_req_invalidate, snp_req_tag);
$display("%t: cache%0d:%0d snp req: addr=%0h, invalidate=%0d, tag=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(snp_req_addr, BANK_ID), snp_req_invalidate, snp_req_tag);
end
if (snp_rsp_valid && snp_rsp_ready) begin
$display("%t: bank%0d:%0d snp rsp: tag=%0h", $time, CACHE_ID, BANK_ID, snp_rsp_tag);
$display("%t: cache%0d:%0d snp rsp: tag=%0h", $time, CACHE_ID, BANK_ID, snp_rsp_tag);
end
end
`endif

View file

@ -130,14 +130,13 @@ module VX_cache #(
`ifdef DBG_CORE_REQ_INFO
/* verilator lint_off UNUSED */
wire[31:0] debug_core_req_use_pc;
wire debug_core_req_wb;
wire[`NR_BITS-1:0] debug_core_req_rd;
wire[`NW_BITS-1:0] debug_core_req_wid;
wire[`UP(CORE_TAG_ID_BITS)-1:0] debug_core_req_idx;
/* verilator lint_on UNUSED */
if (WORD_SIZE != `GLOBAL_BLOCK_SIZE) begin
assign {debug_core_req_use_pc, debug_core_req_wb, debug_core_req_rd, debug_core_req_wid, debug_core_req_idx} = core_req_tag[0];
assign {debug_core_req_use_pc, debug_core_req_rd, debug_core_req_wid, debug_core_req_idx} = core_req_tag[0];
end
`endif

View file

@ -157,7 +157,7 @@ module VX_cache_miss_resrv #(
`ifdef DBG_PRINT_CACHE_MSRQ
always @(posedge clk) begin
if (mrvq_push || mrvq_pop || increment_head || recover_state) begin
$write("%t: bank%0d:%0d msrq: push=%b pop=%b incr=%d recv=%d", $time, CACHE_ID, BANK_ID, mrvq_push, mrvq_pop, increment_head, recover_state);
$write("%t: cache%0d:%0d msrq: push=%b pop=%b incr=%d recv=%d", $time, CACHE_ID, BANK_ID, mrvq_push, mrvq_pop, increment_head, recover_state);
for (integer j = 0; j < MRVQ_SIZE; j++) begin
if (valid_table[j]) begin
$write(" ");

View file

@ -65,12 +65,13 @@ module VX_snp_forwarder #(
) snp_fwd_buffer (
.clk (clk),
.reset (reset),
.write_data ({snp_req_addr, snp_req_invalidate, snp_req_tag}),
.write_addr (sfq_write_addr),
.acquire_slot (sfq_acquire),
.release_slot (sfq_release),
.write_addr (sfq_write_addr),
.acquire_slot (sfq_acquire),
.read_addr (sfq_read_addr),
.write_data ({snp_req_addr, snp_req_invalidate, snp_req_tag}),
.read_data ({snp_rsp_addr, snp_rsp_invalidate, snp_rsp_tag}),
.release_addr (sfq_read_addr),
.release_slot (sfq_release),
.full (sfq_full)
);

View file

@ -28,7 +28,6 @@ module VX_tag_data_access #(
`ifdef DBG_CORE_REQ_INFO
`IGNORE_WARNINGS_BEGIN
input wire[31:0] debug_pc_st1e,
input wire debug_wb_st1e,
input wire[`NR_BITS-1:0] debug_rd_st1e,
input wire[`NW_BITS-1:0] debug_wid_st1e,
input wire[`UP(CORE_TAG_ID_BITS)-1:0] debug_tagid_st1e,
@ -217,15 +216,15 @@ module VX_tag_data_access #(
if (valid_req_st1e) begin
if ((| use_write_enable)) begin
if (writefill_st1e) begin
$display("%t: bank%0d:%0d store-fill: wid=%0d, PC=%0h, tag=%0h, wb=%b, rd=%0d, dirty=%b, blk_addr=%0d, tag_id=%0h, data=%0h", $time, CACHE_ID, BANK_ID, debug_wid_st1e, debug_pc_st1e, debug_tagid_st1e, debug_wb_st1e, debug_rd_st1e, dirty_st1e, writeladdr_st1e, writetag_st1e, use_write_data);
$display("%t: cache%0d:%0d store-fill: wid=%0d, PC=%0h, tag=%0h, rd=%0d, dirty=%b, blk_addr=%0d, tag_id=%0h, data=%0h", $time, CACHE_ID, BANK_ID, debug_wid_st1e, debug_pc_st1e, debug_tagid_st1e, debug_rd_st1e, dirty_st1e, writeladdr_st1e, writetag_st1e, use_write_data);
end else begin
$display("%t: bank%0d:%0d store-write: wid=%0d, PC=%0h, tag=%0h, wb=%b, rd=%0d, dirty=%b, blk_addr=%0d, tag_id=%0h, wsel=%0d, data=%0h", $time, CACHE_ID, BANK_ID, debug_wid_st1e, debug_pc_st1e, debug_tagid_st1e, debug_wb_st1e, debug_rd_st1e, dirty_st1e, writeladdr_st1e, writetag_st1e, wordsel_st1e, writeword_st1e);
$display("%t: cache%0d:%0d store-write: wid=%0d, PC=%0h, tag=%0h, rd=%0d, dirty=%b, blk_addr=%0d, tag_id=%0h, wsel=%0d, data=%0h", $time, CACHE_ID, BANK_ID, debug_wid_st1e, debug_pc_st1e, debug_tagid_st1e, debug_rd_st1e, dirty_st1e, writeladdr_st1e, writetag_st1e, wordsel_st1e, writeword_st1e);
end
end else
if (miss_st1e) begin
$display("%t: bank%0d:%0d store-miss: wid=%0d, PC=%0h, tag=%0h, wb=%b, rd=%0d, dirty=%b", $time, CACHE_ID, BANK_ID, debug_wid_st1e, debug_pc_st1e, debug_tagid_st1e, debug_wb_st1e, debug_rd_st1e, dirty_st1e);
$display("%t: cache%0d:%0d store-miss: wid=%0d, PC=%0h, tag=%0h, rd=%0d, dirty=%b", $time, CACHE_ID, BANK_ID, debug_wid_st1e, debug_pc_st1e, debug_tagid_st1e, debug_rd_st1e, dirty_st1e);
end else begin
$display("%t: bank%0d:%0d store-read: wid=%0d, PC=%0h, tag=%0h, wb=%b, rd=%0d, dirty=%b, blk_addr=%0d, tag_id=%0h, wsel=%0d, data=%0h", $time, CACHE_ID, BANK_ID, debug_wid_st1e, debug_pc_st1e, debug_tagid_st1e, debug_wb_st1e, debug_rd_st1e, dirty_st1e, readaddr_st10, qual_read_tag_st1, wordsel_st1e, qual_read_data_st1);
$display("%t: cache%0d:%0d store-read: wid=%0d, PC=%0h, tag=%0h, rd=%0d, dirty=%b, blk_addr=%0d, tag_id=%0h, wsel=%0d, data=%0h", $time, CACHE_ID, BANK_ID, debug_wid_st1e, debug_pc_st1e, debug_tagid_st1e, debug_rd_st1e, dirty_st1e, readaddr_st10, qual_read_tag_st1, wordsel_st1e, qual_read_data_st1);
end
end
end

View file

@ -1,14 +1,16 @@
`include "VX_define.vh"
`include "dspba_library_ver.sv"
module VX_fp_fpga (
module VX_fp_fpga #(
parameter TAGW = 1
) (
input wire clk,
input wire reset,
input wire valid_in,
output wire ready_in,
input wire [`ISTAG_BITS-1:0] tag_in,
input wire [TAGW-1:0] tag_in,
input wire [`FPU_BITS-1:0] op,
input wire [`FRM_BITS-1:0] frm,
@ -21,7 +23,7 @@ module VX_fp_fpga (
output wire has_fflags,
output fflags_t [`NUM_THREADS-1:0] fflags,
output wire [`ISTAG_BITS-1:0] tag_out,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
@ -31,7 +33,7 @@ module VX_fp_fpga (
wire [NUM_FPC-1:0] per_core_ready_in;
wire [NUM_FPC-1:0][`NUM_THREADS-1:0][31:0] per_core_result;
wire [NUM_FPC-1:0][`ISTAG_BITS-1:0] per_core_tag_out;
wire [NUM_FPC-1:0][TAGW-1:0] per_core_tag_out;
wire [NUM_FPC-1:0] per_core_ready_out;
wire [NUM_FPC-1:0] per_core_valid_out;
@ -62,7 +64,10 @@ module VX_fp_fpga (
endcase
end
VX_fp_noncomp fp_noncomp (
VX_fp_noncomp #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_noncomp (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 0)),
@ -80,7 +85,10 @@ module VX_fp_fpga (
.valid_out (per_core_valid_out[0])
);
VX_fp_add fp_add (
VX_fp_add #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_add (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 1)),
@ -94,7 +102,10 @@ module VX_fp_fpga (
.valid_out (per_core_valid_out[1])
);
VX_fp_sub fp_sub (
VX_fp_sub #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_sub (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 2)),
@ -108,7 +119,10 @@ module VX_fp_fpga (
.valid_out (per_core_valid_out[2])
);
VX_fp_mul fp_mul (
VX_fp_mul #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_mul (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 3)),
@ -122,7 +136,10 @@ module VX_fp_fpga (
.valid_out (per_core_valid_out[3])
);
VX_fp_madd fp_madd (
VX_fp_madd #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_madd (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 4)),
@ -138,7 +155,10 @@ module VX_fp_fpga (
.valid_out (per_core_valid_out[4])
);
VX_fp_msub fp_msub (
VX_fp_msub #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_msub (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 5)),
@ -154,7 +174,10 @@ module VX_fp_fpga (
.valid_out (per_core_valid_out[5])
);
VX_fp_div fp_div (
VX_fp_div #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_div (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 6)),
@ -168,7 +191,10 @@ module VX_fp_fpga (
.valid_out (per_core_valid_out[6])
);
VX_fp_sqrt fp_sqrt (
VX_fp_sqrt #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_sqrt (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 7)),
@ -181,7 +207,10 @@ module VX_fp_fpga (
.valid_out (per_core_valid_out[7])
);
VX_fp_ftoi fp_ftoi (
VX_fp_ftoi #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_ftoi (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 8)),
@ -194,7 +223,10 @@ module VX_fp_fpga (
.valid_out (per_core_valid_out[8])
);
VX_fp_ftou fp_ftou (
VX_fp_ftou #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_ftou (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 9)),
@ -207,7 +239,10 @@ module VX_fp_fpga (
.valid_out (per_core_valid_out[9])
);
VX_fp_itof fp_itof (
VX_fp_itof #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_itof (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 10)),
@ -220,7 +255,10 @@ module VX_fp_fpga (
.valid_out (per_core_valid_out[10])
);
VX_fp_utof fp_utof (
VX_fp_utof #(
.TAGW (TAGW),
.LANES(`NUM_THREADS)
) fp_utof (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 11)),
@ -248,21 +286,10 @@ module VX_fp_fpga (
assign per_core_ready_out[i] = ready_out && (i == fp_index);
end
wire tmp_valid = fp_valid;
wire [`ISTAG_BITS-1:0] tmp_tag = per_core_tag_out[fp_index];
wire [`NUM_THREADS-1:0][31:0] tmp_result = per_core_result[fp_index];
wire tmp_has_fflags = fpnew_has_fflags && (fp_index == 0);
fflags_t [`NUM_THREADS-1:0] tmp_flags = fpnew_fflags;
VX_generic_register #(
.N(1 + `ISTAG_BITS + (`NUM_THREADS * 32) + 1 + `FFG_BITS)
) nc_reg (
.clk (clk),
.reset (reset),
.stall (stall),
.flush (1'b0),
.in ({tmp_valid, tmp_tag, tmp_result, tmp_has_fflags, tmp_fflags}),
.out ({valid_out, tag_out, result, has_fflags, fflags})
);
assign valid_out = fp_valid;
assign tag_out = per_core_tag_out[fp_index];
assign result = per_core_result[fp_index];
assign has_fflags = fpnew_has_fflags && (fp_index == 0);
assign fflags = fpnew_fflags;
endmodule

View file

@ -1,25 +1,28 @@
`include "VX_define.vh"
module VX_fp_noncomp (
module VX_fp_noncomp #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [`ISTAG_BITS-1:0] tag_in,
input wire [TAGW-1:0] tag_in,
input wire [`FPU_BITS-1:0] op,
input wire [`FRM_BITS-1:0] frm,
input wire [`NUM_THREADS-1:0][31:0] dataa,
input wire [`NUM_THREADS-1:0][31:0] datab,
output wire [`NUM_THREADS-1:0][31:0] result,
input wire [LANES-1:0][31:0] dataa,
input wire [LANES-1:0][31:0] datab,
output wire [LANES-1:0][31:0] result,
output wire has_fflags,
output fflags_t [`NUM_THREADS-1:0] fflags,
output fflags_t [LANES-1:0] fflags,
output wire [`ISTAG_BITS-1:0] tag_out,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
@ -35,21 +38,21 @@ module VX_fp_noncomp (
SIG_NAN = 32'h00000100,
QUT_NAN = 32'h00000200;
wire [`NUM_THREADS-1:0] a_sign, b_sign;
wire [`NUM_THREADS-1:0][7:0] a_exponent, b_exponent;
wire [`NUM_THREADS-1:0][22:0] a_mantissa, b_mantissa;
fp_type_t [`NUM_THREADS-1:0] a_type, b_type;
wire [LANES-1:0] a_sign, b_sign;
wire [LANES-1:0][7:0] a_exponent, b_exponent;
wire [LANES-1:0][22:0] a_mantissa, b_mantissa;
fp_type_t [LANES-1:0] a_type, b_type;
wire [`NUM_THREADS-1:0] a_smaller, ab_equal;
wire [LANES-1:0] a_smaller, ab_equal;
reg [`NUM_THREADS-1:0][31:0] fclass_mask; // generate a 10-bit mask for integer reg
reg [`NUM_THREADS-1:0][31:0] fminmax_res; // result of fmin/fmax
reg [`NUM_THREADS-1:0][31:0] fsgnj_res; // result of sign injection
reg [`NUM_THREADS-1:0][31:0] fcmp_res; // result of comparison
reg [`NUM_THREADS-1:0][ 4:0] fcmp_excp; // exception of comparison
reg [LANES-1:0][31:0] fclass_mask; // generate a 10-bit mask for integer reg
reg [LANES-1:0][31:0] fminmax_res; // result of fmin/fmax
reg [LANES-1:0][31:0] fsgnj_res; // result of sign injection
reg [LANES-1:0][31:0] fcmp_res; // result of comparison
reg [LANES-1:0][ 4:0] fcmp_excp; // exception of comparison
// Setup
for (genvar i = 0; i < `NUM_THREADS; i++) begin
for (genvar i = 0; i < LANES; i++) begin
assign a_sign[i] = dataa[i][31];
assign a_exponent[i] = dataa[i][30:23];
assign a_mantissa[i] = dataa[i][22:0];
@ -75,7 +78,7 @@ module VX_fp_noncomp (
end
// FCLASS
for (genvar i = 0; i < `NUM_THREADS; i++) begin
for (genvar i = 0; i < LANES; i++) begin
always @(*) begin
if (a_type[i].is_normal) begin
fclass_mask[i] = a_sign[i] ? NEG_NORM : POS_NORM;
@ -99,7 +102,7 @@ module VX_fp_noncomp (
end
// Min/Max
for (genvar i = 0; i < `NUM_THREADS; i++) begin
for (genvar i = 0; i < LANES; i++) begin
always @(*) begin
if (a_type[i].is_nan && b_type[i].is_nan)
fminmax_res[i] = {1'b0, 8'hff, 1'b1, 22'd0}; // canonical qNaN
@ -118,7 +121,7 @@ module VX_fp_noncomp (
end
// Sign Injection
for (genvar i = 0; i < `NUM_THREADS; i++) begin
for (genvar i = 0; i < LANES; i++) begin
always @(*) begin
case (op)
`FPU_SGNJ: fsgnj_res[i] = { b_sign[i], a_exponent[i], a_mantissa[i]};
@ -130,7 +133,7 @@ module VX_fp_noncomp (
end
// Comparison
for (genvar i = 0; i < `NUM_THREADS; i++) begin
for (genvar i = 0; i < LANES; i++) begin
always @(*) begin
case (frm)
`FRM_RNE: begin
@ -176,8 +179,8 @@ module VX_fp_noncomp (
reg tmp_valid;
reg tmp_has_fflags;
fflags_t [`NUM_THREADS-1:0] tmp_fflags;
reg [`NUM_THREADS-1:0][31:0] tmp_result;
fflags_t [LANES-1:0] tmp_fflags;
reg [LANES-1:0][31:0] tmp_result;
always @(*) begin
case (op)
@ -191,7 +194,7 @@ module VX_fp_noncomp (
endcase
end
for (genvar i = 0; i < `NUM_THREADS; i++) begin
for (genvar i = 0; i < LANES; i++) begin
always @(*) begin
tmp_valid = 1'b1;
case (op)
@ -228,7 +231,7 @@ module VX_fp_noncomp (
assign ready_in = ~stall;
VX_generic_register #(
.N(1 + `ISTAG_BITS + (`NUM_THREADS * 32) + 1 + (`NUM_THREADS * `FFG_BITS))
.N(1 + TAGW + (LANES * 32) + 1 + (LANES * `FFG_BITS))
) nc_reg (
.clk (clk),
.reset (reset),

View file

@ -3,6 +3,7 @@
`include "defs_div_sqrt_mvp.sv"
module VX_fpnew #(
parameter TAGW = 1,
parameter FMULADD = 1,
parameter FDIVSQRT = 1,
parameter FNONCOMP = 1,
@ -14,7 +15,7 @@ module VX_fpnew #(
input wire valid_in,
output wire ready_in,
input wire [`ISTAG_BITS-1:0] tag_in,
input wire [TAGW-1:0] tag_in,
input wire [`FPU_BITS-1:0] op,
input wire [`FRM_BITS-1:0] frm,
@ -27,7 +28,7 @@ module VX_fpnew #(
output wire has_fflags,
output fflags_t [`NUM_THREADS-1:0] fflags,
output wire [`ISTAG_BITS-1:0] tag_out,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
@ -66,7 +67,7 @@ module VX_fpnew #(
wire fpu_ready_in, fpu_valid_in;
wire fpu_ready_out, fpu_valid_out;
reg [`ISTAG_BITS-1:0] fpu_tag_in, fpu_tag_out;
reg [TAGW-1:0] fpu_tag_in, fpu_tag_out;
reg [2:0][`NUM_THREADS-1:0][31:0] fpu_operands;
@ -77,9 +78,6 @@ module VX_fpnew #(
wire [`NUM_THREADS-1:0][31:0] fpu_result;
fpnew_pkg::status_t [0:`NUM_THREADS-1] fpu_status;
wire is_class_op, is_class_op_out;
assign is_class_op = (op == `FPU_CLASS);
reg [FOP_BITS-1:0] fpu_op;
reg [`FRM_BITS-1:0] fpu_rnd;
reg fpu_op_mod;
@ -136,7 +134,7 @@ module VX_fpnew #(
fpnew_top #(
.Features (FPU_FEATURES),
.Implementation (FPU_IMPLEMENTATION),
.TagType (logic[`ISTAG_BITS+1+1-1:0])
.TagType (logic[TAGW+1+1-1:0])
) fpnew_core (
.clk_i (clk),
.rst_ni (1'b1),
@ -148,13 +146,13 @@ module VX_fpnew #(
.dst_fmt_i (fpnew_pkg::fp_format_e'(fpu_dst_fmt)),
.int_fmt_i (fpnew_pkg::int_format_e'(fpu_int_fmt)),
.vectorial_op_i (1'b0),
.tag_i ({fpu_tag_in, fpu_has_fflags, is_class_op}),
.tag_i ({fpu_tag_in, fpu_has_fflags}),
.in_valid_i (fpu_valid_in),
.in_ready_o (fpu_ready_in),
.flush_i (reset),
.result_o (fpu_result[0]),
.status_o (fpu_status[0]),
.tag_o ({fpu_tag_out, fpu_has_fflags_out, is_class_op_out}),
.tag_o ({fpu_tag_out, fpu_has_fflags_out}),
.out_valid_o (fpu_valid_out),
.out_ready_i (fpu_ready_out),
`UNUSED_PIN (busy_o)

View file

@ -1,19 +1,22 @@
`include "VX_define.vh"
module VX_fp_add (
module VX_fp_add #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [`ISTAG_BITS-1:0] tag_in,
input wire [TAGW-1:0] tag_in,
input wire [`NUM_THREADS-1:0][31:0] dataa,
input wire [`NUM_THREADS-1:0][31:0] datab,
output wire [`NUM_THREADS-1:0][31:0] result,
input wire [LANES-1:0][31:0] dataa,
input wire [LANES-1:0][31:0] datab,
output wire [LANES-1:0][31:0] result,
output wire [`ISTAG_BITS-1:0] tag_out,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
@ -22,7 +25,7 @@ module VX_fp_add (
wire enable = ~stall;
assign ready_in = enable;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
for (genvar i = 0; i < LANES; i++) begin
twentynm_fp_mac mac_fp_wys (
// inputs
.accumulate(),
@ -65,7 +68,7 @@ module VX_fp_add (
end
VX_shift_register #(
.DATAW(`ISTAG_BITS + 1),
.DATAW(TAGW + 1),
.DEPTH(1)
) shift_reg (
.clk(clk),

View file

@ -1,19 +1,22 @@
`include "VX_define.vh"
module VX_fp_div (
module VX_fp_div #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [`ISTAG_BITS-1:0] tag_in,
input wire [TAGW-1:0] tag_in,
input wire [`NUM_THREADS-1:0][31:0] dataa,
input wire [`NUM_THREADS-1:0][31:0] datab,
output wire [`NUM_THREADS-1:0][31:0] result,
input wire [LANES-1:0][31:0] dataa,
input wire [LANES-1:0][31:0] datab,
output wire [LANES-1:0][31:0] result,
output wire [`ISTAG_BITS-1:0] tag_out,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
@ -22,7 +25,7 @@ module VX_fp_div (
wire enable = ~stall;
assign ready_in = enable;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
for (genvar i = 0; i < LANES; i++) begin
acl_fp_div fdiv (
.clk (clk),
.areset (1'b0),
@ -34,7 +37,7 @@ module VX_fp_div (
end
VX_shift_register #(
.DATAW(`ISTAG_BITS + 1),
.DATAW(TAGW + 1),
.DEPTH(`LATENCY_FDIV)
) shift_reg (
.clk(clk),

View file

@ -1,18 +1,21 @@
`include "VX_define.vh"
module VX_fp_ftoi (
module VX_fp_ftoi #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [`ISTAG_BITS-1:0] tag_in,
input wire [TAGW-1:0] tag_in,
input wire [`NUM_THREADS-1:0][31:0] dataa,
output wire [`NUM_THREADS-1:0][31:0] result,
input wire [LANES-1:0][31:0] dataa,
output wire [LANES-1:0][31:0] result,
output wire [`ISTAG_BITS-1:0] tag_out,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
@ -21,7 +24,7 @@ module VX_fp_ftoi (
wire enable = ~stall;
assign ready_in = enable;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
for (genvar i = 0; i < LANES; i++) begin
acl_fp_ftoi ftoi (
.clk (clk),
.areset (1'b0),
@ -32,7 +35,7 @@ module VX_fp_ftoi (
end
VX_shift_register #(
.DATAW(`ISTAG_BITS + 1),
.DATAW(TAGW + 1),
.DEPTH(`LATENCY_FTOI)
) shift_reg (
.clk(clk),

View file

@ -1,18 +1,21 @@
`include "VX_define.vh"
module VX_fp_ftou (
module VX_fp_ftou #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [`ISTAG_BITS-1:0] tag_in,
input wire [TAGW-1:0] tag_in,
input wire [`NUM_THREADS-1:0][31:0] dataa,
output wire [`NUM_THREADS-1:0][31:0] result,
input wire [LANES-1:0][31:0] dataa,
output wire [LANES-1:0][31:0] result,
output wire [`ISTAG_BITS-1:0] tag_out,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
@ -21,7 +24,7 @@ module VX_fp_ftou (
wire enable = ~stall;
assign ready_in = enable;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
for (genvar i = 0; i < LANES; i++) begin
acl_fp_ftou ftou (
.clk (clk),
.areset (1'b0),
@ -32,7 +35,7 @@ module VX_fp_ftou (
end
VX_shift_register #(
.DATAW(`ISTAG_BITS + 1),
.DATAW(TAGW + 1),
.DEPTH(`LATENCY_FTOU)
) shift_reg (
.clk(clk),

View file

@ -1,18 +1,21 @@
`include "VX_define.vh"
module VX_fp_itof (
module VX_fp_itof #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [`ISTAG_BITS-1:0] tag_in,
input wire [TAGW-1:0] tag_in,
input wire [`NUM_THREADS-1:0][31:0] dataa,
output wire [`NUM_THREADS-1:0][31:0] result,
input wire [LANES-1:0][31:0] dataa,
output wire [LANES-1:0][31:0] result,
output wire [`ISTAG_BITS-1:0] tag_out,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
@ -21,7 +24,7 @@ module VX_fp_itof (
wire enable = ~stall;
assign ready_in = enable;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
for (genvar i = 0; i < LANES; i++) begin
acl_fp_itof itof (
.clk (clk),
.areset (1'b0),
@ -32,7 +35,7 @@ module VX_fp_itof (
end
VX_shift_register #(
.DATAW(`ISTAG_BITS + 1),
.DATAW(TAGW + 1),
.DEPTH(`LATENCY_ITOF)
) shift_reg (
.clk(clk),

View file

@ -1,22 +1,25 @@
`include "VX_define.vh"
module VX_fp_madd (
module VX_fp_madd #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [`ISTAG_BITS-1:0] tag_in,
input wire [TAGW-1:0] tag_in,
input wire [`NUM_THREADS-1:0][31:0] dataa,
input wire [`NUM_THREADS-1:0][31:0] datab,
input wire [`NUM_THREADS-1:0][31:0] datac,
output wire [`NUM_THREADS-1:0][31:0] result,
input wire [LANES-1:0][31:0] dataa,
input wire [LANES-1:0][31:0] datab,
input wire [LANES-1:0][31:0] datac,
output wire [LANES-1:0][31:0] result,
input wire negate,
output wire [`ISTAG_BITS-1:0] tag_out,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
@ -24,11 +27,11 @@ module VX_fp_madd (
wire enable0, enable1;
assign ready_in = enable0 && enable1;
wire [`NUM_THREADS-1:0][31:0] result_st0, result_st1;
wire [`ISTAG_BITS-1:0] out_tag_st0, out_tag_st1;
wire [LANES-1:0][31:0] result_st0, result_st1;
wire [TAGW-1:0] out_tag_st0, out_tag_st1;
wire in_valid_st0, out_valid_st0, out_valid_st1;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
for (genvar i = 0; i < LANES; i++) begin
twentynm_fp_mac mac_fp_wys0 (
// inputs
.accumulate(),
@ -111,7 +114,7 @@ module VX_fp_madd (
end
VX_shift_register #(
.DATAW(`ISTAG_BITS + 1 + 1),
.DATAW(TAGW + 1 + 1),
.DEPTH(1)
) shift_reg0 (
.clk(clk),
@ -122,7 +125,7 @@ module VX_fp_madd (
);
VX_shift_register #(
.DATAW(`ISTAG_BITS + 1),
.DATAW(TAGW + 1),
.DEPTH(1)
) shift_reg1 (
.clk(clk),

View file

@ -1,22 +1,25 @@
`include "VX_define.vh"
module VX_fp_msub (
module VX_fp_msub #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [`ISTAG_BITS-1:0] tag_in,
input wire [TAGW-1:0] tag_in,
input wire [`NUM_THREADS-1:0][31:0] dataa,
input wire [`NUM_THREADS-1:0][31:0] datab,
input wire [`NUM_THREADS-1:0][31:0] datac,
output wire [`NUM_THREADS-1:0][31:0] result,
input wire [LANES-1:0][31:0] dataa,
input wire [LANES-1:0][31:0] datab,
input wire [LANES-1:0][31:0] datac,
output wire [LANES-1:0][31:0] result,
input wire negate,
output wire [`ISTAG_BITS-1:0] tag_out,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
@ -24,11 +27,11 @@ module VX_fp_msub (
wire enable0, enable1;
assign ready_in = enable0 && enable1;
wire [`NUM_THREADS-1:0][31:0] result_st0, result_st1;
wire [`ISTAG_BITS-1:0] out_tag_st0, out_tag_st1;
wire [LANES-1:0][31:0] result_st0, result_st1;
wire [TAGW-1:0] out_tag_st0, out_tag_st1;
wire in_valid_st0, out_valid_st0, out_valid_st1;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
for (genvar i = 0; i < LANES; i++) begin
twentynm_fp_mac mac_fp_wys0 (
// inputs
.accumulate(),
@ -111,7 +114,7 @@ module VX_fp_msub (
end
VX_shift_register #(
.DATAW(`ISTAG_BITS + 1 + 1),
.DATAW(TAGW + 1 + 1),
.DEPTH(1)
) shift_reg0 (
.clk(clk),
@ -122,7 +125,7 @@ module VX_fp_msub (
);
VX_shift_register #(
.DATAW(`ISTAG_BITS + 1),
.DATAW(TAGW + 1),
.DEPTH(1)
) shift_reg1 (
.clk(clk),

View file

@ -1,19 +1,22 @@
`include "VX_define.vh"
module VX_fp_mul (
module VX_fp_mul #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [`ISTAG_BITS-1:0] tag_in,
input wire [TAGW-1:0] tag_in,
input wire [`NUM_THREADS-1:0][31:0] dataa,
input wire [`NUM_THREADS-1:0][31:0] datab,
output wire [`NUM_THREADS-1:0][31:0] result,
input wire [LANES-1:0][31:0] dataa,
input wire [LANES-1:0][31:0] datab,
output wire [LANES-1:0][31:0] result,
output wire [`ISTAG_BITS-1:0] tag_out,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
@ -22,7 +25,7 @@ module VX_fp_mul (
wire enable = ~stall;
assign ready_in = enable;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
for (genvar i = 0; i < LANES; i++) begin
twentynm_fp_mac mac_fp_wys (
// inputs
.accumulate(),
@ -65,7 +68,7 @@ module VX_fp_mul (
end
VX_shift_register #(
.DATAW(`ISTAG_BITS + 1),
.DATAW(TAGW + 1),
.DEPTH(1)
) shift_reg (
.clk(clk),

View file

@ -1,18 +1,21 @@
`include "VX_define.vh"
module VX_fp_sqrt (
module VX_fp_sqrt #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [`ISTAG_BITS-1:0] tag_in,
input wire [TAGW-1:0] tag_in,
input wire [`NUM_THREADS-1:0][31:0] dataa,
output wire [`NUM_THREADS-1:0][31:0] result,
input wire [LANES-1:0][31:0] dataa,
output wire [LANES-1:0][31:0] result,
output wire [`ISTAG_BITS-1:0] tag_out,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
@ -21,7 +24,7 @@ module VX_fp_sqrt (
wire enable = ~stall;
assign ready_in = enable;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
for (genvar i = 0; i < LANES; i++) begin
acl_fp_sqrt fsqrt (
.clk (clk),
.areset (1'b0),
@ -32,7 +35,7 @@ module VX_fp_sqrt (
end
VX_shift_register #(
.DATAW(`ISTAG_BITS + 1),
.DATAW(TAGW + 1),
.DEPTH(`LATENCY_FSQRT)
) shift_reg (
.clk(clk),

View file

@ -1,19 +1,22 @@
`include "VX_define.vh"
module VX_fp_sub (
module VX_fp_sub #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [`ISTAG_BITS-1:0] tag_in,
input wire [TAGW-1:0] tag_in,
input wire [`NUM_THREADS-1:0][31:0] dataa,
input wire [`NUM_THREADS-1:0][31:0] datab,
output wire [`NUM_THREADS-1:0][31:0] result,
input wire [LANES-1:0][31:0] dataa,
input wire [LANES-1:0][31:0] datab,
output wire [LANES-1:0][31:0] result,
output wire [`ISTAG_BITS-1:0] tag_out,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
@ -22,7 +25,7 @@ module VX_fp_sub (
wire enable = ~stall;
assign ready_in = enable;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
for (genvar i = 0; i < LANES; i++) begin
twentynm_fp_mac mac_fp_wys (
// inputs
.accumulate(),
@ -65,7 +68,7 @@ module VX_fp_sub (
end
VX_shift_register #(
.DATAW(`ISTAG_BITS + 1),
.DATAW(TAGW + 1),
.DEPTH(1)
) shift_reg (
.clk(clk),

View file

@ -1,18 +1,21 @@
`include "VX_define.vh"
module VX_fp_utof (
module VX_fp_utof #(
parameter TAGW = 1,
parameter LANES = 1
) (
input wire clk,
input wire reset,
output wire ready_in,
input wire valid_in,
input wire [`ISTAG_BITS-1:0] tag_in,
input wire [TAGW-1:0] tag_in,
input wire [`NUM_THREADS-1:0][31:0] dataa,
output wire [`NUM_THREADS-1:0][31:0] result,
input wire [LANES-1:0][31:0] dataa,
output wire [LANES-1:0][31:0] result,
output wire [`ISTAG_BITS-1:0] tag_out,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
@ -21,7 +24,7 @@ module VX_fp_utof (
wire enable = ~stall;
assign ready_in = enable;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
for (genvar i = 0; i < LANES; i++) begin
acl_fp_utof utof (
.clk (clk),
.areset (1'b0),
@ -32,7 +35,7 @@ module VX_fp_utof (
end
VX_shift_register #(
.DATAW(`ISTAG_BITS + 1),
.DATAW(TAGW + 1),
.DEPTH(`LATENCY_UTOF)
) shift_reg (
.clk(clk),

View file

@ -5,23 +5,20 @@
interface VX_alu_req_if ();
wire valid;
wire [`ISTAG_BITS-1:0] issue_tag;
`DEBUG_BEGIN
wire [`NW_BITS-1:0] wid;
wire valid;
wire [`NW_BITS-1:0] wid;
wire [`NUM_THREADS-1:0] thread_mask;
`DEBUG_END
wire [31:0] curr_PC;
wire [`ALU_BITS-1:0] op;
wire [`ALU_BR_BITS-1:0] op;
wire rs1_is_PC;
wire rs2_is_imm;
wire [31:0] imm;
wire [`NT_BITS-1:0] tid;
wire [`NUM_THREADS-1:0][31:0] rs1_data;
wire [`NUM_THREADS-1:0][31:0] rs2_data;
wire [`NR_BITS-1:0] rd;
wire wb;
wire ready;

View file

@ -1,29 +0,0 @@
`ifndef VX_BRANCH_REQ_IF
`define VX_BRANCH_REQ_IF
`include "VX_define.vh"
interface VX_bru_req_if ();
wire valid;
wire [`ISTAG_BITS-1:0] issue_tag;
wire [`NW_BITS-1:0] wid;
`DEBUG_BEGIN
wire [`NUM_THREADS-1:0] thread_mask;
`DEBUG_END
wire [31:0] curr_PC;
wire [`BRU_BITS-1:0] op;
wire rs1_is_PC;
wire [31:0] rs1_data;
wire [31:0] rs2_data;
wire [31:0] offset;
wire ready;
endinterface
`endif

View file

@ -1,36 +0,0 @@
`ifndef VX_CMT_TO_ISSUE_IF
`define VX_CMT_TO_ISSUE_IF
`include "VX_define.vh"
interface VX_cmt_to_issue_if ();
wire alu_valid;
wire bru_valid;
wire lsu_valid;
wire csr_valid;
wire mul_valid;
wire fpu_valid;
wire gpu_valid;
wire [`ISTAG_BITS-1:0] alu_tag;
wire [`ISTAG_BITS-1:0] bru_tag;
wire [`ISTAG_BITS-1:0] lsu_tag;
wire [`ISTAG_BITS-1:0] csr_tag;
wire [`ISTAG_BITS-1:0] mul_tag;
wire [`ISTAG_BITS-1:0] fpu_tag;
wire [`ISTAG_BITS-1:0] gpu_tag;
`IGNORE_WARNINGS_BEGIN
issue_data_t alu_data;
issue_data_t bru_data;
issue_data_t lsu_data;
issue_data_t csr_data;
issue_data_t mul_data;
issue_data_t fpu_data;
issue_data_t gpu_data;
`IGNORE_WARNINGS_END
endinterface
`endif

View file

@ -6,18 +6,13 @@
interface VX_csr_req_if ();
wire valid;
wire [`ISTAG_BITS-1:0] issue_tag;
wire [`NW_BITS-1:0] wid;
`DEBUG_BEGIN
wire [`NUM_THREADS-1:0] thread_mask;
`DEBUG_END
wire [31:0] curr_PC;
wire [31:0] curr_PC;
wire [`CSR_BITS-1:0] op;
wire [`CSR_ADDR_BITS-1:0] csr_addr;
wire [31:0] csr_mask;
wire [`NR_BITS-1:0] rd;
wire wb;
wire is_io;
@ -26,4 +21,4 @@ interface VX_csr_req_if ();
endinterface
`endif
`endif

View file

@ -1,15 +0,0 @@
`ifndef VX_CSR_RSP_IF
`define VX_CSR_RSP_IF
`include "VX_define.vh"
interface VX_csr_rsp_if ();
wire valid;
wire [`ISTAG_BITS-1:0] issue_tag;
wire [`NUM_THREADS-1:0][31:0] data;
wire ready;
endinterface
`endif

View file

@ -1,5 +1,5 @@
`ifndef VX_CSR_TO_FPU_IF
`define VX_CSR_TO_FPU_IF
`ifndef VX_CSR_TO_ISSUE_IF
`define VX_CSR_TO_ISSUE_IF
`include "VX_define.vh"
@ -7,7 +7,7 @@
`IGNORE_WARNINGS_BEGIN
`endif
interface VX_csr_to_fpu_if ();
interface VX_csr_to_issue_if ();
wire [`NW_BITS-1:0] wid;
wire [`FRM_BITS-1:0] frm;

View file

@ -6,29 +6,26 @@
interface VX_decode_if ();
wire valid;
wire [`NW_BITS-1:0] wid;
wire [`NUM_THREADS-1:0] thread_mask;
wire [31:0] curr_PC;
wire [`EX_BITS-1:0] ex_type;
wire [`OP_BITS-1:0] ex_op;
wire [`FRM_BITS-1:0] frm;
wire wb;
wire [`NR_BITS-1:0] rd;
wire [`NR_BITS-1:0] rs1;
wire [`NR_BITS-1:0] rs2;
wire [`NR_BITS-1:0] rs3;
wire [31:0] imm;
wire rs1_is_PC;
wire rs2_is_imm;
wire [`NUM_REGS-1:0] reg_use_mask;
// FP states
wire [`NR_BITS-1:0] rs3;
wire rs2_is_imm;
wire use_rs3;
wire [`FRM_BITS-1:0] frm;
wire wb;
wire [`NUM_REGS-1:0] used_regs;
wire ready;

View file

@ -5,9 +5,14 @@
interface VX_exu_to_cmt_if ();
wire valid;
wire [`ISTAG_BITS-1:0] issue_tag;
wire [`NUM_THREADS-1:0][31:0] data;
wire valid;
wire [`NW_BITS-1:0] wid;
wire [`NUM_THREADS-1:0] thread_mask;
wire [31:0] curr_PC;
wire [`NUM_THREADS-1:0][31:0] data;
wire [`NR_BITS-1:0] rd;
wire wb;
wire ready;
endinterface

View file

@ -10,20 +10,18 @@
interface VX_fpu_req_if ();
wire valid;
wire [`ISTAG_BITS-1:0] issue_tag;
wire [`NW_BITS-1:0] wid;
`DEBUG_BEGIN
wire [`NUM_THREADS-1:0] thread_mask;
wire [31:0] curr_PC;
`DEBUG_END
wire [`FPU_BITS-1:0] op;
wire [`FRM_BITS-1:0] frm;
wire [`NUM_THREADS-1:0][31:0] rs1_data;
wire [`NUM_THREADS-1:0][31:0] rs2_data;
wire [`NUM_THREADS-1:0][31:0] rs3_data;
wire [`NR_BITS-1:0] rd;
wire wb;
wire ready;
endinterface

View file

@ -5,11 +5,16 @@
interface VX_fpu_to_cmt_if ();
wire valid;
wire [`ISTAG_BITS-1:0] issue_tag;
wire [`NUM_THREADS-1:0][31:0] data;
wire has_fflags;
fflags_t [`NUM_THREADS-1:0] fflags;
wire valid;
wire [`NW_BITS-1:0] wid;
wire [`NUM_THREADS-1:0] thread_mask;
wire [31:0] curr_PC;
wire [`NUM_THREADS-1:0][31:0] data;
wire [`NR_BITS-1:0] rd;
wire wb;
wire has_fflags;
fflags_t [`NUM_THREADS-1:0] fflags;
wire ready;
endinterface

View file

@ -19,7 +19,8 @@ interface VX_gpr_read_if ();
wire [`NUM_THREADS-1:0][31:0] rs2_data;
wire [`NUM_THREADS-1:0][31:0] rs3_data;
wire ready;
wire ready_in;
wire ready_out;
endinterface

View file

@ -6,15 +6,15 @@
interface VX_gpu_req_if();
wire valid;
wire [`ISTAG_BITS-1:0] issue_tag;
wire [`NW_BITS-1:0] wid;
wire [`NUM_THREADS-1:0] thread_mask;
wire [31:0] curr_PC;
wire [`GPU_BITS-1:0] op;
wire [`NUM_THREADS-1:0][31:0] rs1_data;
wire [31:0] rs2_data;
wire [`NR_BITS-1:0] rd;
wire wb;
wire ready;

View file

@ -1,39 +0,0 @@
`ifndef VX_ISSUE_IF
`define VX_ISSUE_IF
`include "VX_define.vh"
interface VX_issue_if ();
wire valid;
wire [`ISTAG_BITS-1:0] issue_tag;
wire [`NW_BITS-1:0] wid;
wire [`NUM_THREADS-1:0] thread_mask;
wire [31:0] curr_PC;
wire [`EX_BITS-1:0] ex_type;
wire [`OP_BITS-1:0] ex_op;
wire [`FRM_BITS-1:0] frm;
wire wb;
wire [`NR_BITS-1:0] rd;
wire [`NUM_THREADS-1:0][31:0] rs1_data;
wire [`NUM_THREADS-1:0][31:0] rs2_data;
wire [`NUM_THREADS-1:0][31:0] rs3_data;
wire [`NR_BITS-1:0] rs1;
wire [31:0] imm;
wire rs1_is_PC;
wire rs2_is_imm;
wire [`NT_BITS-1:0] tid;
wire ready;
endinterface
`endif

View file

@ -6,9 +6,9 @@
interface VX_lsu_req_if ();
wire valid;
wire [`NUM_THREADS-1:0] thread_mask;
wire [`ISTAG_BITS-1:0] issue_tag;
wire [`NW_BITS-1:0] wid;
wire [`NUM_THREADS-1:0] thread_mask;
wire [31:0] curr_PC;
wire rw;

View file

@ -10,16 +10,15 @@
interface VX_mul_req_if ();
wire valid;
wire [`ISTAG_BITS-1:0] issue_tag;
`DEBUG_BEGIN
wire [`NW_BITS-1:0] wid;
wire [`NUM_THREADS-1:0] thread_mask;
wire [31:0] curr_PC;
`DEBUG_END
wire [`MUL_BITS-1:0] op;
wire [`NUM_THREADS-1:0][31:0] rs1_data;
wire [`NUM_THREADS-1:0][31:0] rs2_data;
wire [`NR_BITS-1:0] rd;
wire wb;
wire ready;

View file

@ -5,6 +5,7 @@
interface VX_warp_ctl_if ();
wire valid;
wire [`NW_BITS-1:0] wid;
gpu_tmc_t tmc;

View file

@ -1,9 +1,9 @@
`ifndef VX_WB_IF
`define VX_WB_IF
`ifndef VX_WRITEBACK_IF
`define VX_WRITEBACK_IF
`include "VX_define.vh"
interface VX_wb_if ();
interface VX_writeback_if ();
wire valid;
wire [`NUM_THREADS-1:0] thread_mask;
@ -16,6 +16,8 @@ interface VX_wb_if ();
wire [`NR_BITS-1:0] rd;
wire [`NUM_THREADS-1:0][31:0] data;
wire ready;
endinterface
`endif

View file

@ -1,19 +1,21 @@
`include "VX_platform.vh"
module VX_cam_buffer #(
parameter DATAW = 1,
parameter SIZE = 1,
parameter RPORTS = 1,
parameter DATAW = 1,
parameter SIZE = 1,
parameter RPORTS = 1,
parameter CPORTS = 1,
parameter ADDRW = `LOG2UP(SIZE)
) (
input wire clk,
input wire reset,
input wire [DATAW-1:0] write_data,
output wire [ADDRW-1:0] write_addr,
input wire [DATAW-1:0] write_data,
input wire acquire_slot,
input wire [RPORTS-1:0][ADDRW-1:0] read_addr,
output reg [RPORTS-1:0][DATAW-1:0] read_data,
input wire [RPORTS-1:0] release_slot,
input wire [CPORTS-1:0][ADDRW-1:0] release_addr,
input wire [CPORTS-1:0] release_slot,
output wire full
);
reg [DATAW-1:0] entries [SIZE-1:0];
@ -34,12 +36,11 @@ module VX_cam_buffer #(
always @(*) begin
free_slots_n = free_slots;
for (integer i = 0; i < RPORTS; i++) begin
for (integer i = 0; i < CPORTS; i++) begin
if (release_slot[i]) begin
free_slots_n[read_addr[i]] = 1;
end
read_data[i] = entries[read_addr[i]];
end
free_slots_n[release_addr[i]] = 1;
end
end
if (acquire_slot) begin
free_slots_n[write_addr_r] = 0;
end
@ -55,15 +56,19 @@ module VX_cam_buffer #(
assert(1 == free_slots[write_addr]) else $display("%t: inused slot at port %d", $time, write_addr);
entries[write_addr] <= write_data;
end
for (integer i = 0; i < RPORTS; i++) begin
for (integer i = 0; i < CPORTS; i++) begin
if (release_slot[i]) begin
assert(0 == free_slots[read_addr[i]]) else $display("%t: freed slot at port %d", $time, read_addr[i]);
assert(0 == free_slots[release_addr[i]]) else $display("%t: freed slot at port %d", $time, release_addr[i]);
end
end
free_slots <= free_slots_n;
write_addr_r <= free_index;
full_r <= ~free_valid;
end
end
for (genvar i = 0; i < RPORTS; i++) begin
assign read_data[i] = entries[read_addr[i]];
end
assign write_addr = write_addr_r;

View file

@ -14,53 +14,25 @@ module VX_elastic_buffer #(
input wire ready_out,
output wire valid_out
);
if (0 == SIZE) begin
wire empty, full;
reg [DATAW-1:0] skid_buffer;
reg skid_valid;
VX_generic_queue #(
.DATAW (DATAW),
.SIZE (SIZE),
.BUFFERED (BUFFERED)
) queue (
.clk (clk),
.reset (reset),
.push (valid_in),
.pop (ready_out),
.data_in(data_in),
.data_out(data_out),
.empty (empty),
.full (full),
`UNUSED_PIN (size)
);
always @(posedge clk) begin
if (reset) begin
skid_valid <= 0;
end else begin
if (valid_in && ~ready_out) begin
assert(~skid_valid);
skid_buffer <= data_in;
skid_valid <= 1;
end
if (ready_out) begin
skid_valid <= 0;
end
end
end
assign ready_in = ready_out || ~skid_valid;
assign data_out = skid_valid ? skid_buffer : data_in;
assign valid_out = valid_in || skid_valid;
end else begin
wire empty, full;
VX_generic_queue #(
.DATAW (DATAW),
.SIZE (SIZE),
.BUFFERED (BUFFERED)
) queue (
.clk (clk),
.reset (reset),
.push (valid_in),
.pop (ready_out),
.data_in(data_in),
.data_out(data_out),
.empty (empty),
.full (full),
`UNUSED_PIN (size)
);
assign ready_in = ~full;
assign valid_out = ~empty;
end
assign ready_in = ~full;
assign valid_out = ~empty;
endmodule

View file

@ -70,7 +70,6 @@ module VX_generic_queue #(
if (writing) begin
data[wr_ptr_a] <= data_in;
wr_ptr_r <= wr_ptr_r + 1;
if (!reading) begin
size_r <= size_r + 1;
end

View file

@ -36,14 +36,14 @@ module VX_rr_arbiter #(
end
end
grant_onehot_r = N'(0);
grant_onehot_r[grant_index] = 1;
grant_onehot_r[grant_table[state]] = 1;
end
always @(posedge clk) begin
if (reset) begin
state <= 0;
end else begin
state <= grant_index;
state <= grant_table[state];
end
end

View file

@ -0,0 +1,65 @@
`include "VX_platform.vh"
module VX_skid_buffer #(
parameter DATAW = 1
) (
input wire clk,
input wire reset,
input wire valid_in,
output reg ready_in,
input wire [DATAW-1:0] data_in,
output reg [DATAW-1:0] data_out,
input wire ready_out,
output reg valid_out
);
reg [DATAW-1:0] buffer;
reg use_buffer;
wire push = valid_in && ready_in;
always @(posedge clk) begin
if (reset) begin
use_buffer <= 0;
valid_out <= 0;
end else begin
if (push && (valid_out && !ready_out)) begin
assert(!use_buffer);
use_buffer <= 1;
end
if (ready_out) begin
use_buffer <= 0;
end
if (push) begin
buffer <= data_in;
end
if (!valid_out || ready_out) begin
valid_out <= valid_in || use_buffer;
data_out <= use_buffer ? buffer : data_in;
end
end
end
assign ready_in = !use_buffer;
/*wire empty, full;
VX_generic_queue #(
.DATAW (DATAW),
.SIZE (2),
.BUFFERED (0)
) queue (
.clk (clk),
.reset (reset),
.push (valid_in),
.pop (ready_out),
.data_in(data_in),
.data_out(data_out),
.empty (empty),
.full (full),
`UNUSED_PIN (size)
);
assign ready_in = ~full;
assign valid_out = ~empty;*/
endmodule

View file

@ -3,7 +3,7 @@
#include <fstream>
#include <iomanip>
#define ALL_TESTS
//#define ALL_TESTS
int main(int argc, char **argv) {
if (argc == 1) {

View file

@ -1,6 +1,6 @@
set_time_format -unit ns -decimal_places 3
create_clock -name {clk} -period "300 MHz" -waveform { 0.0 1.0 } [get_ports {clk}]
create_clock -name {clk} -period "240 MHz" -waveform { 0.0 1.0 } [get_ports {clk}]
derive_pll_clocks -create_base_clocks
derive_clock_uncertainty