Merge pull request #26 from CKeilbar/fpu-squashed

Add FPU (squashed)
This commit is contained in:
Mike Thompson 2024-04-02 22:37:02 -04:00 committed by GitHub
commit f0b92a923a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
122 changed files with 11241 additions and 4515 deletions

View file

@ -26,11 +26,11 @@ build-toolchain:
- mv -f taiga taiga-project/
- cd taiga-project
- source settings.sh
- git clone https://github.com/gcc-mirror/gcc.git --branch releases/gcc-10 --single-branch tool-chain/gcc
- git clone https://gitlab.com/sfu-rcl/taiga-picolibc.git --branch master --single-branch tool-chain/picolibc
- git clone https://github.com/gcc-mirror/gcc.git --branch releases/gcc-11 --single-branch tool-chain/gcc
- git clone https://gitlab.com/sfu-rcl/taiga-picolibc.git --branch main --single-branch tool-chain/picolibc
- git clone http://git.veripool.org/git/verilator --branch master --single-branch tool-chain/verilator
- git clone https://sourceware.org/git/binutils-gdb.git --branch binutils-2_35-branch --single-branch tool-chain/binutils-gdb
- git clone https://sourceware.org/git/newlib-cygwin.git --branch master --single-branch tool-chain/newlib-cygwin
- git clone https://github.com/bminor/binutils-gdb.git --branch binutils-2_36-branch --single-branch tool-chain/binutils-gdb
- git clone https://github.com/mirror/newlib-cygwin.git --branch master --single-branch tool-chain/newlib-cygwin
- git clone https://gitlab.com/sfu-rcl/taiga-embench.git --branch taiga-picolibc --single-branch benchmarks/embench
- git clone https://gitlab.com/sfu-rcl/taiga-riscv-compliance.git --branch taiga-sim --single-branch benchmarks/riscv-compliance
- git clone https://gitlab.com/sfu-rcl/taiga-dhrystone.git --branch master --single-branch benchmarks/taiga-dhrystone

View file

@ -1,6 +1,6 @@
# CVA5
CVA5 is a 32-bit RISC-V processor designed for FPGAs supporting the Multiply/Divide and Atomic extensions (RV32IMA). The processor is written in SystemVerilog and has been designed to be both highly extensible and highly configurable.
CVA5 is a 32-bit RISC-V processor designed for FPGAs supporting the Multiply/Divide and Double-precision Floating-Point extensions (RV32IMD). The processor is written in SystemVerilog and has been designed to be both highly extensible and highly configurable.
The CVA5 is derived from the Taiga Project from Simon Fraser University.

View file

@ -1,93 +0,0 @@
/*
* Copyright © 2017-2020 Eric Matthews, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Eric Matthews <ematthew@sfu.ca>
*/
module alu_unit
import cva5_config::*;
import riscv_types::*;
import cva5_types::*;
(
input logic clk,
input logic rst,
unit_issue_interface.unit issue,
input alu_inputs_t alu_inputs,
unit_writeback_interface.unit wb
);
logic[XLEN:0] add_sub_result;
logic add_sub_carry_in;
logic[XLEN:0] adder_in1;
logic[XLEN:0] adder_in2;
logic[XLEN-1:0] shift_result;
logic[XLEN-1:0] result;
//implementation
////////////////////////////////////////////////////
//Logic ops put through the adder carry chain to reduce resources
always_comb begin
case (alu_inputs.logic_op)
ALU_LOGIC_XOR : adder_in1 = alu_inputs.in1 ^ alu_inputs.in2;
ALU_LOGIC_OR : adder_in1 = alu_inputs.in1 | alu_inputs.in2;
ALU_LOGIC_AND : adder_in1 = alu_inputs.in1 & alu_inputs.in2;
default : adder_in1 = alu_inputs.in1; //ADD/SUB/SLT/SLTU
endcase
case (alu_inputs.logic_op)
ALU_LOGIC_XOR,
ALU_LOGIC_OR,
ALU_LOGIC_AND : adder_in2 = 0;
default : adder_in2 = alu_inputs.in2 ^ {33{alu_inputs.subtract}};
endcase
end
//Add/Sub ops
assign {add_sub_result, add_sub_carry_in} = {adder_in1, 1'b1} + {adder_in2, alu_inputs.subtract};
//Shift ops
barrel_shifter shifter (
.shifter_input(alu_inputs.shifter_in),
.shift_amount(alu_inputs.shift_amount),
.arith(alu_inputs.arith),
.lshift(alu_inputs.lshift),
.shifted_result(shift_result)
);
always_comb begin
case (alu_inputs.alu_op)
ALU_CONSTANT : result = alu_inputs.constant_adder;
ALU_ADD_SUB : result = add_sub_result[31:0];
ALU_SLT : result = {31'b0, add_sub_result[XLEN]};
default : result = shift_result; //ALU_SHIFT
endcase
end
////////////////////////////////////////////////////
//Output
assign issue.ready = 1;
assign wb.rd = result;
assign wb.done = issue.possible_issue;
assign wb.id = issue.id;
////////////////////////////////////////////////////
//Assertions
endmodule

View file

@ -1,115 +0,0 @@
/*
* Copyright © 2017-2019 Eric Matthews, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Eric Matthews <ematthew@sfu.ca>
*/
module binary_occupancy
import cva5_config::*;
import cva5_types::*;
#(parameter DEPTH = 4)
(
input logic clk,
input logic rst,
input logic push,
input logic pop,
output logic almost_full,
output logic full,
output logic empty,
output logic almost_empty,
output logic valid
);
logic[$clog2(DEPTH)-1:0] count;
//Occupancy Tracking
always_ff @ (posedge clk) begin
if (rst)
count <= 0;
else begin
case ({push, pop})
2'b10: count <= count + 1;
2'b01: count <= count - 1;
default : count <= count;
endcase
end
end
always_ff @ (posedge clk) begin
if (rst)
valid <= 0;
else begin
case ({push, pop})
2'b10: valid <= 1;
2'b01: valid <= !(count == 1);
default : valid <= valid;
endcase
end
end
// always_ff @ (posedge clk) begin
// if (rst)
// full <= 0;
// else begin
// case ({push, pop})
// 2'b10: full <= (count == DEPTH-2);
// 2'b01: full <= 0;
// default : full <= full;
// endcase
// end
// end
// always_ff @ (posedge clk) begin
// if (rst)
// almost_full <= 0;
// else begin
// case ({push, pop})
// 2'b10: almost_full <= (count == DEPTH-3);
// 2'b01: almost_full <= (count == DEPTH-1);
// default : almost_full <= almost_full;
// endcase
// end
// end
// always_ff @ (posedge clk) begin
// if (rst)
// almost_empty <= 0;
// else begin
// case ({push, pop})
// 2'b10: almost_empty <=(count == 0);
// 2'b01: almost_empty <= (count == 2);
// default : almost_empty <= almost_empty;
// endcase
// end
// end
assign empty = ~valid;
////////////////////////////////////////////////////
//Assertions
always_ff @ (posedge clk) begin
assert (!(~rst & full & push)) else $error("overflow");
assert (!(~rst & empty & pop)) else $error("underflow");
end
endmodule

View file

@ -1,64 +0,0 @@
/*
* Copyright © 2019 Eric Matthews, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Eric Matthews <ematthew@sfu.ca>
*/
module branch_predictor_ram
import cva5_config::*;
import cva5_types::*;
#(
parameter C_DATA_WIDTH = 20,
parameter C_DEPTH = 512
)
(
input logic clk,
input logic rst,
input logic [$clog2(C_DEPTH)-1:0] write_addr,
input logic write_en,
input logic [$clog2(C_DEPTH)-1:0] read_addr,
input logic read_en,
input logic [C_DATA_WIDTH-1:0] write_data,
output logic [C_DATA_WIDTH-1:0] read_data
);
(* ram_style = "block" *)logic [C_DATA_WIDTH-1:0] branch_ram [C_DEPTH-1:0];
////////////////////////////////////////////////////
//Implementation
initial branch_ram = '{default: 0};
always_ff @(posedge clk) begin
if (write_en)
branch_ram[write_addr] <= write_data;
end
always_ff @(posedge clk) begin
if (read_en)
read_data <= branch_ram[read_addr];
end
////////////////////////////////////////////////////
//End of Implementation
////////////////////////////////////////////////////
////////////////////////////////////////////////////
//Assertions
////////////////////////////////////////////////////
//Trace Interface
endmodule

View file

@ -1,174 +0,0 @@
/*
* Copyright © 2017-2019 Eric Matthews, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Eric Matthews <ematthew@sfu.ca>
*/
module branch_unit
import cva5_config::*;
import riscv_types::*;
import cva5_types::*;
# (
parameter cpu_config_t CONFIG = EXAMPLE_CONFIG
)
(
input logic clk,
input logic rst,
unit_issue_interface.unit issue,
input branch_inputs_t branch_inputs,
output branch_results_t br_results,
output logic branch_flush,
exception_interface.unit exception,
//Trace signals
output logic tr_branch_correct,
output logic tr_branch_misspredict,
output logic tr_return_correct,
output logic tr_return_misspredict
);
logic branch_issued_r;
logic result;
//Branch Predictor
logic branch_taken;
logic branch_taken_ex;
id_t id_ex;
logic [31:0] jump_pc;
logic [31:0] new_pc;
logic [31:0] new_pc_ex;
logic [31:0] pc_ex;
logic instruction_is_completing;
logic branch_complete;
logic jal_jalr_ex;
////////////////////////////////////////////////////
//Implementation
//Only stall condition is if the following instruction is not valid for pc comparisons.
//If the next instruction isn't valid, no instruction can be issued anyways, so it
//is safe to hardcode this to one.
assign issue.ready = 1;
//Branch new request is held if the following instruction hasn't arrived at decode/issue yet
set_clr_reg_with_rst #(.SET_OVER_CLR(1), .WIDTH(1), .RST_VALUE(0)) branch_issued_m (
.clk, .rst,
.set(issue.new_request),
.clr(branch_inputs.issue_pc_valid | exception.valid),
.result(branch_issued_r)
);
//To determine if the branch was predicted correctly we need to wait until the
//subsequent instruction has reached the issue stage
assign instruction_is_completing = branch_issued_r & branch_inputs.issue_pc_valid;
////////////////////////////////////////////////////
//Branch/Jump target determination
//Branch comparison and final address calculation
//are performed in the issue stage
branch_comparator bc (
.less_than(branch_inputs.fn3[2]),
.a(branch_inputs.rs1),
.b(branch_inputs.rs2),
.xor_result(branch_inputs.fn3[0]),
.result(result)
);
assign branch_taken = result | branch_inputs.jal_jalr;
assign jump_pc = (branch_inputs.jalr ? branch_inputs.rs1[31:0] : branch_inputs.issue_pc) + 32'(signed'(branch_inputs.pc_offset));
assign new_pc = branch_taken ? jump_pc : branch_inputs.pc_p4;
always_ff @(posedge clk) begin
if (issue.new_request) begin
branch_taken_ex <= branch_taken;
new_pc_ex <= {new_pc[31:1], new_pc[0] & ~branch_inputs.jalr};
id_ex <= issue.id;
jal_jalr_ex <= branch_inputs.jal_jalr;
end
end
////////////////////////////////////////////////////
//Exception support
generate if (CONFIG.INCLUDE_M_MODE) begin : gen_branch_exception
logic new_exception;
assign new_exception = new_pc[1] & branch_taken & issue.new_request;
always_ff @(posedge clk) begin
if (rst)
exception.valid <= 0;
else
exception.valid <= (exception.valid & ~exception.ack) | new_exception;
end
always_ff @(posedge clk) begin
if (issue.new_request)
exception.id <= issue.id;
end
assign exception.code = INST_ADDR_MISSALIGNED;
assign exception.tval = new_pc_ex;
end
endgenerate
////////////////////////////////////////////////////
//Predictor support
logic is_return;
logic is_call;
always_ff @(posedge clk) begin
if (issue.possible_issue) begin
is_return <= branch_inputs.is_return;
is_call <= branch_inputs.is_call;
pc_ex <= branch_inputs.issue_pc;
end
end
assign br_results.id = id_ex;
assign br_results.valid = instruction_is_completing;
assign br_results.pc = pc_ex;
assign br_results.target_pc = new_pc_ex;
assign br_results.branch_taken = branch_taken_ex;
assign br_results.is_branch = ~jal_jalr_ex;
assign br_results.is_return = is_return;
assign br_results.is_call = is_call;
assign branch_flush = instruction_is_completing && (branch_inputs.issue_pc[31:1] != new_pc_ex[31:1]);
////////////////////////////////////////////////////
//End of Implementation
////////////////////////////////////////////////////
////////////////////////////////////////////////////
//Assertions
////////////////////////////////////////////////////
//Trace Interface
generate if (ENABLE_TRACE_INTERFACE) begin
assign tr_branch_correct = instruction_is_completing & ~is_return & ~branch_flush;
assign tr_branch_misspredict = instruction_is_completing & ~is_return & branch_flush;
assign tr_return_correct = instruction_is_completing & is_return & ~branch_flush;
assign tr_return_misspredict = instruction_is_completing & is_return & branch_flush;
end
endgenerate
endmodule

View file

@ -1,64 +0,0 @@
/*
* Copyright © 2018 Eric Matthews, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Eric Matthews <ematthew@sfu.ca>
*/
module clz
(
input logic [31:0] clz_input,
output logic [4:0] clz
);
logic [1:0] low_order_clz [7:0];
logic [7:0] sub_clz;
logic [1:0] upper_lower [3:0];
//////////////////////////////////////////
/* CLZ in groups of 4-bits (optimized for 6-input LUTs)
* Upper 3 bits of CLZ calculated directly from the subgroups
* Lower order bits [1:0] determined for each subgroup
* Lower order bits muxed with neighbor before final 4-1 mux using highest order bits [4:3]
*/
//////////////////////////////////////////
//31-28 index: 0, 3-0 index: 7
const logic [1:0] clz_low_table [8] = '{2'd3, 2'd2, 2'd1, 2'd1, 2'd0, 2'd0, 2'd0, 2'd0};
always_comb begin
for (int i=0; i<8; i++) begin
sub_clz[7-i] = ~|clz_input[(i*4) +: 4];
low_order_clz[7-i] = clz_low_table[clz_input[(i*4) + 1 +: 3]];
end
clz[4] = &sub_clz[3:0]; //upper 16 all zero
clz[3] = clz[4] ? &sub_clz[5:4] : &sub_clz[1:0];//upper 24 zero, or first 8 zero
clz[2] =
(sub_clz[0] & ~sub_clz[1]) |
(&sub_clz[2:0] & ~sub_clz[3]) |
(&sub_clz[4:0] & ~sub_clz[5]) |
(&sub_clz[6:0]);
for (int i=0; i<8; i+=2) begin
upper_lower[i/2] = low_order_clz[{i[2:1], sub_clz[i]}];
end
clz[1:0] = upper_lower[clz[4:3]];
end
endmodule

View file

@ -22,7 +22,7 @@
module byte_en_BRAM
module byte_en_bram
import cva5_config::*;
import cva5_types::*;

View file

@ -0,0 +1,78 @@
/*
* Copyright © 2023 Chris Keilbart, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Chris Keilbart <ckeilbar@sfu.ca>
*/
module clz
#(
parameter WIDTH = 32
)
(
input logic[WIDTH-1:0] clz_input,
output logic[$clog2(WIDTH)-1:0] clz,
output logic zero
);
//Based on "Design of Leading Zero Counters on FPGAs" by Perri et al. 2022 (which is optimized for 6-input LUTs)
//It is possible to unroll this and implement it without recursion
//However, this significantly hurts readability especially with regards to the clz signal
localparam TREE_WIDTH = 2**$clog2(WIDTH);
localparam TREE_CLZ_WIDTH = $clog2(WIDTH)-1;
localparam HALF_TREE_WIDTH = TREE_WIDTH/2;
localparam WIDTH_DIFFERENCE = TREE_WIDTH - WIDTH;
generate if (WIDTH == 2) begin : gen_base_case
//Base case
assign zero = ~(clz_input[1] | clz_input[0]);
assign clz[0] = ~clz_input[1] & clz_input[0];
end
else begin : gen_recursive
logic[TREE_WIDTH-1:0] padded_input;
if (WIDTH_DIFFERENCE != 0) //Pad input on right if width is not a power of 2
assign padded_input = {clz_input, {WIDTH_DIFFERENCE{1'b0}}};
else
assign padded_input = clz_input;
logic[TREE_CLZ_WIDTH-1:0] upper_clz;
logic[TREE_CLZ_WIDTH-1:0] lower_clz;
logic upper_zero;
logic lower_zero;
assign zero = upper_zero & lower_zero;
assign clz[$clog2(WIDTH)-1] = upper_zero;
clz #(.WIDTH(HALF_TREE_WIDTH)) upper_tree (
.clz_input(padded_input[TREE_WIDTH-1:HALF_TREE_WIDTH]),
.clz(upper_clz),
.zero(upper_zero)
);
clz #(.WIDTH(HALF_TREE_WIDTH)) lower_tree (
.clz_input(padded_input[HALF_TREE_WIDTH-1:0]),
.clz(lower_clz),
.zero(lower_zero)
);
for (genvar i = 0; i < TREE_CLZ_WIDTH; i++) //Combine tree outputs
assign clz[i] = (~upper_zero & upper_clz[i]) | (upper_zero & lower_clz[i]);
end
endgenerate
endmodule

View file

@ -32,7 +32,7 @@ module cva5_fifo
import cva5_types::*;
#(
parameter DATA_WIDTH = 70,
parameter type DATA_TYPE = logic,
parameter FIFO_DEPTH = 4
)
(
@ -63,7 +63,7 @@ module cva5_fifo
//connected as a shift reg for the same resources as a LUTRAM FIFO
//but with better timing
else if (FIFO_DEPTH == 2) begin : gen_width_two
logic [DATA_WIDTH-1:0] shift_reg [FIFO_DEPTH];
DATA_TYPE shift_reg [FIFO_DEPTH];
logic [LOG2_FIFO_DEPTH:0] inflight_count;
////////////////////////////////////////////////////
//Occupancy Tracking
@ -87,8 +87,6 @@ module cva5_fifo
assign fifo.data_out = shift_reg[~inflight_count[0]];
end
else begin : gen_width_3_plus
//Force FIFO depth to next power of 2
(* ramstyle = "MLAB, no_rw_check" *) logic [DATA_WIDTH-1:0] lut_ram [(2**LOG2_FIFO_DEPTH)];
logic [LOG2_FIFO_DEPTH-1:0] write_index;
logic [LOG2_FIFO_DEPTH-1:0] read_index;
logic [LOG2_FIFO_DEPTH:0] inflight_count;
@ -102,7 +100,7 @@ module cva5_fifo
end
assign fifo.valid = inflight_count[LOG2_FIFO_DEPTH];
assign fifo.full = fifo.valid & ~|inflight_count[LOG2_FIFO_DEPTH-1:0];
assign fifo.full = inflight_count == (LOG2_FIFO_DEPTH+1)'(-FIFO_DEPTH);
lfsr #(.WIDTH(LOG2_FIFO_DEPTH), .NEEDS_RESET(1))
lfsr_read_index (
@ -116,8 +114,8 @@ module cva5_fifo
.en(fifo.push),
.value(write_index)
);
lutram_1w_1r #(.WIDTH(DATA_WIDTH), .DEPTH(FIFO_DEPTH))
//Force FIFO depth to next power of 2
lutram_1w_1r #(.DATA_TYPE(DATA_TYPE), .DEPTH(2**LOG2_FIFO_DEPTH))
write_port (
.clk(clk),
.waddr(write_index),
@ -138,4 +136,4 @@ module cva5_fifo
fifo_underflow_assertion:
assert property (@(posedge clk) disable iff (rst) fifo.pop |-> fifo.valid) else $error("underflow");
endmodule
endmodule

View file

@ -1,5 +1,5 @@
/*
* Copyright © 2017-2020 Eric Matthews, Lesley Shannon
* Copyright © 2023 Eric Matthews, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -20,46 +20,51 @@
* Eric Matthews <ematthew@sfu.ca>
*/
module tag_bank
#(
module dual_port_bram
import cva5_config::*;
import cva5_types::*;
import riscv_types::*;
#(
parameter WIDTH = 32,
parameter LINES = 512
parameter LINES = 4096
)
(
input logic clk,
input logic rst,
input logic[$clog2(LINES)-1:0] addr_a,
input logic[$clog2(LINES)-1:0] addr_b,
input logic en_a,
input logic en_b,
input logic wen_a,
input logic[$clog2(LINES)-1:0] addr_a,
input logic[WIDTH-1:0] data_in_a,
output logic[WIDTH-1:0] data_out_a,
input logic en_b,
input logic wen_b,
input logic [WIDTH-1:0] data_in_a,
input logic [WIDTH-1:0] data_in_b,
output logic [WIDTH-1:0] data_out_a,
output logic [WIDTH-1:0] data_out_b
input logic[$clog2(LINES)-1:0] addr_b,
input logic[WIDTH-1:0] data_in_b,
output logic[WIDTH-1:0] data_out_b
);
(* ram_style = "block", ramstyle = "no_rw_check" *) logic [WIDTH-1:0] tag_entry [LINES];
initial tag_entry = '{default: 0};
(* ram_style = "block", ramstyle = "no_rw_check" *) logic [WIDTH-1:0] ram [LINES];
initial ram = '{default: 0};
always_ff @ (posedge clk) begin
if (en_a) begin
if (wen_a)
tag_entry[addr_a] <= data_in_a;
else
data_out_a <= tag_entry[addr_a];
ram[addr_a] <= data_in_a;
data_out_a <= ram[addr_a];
end
end
always_ff @ (posedge clk) begin
if (en_b) begin
if (wen_b)
tag_entry[addr_b] <= data_in_b;
else
data_out_b <= tag_entry[addr_b];
ram[addr_b] <= data_in_b;
data_out_b <= ram[addr_b];
end
end
endmodule
endmodule

View file

@ -22,7 +22,7 @@
module lutram_1w_1r
#(
parameter WIDTH = 32,
parameter type DATA_TYPE = logic,
parameter DEPTH = 32
)
(
@ -32,11 +32,11 @@ module lutram_1w_1r
input logic[$clog2(DEPTH)-1:0] raddr,
input logic ram_write,
input logic[WIDTH-1:0] new_ram_data,
output logic[WIDTH-1:0] ram_data_out
input DATA_TYPE new_ram_data,
output DATA_TYPE ram_data_out
);
(* ramstyle = "MLAB, no_rw_check", ram_style = "distributed" *) logic [WIDTH-1:0] ram [DEPTH-1:0];
(* ramstyle = "MLAB, no_rw_check", ram_style = "distributed" *) logic [$bits(DATA_TYPE)-1:0] ram [DEPTH-1:0];
initial ram = '{default: 0};
always_ff @ (posedge clk) begin

View file

@ -25,7 +25,7 @@ module lutram_1w_mr
import cva5_config::*;
#(
parameter WIDTH = 32,
parameter type DATA_TYPE = logic,
parameter DEPTH = 32,
parameter NUM_READ_PORTS = 2
)
@ -36,14 +36,14 @@ module lutram_1w_mr
input logic[$clog2(DEPTH)-1:0] raddr [NUM_READ_PORTS],
input logic ram_write,
input logic[WIDTH-1:0] new_ram_data,
output logic[WIDTH-1:0] ram_data_out [NUM_READ_PORTS]
input DATA_TYPE new_ram_data,
output DATA_TYPE ram_data_out [NUM_READ_PORTS]
);
//For Xilinx with their wider selection of LUTRAMs, infer a multi-read port LUTRAM
//For Intel, build the multi-read port ram from simple-dual-port LUTRAMs
generate if (FPGA_VENDOR == XILINX) begin : xilinx_gen
logic [WIDTH-1:0] ram [DEPTH-1:0];
logic [$bits(DATA_TYPE)-1:0] ram [DEPTH-1:0];
initial ram = '{default: 0};
always_ff @ (posedge clk) begin
@ -61,7 +61,7 @@ end
else if (FPGA_VENDOR == INTEL) begin : intel_gen
for (genvar i = 0; i < NUM_READ_PORTS; i++) begin : lutrams
lutram_1w_1r #(.WIDTH(WIDTH), .DEPTH(DEPTH))
lutram_1w_1r #(.DATA_TYPE(DATA_TYPE), .DEPTH(DEPTH))
write_port (
.clk(clk),
.waddr(waddr),

View file

@ -51,7 +51,7 @@ module toggle_memory
assign new_ram_data = toggle ^ _read_data[0];
lutram_1w_mr #(
.WIDTH(1),
.DATA_TYPE(logic),
.DEPTH(DEPTH),
.NUM_READ_PORTS(NUM_READ_PORTS+1)
)

View file

@ -28,9 +28,7 @@ module toggle_memory_set
# (
parameter DEPTH = 64,
parameter NUM_WRITE_PORTS = 3,
parameter NUM_READ_PORTS = 2,
parameter WRITE_INDEX_FOR_RESET = 0,
parameter READ_INDEX_FOR_RESET = 0
parameter NUM_READ_PORTS = 2
)
(
input logic clk,

View file

@ -84,7 +84,6 @@ module cva5_wrapper_xilinx
avalon_interface m_avalon ();
wishbone_interface dwishbone ();
wishbone_interface iwishbone ();
trace_outputs_t tr;
logic timer_interrupt;
logic interrupt;

View file

@ -1,732 +0,0 @@
/*
* Copyright © 2017-2020 Eric Matthews, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Eric Matthews <ematthew@sfu.ca>
*/
module csr_unit
import cva5_config::*;
import riscv_types::*;
import cva5_types::*;
import csr_types::*;
# (
parameter cpu_config_t CONFIG = EXAMPLE_CONFIG
)
(
input logic clk,
input logic rst,
//Unit Interfaces
unit_issue_interface.unit issue,
input csr_inputs_t csr_inputs,
unit_writeback_interface.unit wb,
//Privilege
output logic [1:0] current_privilege,
//GC
input logic interrupt_taken,
output logic interrupt_pending,
output logic processing_csr,
//TLB and MMU
output logic tlb_on,
output logic [ASIDLEN-1:0] asid,
//MMUs
mmu_interface.csr immu,
mmu_interface.csr dmmu,
//CSR exception interface
input exception_packet_t exception,
output logic [31:0] exception_target_pc,
//exception return
input logic mret,
input logic sret,
output logic [31:0] epc,
//Retire
input retire_packet_t retire,
input id_t retire_ids [RETIRE_PORTS],
//External
input interrupt_t s_interrupt,
input interrupt_t m_interrupt
);
logic busy;
logic commit;
logic commit_in_progress;
csr_inputs_t csr_inputs_r;
privilege_t privilege_level;
privilege_t next_privilege_level;
//write_logic
logic supervisor_write;
logic machine_write;
logic [XLEN-1:0] selected_csr;
logic [XLEN-1:0] selected_csr_r;
logic [31:0] updated_csr;
logic swrite;
logic mwrite;
function logic mwrite_en (input csr_addr_t addr);
return mwrite & (csr_inputs_r.addr.sub_addr == addr.sub_addr);
endfunction
function logic swrite_en (input csr_addr_t addr);
return swrite & (csr_inputs_r.addr.sub_addr == addr.sub_addr);
endfunction
////////////////////////////////////////////////////
//Implementation
assign processing_csr = busy | issue.new_request;
assign issue.ready = ~busy;
always_ff @(posedge clk) begin
if (rst)
busy <= 0;
else
busy <= (busy & ~wb.ack) | issue.new_request;
end
always_ff @(posedge clk) begin
if (issue.new_request)
csr_inputs_r <= csr_inputs;
end
always_ff @(posedge clk) begin
if (rst)
commit_in_progress <= 0;
else
commit_in_progress <= (commit_in_progress & ~issue.new_request) | commit;
end
//Waits until CSR instruction is the oldest issued instruction
assign commit = (retire_ids[0] == wb.id) & busy & (~commit_in_progress);
////////////////////////////////////////////////////
//Output
always_ff @(posedge clk) begin
if (rst)
wb.done <= 0;
else
wb.done <= (wb.done & ~wb.ack) | commit;
end
always_ff @(posedge clk) begin
if (issue.new_request)
wb.id <= issue.id;
end
assign wb.rd = selected_csr_r;
////////////////////////////////////////////////////
//Shared logic
always_ff @(posedge clk) begin
mwrite <= CONFIG.INCLUDE_M_MODE && commit && (csr_inputs_r.addr.rw_bits != CSR_READ_ONLY && csr_inputs_r.addr.privilege == MACHINE_PRIVILEGE);
swrite <= CONFIG.INCLUDE_S_MODE && commit && (csr_inputs_r.addr.rw_bits != CSR_READ_ONLY && csr_inputs_r.addr.privilege == SUPERVISOR_PRIVILEGE);
end
always_ff @(posedge clk) begin
if (commit) begin
case (csr_inputs_r.op)
CSR_RW : updated_csr = csr_inputs_r.data;
CSR_RS : updated_csr = selected_csr | csr_inputs_r.data;
CSR_RC : updated_csr = selected_csr & ~csr_inputs_r.data;
default : updated_csr = csr_inputs_r.data;
endcase
end
end
////////////////////////////////////////////////////
//Machine Mode Registers
////////////////////////////////////////////////////
////////////////////////////////////////////////////
//Constant Registers
////////////////////////////////////////////////////
//Machine ISA register
const misa_t misa = '{default:0, mxlen:1, A:(CONFIG.INCLUDE_AMO), I:1, M:(CONFIG.INCLUDE_MUL && CONFIG.INCLUDE_DIV), S:(CONFIG.INCLUDE_S_MODE), U:(CONFIG.INCLUDE_U_MODE)};
////////////////////////////////////////////////////
//Machine Version Registers
const logic [XLEN-1:0] mvendorid = 0;
const logic [XLEN-1:0] marchid = 0;
const logic [XLEN-1:0] mimpid = CONFIG.CSRS.MACHINE_IMPLEMENTATION_ID;
const logic [XLEN-1:0] mhartid = CONFIG.CSRS.CPU_ID;
////////////////////////////////////////////////////
//MSTATUS
const logic [XLEN-1:0] mstatush = 0; //Always little endian
////////////////////////////////////////////////////
//Non-Constant Registers
mstatus_t mstatus;
logic[XLEN-1:0] mtvec;
logic[XLEN-1:0] medeleg;
logic[XLEN-1:0] mideleg;
mip_t mip, mip_mask, mip_w_mask, mip_new;
mie_t mie, mie_mask;
mip_t sip_mask;
mie_t sie_mask;
logic[XLEN-1:0] mepc;
logic[XLEN-1:0] mtimecmp;
mcause_t mcause;
logic[XLEN-1:0] mtval;
logic[XLEN-1:0] mscratch;
//Virtualization support: TSR, TW, TVM unused
//Extension context status: SD, FS, XS unused
const mstatus_t mstatus_mask =
'{default:0, mprv:(CONFIG.INCLUDE_U_MODE | CONFIG.INCLUDE_S_MODE), mxr:(CONFIG.INCLUDE_S_MODE),
sum:(CONFIG.INCLUDE_U_MODE & CONFIG.INCLUDE_S_MODE), mpp:'1, spp:(CONFIG.INCLUDE_S_MODE),
mpie:1, spie:(CONFIG.INCLUDE_S_MODE), mie:1, sie:(CONFIG.INCLUDE_S_MODE)};
const mstatus_t sstatus_mask = '{default:0, mxr:1, sum:1, spp:1, spie:1, sie:1};
generate if (CONFIG.INCLUDE_M_MODE) begin : gen_csr_m_mode
privilege_t trap_return_privilege_level;
privilege_t exception_privilege_level;
privilege_t interrupt_privilege_level;
mstatus_t mstatus_exception;
mstatus_t mstatus_return;
mstatus_t mstatus_new;
logic [ECODE_W-1:0] interrupt_cause_r;
//Interrupt and Exception Delegation
//Can delegate to supervisor if currently in supervisor or user modes
always_comb begin
exception_privilege_level = MACHINE_PRIVILEGE;
interrupt_privilege_level = MACHINE_PRIVILEGE;
if (CONFIG.INCLUDE_S_MODE && privilege_level inside {SUPERVISOR_PRIVILEGE, USER_PRIVILEGE}) begin
if (exception.valid & medeleg[exception.code])
exception_privilege_level = SUPERVISOR_PRIVILEGE;
if (interrupt_taken & mideleg[interrupt_cause_r])
interrupt_privilege_level = SUPERVISOR_PRIVILEGE;
end
end
//return from trap privilege determination
assign trap_return_privilege_level = mret ? privilege_t'(mstatus.mpp) : privilege_t'({1'b0,mstatus.spp});
always_comb begin
if(mret | sret)
next_privilege_level = trap_return_privilege_level;
else if (interrupt_taken)
next_privilege_level = interrupt_privilege_level;
else if (exception.valid)
next_privilege_level = exception_privilege_level;
else
next_privilege_level = privilege_level;
end
//Current privilege level
always_ff @(posedge clk) begin
if (rst)
privilege_level <= MACHINE_PRIVILEGE;
else
privilege_level <= next_privilege_level;
end
assign current_privilege = privilege_level;
always_comb begin
mstatus_exception = mstatus;
case (next_privilege_level)
SUPERVISOR_PRIVILEGE: begin
mstatus_exception.spie = (privilege_level == SUPERVISOR_PRIVILEGE) ? mstatus.sie : 0;
mstatus_exception.sie = 0;
mstatus_exception.spp = privilege_level[0]; //one if from supervisor-mode, zero if from user-mode
end
default: begin
mstatus_exception.mpie = (privilege_level == MACHINE_PRIVILEGE) ? mstatus.mie : ((privilege_level == SUPERVISOR_PRIVILEGE) ? mstatus.sie : 0);
mstatus_exception.mie = 0;
mstatus_exception.mpp = privilege_level; //machine,supervisor or user
end
endcase
end
//return from trap
always_comb begin
mstatus_return = mstatus;
if (sret) begin
mstatus_return.sie = mstatus.spie;
mstatus_return.spie = 1;
mstatus_return.spp = USER_PRIVILEGE[0];
mstatus_return.mprv = 0;
end
else if (mret) begin
mstatus_return.mie = mstatus.mpie;
mstatus_return.mpie = 1;
mstatus_return.mpp = CONFIG.INCLUDE_U_MODE ? USER_PRIVILEGE : MACHINE_PRIVILEGE;
if (mstatus.mpp != MACHINE_PRIVILEGE)
mstatus_return.mprv = 0;
end
end
mstatus_t mstatus_write_mask;
assign mstatus_write_mask = swrite ? sstatus_mask : mstatus_mask;
always_comb begin
mstatus_new = mstatus;
if (mwrite_en(MSTATUS) | swrite_en(SSTATUS))
mstatus_new = (mstatus & ~mstatus_write_mask) | (updated_csr & mstatus_write_mask);
else if (interrupt_taken | exception.valid)
mstatus_new = mstatus_exception;
else if (mret | sret)
mstatus_new = mstatus_return;
end
always_ff @(posedge clk) begin
if (rst)
mstatus <= '{default:0, mpp:MACHINE_PRIVILEGE};
else
mstatus <= mstatus_new;
end
////////////////////////////////////////////////////
//MTVEC
//No vectored mode, mode hard-coded to zero
initial mtvec[31:2] = CONFIG.CSRS.RESET_MTVEC[31:2];
always_ff @(posedge clk) begin
mtvec[1:0] <= '0;
if (CONFIG.CSRS.NON_STANDARD_OPTIONS.MTVEC_WRITEABLE & mwrite_en(MTVEC))
mtvec[XLEN-1:2] <= updated_csr[XLEN-1:2];
end
assign exception_target_pc = mtvec;
////////////////////////////////////////////////////
//MEDELEG
logic [31:0] medeleg_mask;
always_comb begin
medeleg_mask = 0;
if (CONFIG.INCLUDE_S_MODE) begin
medeleg_mask[INST_ADDR_MISSALIGNED] = 1;
medeleg_mask[INST_ACCESS_FAULT] = 1;
medeleg_mask[ILLEGAL_INST] = 1;
medeleg_mask[BREAK] = 1;
medeleg_mask[LOAD_ADDR_MISSALIGNED] = 1;
medeleg_mask[LOAD_FAULT] = 1;
medeleg_mask[STORE_AMO_ADDR_MISSALIGNED] = 1;
medeleg_mask[STORE_AMO_FAULT] = 1;
medeleg_mask[ECALL_U] = 1;
medeleg_mask[INST_PAGE_FAULT] = 1;
medeleg_mask[LOAD_PAGE_FAULT] = 1;
medeleg_mask[STORE_OR_AMO_PAGE_FAULT] = 1;
end
end
always_ff @(posedge clk) begin
if (rst)
medeleg <= '0;
else if (mwrite_en(MEDELEG) & CONFIG.INCLUDE_S_MODE)
medeleg <= (updated_csr & medeleg_mask);
end
////////////////////////////////////////////////////
//MIDELEG
logic [31:0] mideleg_mask;
always_comb begin
mideleg_mask = 0;
if (CONFIG.INCLUDE_S_MODE) begin
mideleg_mask[S_SOFTWARE_INTERRUPT] = CONFIG.INCLUDE_S_MODE;
mideleg_mask[S_TIMER_INTERRUPT] = CONFIG.INCLUDE_S_MODE;
mideleg_mask[S_EXTERNAL_INTERRUPT] = CONFIG.INCLUDE_S_MODE;
end
end
always_ff @(posedge clk) begin
if (rst)
mideleg <= '0;
else if (mwrite_en(MIDELEG) & CONFIG.INCLUDE_S_MODE)
mideleg <= (updated_csr & mideleg_mask);
end
////////////////////////////////////////////////////
//MIP
assign mip_mask = '{default:0, meip:1, seip:CONFIG.INCLUDE_S_MODE, mtip:1, stip:CONFIG.INCLUDE_S_MODE, msip:1, ssip:CONFIG.INCLUDE_S_MODE};
assign mip_w_mask = '{default:0, seip:CONFIG.INCLUDE_S_MODE, stip:CONFIG.INCLUDE_S_MODE, ssip:CONFIG.INCLUDE_S_MODE};
always_comb begin
mip_new = '0;
mip_new.ssip = s_interrupt.software;
mip_new.stip = s_interrupt.timer;
mip_new.seip = s_interrupt.external;
mip_new.msip = m_interrupt.software;
mip_new.mtip = m_interrupt.timer;
mip_new.meip = m_interrupt.external;
mip_new &= mip_mask;
end
always_ff @(posedge clk) begin
if (rst)
mip <= 0;
else if (mwrite_en(MIP) | (|mip_new))
mip <= (updated_csr & mip_w_mask) | mip_new;
end
assign interrupt_pending = |(mip & mie) & mstatus.mie;
////////////////////////////////////////////////////
//MIE
assign mie_mask = '{default:0, meie:1, seie:CONFIG.INCLUDE_S_MODE, mtie:1, stie:CONFIG.INCLUDE_S_MODE, msie:1, ssie:CONFIG.INCLUDE_S_MODE};
assign sie_mask = '{default:0, seie:1, stie:1, ssie:1};
always_ff @(posedge clk) begin
if (rst)
mie <= '0;
else if (mwrite_en(MIE) | swrite_en(SIE))
mie <= updated_csr & (swrite ? sie_mask : mie_mask);
end
////////////////////////////////////////////////////
//MEPC
//Can be software written, written on exception with
//exception causing PC. Lower two bits tied to zero.
always_ff @(posedge clk) begin
mepc[1:0] <= '0;
if (mwrite_en(MEPC) | exception.valid | interrupt_taken)
mepc[XLEN-1:2] <= (exception.valid | interrupt_taken) ? exception.pc[XLEN-1:2] : updated_csr[XLEN-1:2];
end
assign epc = mepc;
////////////////////////////////////////////////////
//MCAUSE
//As the exception and interrupts codes are sparsely populated,
//to ensure that only legal values are written, a ROM lookup
//is used to validate the CSR write operation
logic M_EXCEPTION_MASKING_ROM [2**ECODE_W];
logic M_INTERRUPT_MASKING_ROM [2**ECODE_W];
always_comb begin
M_EXCEPTION_MASKING_ROM = '{default: 0};
M_EXCEPTION_MASKING_ROM[INST_ADDR_MISSALIGNED] = 1;
M_EXCEPTION_MASKING_ROM[INST_ACCESS_FAULT] = CONFIG.INCLUDE_S_MODE;
M_EXCEPTION_MASKING_ROM[ILLEGAL_INST] = 1;
M_EXCEPTION_MASKING_ROM[BREAK] = 1;
M_EXCEPTION_MASKING_ROM[LOAD_ADDR_MISSALIGNED] = 1;
M_EXCEPTION_MASKING_ROM[LOAD_FAULT] = CONFIG.INCLUDE_S_MODE;
M_EXCEPTION_MASKING_ROM[STORE_AMO_ADDR_MISSALIGNED] = 1;
M_EXCEPTION_MASKING_ROM[STORE_AMO_FAULT] = CONFIG.INCLUDE_S_MODE;
M_EXCEPTION_MASKING_ROM[ECALL_U] = CONFIG.INCLUDE_S_MODE;
M_EXCEPTION_MASKING_ROM[ECALL_S] = CONFIG.INCLUDE_S_MODE;
M_EXCEPTION_MASKING_ROM[ECALL_M] = 1;
M_EXCEPTION_MASKING_ROM[INST_PAGE_FAULT] = CONFIG.INCLUDE_S_MODE;
M_EXCEPTION_MASKING_ROM[LOAD_PAGE_FAULT] = CONFIG.INCLUDE_S_MODE;
M_EXCEPTION_MASKING_ROM[STORE_OR_AMO_PAGE_FAULT] = CONFIG.INCLUDE_S_MODE;
M_INTERRUPT_MASKING_ROM = '{default: 0};
M_INTERRUPT_MASKING_ROM[S_SOFTWARE_INTERRUPT] = CONFIG.INCLUDE_S_MODE;
M_INTERRUPT_MASKING_ROM[M_SOFTWARE_INTERRUPT] = 1;
M_INTERRUPT_MASKING_ROM[S_TIMER_INTERRUPT] = CONFIG.INCLUDE_S_MODE;
M_INTERRUPT_MASKING_ROM[M_TIMER_INTERRUPT] = 1;
M_INTERRUPT_MASKING_ROM[S_EXTERNAL_INTERRUPT] = CONFIG.INCLUDE_S_MODE;
M_INTERRUPT_MASKING_ROM[M_EXTERNAL_INTERRUPT] = 1;
end
logic mcause_write_valid;
always_comb begin
if (updated_csr[XLEN-1]) //interrupt
mcause_write_valid = M_INTERRUPT_MASKING_ROM[updated_csr[ECODE_W-1:0]];
else
mcause_write_valid = M_EXCEPTION_MASKING_ROM[updated_csr[ECODE_W-1:0]];
end
mip_t mip_cause;
logic [5:0] mip_priority_vector;
logic [2:0] mip_cause_sel;
const logic [ECODE_W-1:0] interruput_code_table [7:0] = '{ 0, 0,
M_EXTERNAL_INTERRUPT, M_TIMER_INTERRUPT, M_SOFTWARE_INTERRUPT,
S_EXTERNAL_INTERRUPT, S_TIMER_INTERRUPT, S_SOFTWARE_INTERRUPT
};
assign mip_cause = (mip & mie);
assign mip_priority_vector = '{mip_cause.meip, mip_cause.mtip, mip_cause.msip, mip_cause.seip, mip_cause.stip, mip_cause.ssip};
priority_encoder #(.WIDTH(6))
interrupt_cause_encoder (
.priority_vector (mip_priority_vector),
.encoded_result (mip_cause_sel)
);
always_ff @(posedge clk) begin
if (interrupt_pending)
interrupt_cause_r <= interruput_code_table[mip_cause_sel];
end
always_ff @(posedge clk) begin
mcause.zeroes <= '0;
if (rst) begin
mcause.is_interrupt <= 0;
mcause.code <= 0;
end
else if (CONFIG.CSRS.NON_STANDARD_OPTIONS.INCLUDE_MCAUSE & ((mcause_write_valid & mwrite_en(MCAUSE)) | exception.valid | interrupt_taken)) begin
mcause.is_interrupt <= interrupt_taken | (mwrite_en(MCAUSE) & updated_csr[XLEN-1]);
mcause.code <= interrupt_taken ? interrupt_cause_r : exception.valid ? exception.code : updated_csr[ECODE_W-1:0];
end
end
////////////////////////////////////////////////////
//MTVAL
always_ff @(posedge clk) begin
if (CONFIG.CSRS.NON_STANDARD_OPTIONS.INCLUDE_MTVAL & (mwrite_en(MTVAL) | exception.valid))
mtval <= exception.valid ? exception.tval : updated_csr;
end
////////////////////////////////////////////////////
//MSCRATCH
always_ff @(posedge clk) begin
if (CONFIG.CSRS.NON_STANDARD_OPTIONS.INCLUDE_MSCRATCH & mwrite_en(MSCRATCH))
mscratch <= updated_csr;
end
end
endgenerate
////////////////////////////////////////////////////
//END OF MACHINE REGS
////////////////////////////////////////////////////
////////////////////////////////////////////////////
//BEGIN OF SUPERVISOR REGS
////////////////////////////////////////////////////
logic[XLEN-1:0] sepc;
logic[XLEN-1:0] stime;
logic[XLEN-1:0] stimecmp;
logic[XLEN-1:0] scause;
logic[XLEN-1:0] stval;
logic[XLEN-1:0] sstatus;
logic[XLEN-1:0] stvec;
satp_t satp;
logic[XLEN-1:0] sscratch;
//TLB status --- used to mux physical/virtual address
assign tlb_on = CONFIG.INCLUDE_S_MODE & satp.mode;
assign asid = satp.asid;
//******************
generate if (CONFIG.INCLUDE_S_MODE) begin : gen_csr_s_mode
////////////////////////////////////////////////////
//MMU interface
assign immu.mxr = mstatus.mxr;
assign dmmu.mxr = mstatus.mxr;
assign immu.sum = mstatus.sum;
assign dmmu.sum = mstatus.sum;
assign immu.privilege = privilege_level;
assign dmmu.privilege = mstatus.mprv ? mstatus.mpp : privilege_level;
assign immu.satp_ppn = satp.ppn;
assign dmmu.satp_ppn = satp.ppn;
////////////////////////////////////////////////////
assign sip_mask = '{default:0, seip:1, stip:1, ssip:1};
////////////////////////////////////////////////////
//STVEC
logic [31:0] stvec_mask = '1;
always_ff @(posedge clk) begin
if (rst)
stvec <= {CONFIG.CSRS.RESET_VEC[XLEN-1:2], 2'b00};
else if (swrite_en(STVEC))
stvec <= (updated_csr & stvec_mask);
end
////////////////////////////////////////////////////
//SATP
logic[XLEN-1:0] satp_mask;
assign satp_mask = '1;
always_ff @(posedge clk) begin
if (rst)
satp <= 0;
else if (swrite_en(SATP))
satp <= (updated_csr & satp_mask);
end
////////////////////////////////////////////////////
//SSCRATCH
always_ff @(posedge clk) begin
if (swrite_en(SSCRATCH))
sscratch <= updated_csr;
end
end
endgenerate
////////////////////////////////////////////////////
//END OF SUPERVISOR REGS
////////////////////////////////////////////////////
////////////////////////////////////////////////////
//Timers and Counters
//Register increment for instructions completed
//Increments suppressed on writes to these registers
logic[CONFIG.CSRS.NON_STANDARD_OPTIONS.COUNTER_W-1:0] mcycle;
logic[CONFIG.CSRS.NON_STANDARD_OPTIONS.COUNTER_W-1:0] mtime;
logic[CONFIG.CSRS.NON_STANDARD_OPTIONS.COUNTER_W-1:0] minst_ret;
logic[CONFIG.CSRS.NON_STANDARD_OPTIONS.COUNTER_W-1:0] mcycle_input_next;
logic[CONFIG.CSRS.NON_STANDARD_OPTIONS.COUNTER_W-1:0] minst_ret_input_next;
logic[LOG2_RETIRE_PORTS:0] minst_ret_inc;
logic mcycle_inc;
always_comb begin
mcycle_input_next = mcycle;
if (CONFIG.CSRS.NON_STANDARD_OPTIONS.MCYCLE_WRITEABLE & mwrite_en(MCYCLE))
mcycle_input_next[31:0] = updated_csr;
if (CONFIG.CSRS.NON_STANDARD_OPTIONS.MCYCLE_WRITEABLE & mwrite_en(MCYCLEH))
mcycle_input_next[CONFIG.CSRS.NON_STANDARD_OPTIONS.COUNTER_W-1:32] = updated_csr[CONFIG.CSRS.NON_STANDARD_OPTIONS.COUNTER_W-33:0];
end
assign mcycle_inc = ~(CONFIG.CSRS.NON_STANDARD_OPTIONS.MCYCLE_WRITEABLE & (mwrite_en(MCYCLE) | mwrite_en(MCYCLEH)));
always_ff @(posedge clk) begin
if (rst)
mcycle <= 0;
else
mcycle <= mcycle_input_next + CONFIG.CSRS.NON_STANDARD_OPTIONS.COUNTER_W'(mcycle_inc);
end
always_comb begin
minst_ret_input_next = minst_ret;
if (CONFIG.CSRS.NON_STANDARD_OPTIONS.MINSTR_WRITEABLE & mwrite_en(MINSTRET))
minst_ret_input_next[31:0] = updated_csr;
if (CONFIG.CSRS.NON_STANDARD_OPTIONS.MINSTR_WRITEABLE & mwrite_en(MINSTRETH))
minst_ret_input_next[CONFIG.CSRS.NON_STANDARD_OPTIONS.COUNTER_W-1:32] = updated_csr[CONFIG.CSRS.NON_STANDARD_OPTIONS.COUNTER_W-33:0];
end
assign minst_ret_inc = {(LOG2_RETIRE_PORTS+1){~(CONFIG.CSRS.NON_STANDARD_OPTIONS.MINSTR_WRITEABLE & (mwrite_en(MINSTRET) | mwrite_en(MINSTRETH)))}} & retire.count;
always_ff @(posedge clk) begin
if (rst)
minst_ret <= 0;
else
minst_ret <= minst_ret_input_next + CONFIG.CSRS.NON_STANDARD_OPTIONS.COUNTER_W'(minst_ret_inc);
end
////////////////////////////////////////////////////
//CSR mux
always_comb begin
case (csr_inputs_r.addr) inside
//Machine info
MISA : selected_csr = CONFIG.INCLUDE_M_MODE ? misa : 0;
MVENDORID : selected_csr = CONFIG.INCLUDE_M_MODE ? mvendorid : 0;
MARCHID : selected_csr = CONFIG.INCLUDE_M_MODE ? marchid : 0;
MIMPID : selected_csr = CONFIG.INCLUDE_M_MODE ? mimpid : 0;
MHARTID : selected_csr = CONFIG.INCLUDE_M_MODE ? mhartid : 0;
//Machine trap setup
MSTATUS : selected_csr = CONFIG.INCLUDE_M_MODE ? mstatus : 0;
MEDELEG : selected_csr = CONFIG.INCLUDE_M_MODE ? medeleg : 0;
MIDELEG : selected_csr = CONFIG.INCLUDE_M_MODE ? mideleg : 0;
MIE : selected_csr = CONFIG.INCLUDE_M_MODE ? mie : 0;
MTVEC : selected_csr = CONFIG.INCLUDE_M_MODE ? mtvec : 0;
MCOUNTEREN : selected_csr = 0;
//Machine trap handling
MSCRATCH : selected_csr = CONFIG.INCLUDE_M_MODE ? mscratch : 0;
MEPC : selected_csr = CONFIG.INCLUDE_M_MODE ? mepc : 0;
MCAUSE : selected_csr = CONFIG.INCLUDE_M_MODE ? mcause : 0;
MTVAL : selected_csr = CONFIG.INCLUDE_M_MODE ? mtval : 0;
MIP : selected_csr = CONFIG.INCLUDE_M_MODE ? mip : 0;
//Machine Memory Protection
[12'h3EF : 12'h3A0] : selected_csr = 0;
//Machine Timers and Counters
MCYCLE : selected_csr = CONFIG.INCLUDE_M_MODE ? mcycle[XLEN-1:0] : 0;
MINSTRET : selected_csr = CONFIG.INCLUDE_M_MODE ? minst_ret[XLEN-1:0] : 0;
[12'hB03 : 12'hB1F] : selected_csr = 0;
MCYCLEH : selected_csr = CONFIG.INCLUDE_M_MODE ? 32'(mcycle[CONFIG.CSRS.NON_STANDARD_OPTIONS.COUNTER_W-1:XLEN]) : 0;
MINSTRETH : selected_csr = CONFIG.INCLUDE_M_MODE ? 32'(minst_ret[CONFIG.CSRS.NON_STANDARD_OPTIONS.COUNTER_W-1:XLEN]) : 0;
[12'hB83 : 12'hB9F] : selected_csr = 0;
//Machine Counter Setup
[12'h320 : 12'h33F] : selected_csr = 0;
//Supervisor Trap Setup
SSTATUS : selected_csr = CONFIG.INCLUDE_S_MODE ? (mstatus & sstatus_mask) : '0;
SEDELEG : selected_csr = 0; //No user-level interrupts/exception handling
SIDELEG : selected_csr = 0;
SIE : selected_csr = CONFIG.INCLUDE_S_MODE ? (mie & sie_mask) : '0;
STVEC : selected_csr = CONFIG.INCLUDE_S_MODE ? stvec : '0;
SCOUNTEREN : selected_csr = 0;
//Supervisor trap handling
SSCRATCH : selected_csr = CONFIG.INCLUDE_S_MODE ? sscratch : '0;
SEPC : selected_csr = CONFIG.INCLUDE_S_MODE ? sscratch : '0;
SCAUSE : selected_csr = CONFIG.INCLUDE_S_MODE ? sscratch : '0;
STVAL : selected_csr = CONFIG.INCLUDE_S_MODE ? sscratch : '0;
SIP : selected_csr = CONFIG.INCLUDE_S_MODE ? (mip & sip_mask) : '0;
//Supervisor Protection and Translation
SATP : selected_csr = CONFIG.INCLUDE_S_MODE ? satp : '0;
//User status
//Floating point
FFLAGS : selected_csr = 0;
FRM : selected_csr = 0;
FCSR : selected_csr = 0;
//User Counter Timers
CYCLE : selected_csr = mcycle[XLEN-1:0];
TIME : selected_csr = mcycle[XLEN-1:0];
INSTRET : selected_csr = minst_ret[XLEN-1:0];
[12'hC03 : 12'hC1F] : selected_csr = 0;
CYCLEH : selected_csr = 32'(mcycle[CONFIG.CSRS.NON_STANDARD_OPTIONS.COUNTER_W-1:XLEN]);
TIMEH : selected_csr = 32'(mcycle[CONFIG.CSRS.NON_STANDARD_OPTIONS.COUNTER_W-1:XLEN]);
INSTRETH : selected_csr = 32'(minst_ret[CONFIG.CSRS.NON_STANDARD_OPTIONS.COUNTER_W-1:XLEN]);
[12'hC83 : 12'hC9F] : selected_csr = 0;
default : selected_csr = 0;
endcase
end
always_ff @(posedge clk) begin
if (commit)
selected_csr_r <= selected_csr;
end
endmodule

View file

@ -28,6 +28,7 @@ module cva5
import l2_config_and_types::*;
import riscv_types::*;
import cva5_types::*;
import fpu_types::*;
#(
parameter cpu_config_t CONFIG = EXAMPLE_CONFIG
@ -45,48 +46,11 @@ module cva5
wishbone_interface.master dwishbone,
wishbone_interface.master iwishbone,
output trace_outputs_t tr,
l2_requester_interface.master l2,
input interrupt_t s_interrupt,
input interrupt_t m_interrupt
);
////////////////////////////////////////////////////
//Unit ID Assignment
//Generate Issue IDs based on configuration options
//Then assigned to a struct for ease in passing to sub modules
//Units with writeback
localparam int unsigned ALU_UNIT_ID = 32'd0;
localparam int unsigned LS_UNIT_ID = 32'd1;
localparam int unsigned CSR_UNIT_ID = LS_UNIT_ID + int'(CONFIG.INCLUDE_CSRS);
localparam int unsigned MUL_UNIT_ID = CSR_UNIT_ID + int'(CONFIG.INCLUDE_MUL);
localparam int unsigned DIV_UNIT_ID = MUL_UNIT_ID + int'(CONFIG.INCLUDE_DIV);
//Non-writeback units
localparam int unsigned BRANCH_UNIT_ID = DIV_UNIT_ID + 1;
localparam int unsigned IEC_UNIT_ID = BRANCH_UNIT_ID + 1;
//Total number of units
localparam int unsigned NUM_UNITS = IEC_UNIT_ID + 1;
localparam unit_id_param_t UNIT_IDS = '{
ALU : ALU_UNIT_ID,
LS : LS_UNIT_ID,
CSR : CSR_UNIT_ID,
MUL : MUL_UNIT_ID,
DIV : DIV_UNIT_ID,
BR : BRANCH_UNIT_ID,
IEC : IEC_UNIT_ID
};
////////////////////////////////////////////////////
//Writeback Port Assignment
//
localparam int unsigned NUM_WB_UNITS_GROUP_1 = 1;//ALU
localparam int unsigned NUM_WB_UNITS_GROUP_2 = 1 + int'(CONFIG.INCLUDE_CSRS) + int'(CONFIG.INCLUDE_MUL) + int'(CONFIG.INCLUDE_DIV);//LS
localparam int unsigned NUM_WB_UNITS = NUM_WB_UNITS_GROUP_1 + NUM_WB_UNITS_GROUP_2;
);
////////////////////////////////////////////////////
//Connecting Signals
@ -105,24 +69,22 @@ module cva5
ras_interface ras();
issue_packet_t issue;
register_file_issue_interface #(.NUM_WB_GROUPS(CONFIG.NUM_WB_GROUPS)) rf_issue();
register_file_issue_interface #(.NUM_WB_GROUPS(CONFIG.NUM_WB_GROUPS), .READ_PORTS(REGFILE_READ_PORTS), .DATA_WIDTH(32)) rf_issue();
register_file_issue_interface #(.NUM_WB_GROUPS(2), .READ_PORTS(3), .DATA_WIDTH(FLEN)) fp_rf_issue();
logic [MAX_NUM_UNITS-1:0] unit_needed;
logic [MAX_NUM_UNITS-1:0][REGFILE_READ_PORTS-1:0] unit_uses_rs;
logic [1:0][2:0] fp_unit_uses_rs;
logic [MAX_NUM_UNITS-1:0] unit_uses_rd;
logic [1:0] fp_unit_uses_rd;
alu_inputs_t alu_inputs;
load_store_inputs_t ls_inputs;
branch_inputs_t branch_inputs;
mul_inputs_t mul_inputs;
div_inputs_t div_inputs;
gc_inputs_t gc_inputs;
csr_inputs_t csr_inputs;
logic [31:0] constant_alu;
unit_issue_interface unit_issue [NUM_UNITS-1:0]();
unit_issue_interface unit_issue [MAX_NUM_UNITS-1:0]();
exception_packet_t ls_exception;
logic ls_exception_is_store;
unit_writeback_interface unit_wb [NUM_WB_UNITS]();
mmu_interface immu();
mmu_interface dmmu();
@ -148,24 +110,39 @@ module cva5
logic decode_advance;
decode_packet_t decode;
logic decode_uses_rd;
logic fp_decode_uses_rd;
rs_addr_t decode_rd_addr;
exception_sources_t decode_exception_unit;
logic decode_is_store;
phys_addr_t decode_phys_rd_addr;
phys_addr_t fp_decode_phys_rd_addr;
phys_addr_t decode_phys_rs_addr [REGFILE_READ_PORTS];
phys_addr_t fp_decode_phys_rs_addr [3];
logic [$clog2(CONFIG.NUM_WB_GROUPS)-1:0] decode_rs_wb_group [REGFILE_READ_PORTS];
logic fp_decode_rs_wb_group [3];
logic [2:0] dyn_rm;
//ID freeing
retire_packet_t retire;
retire_packet_t wb_retire;
retire_packet_t fp_wb_retire;
retire_packet_t store_retire;
id_t retire_ids [RETIRE_PORTS];
id_t retire_ids_next [RETIRE_PORTS];
logic retire_port_valid [RETIRE_PORTS];
logic [LOG2_RETIRE_PORTS : 0] retire_count;
//Writeback
unit_writeback_interface #(.DATA_WIDTH(32)) unit_wb [MAX_NUM_UNITS]();
unit_writeback_interface #(.DATA_WIDTH(FLEN)) fp_unit_wb [2]();
wb_packet_t wb_packet [CONFIG.NUM_WB_GROUPS];
commit_packet_t commit_packet [CONFIG.NUM_WB_GROUPS];
fp_wb_packet_t fp_wb_packet [2];
phys_addr_t wb_phys_addr [CONFIG.NUM_WB_GROUPS];
phys_addr_t fp_wb_phys_addr [2];
logic [4:0] fflag_wmask;
//Exception
logic [31:0] oldest_pc;
renamer_interface #(.NUM_WB_GROUPS(CONFIG.NUM_WB_GROUPS)) decode_rename_interface ();
renamer_interface #(.NUM_WB_GROUPS(CONFIG.NUM_WB_GROUPS), .READ_PORTS(REGFILE_READ_PORTS)) decode_rename_interface ();
renamer_interface #(.NUM_WB_GROUPS(2), .READ_PORTS(3)) fp_decode_rename_interface ();
//Global Control
exception_interface exception [NUM_EXCEPTION_SOURCES]();
@ -186,47 +163,16 @@ module cva5
logic processing_csr;
//Decode Unit and Fetch Unit
logic issue_stage_ready;
phys_addr_t issue_phys_rs_addr [REGFILE_READ_PORTS];
phys_addr_t fp_issue_phys_rs_addr [3];
rs_addr_t issue_rs_addr [REGFILE_READ_PORTS];
logic [$clog2(CONFIG.NUM_WB_GROUPS)-1:0] issue_rd_wb_group;
logic fp_issue_rd_wb_group;
logic illegal_instruction;
logic instruction_issued;
logic instruction_issued_with_rd;
//LS
wb_packet_t wb_snoop;
//Trace Interface Signals
logic tr_early_branch_correction;
logic tr_operand_stall;
logic tr_unit_stall;
logic tr_no_id_stall;
logic tr_no_instruction_stall;
logic tr_other_stall;
logic tr_branch_operand_stall;
logic tr_alu_operand_stall;
logic tr_ls_operand_stall;
logic tr_div_operand_stall;
logic tr_alu_op;
logic tr_branch_or_jump_op;
logic tr_load_op;
logic tr_store_op;
logic tr_mul_op;
logic tr_div_op;
logic tr_misc_op;
logic tr_instruction_issued_dec;
logic [31:0] tr_instruction_pc_dec;
logic [31:0] tr_instruction_data_dec;
logic tr_branch_correct;
logic tr_branch_misspredict;
logic tr_return_correct;
logic tr_return_misspredict;
logic tr_load_conflict_delay;
logic tr_rs1_forwarding_needed;
logic tr_rs2_forwarding_needed;
logic tr_rs1_and_rs2_forwarding_needed;
logic fp_instruction_issued_with_rd;
////////////////////////////////////////////////////
//Implementation
@ -267,18 +213,27 @@ module cva5
.decode (decode),
.decode_advance (decode_advance),
.decode_uses_rd (decode_uses_rd),
.fp_decode_uses_rd (fp_decode_uses_rd),
.decode_rd_addr (decode_rd_addr),
.decode_phys_rd_addr (decode_phys_rd_addr),
.fp_decode_phys_rd_addr (fp_decode_phys_rd_addr),
.decode_exception_unit (decode_exception_unit),
.decode_is_store (decode_is_store),
.issue (issue),
.instruction_issued (instruction_issued),
.instruction_issued_with_rd (instruction_issued_with_rd),
.fp_instruction_issued_with_rd (fp_instruction_issued_with_rd),
.wb_packet (wb_packet),
.commit_packet (commit_packet),
.retire (retire),
.fp_wb_packet (fp_wb_packet),
.wb_phys_addr (wb_phys_addr),
.fp_wb_phys_addr (fp_wb_phys_addr),
.wb_retire (wb_retire),
.fp_wb_retire (fp_wb_retire),
.store_retire (store_retire),
.retire_ids (retire_ids),
.retire_ids_next (retire_ids_next),
.retire_port_valid(retire_port_valid),
.retire_count (retire_count),
.post_issue_count(post_issue_count),
.oldest_pc (oldest_pc),
.current_exception_unit (current_exception_unit)
@ -307,11 +262,9 @@ module cva5
.iwishbone (iwishbone),
.icache_on ('1),
.tlb (itlb),
.tlb_on (tlb_on),
.l1_request (l1_request[L1_ICACHE_ID]),
.l1_response (l1_response[L1_ICACHE_ID]),
.exception (1'b0),
.tr_early_branch_correction (tr_early_branch_correction)
.exception (1'b0)
);
branch_predictor #(.CONFIG(CONFIG))
@ -364,7 +317,7 @@ module cva5
////////////////////////////////////////////////////
//Renamer
renamer #(.CONFIG(CONFIG))
renamer #(.NUM_WB_GROUPS(CONFIG.NUM_WB_GROUPS), .READ_PORTS(REGFILE_READ_PORTS), .RENAME_ZERO(0))
renamer_block (
.clk (clk),
.rst (rst),
@ -373,68 +326,57 @@ module cva5
.decode (decode_rename_interface),
.issue (issue), //packet
.instruction_issued_with_rd (instruction_issued_with_rd),
.retire (retire) //packet
.wb_retire (wb_retire)
);
////////////////////////////////////////////////////
//Decode/Issue
decode_and_issue #(
.CONFIG (CONFIG),
.NUM_UNITS (NUM_UNITS),
.UNIT_IDS (UNIT_IDS)
)
decode_and_issue #(.CONFIG(CONFIG))
decode_and_issue_block (
.clk (clk),
.rst (rst),
.pc_id_available (pc_id_available),
.decode (decode),
.decode_advance (decode_advance),
.unit_needed (unit_needed),
.unit_uses_rs (unit_uses_rs),
.fp_unit_uses_rs (fp_unit_uses_rs),
.unit_uses_rd (unit_uses_rd),
.fp_unit_uses_rd (fp_unit_uses_rd),
.renamer (decode_rename_interface),
.fp_renamer (fp_decode_rename_interface),
.decode_uses_rd (decode_uses_rd),
.fp_decode_uses_rd (fp_decode_uses_rd),
.decode_rd_addr (decode_rd_addr),
.decode_exception_unit (decode_exception_unit),
.decode_phys_rd_addr (decode_phys_rd_addr),
.fp_decode_phys_rd_addr (fp_decode_phys_rd_addr),
.decode_phys_rs_addr (decode_phys_rs_addr),
.fp_decode_phys_rs_addr (fp_decode_phys_rs_addr),
.decode_rs_wb_group (decode_rs_wb_group),
.fp_decode_rs_wb_group (fp_decode_rs_wb_group),
.instruction_issued (instruction_issued),
.instruction_issued_with_rd (instruction_issued_with_rd),
.fp_instruction_issued_with_rd (fp_instruction_issued_with_rd),
.issue (issue),
.issue_rs_addr (issue_rs_addr),
.issue_stage_ready (issue_stage_ready),
.issue_phys_rs_addr (issue_phys_rs_addr),
.fp_issue_phys_rs_addr (fp_issue_phys_rs_addr),
.issue_rd_wb_group (issue_rd_wb_group),
.fp_issue_rd_wb_group (fp_issue_rd_wb_group),
.rf (rf_issue),
.alu_inputs (alu_inputs),
.ls_inputs (ls_inputs),
.branch_inputs (branch_inputs),
.gc_inputs (gc_inputs),
.csr_inputs (csr_inputs),
.mul_inputs (mul_inputs),
.div_inputs (div_inputs),
.fp_rf (fp_rf_issue),
.constant_alu (constant_alu),
.unit_issue (unit_issue),
.gc (gc),
.current_privilege (current_privilege),
.exception (exception[PRE_ISSUE_EXCEPTION]),
.tr_operand_stall (tr_operand_stall),
.tr_unit_stall (tr_unit_stall),
.tr_no_id_stall (tr_no_id_stall),
.tr_no_instruction_stall (tr_no_instruction_stall),
.tr_other_stall (tr_other_stall),
.tr_branch_operand_stall (tr_branch_operand_stall),
.tr_alu_operand_stall (tr_alu_operand_stall),
.tr_ls_operand_stall (tr_ls_operand_stall),
.tr_div_operand_stall (tr_div_operand_stall),
.tr_alu_op (tr_alu_op),
.tr_branch_or_jump_op (tr_branch_or_jump_op),
.tr_load_op (tr_load_op),
.tr_store_op (tr_store_op),
.tr_mul_op (tr_mul_op),
.tr_div_op (tr_div_op),
.tr_misc_op (tr_misc_op),
.tr_instruction_issued_dec (tr_instruction_issued_dec),
.tr_instruction_pc_dec (tr_instruction_pc_dec),
.tr_instruction_data_dec (tr_instruction_data_dec)
.exception (exception[PRE_ISSUE_EXCEPTION])
);
////////////////////////////////////////////////////
//Register File
register_file #(.CONFIG(CONFIG))
register_file #(.NUM_WB_GROUPS(CONFIG.NUM_WB_GROUPS), .READ_PORTS(REGFILE_READ_PORTS), .PORT_ZERO_ABSENT(0), .USE_ZERO(0), .WB_PACKET_TYPE(wb_packet_t))
register_file_block (
.clk (clk),
.rst (rst),
@ -444,8 +386,10 @@ module cva5
.decode_rs_wb_group (decode_rs_wb_group),
.decode_advance (decode_advance),
.decode_uses_rd (decode_uses_rd),
.decode_rd_addr (decode_rd_addr),
.rf_issue (rf_issue),
.commit (commit_packet)
.commit (wb_packet),
.wb_phys_addr (wb_phys_addr)
);
////////////////////////////////////////////////////
@ -453,25 +397,36 @@ module cva5
branch_unit #(.CONFIG(CONFIG))
branch_unit_block (
.clk (clk),
.rst (rst),
.issue (unit_issue[UNIT_IDS.BR]),
.branch_inputs (branch_inputs),
.rst (rst),
.decode_stage (decode),
.issue_stage (issue),
.issue_stage_ready (issue_stage_ready),
.unit_needed (unit_needed[BR_ID]),
.uses_rs (unit_uses_rs[BR_ID]),
.uses_rd (unit_uses_rd[BR_ID]),
.rf (rf_issue.data),
.constant_alu (constant_alu),
.issue (unit_issue[BR_ID]),
.br_results (br_results),
.branch_flush (branch_flush),
.exception (exception[BR_EXCEPTION]),
.tr_branch_correct (tr_branch_correct),
.tr_branch_misspredict (tr_branch_misspredict),
.tr_return_correct (tr_return_correct),
.tr_return_misspredict (tr_return_misspredict)
.exception (exception[BR_EXCEPTION])
);
alu_unit alu_unit_block (
.clk (clk),
.rst (rst),
.alu_inputs (alu_inputs),
.issue (unit_issue[UNIT_IDS.ALU]),
.wb (unit_wb[UNIT_IDS.ALU])
.decode_stage (decode),
.issue_stage (issue),
.issue_stage_ready (issue_stage_ready),
.unit_needed (unit_needed[ALU_ID]),
.uses_rs (unit_uses_rs[ALU_ID]),
.uses_rd (unit_uses_rd[ALU_ID]),
.rf (rf_issue.data),
.constant_alu (constant_alu),
.issue_rs_addr (issue_rs_addr),
.issue (unit_issue[ALU_ID]),
.wb (unit_wb[ALU_ID])
);
load_store_unit #(.CONFIG(CONFIG))
@ -479,8 +434,25 @@ module cva5
.clk (clk),
.rst (rst),
.gc (gc),
.ls_inputs (ls_inputs),
.issue (unit_issue[UNIT_IDS.LS]),
.decode_stage (decode),
.issue_stage (issue),
.issue_stage_ready (issue_stage_ready),
.unit_needed (unit_needed[LS_ID]),
.uses_rs (unit_uses_rs[LS_ID]),
.fp_uses_rs (fp_unit_uses_rs[0]),
.uses_rd (unit_uses_rd[LS_ID]),
.fp_uses_rd (fp_unit_uses_rd[0]),
.decode_is_store (decode_is_store),
.instruction_issued_with_rd (instruction_issued_with_rd),
.fp_instruction_issued_with_rd (fp_instruction_issued_with_rd),
.issue_rs_addr (issue_rs_addr),
.issue_rd_wb_group (issue_rd_wb_group),
.fp_issue_rd_wb_group (fp_issue_rd_wb_group),
.rs2_inuse (rf_issue.inuse[RS2]),
.fp_rs2_inuse (fp_rf_issue.inuse[RS2]),
.rf (rf_issue.data),
.fp_rf (fp_rf_issue.data),
.issue (unit_issue[LS_ID]),
.dcache_on (1'b1),
.clear_reservation (1'b0),
.tlb (dtlb),
@ -493,13 +465,13 @@ module cva5
.m_avalon (m_avalon),
.dwishbone (dwishbone),
.data_bram (data_bram),
.wb_snoop (wb_snoop),
.retire_ids (retire_ids),
.retire_port_valid(retire_port_valid),
.wb_packet (wb_packet),
.fp_wb_packet (fp_wb_packet),
.store_retire (store_retire),
.exception (exception[LS_EXCEPTION]),
.load_store_status(load_store_status),
.wb (unit_wb[UNIT_IDS.LS]),
.tr_load_conflict_delay (tr_load_conflict_delay)
.wb (unit_wb[LS_ID]),
.fp_wb (fp_unit_wb[0])
);
generate if (CONFIG.INCLUDE_S_MODE) begin : gen_dtlb_dmmu
@ -530,15 +502,24 @@ module cva5
end
endgenerate
generate if (CONFIG.INCLUDE_CSRS) begin : gen_csrs
generate if (CONFIG.INCLUDE_UNIT.CSR) begin : gen_csrs
csr_unit # (.CONFIG(CONFIG))
csr_unit_block (
.clk(clk),
.rst(rst),
.csr_inputs (csr_inputs),
.issue (unit_issue[UNIT_IDS.CSR]),
.wb (unit_wb[UNIT_IDS.CSR]),
.decode_stage (decode),
.issue_stage (issue),
.issue_stage_ready (issue_stage_ready),
.issue_rs_addr (issue_rs_addr),
.unit_needed (unit_needed[CSR_ID]),
.uses_rs (unit_uses_rs[CSR_ID]),
.uses_rd (unit_uses_rd[CSR_ID]),
.rf (rf_issue.data),
.issue (unit_issue[CSR_ID]),
.wb (unit_wb[CSR_ID]),
.current_privilege(current_privilege),
.fflag_wmask (fflag_wmask),
.dyn_rm (dyn_rm),
.interrupt_taken(interrupt_taken),
.interrupt_pending(interrupt_pending),
.processing_csr(processing_csr),
@ -551,8 +532,8 @@ module cva5
.mret(mret),
.sret(sret),
.epc(epc),
.retire(retire),
.retire_ids(retire_ids),
.retire_count (retire_count),
.s_interrupt(s_interrupt),
.m_interrupt(m_interrupt)
);
@ -562,8 +543,15 @@ module cva5
gc_unit_block (
.clk (clk),
.rst (rst),
.issue (unit_issue[UNIT_IDS.IEC]),
.gc_inputs (gc_inputs),
.decode_stage (decode),
.issue_stage (issue),
.issue_stage_ready (issue_stage_ready),
.unit_needed (unit_needed[IEC_ID]),
.uses_rs (unit_uses_rs[IEC_ID]),
.uses_rd (unit_uses_rd[IEC_ID]),
.constant_alu (constant_alu),
.rf (rf_issue.data),
.issue (unit_issue[IEC_ID]),
.branch_flush (branch_flush),
.exception (exception),
.exception_target_pc (exception_target_pc),
@ -573,8 +561,6 @@ module cva5
.mret(mret),
.sret(sret),
.epc(epc),
.retire (retire),
.retire_ids (retire_ids),
.retire_ids_next (retire_ids_next),
.interrupt_taken(interrupt_taken),
.interrupt_pending(interrupt_pending),
@ -583,43 +569,129 @@ module cva5
.post_issue_count (post_issue_count)
);
generate if (CONFIG.INCLUDE_MUL) begin : gen_mul
generate if (CONFIG.INCLUDE_UNIT.MUL) begin : gen_mul
mul_unit mul_unit_block (
.clk (clk),
.rst (rst),
.mul_inputs (mul_inputs),
.issue (unit_issue[UNIT_IDS.MUL]),
.wb (unit_wb[UNIT_IDS.MUL])
.decode_stage (decode),
.issue_stage (issue),
.issue_stage_ready (issue_stage_ready),
.unit_needed (unit_needed[MUL_ID]),
.uses_rs (unit_uses_rs[MUL_ID]),
.uses_rd (unit_uses_rd[MUL_ID]),
.rf (rf_issue.data),
.issue (unit_issue[MUL_ID]),
.wb (unit_wb[MUL_ID])
);
end endgenerate
generate if (CONFIG.INCLUDE_DIV) begin : gen_div
generate if (CONFIG.INCLUDE_UNIT.DIV) begin : gen_div
div_unit div_unit_block (
.clk (clk),
.rst (rst),
.div_inputs (div_inputs),
.issue (unit_issue[UNIT_IDS.DIV]),
.wb (unit_wb[UNIT_IDS.DIV])
.gc (gc),
.instruction_issued_with_rd (instruction_issued_with_rd),
.decode_stage (decode),
.issue_stage (issue),
.issue_stage_ready (issue_stage_ready),
.issue_rs_addr (issue_rs_addr),
.unit_needed (unit_needed[DIV_ID]),
.uses_rs (unit_uses_rs[DIV_ID]),
.uses_rd (unit_uses_rd[DIV_ID]),
.rf (rf_issue.data),
.issue (unit_issue[DIV_ID]),
.wb (unit_wb[DIV_ID])
);
end endgenerate
generate if (CONFIG.INCLUDE_UNIT.CUSTOM) begin : gen_custom
custom_unit custom_unit_block (
.clk (clk),
.rst (rst),
.decode_stage (decode),
.unit_needed (unit_needed[CUSTOM_ID]),
.uses_rs (unit_uses_rs[CUSTOM_ID]),
.uses_rd (unit_uses_rd[CUSTOM_ID]),
.issue_stage (issue),
.issue_stage_ready (issue_stage_ready),
.rf (rf_issue.data),
.issue (unit_issue[CUSTOM_ID]),
.wb (unit_wb[CUSTOM_ID])
);
end endgenerate
////////////////////////////////////////////////////
//Writeback
//First writeback port: ALU
//Second writeback port: LS, CSR, [MUL], [DIV]
localparam int unsigned NUM_UNITS_PER_PORT [CONFIG.NUM_WB_GROUPS] = '{NUM_WB_UNITS_GROUP_1, NUM_WB_UNITS_GROUP_2};
writeback #(
.CONFIG (CONFIG),
.NUM_UNITS (NUM_UNITS_PER_PORT),
.NUM_WB_UNITS (NUM_WB_UNITS)
)
writeback_block (
.clk (clk),
.rst (rst),
.wb_packet (wb_packet),
.unit_wb (unit_wb),
.wb_snoop (wb_snoop)
);
generate for (genvar i = 0; i < CONFIG.NUM_WB_GROUPS; i++) begin : gen_wb
writeback #(
.NUM_WB_UNITS (get_num_wb_units(CONFIG.WB_GROUP[i])),
.WB_INDEX (CONFIG.WB_GROUP[i])
)
writeback_block (
.wb_packet (wb_packet[i]),
.unit_wb (unit_wb)
);
end endgenerate
////////////////////////////////////////////////////
//FPU
generate if (CONFIG.INCLUDE_UNIT.FPU) begin : gen_fpu
fp_writeback fp_writeback_block (
.unit_wb (fp_unit_wb),
.wb_packet (fp_wb_packet)
);
fpu_top #(.CONFIG(CONFIG))
fpu_block (
.clk (clk),
.rst (rst),
.decode_stage (decode),
.unit_needed (unit_needed[FPU_ID]),
.uses_rs (unit_uses_rs[FPU_ID]),
.fp_uses_rs (fp_unit_uses_rs[1]),
.uses_rd (unit_uses_rd[FPU_ID]),
.fp_uses_rd (fp_unit_uses_rd[1]),
.issue_stage_ready (issue_stage_ready),
.dyn_rm (dyn_rm),
.int_rf (rf_issue.data),
.fp_rf (fp_rf_issue.data),
.issue (unit_issue[FPU_ID]),
.int_wb (unit_wb[FPU_ID]),
.fp_wb (fp_unit_wb[1]),
.fflags (fflag_wmask)
);
register_file #(.NUM_WB_GROUPS(2), .READ_PORTS(3), .USE_ZERO(1), .PORT_ZERO_ABSENT(1), .WB_PACKET_TYPE(fp_wb_packet_t))
fp_register_file_block (
.clk (clk),
.rst (rst),
.gc (gc),
.decode_phys_rs_addr (fp_decode_phys_rs_addr),
.decode_phys_rd_addr (fp_decode_phys_rd_addr),
.decode_rs_wb_group (fp_decode_rs_wb_group),
.decode_advance (decode_advance),
.decode_uses_rd (fp_decode_uses_rd),
.decode_rd_addr ('x),
.rf_issue (fp_rf_issue),
.commit (fp_wb_packet),
.wb_phys_addr (fp_wb_phys_addr)
);
renamer #(.NUM_WB_GROUPS(2), .READ_PORTS(3), .RENAME_ZERO(1))
fp_renamer_block (
.clk (clk),
.rst (rst),
.gc (gc),
.decode_advance (decode_advance),
.decode (fp_decode_rename_interface),
.issue (issue),
.instruction_issued_with_rd (fp_instruction_issued_with_rd),
.wb_retire (fp_wb_retire)
);
end endgenerate
////////////////////////////////////////////////////
//End of Implementation
@ -635,40 +707,5 @@ module cva5
////////////////////////////////////////////////////
//Assertions
////////////////////////////////////////////////////
//Trace Interface
generate if (ENABLE_TRACE_INTERFACE) begin : gen_cva5_trace
always_ff @(posedge clk) begin
tr.events.early_branch_correction <= tr_early_branch_correction;
tr.events.operand_stall <= tr_operand_stall;
tr.events.unit_stall <= tr_unit_stall;
tr.events.no_id_stall <= tr_no_id_stall;
tr.events.no_instruction_stall <= tr_no_instruction_stall;
tr.events.other_stall <= tr_other_stall;
tr.events.instruction_issued_dec <= tr_instruction_issued_dec;
tr.events.branch_operand_stall <= tr_branch_operand_stall;
tr.events.alu_operand_stall <= tr_alu_operand_stall;
tr.events.ls_operand_stall <= tr_ls_operand_stall;
tr.events.div_operand_stall <= tr_div_operand_stall;
tr.events.alu_op <= tr_alu_op;
tr.events.branch_or_jump_op <= tr_branch_or_jump_op;
tr.events.load_op <= tr_load_op;
tr.events.store_op <= tr_store_op;
tr.events.mul_op <= tr_mul_op;
tr.events.div_op <= tr_div_op;
tr.events.misc_op <= tr_misc_op;
tr.events.branch_correct <= tr_branch_correct;
tr.events.branch_misspredict <= tr_branch_misspredict;
tr.events.return_correct <= tr_return_correct;
tr.events.return_misspredict <= tr_return_misspredict;
tr.events.load_conflict_delay <= tr_load_conflict_delay;
tr.events.rs1_forwarding_needed <= tr_rs1_forwarding_needed;
tr.events.rs2_forwarding_needed <= tr_rs2_forwarding_needed;
tr.events.rs1_and_rs2_forwarding_needed <= tr_rs1_and_rs2_forwarding_needed;
tr.instruction_pc_dec <= tr_instruction_pc_dec;
tr.instruction_data_dec <= tr_instruction_data_dec;
end
end
endgenerate
endmodule

View file

@ -1,345 +0,0 @@
/*
* Copyright © 2017-2020 Eric Matthews, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Eric Matthews <ematthew@sfu.ca>
*/
module dcache
import cva5_config::*;
import riscv_types::*;
import cva5_types::*;
# (
parameter cpu_config_t CONFIG = EXAMPLE_CONFIG
)
(
input logic clk,
input logic rst,
input logic dcache_on,
l1_arbiter_request_interface.master l1_request,
l1_arbiter_return_interface.master l1_response,
input logic sc_complete,
input logic sc_success,
input logic clear_reservation,
input amo_details_t amo,
input logic uncacheable,
memory_sub_unit_interface.responder ls
);
localparam DCACHE_SIZE_IN_WORDS = CONFIG.DCACHE.LINES*CONFIG.DCACHE.LINE_W*CONFIG.DCACHE.WAYS;
localparam derived_cache_config_t SCONFIG = get_derived_cache_params(CONFIG, CONFIG.DCACHE, CONFIG.DCACHE_ADDR);
localparam LOG2_DCACHE_WAYS = (CONFIG.DCACHE.WAYS == 1) ? 1 : $clog2(CONFIG.DCACHE.WAYS);
typedef struct packed{
logic [29:0] addr;
logic [3:0] be;
logic load;
logic store;
logic [31:0] data;
amo_details_t amo;
logic uncacheable;
} stage2_t;
logic [$clog2(DCACHE_SIZE_IN_WORDS)-1:0] data_bank_addr_a;
logic [$clog2(DCACHE_SIZE_IN_WORDS)-1:0] data_bank_addr_b;
logic tag_hit;
logic [CONFIG.DCACHE.WAYS-1:0] tag_hit_way;
logic [LOG2_DCACHE_WAYS-1:0] tag_hit_way_int;
logic tag_update;
logic [CONFIG.DCACHE.WAYS-1:0] tag_update_way;
logic [CONFIG.DCACHE.WAYS-1:0] replacement_way;
logic [LOG2_DCACHE_WAYS-1:0] replacement_way_int;
logic [LOG2_DCACHE_WAYS-1:0] tag_update_way_int;
logic [SCONFIG.SUB_LINE_ADDR_W-1:0] word_count;
logic [SCONFIG.SUB_LINE_ADDR_W-1:0] sc_write_index;
logic [SCONFIG.SUB_LINE_ADDR_W-1:0] update_word_index;
logic line_complete;
logic reservation;
stage2_t stage2;
logic [31:0] dbank_data_out;
logic [31:0] hit_data;
logic [31:0] miss_data;
logic [31:0] new_line_data;
logic [31:0] amo_result;
logic [31:0] amo_rs2;
logic[3:0] write_hit_be;
logic second_cycle;
logic new_arb_request;
logic arb_request_r;
logic is_target_word;
logic hit_allowed;
logic read_hit_allowed;
logic read_hit_data_valid;
logic read_hit;
logic address_range_valid;
logic idle;
logic read_miss_complete;
logic store_complete;
amo_alu_inputs_t amo_alu_inputs;
////////////////////////////////////////////////////
//Implementation
////////////////////////////////////////////////////
//2nd Cycle Control Signals
always_ff @ (posedge clk) begin
if (ls.new_request) begin
stage2.addr <= ls.addr[31:2];
stage2.be <= ls.be;
stage2.load <= ls.re;
stage2.store <= ls.we;
stage2.data <= ls.data_in;
stage2.amo <= amo;
stage2.uncacheable <= uncacheable;
end
end
////////////////////////////////////////////////////
//General Control Logic
//LR and AMO ops are forced misses (if there is a tag hit they will reuse the same way)
//Signal is valid for a single cycle, RAM enables are used to hold outputs in case of pipeline stalls
always_ff @ (posedge clk) begin
read_hit_allowed <= ls.new_request & ls.re & dcache_on & ~(amo.is_lr | amo.is_amo) & ~uncacheable;
read_hit_data_valid <= read_hit_allowed;
second_cycle <= ls.new_request;
tag_update <= second_cycle & dcache_on & stage2.load & ~tag_hit & ~stage2.uncacheable;//Cache enabled, read miss
end
assign read_hit = tag_hit & read_hit_allowed;
//LR reservation, cleared on exceptions
always_ff @ (posedge clk) begin
if (rst)
reservation <= 0;
else if (second_cycle)
reservation <= stage2.amo.is_lr;
else if (sc_complete | clear_reservation)
reservation <= 0;
end
////////////////////////////////////////////////////
//L1 Arbiter Interface
assign l1_request.addr = {stage2.addr, 2'b0} ;//Memory interface aligns request to burst size (done there to support AMO line-read word-write)
assign l1_request.data = stage2.data;
assign l1_request.rnw = ~stage2.store;
assign l1_request.be = stage2.be;
assign l1_request.size = (stage2.load & ~stage2.uncacheable) ? 5'(CONFIG.DCACHE.LINE_W-1) : 0;//LR and AMO ops are included in load
assign l1_request.is_amo = (stage2.amo.is_amo | stage2.amo.is_lr | stage2.amo.is_sc);
assign l1_request.amo = stage2.amo.op;
always_ff @ (posedge clk) begin
if (rst | line_complete)
word_count <= 0;
else if (l1_response.data_valid)
word_count <= word_count + 1;
end
assign is_target_word = (stage2.addr[SCONFIG.SUB_LINE_ADDR_W-1:0] == word_count) | stage2.uncacheable;
assign new_arb_request = second_cycle & (~read_hit);
always_ff @ (posedge clk) begin
if (rst)
arb_request_r <= 0;
else if (second_cycle & ~l1_request.ack)
arb_request_r <= new_arb_request;
else if (l1_request.ack)
arb_request_r <= 0;
end
assign l1_request.request = new_arb_request | arb_request_r;
////////////////////////////////////////////////////
//Replacement policy (free runing one-hot cycler, i.e. pseudo random)
cycler #(CONFIG.DCACHE.WAYS) replacement_policy (
.clk (clk),
.rst (rst),
.en (1'b1),
.one_hot (replacement_way)
);
//One-hot tag hit / update logic to binary int
one_hot_to_integer #(CONFIG.DCACHE.WAYS)
hit_way_conv (
.one_hot(tag_hit_way),
.int_out(tag_hit_way_int)
);
one_hot_to_integer #(CONFIG.DCACHE.WAYS)
update_way_conv (
.one_hot (replacement_way),
.int_out (replacement_way_int)
);
//If atomic load (LR or AMO op) and there's a tag hit reuse same line
logic stage2_amo_with_load;
assign stage2_amo_with_load = stage2.amo.is_amo | stage2.amo.is_lr;
always_ff @ (posedge clk) begin
if (second_cycle) begin
tag_update_way<= (stage2_amo_with_load & tag_hit) ? tag_hit_way : replacement_way;
tag_update_way_int <= (stage2_amo_with_load & tag_hit) ? tag_hit_way_int : replacement_way_int;
end
end
////////////////////////////////////////////////////
//Tag banks
dtag_banks #(.CONFIG(CONFIG), .SCONFIG(SCONFIG))
dcache_tag_banks (
.clk (clk),
.rst (rst),
.stage1_addr (ls.addr[31:2]),
.stage2_addr (stage2.addr),
.inv_addr (l1_response.inv_addr),
.update_way (tag_update_way),
.update (tag_update),
.stage1_adv (ls.new_request),
.stage1_inv (1'b0),//For software invalidation
.extern_inv (l1_response.inv_valid),
.extern_inv_complete (l1_response.inv_ack),
.tag_hit (tag_hit),
.tag_hit_way (tag_hit_way)
);
////////////////////////////////////////////////////
//AMO logic
always_ff @ (posedge clk) begin
amo_rs2 <= stage2.data;
end
assign amo_alu_inputs.rs1_load = l1_response.data;
assign amo_alu_inputs.rs2 = amo_rs2;
assign amo_alu_inputs.op = stage2.amo.op;
generate if (CONFIG.INCLUDE_AMO)
amo_alu amo_unit (
.amo_alu_inputs (amo_alu_inputs),
.result (amo_result)
);
endgenerate
always_comb begin
if (stage2.amo.is_amo & is_target_word)
new_line_data = amo_result;
else if (stage2.amo.is_sc)
new_line_data = stage2.data;
else
new_line_data = l1_response.data;
end
assign sc_write_index = stage2.addr[SCONFIG.SUB_LINE_ADDR_W-1:0];
////////////////////////////////////////////////////
//Data Bank(s)
//Tag bank selection done with upper address bits
//On miss, word index in line provided by: update_word_index
assign write_hit_be = stage2.be & {4{tag_hit}};
assign update_word_index = stage2.amo.is_sc ? sc_write_index : word_count;
generate if (CONFIG.DCACHE.WAYS == 1) begin : bank_sel_gen
assign data_bank_addr_a = stage2.addr[SCONFIG.LINE_ADDR_W+SCONFIG.SUB_LINE_ADDR_W-1:0];
assign data_bank_addr_b = {stage2.addr[SCONFIG.LINE_ADDR_W+SCONFIG.SUB_LINE_ADDR_W-1:SCONFIG.SUB_LINE_ADDR_W], update_word_index};
end else begin
assign data_bank_addr_a = {tag_hit_way_int, stage2.addr[SCONFIG.LINE_ADDR_W+SCONFIG.SUB_LINE_ADDR_W-1:0]};
assign data_bank_addr_b = {tag_update_way_int, stage2.addr[SCONFIG.LINE_ADDR_W+SCONFIG.SUB_LINE_ADDR_W-1:SCONFIG.SUB_LINE_ADDR_W], update_word_index};
end endgenerate
ddata_bank #(.LINES(DCACHE_SIZE_IN_WORDS)) data_bank (
.clk(clk),
.addr_a(data_bank_addr_a),
.addr_b(data_bank_addr_b),
.en_a(second_cycle),
.en_b((l1_response.data_valid & ~stage2.uncacheable) | (sc_complete & sc_success)),
.be_a(write_hit_be),
.data_in_a(stage2.data),
.data_in_b(new_line_data),
.data_out_a(dbank_data_out)
);
////////////////////////////////////////////////////
//Output
always_ff @ (posedge clk) begin
if (l1_response.data_valid & is_target_word)
miss_data <= l1_response.data;
else if (sc_complete)
miss_data <= {31'b0, sc_success};
end
assign ls.data_out = read_hit_data_valid ? dbank_data_out : miss_data;
////////////////////////////////////////////////////
//Pipeline Advancement
assign line_complete = l1_response.data_valid & ((word_count == $clog2(CONFIG.DCACHE.LINE_W)'(CONFIG.DCACHE.LINE_W-1)) | stage2.uncacheable); //covers load, LR, AMO
assign store_complete = l1_request.ack & stage2.store & ~stage2.amo.is_sc;
//read miss complete includes store conditional complete
always_ff @ (posedge clk) begin
if (rst)
read_miss_complete <= 0;
else
read_miss_complete <= line_complete | sc_complete;
end
always_ff @ (posedge clk) begin
if (rst)
ls.data_valid <= 0;
else
ls.data_valid <= (l1_response.data_valid & is_target_word) | read_hit | sc_complete;
end
assign ls.ready = read_hit | store_complete | read_miss_complete | idle;
always_ff @ (posedge clk) begin
if (rst)
idle <= 1;
else if (ls.new_request)
idle <= 0;
else if (ls.ready)
idle <= 1;
end
////////////////////////////////////////////////////
//End of Implementation
////////////////////////////////////////////////////
////////////////////////////////////////////////////
//Assertions
dcache_request_when_not_ready_assertion:
assert property (@(posedge clk) disable iff (rst) ls.new_request |-> ls.ready)
else $error("dcache received request when not ready");
endmodule

View file

@ -1,47 +0,0 @@
/*
* Copyright © 2017 Eric Matthews, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Eric Matthews <ematthew@sfu.ca>
*/
module ddata_bank
import cva5_config::*;
import cva5_types::*;
#(
parameter LINES = 2048
)
(
input logic clk,
input logic[$clog2(LINES)-1:0] addr_a,
input logic en_a,
input logic[3:0] be_a,
input logic[31:0] data_in_a,
output logic[31:0] data_out_a,
//write only port
input logic[$clog2(LINES)-1:0] addr_b,
input logic en_b,
input logic[31:0] data_in_b
);
byte_en_BRAM #(LINES, "", 0) ram_block (.*, .be_b({4{en_b}}), .data_out_b());
endmodule

View file

@ -26,11 +26,10 @@ module decode_and_issue
import riscv_types::*;
import cva5_types::*;
import csr_types::*;
import opcodes::*;
# (
parameter cpu_config_t CONFIG = EXAMPLE_CONFIG,
parameter NUM_UNITS = 7,
parameter unit_id_param_t UNIT_IDS = EXAMPLE_UNIT_IDS
parameter cpu_config_t CONFIG = EXAMPLE_CONFIG
)
(
@ -45,161 +44,148 @@ module decode_and_issue
//Renamer
renamer_interface.decode renamer,
renamer_interface.decode fp_renamer,
input logic [MAX_NUM_UNITS-1:0] unit_needed,
input logic [MAX_NUM_UNITS-1:0][REGFILE_READ_PORTS-1:0] unit_uses_rs,
input logic [1:0][2:0] fp_unit_uses_rs,
input logic [MAX_NUM_UNITS-1:0] unit_uses_rd,
input logic [1:0] fp_unit_uses_rd,
output logic decode_uses_rd,
output logic fp_decode_uses_rd,
output rs_addr_t decode_rd_addr,
output phys_addr_t decode_phys_rd_addr,
output phys_addr_t fp_decode_phys_rd_addr,
output phys_addr_t decode_phys_rs_addr [REGFILE_READ_PORTS],
output phys_addr_t fp_decode_phys_rs_addr [3],
output logic [$clog2(CONFIG.NUM_WB_GROUPS)-1:0] decode_rs_wb_group [REGFILE_READ_PORTS],
output logic fp_decode_rs_wb_group [3],
output logic instruction_issued,
output logic instruction_issued_with_rd,
output logic fp_instruction_issued_with_rd,
output issue_packet_t issue,
output rs_addr_t issue_rs_addr [REGFILE_READ_PORTS],
output phys_addr_t issue_phys_rs_addr [REGFILE_READ_PORTS],
output phys_addr_t fp_issue_phys_rs_addr [3],
output logic [$clog2(CONFIG.NUM_WB_GROUPS)-1:0] issue_rd_wb_group,
output logic fp_issue_rd_wb_group,
output logic issue_stage_ready,
//Register File
register_file_issue_interface.issue rf,
register_file_issue_interface.issue fp_rf,
output alu_inputs_t alu_inputs,
output load_store_inputs_t ls_inputs,
output branch_inputs_t branch_inputs,
output gc_inputs_t gc_inputs,
output csr_inputs_t csr_inputs,
output mul_inputs_t mul_inputs,
output div_inputs_t div_inputs,
output logic [31:0] constant_alu,
unit_issue_interface.decode unit_issue [NUM_UNITS-1:0],
unit_issue_interface.decode unit_issue [MAX_NUM_UNITS-1:0],
input gc_outputs_t gc,
input logic [1:0] current_privilege,
exception_interface.unit exception,
exception_interface.unit exception
);
//Trace signals
output logic tr_operand_stall,
output logic tr_unit_stall,
output logic tr_no_id_stall,
output logic tr_no_instruction_stall,
output logic tr_other_stall,
output logic tr_branch_operand_stall,
output logic tr_alu_operand_stall,
output logic tr_ls_operand_stall,
output logic tr_div_operand_stall,
output logic tr_alu_op,
output logic tr_branch_or_jump_op,
output logic tr_load_op,
output logic tr_store_op,
output logic tr_mul_op,
output logic tr_div_op,
output logic tr_misc_op,
common_instruction_t decode_instruction;//rs1_addr, rs2_addr, fn3, fn7, rd_addr, upper/lower opcode
output logic tr_instruction_issued_dec,
output logic [31:0] tr_instruction_pc_dec,
output logic [31:0] tr_instruction_data_dec
);
logic decode_uses_rs [REGFILE_READ_PORTS];
logic fp_decode_uses_rs [3];
logic [2:0] fn3;
logic [6:0] opcode;
logic [4:0] opcode_trim;
rs_addr_t decode_rs_addr [REGFILE_READ_PORTS];
rs_addr_t fp_decode_rs_addr [3];
logic [$clog2(CONFIG.NUM_WB_GROUPS)-1:0] decode_wb_group;
logic fp_decode_wb_group;
logic uses_rs [REGFILE_READ_PORTS];
logic uses_rd;
logic issue_hold;
logic [REGFILE_READ_PORTS-1:0] operand_ready;
logic [2:0] fp_operand_ready;
logic [MAX_NUM_UNITS-1:0] unit_needed_issue_stage;
logic [MAX_NUM_UNITS-1:0] issue_to;
rs_addr_t rs_addr [REGFILE_READ_PORTS];
rs_addr_t rd_addr;
logic is_csr;
logic is_fence;
logic is_ifence;
logic csr_imm_op;
logic environment_op;
logic issue_valid;
logic operands_ready;
logic mult_div_op;
logic [NUM_UNITS-1:0] unit_needed;
logic [NUM_UNITS-1:0] unit_needed_issue_stage;
logic [NUM_UNITS-1:0] unit_ready;
logic [NUM_UNITS-1:0] issue_ready;
logic [NUM_UNITS-1:0] issue_to;
rs_addr_t issue_rs_addr [REGFILE_READ_PORTS];
phys_addr_t issue_phys_rs_addr [REGFILE_READ_PORTS];
logic [$clog2(CONFIG.NUM_WB_GROUPS)-1:0] issue_rs_wb_group [REGFILE_READ_PORTS];
logic fp_issue_rs_wb_group [3];
logic issue_uses_rs [REGFILE_READ_PORTS];
logic fp_issue_uses_rs [3];
logic pre_issue_exception_pending;
logic illegal_instruction_pattern;
logic issue_stage_ready;
logic [REGFILE_READ_PORTS-1:0] rs_conflict;
genvar i;
////////////////////////////////////////////////////
//Implementation
//Can move data into issue stage if:
// there is no instruction currently in the issue stage, or
// an instruction could issue (issue_flush, issue_hold and whether the instruction is valid are not needed in this check)
assign issue_stage_ready = ((~issue.stage_valid) | (issue_valid & |issue_ready)) & ~gc.issue_hold;
// an instruction could issue (ignoring gc.fetch_flush)
assign issue_stage_ready = (~issue.stage_valid) | (|issue_to);
assign decode_advance = decode.valid & issue_stage_ready;
//Instruction aliases
assign opcode = decode.instruction[6:0];
assign opcode_trim = opcode[6:2];
assign fn3 = decode.instruction[14:12];
assign rs_addr[RS1] = decode.instruction[19:15];
assign rs_addr[RS2] = decode.instruction[24:20];
assign rd_addr = decode.instruction[11:7];
assign is_csr = CONFIG.INCLUDE_CSRS & (opcode_trim == SYSTEM_T) & (fn3 != 0);
assign is_fence = (opcode_trim == FENCE_T) & ~fn3[0];
assign is_ifence = CONFIG.INCLUDE_IFENCE & (opcode_trim == FENCE_T) & fn3[0];
assign csr_imm_op = (opcode_trim == SYSTEM_T) & fn3[2];
assign environment_op = (opcode_trim == SYSTEM_T) & (fn3 == 0);
assign decode_instruction = decode.instruction;
always_comb begin
decode_rs_addr = '{default: '0};
decode_rs_addr[RS1] = decode_instruction.rs1_addr;
decode_rs_addr[RS2] = decode_instruction.rs2_addr;
fp_decode_rs_addr = '{default: '0};
fp_decode_rs_addr[RS1] = decode_instruction.rs1_addr;
fp_decode_rs_addr[RS2] = decode_instruction.rs2_addr;
fp_decode_rs_addr[RS3] = decode_instruction.fn7[6:2];
end
////////////////////////////////////////////////////
//Register File Support
assign uses_rs[RS1] = opcode_trim inside {JALR_T, BRANCH_T, LOAD_T, STORE_T, ARITH_IMM_T, ARITH_T, AMO_T} | is_csr;
assign uses_rs[RS2] = opcode_trim inside {BRANCH_T, ARITH_T, AMO_T};//Stores are exempted due to store forwarding
assign uses_rd = opcode_trim inside {LUI_T, AUIPC_T, JAL_T, JALR_T, LOAD_T, ARITH_IMM_T, ARITH_T} | is_csr;
always_comb begin
decode_uses_rd = |unit_uses_rd;
fp_decode_uses_rd = |fp_unit_uses_rd;
decode_uses_rs = '{default: 0};
for (int i = 0; i < MAX_NUM_UNITS; i++)
for (int j = 0; j < REGFILE_READ_PORTS; j++)
decode_uses_rs[j] |= unit_uses_rs[i][j];
fp_decode_uses_rs = '{default: 0};
for (int i = 0; i < 2; i++)
for (int j = 0; j < 3; j++)
fp_decode_uses_rs[j] |= fp_unit_uses_rs[i][j];
end
////////////////////////////////////////////////////
//Unit Determination
assign unit_needed[UNIT_IDS.BR] = opcode_trim inside {BRANCH_T, JAL_T, JALR_T};
assign unit_needed[UNIT_IDS.ALU] = (opcode_trim inside {ARITH_T, ARITH_IMM_T, AUIPC_T, LUI_T, JAL_T, JALR_T}) & ~mult_div_op;
assign unit_needed[UNIT_IDS.LS] = opcode_trim inside {LOAD_T, STORE_T, AMO_T} | is_fence;
generate if (CONFIG.INCLUDE_CSRS)
assign unit_needed[UNIT_IDS.CSR] = is_csr;
endgenerate
assign unit_needed[UNIT_IDS.IEC] = (opcode_trim inside {SYSTEM_T} & ~is_csr & CONFIG.INCLUDE_M_MODE) | is_ifence;
//WB Group Determination
localparam units_t [MAX_NUM_UNITS-1:0] WB_UNITS_TYPE_REP = get_wb_units_type_representation(CONFIG.WB_GROUP);
logic [CONFIG.NUM_WB_GROUPS-1:0] uses_wb_group;
always_comb begin
for (int i = 0; i < CONFIG.NUM_WB_GROUPS; i++)
uses_wb_group[i] = |(unit_needed & WB_UNITS_TYPE_REP[i]);
end
assign mult_div_op = (opcode_trim == ARITH_T) && decode.instruction[25];
generate if (CONFIG.INCLUDE_MUL)
assign unit_needed[UNIT_IDS.MUL] = mult_div_op && ~fn3[2];
endgenerate
one_hot_to_integer #(.C_WIDTH(CONFIG.NUM_WB_GROUPS))
wb_group_one_hot_block (
.one_hot (uses_wb_group),
.int_out (decode_wb_group)
);
generate if (CONFIG.INCLUDE_DIV)
assign unit_needed[UNIT_IDS.DIV] = mult_div_op && fn3[2];
endgenerate
assign fp_decode_wb_group = unit_needed[FPU_ID];
////////////////////////////////////////////////////
//Renamer Support
assign renamer.rd_addr = rd_addr;
assign renamer.rs_addr = rs_addr;
assign renamer.uses_rd = uses_rd;
assign renamer.rd_wb_group = ~unit_needed[UNIT_IDS.ALU];//TODO: automate generation of wb group logic
assign renamer.rd_addr = decode_instruction.rd_addr;
assign fp_renamer.rd_addr = decode_instruction.rd_addr;
assign renamer.rs_addr = decode_rs_addr;
assign fp_renamer.rs_addr = fp_decode_rs_addr;
assign renamer.uses_rd = decode_uses_rd;
assign fp_renamer.uses_rd = fp_decode_uses_rd;
assign renamer.rd_wb_group = decode_wb_group;
assign fp_renamer.rd_wb_group = fp_decode_wb_group;
assign renamer.id = decode.id;
assign fp_renamer.id = decode.id;
////////////////////////////////////////////////////
//Decode ID Support
assign decode_uses_rd = uses_rd;
assign decode_rd_addr = rd_addr;
assign decode_rd_addr = decode_instruction.rd_addr;
assign decode_phys_rd_addr = renamer.phys_rd_addr;
assign fp_decode_phys_rd_addr = fp_renamer.phys_rd_addr;
assign decode_phys_rs_addr = renamer.phys_rs_addr;
assign fp_decode_phys_rs_addr = fp_renamer.phys_rs_addr;
assign decode_rs_wb_group = renamer.rs_wb_group;
assign fp_decode_rs_wb_group = fp_renamer.rs_wb_group;
////////////////////////////////////////////////////
//Issue
@ -208,18 +194,25 @@ module decode_and_issue
issue.pc <= decode.pc;
issue.instruction <= decode.instruction;
issue.fetch_metadata <= decode.fetch_metadata;
issue.fn3 <= fn3;
issue.opcode <= opcode;
issue_rs_addr <= rs_addr;
issue.fn3 <= decode_instruction.fn3;
issue.opcode <= decode.instruction[6:0];
issue_rs_addr <= decode_rs_addr;
issue_phys_rs_addr <= renamer.phys_rs_addr;
fp_issue_phys_rs_addr <= fp_renamer.phys_rs_addr;
issue_rs_wb_group <= renamer.rs_wb_group;
issue.rd_addr <= rd_addr;
fp_issue_rs_wb_group <= fp_renamer.rs_wb_group;
issue.rd_addr <= decode_instruction.rd_addr;
issue.phys_rd_addr <= renamer.phys_rd_addr;
issue.is_multicycle <= ~unit_needed[UNIT_IDS.ALU];
issue.fp_phys_rd_addr <= fp_renamer.phys_rd_addr;
issue_rd_wb_group <= decode_wb_group;
fp_issue_rd_wb_group <= fp_decode_wb_group;
issue.is_multicycle <= ~unit_needed[ALU_ID];
issue.id <= decode.id;
issue.exception_unit <= decode_exception_unit;
issue_uses_rs <= uses_rs;
issue.uses_rd <= uses_rd;
issue_uses_rs <= decode_uses_rs;
fp_issue_uses_rs <= fp_decode_uses_rs;
issue.uses_rd <= decode_uses_rd;
issue.fp_uses_rd <= fp_decode_uses_rd;
end
end
@ -236,379 +229,105 @@ module decode_and_issue
end
////////////////////////////////////////////////////
//Unit ready
generate for (i=0; i<NUM_UNITS; i++)
assign unit_ready[i] = unit_issue[i].ready;
//Issue Determination
assign issue_hold = gc.issue_hold | pre_issue_exception_pending;
generate for (genvar i=0; i<REGFILE_READ_PORTS; i++)
assign operand_ready[i] = ~rf.inuse[i] | (rf.inuse[i] & ~issue_uses_rs[i]);
endgenerate
generate for (genvar i=0; i<3; i++)
assign fp_operand_ready[i] = ~fp_rf.inuse[i] | (fp_rf.inuse[i] & ~fp_issue_uses_rs[i]);
endgenerate
////////////////////////////////////////////////////
//Issue Determination
generate for (i=0; i<REGFILE_READ_PORTS; i++)
assign rs_conflict[i] = rf.inuse[i] & issue_uses_rs[i];
endgenerate
assign operands_ready = ~|rs_conflict;
//Unit EX signals
generate for (genvar i = 0; i < MAX_NUM_UNITS; i++) begin : gen_unit_issue_signals
assign unit_issue[i].possible_issue = issue.stage_valid & unit_needed_issue_stage[i] & unit_issue[i].ready;
assign issue_to[i] = unit_issue[i].possible_issue & (&operand_ready) & (&fp_operand_ready) & ~issue_hold;
assign unit_issue[i].new_request = issue_to[i] & ~gc.fetch_flush;
assign unit_issue[i].id = issue.id;
end endgenerate
assign issue_ready = unit_needed_issue_stage & unit_ready;
assign issue_valid = issue.stage_valid & operands_ready & ~gc.issue_hold & ~pre_issue_exception_pending;
assign issue_to = {NUM_UNITS{issue_valid & ~gc.fetch_flush}} & issue_ready;
assign instruction_issued = issue_valid & ~gc.fetch_flush & |issue_ready;
assign instruction_issued = |issue_to & ~gc.fetch_flush;
assign instruction_issued_with_rd = instruction_issued & issue.uses_rd;
assign fp_instruction_issued_with_rd = instruction_issued & issue.fp_uses_rd;
////////////////////////////////////////////////////
//Register File Issue Interface
assign rf.phys_rs_addr = issue_phys_rs_addr;
assign fp_rf.phys_rs_addr = fp_issue_phys_rs_addr;
assign rf.phys_rd_addr = issue.phys_rd_addr;
assign fp_rf.phys_rd_addr = issue.fp_phys_rd_addr;
assign rf.rs_wb_group = issue_rs_wb_group;
assign fp_rf.rs_wb_group = fp_issue_rs_wb_group;
assign rf.single_cycle_or_flush = (instruction_issued_with_rd & |issue.rd_addr & ~issue.is_multicycle) | (issue.stage_valid & issue.uses_rd & |issue.rd_addr & gc.fetch_flush);
assign fp_rf.single_cycle_or_flush = issue.stage_valid & issue.fp_uses_rd & gc.fetch_flush;
////////////////////////////////////////////////////
//ALU unit inputs
logic [XLEN-1:0] alu_rs2_data;
logic alu_imm_type;
logic [31:0] constant_alu;
alu_op_t alu_op;
alu_op_t alu_op_r;
alu_logic_op_t alu_logic_op;
alu_logic_op_t alu_logic_op_r;
logic alu_subtract;
logic sub_instruction;
always_comb begin
case (opcode_trim) inside
LUI_T, AUIPC_T, JAL_T, JALR_T : alu_op = ALU_CONSTANT;
default :
case (fn3) inside
SLTU_fn3, SLT_fn3 : alu_op = ALU_SLT;
SLL_fn3, SRA_fn3 : alu_op = ALU_SHIFT;
default : alu_op = ALU_ADD_SUB;
endcase
endcase
end
always_comb begin
case (fn3)
XOR_fn3 : alu_logic_op = ALU_LOGIC_XOR;
OR_fn3 : alu_logic_op = ALU_LOGIC_OR;
AND_fn3 : alu_logic_op = ALU_LOGIC_AND;
default : alu_logic_op = ALU_LOGIC_ADD;//ADD/SUB/SLT/SLTU
endcase
end
assign sub_instruction = (fn3 == ADD_SUB_fn3) && decode.instruction[30] && opcode[5];//If ARITH instruction
//Constant ALU:
// provides LUI, AUIPC, JAL, JALR results for ALU
// provides PC+4 for BRANCH unit and ifence in GC unit
always_ff @(posedge clk) begin
if (issue_stage_ready) begin
constant_alu <= ((opcode_trim inside {LUI_T}) ? '0 : decode.pc) + ((opcode_trim inside {LUI_T, AUIPC_T}) ? {decode.instruction[31:12], 12'b0} : 4);
alu_imm_type <= opcode_trim inside {ARITH_IMM_T};
alu_op_r <= alu_op;
alu_subtract <= (fn3 inside {SLTU_fn3, SLT_fn3}) || sub_instruction;
alu_logic_op_r <= alu_logic_op;
end
if (issue_stage_ready)
constant_alu <= ((decode_instruction.upper_opcode inside {LUI_T}) ? '0 : decode.pc) + ((decode_instruction.upper_opcode inside {LUI_T, AUIPC_T}) ? {decode.instruction[31:12], 12'b0} : 4);
end
//Shifter related
assign alu_inputs.lshift = ~issue.fn3[2];
assign alu_inputs.shift_amount = alu_imm_type ? issue_rs_addr[RS2] : rf.data[RS2][4:0];
assign alu_inputs.arith = rf.data[RS1][XLEN-1] & issue.instruction[30];//shift in bit
assign alu_inputs.shifter_in = rf.data[RS1];
//LUI, AUIPC, JAL, JALR
assign alu_inputs.constant_adder = constant_alu;
//logic and adder
assign alu_inputs.subtract = alu_subtract;
assign alu_inputs.logic_op = alu_logic_op_r;
assign alu_inputs.in1 = {(rf.data[RS1][XLEN-1] & ~issue.fn3[0]), rf.data[RS1]};//(fn3[0] is SLTU_fn3);
assign alu_rs2_data = alu_imm_type ? 32'(signed'(issue.instruction[31:20])) : rf.data[RS2];
assign alu_inputs.in2 = {(alu_rs2_data[XLEN-1] & ~issue.fn3[0]), alu_rs2_data};
assign alu_inputs.alu_op = alu_op_r;
////////////////////////////////////////////////////
//Load Store unit inputs
logic is_load;
logic is_store;
logic amo_op;
logic store_conditional;
logic load_reserve;
logic [4:0] amo_type;
assign amo_op = CONFIG.INCLUDE_AMO ? (opcode_trim == AMO_T) : 1'b0;
assign amo_type = decode.instruction[31:27];
assign store_conditional = (amo_type == AMO_SC_FN5);
assign load_reserve = (amo_type == AMO_LR_FN5);
generate if (CONFIG.INCLUDE_AMO) begin : gen_decode_ls_amo
assign ls_inputs.amo.is_lr = load_reserve;
assign ls_inputs.amo.is_sc = store_conditional;
assign ls_inputs.amo.is_amo = amo_op & ~(load_reserve | store_conditional);
assign ls_inputs.amo.op = amo_type;
end
else begin
assign ls_inputs.amo = '0;
end
endgenerate
assign is_load = (opcode_trim inside {LOAD_T, AMO_T}) && !(amo_op & store_conditional); //LR and AMO_ops perform a read operation as well
assign is_store = (opcode_trim == STORE_T) || (amo_op && store_conditional);//Used for LS unit and for ID tracking
logic [11:0] ls_offset;
logic is_load_r;
logic is_store_r;
logic is_fence_r;
always_ff @(posedge clk) begin
if (issue_stage_ready) begin
ls_offset <= opcode[5] ? {decode.instruction[31:25], decode.instruction[11:7]} : decode.instruction[31:20];
is_load_r <= is_load;
is_store_r <= is_store;
is_fence_r <= is_fence;
end
end
(* ramstyle = "MLAB, no_rw_check" *) id_t rd_to_id_table [32];
always_ff @ (posedge clk) begin
if (instruction_issued_with_rd)
rd_to_id_table[issue.rd_addr] <= issue.id;
end
assign ls_inputs.offset = ls_offset;
assign ls_inputs.load = is_load_r;
assign ls_inputs.store = is_store_r;
assign ls_inputs.fence = is_fence_r;
assign ls_inputs.fn3 = amo_op ? LS_W_fn3 : issue.fn3;
assign ls_inputs.rs1 = rf.data[RS1];
assign ls_inputs.rs2 = rf.data[RS2];
assign ls_inputs.forwarded_store = rf.inuse[RS2];
assign ls_inputs.store_forward_id = rd_to_id_table[issue_rs_addr[RS2]];
////////////////////////////////////////////////////
//Branch unit inputs
////////////////////////////////////////////////////
//RAS Support
logic rs1_link;
logic rd_link;
logic rs1_eq_rd;
logic is_return;
logic is_call;
assign rs1_link = (rs_addr[RS1] inside {1,5});
assign rd_link = (rd_addr inside {1,5});
assign rs1_eq_rd = (rs_addr[RS1] == rd_addr);
logic br_use_signed;
always_ff @(posedge clk) begin
if (issue_stage_ready) begin
is_return <= (opcode_trim == JALR_T) && ((rs1_link & ~rd_link) | (rs1_link & rd_link & ~rs1_eq_rd));
is_call <= (opcode_trim inside {JAL_T, JALR_T}) && rd_link;
br_use_signed <= !(fn3 inside {BLTU_fn3, BGEU_fn3});
end
end
logic[19:0] jal_imm;
logic[11:0] jalr_imm;
logic[11:0] br_imm;
logic [20:0] pc_offset;
logic [20:0] pc_offset_r;
assign jal_imm = {decode.instruction[31], decode.instruction[19:12], decode.instruction[20], decode.instruction[30:21]};
assign jalr_imm = decode.instruction[31:20];
assign br_imm = {decode.instruction[31], decode.instruction[7], decode.instruction[30:25], decode.instruction[11:8]};
always_comb begin
case (opcode[3:2])
2'b11 : pc_offset = 21'(signed'({jal_imm, 1'b0}));
2'b01 : pc_offset = 21'(signed'(jalr_imm));
default : pc_offset = 21'(signed'({br_imm, 1'b0}));
endcase
end
logic jalr;
always_ff @(posedge clk) begin
if (issue_stage_ready) begin
pc_offset_r <= pc_offset;
jalr <= (~opcode[3] & opcode[2]);
end
end
assign branch_inputs.is_return = is_return;
assign branch_inputs.is_call = is_call;
assign branch_inputs.fn3 = issue.fn3;
assign branch_inputs.pc_offset = pc_offset_r;
assign branch_inputs.jal = issue.opcode[3];//(opcode == JAL);
assign branch_inputs.jalr = jalr;
assign branch_inputs.jal_jalr = issue.opcode[2];
assign branch_inputs.issue_pc = issue.pc;
assign branch_inputs.issue_pc_valid = issue.stage_valid;
assign branch_inputs.rs1 = {(rf.data[RS1][31] & br_use_signed), rf.data[RS1]};
assign branch_inputs.rs2 = {(rf.data[RS2][31] & br_use_signed), rf.data[RS2]};
assign branch_inputs.pc_p4 = constant_alu;
////////////////////////////////////////////////////
//Global Control unit inputs
logic is_ecall_r;
logic is_ebreak_r;
logic is_mret_r;
logic is_sret_r;
logic is_ifence_r;
logic [7:0] sys_op_match;
typedef enum logic [2:0] {
ECALL_i = 0,
EBREAK_i = 1,
URET_i = 2,
SRET_i = 3,
MRET_i = 4,
SFENCE_i = 5
} sys_op_index_t;
always_comb begin
sys_op_match = '0;
case (decode.instruction[31:20]) inside
ECALL_imm : sys_op_match[ECALL_i] = CONFIG.INCLUDE_M_MODE;
EBREAK_imm : sys_op_match[EBREAK_i] = CONFIG.INCLUDE_M_MODE;
SRET_imm : sys_op_match[SRET_i] = CONFIG.INCLUDE_S_MODE;
MRET_imm : sys_op_match[MRET_i] = CONFIG.INCLUDE_M_MODE;
SFENCE_imm : sys_op_match[SFENCE_i] = CONFIG.INCLUDE_S_MODE;
default : sys_op_match = '0;
endcase
end
always_ff @(posedge clk) begin
if (issue_stage_ready) begin
is_ecall_r <= sys_op_match[ECALL_i];
is_ebreak_r <= sys_op_match[EBREAK_i];
is_mret_r <= sys_op_match[MRET_i];
is_sret_r <= sys_op_match[SRET_i];
is_ifence_r <= is_ifence;
end
end
assign gc_inputs.pc_p4 = constant_alu;
assign gc_inputs.is_ifence = is_ifence_r;
assign gc_inputs.is_mret = is_mret_r;
assign gc_inputs.is_sret = is_sret_r;
////////////////////////////////////////////////////
//CSR unit inputs
generate if (CONFIG.INCLUDE_CSRS) begin : gen_decode_csr_inputs
assign csr_inputs.addr = issue.instruction[31:20];
assign csr_inputs.op = issue.fn3[1:0];
assign csr_inputs.data = issue.fn3[2] ? {27'b0, issue_rs_addr[RS1]} : rf.data[RS1];
assign csr_inputs.reads = ~((issue.fn3[1:0] == CSR_RW) && (issue.rd_addr == 0));
assign csr_inputs.writes = ~((issue.fn3[1:0] == CSR_RC) && (issue_rs_addr[RS1] == 0));
end endgenerate
////////////////////////////////////////////////////
//Mul unit inputs
generate if (CONFIG.INCLUDE_MUL) begin : gen_decode_mul_inputs
assign mul_inputs.rs1 = rf.data[RS1];
assign mul_inputs.rs2 = rf.data[RS2];
assign mul_inputs.op = issue.fn3[1:0];
end endgenerate
////////////////////////////////////////////////////
//Div unit inputs
generate if (CONFIG.INCLUDE_DIV) begin : gen_decode_div_inputs
phys_addr_t prev_div_rs_addr [2];
logic [1:0] div_rd_match;
logic prev_div_result_valid;
logic div_rs_overwrite;
logic div_op_reuse;
always_ff @(posedge clk) begin
if (issue_to[UNIT_IDS.DIV])
prev_div_rs_addr <= issue_phys_rs_addr[RS1:RS2];
end
assign div_op_reuse = {prev_div_result_valid, prev_div_rs_addr[RS1], prev_div_rs_addr[RS2]} == {1'b1, issue_phys_rs_addr[RS1],issue_phys_rs_addr[RS2]};
//Clear if prev div inputs are overwritten by another instruction
assign div_rd_match[RS1] = (issue.phys_rd_addr == prev_div_rs_addr[RS1]);
assign div_rd_match[RS2] = (issue.phys_rd_addr == prev_div_rs_addr[RS2]);
assign div_rs_overwrite = |div_rd_match;
set_clr_reg_with_rst #(.SET_OVER_CLR(1), .WIDTH(1), .RST_VALUE(0)) prev_div_result_valid_m (
.clk, .rst,
.set(instruction_issued & unit_needed_issue_stage[UNIT_IDS.DIV]),
.clr((instruction_issued & issue.uses_rd & div_rs_overwrite) | gc.writeback_supress), //No instructions will be issued while gc.writeback_supress is asserted
.result(prev_div_result_valid)
);
assign div_inputs.rs1 = rf.data[RS1];
assign div_inputs.rs2 = rf.data[RS2];
assign div_inputs.op = issue.fn3[1:0];
assign div_inputs.reuse_result = div_op_reuse;
end endgenerate
////////////////////////////////////////////////////
//Unit EX signals
generate for (i = 0; i < NUM_UNITS; i++) begin : gen_unit_issue_signals
assign unit_issue[i].possible_issue = issue.stage_valid & unit_needed_issue_stage[i] & unit_issue[i].ready;
assign unit_issue[i].new_request = issue_to[i];
assign unit_issue[i].id = issue.id;
end endgenerate
////////////////////////////////////////////////////
//Illegal Instruction check
logic illegal_instruction_pattern_r;
generate if (CONFIG.INCLUDE_M_MODE) begin : gen_decode_exceptions
illegal_instruction_checker # (.CONFIG(CONFIG))
illegal_op_check (
.instruction(decode.instruction), .illegal_instruction(illegal_instruction_pattern)
);
always_ff @(posedge clk) begin
if (rst)
illegal_instruction_pattern_r <= 0;
else if (issue_stage_ready)
illegal_instruction_pattern_r <= illegal_instruction_pattern;
end
logic new_exception;
exception_code_t ecode;
exception_code_t ecall_code;
//ECALL and EBREAK captured here, but seperated out when ecode is set
assign illegal_instruction_pattern = ~|unit_needed;
//TODO: Consider ways of parameterizing so that any exception generating unit
//can be automatically added to this expression
always_comb begin
unique case (1'b1)
unit_needed[UNIT_IDS.LS] : decode_exception_unit = LS_EXCEPTION;
unit_needed[UNIT_IDS.BR] : decode_exception_unit = BR_EXCEPTION;
unit_needed[LS_ID] : decode_exception_unit = LS_EXCEPTION;
unit_needed[BR_ID] : decode_exception_unit = BR_EXCEPTION;
default : decode_exception_unit = PRE_ISSUE_EXCEPTION;
endcase
if (illegal_instruction_pattern)
if (~decode.fetch_metadata.ok)
decode_exception_unit = PRE_ISSUE_EXCEPTION;
end
////////////////////////////////////////////////////
//ECALL/EBREAK
//The type of call instruction is depedent on the current privilege level
exception_code_t ecall_code;
always_comb begin
case (current_privilege)
USER_PRIVILEGE : ecall_code = ECALL_U;
SUPERVISOR_PRIVILEGE : ecall_code = ECALL_S;
SUPERVISOR_PRIVILEGE : ecall_code = ECALL_S;
MACHINE_PRIVILEGE : ecall_code = ECALL_M;
default : ecall_code = ECALL_U;
endcase
end
always_ff @(posedge clk) begin
if (issue_stage_ready) begin
ecode <=
decode.instruction inside {ECALL} ? ecall_code :
decode.instruction inside {EBREAK} ? BREAK :
illegal_instruction_pattern ? ILLEGAL_INST :
decode.fetch_metadata.error_code; //(~decode.fetch_metadata.ok)
end
end
////////////////////////////////////////////////////
//Exception generation (ecall/ebreak/illegal instruction/propagated fetch error)
logic new_exception;
exception_code_t ecode;
always_ff @(posedge clk) begin
if (rst)
pre_issue_exception_pending <= 0;
else if (issue_stage_ready)
pre_issue_exception_pending <= illegal_instruction_pattern | (opcode_trim inside {SYSTEM_T} & ~is_csr & (sys_op_match[ECALL_i] | sys_op_match[EBREAK_i])) | ~decode.fetch_metadata.ok;
pre_issue_exception_pending <= illegal_instruction_pattern | (~decode.fetch_metadata.ok);
end
assign new_exception = issue.stage_valid & pre_issue_exception_pending & ~(gc.issue_hold | gc.fetch_flush);
assign new_exception = issue.stage_valid & pre_issue_exception_pending & ~(gc.issue_hold | gc.fetch_flush | exception.valid);
always_ff @(posedge clk) begin
if (rst)
@ -617,12 +336,6 @@ module decode_and_issue
exception.valid <= (exception.valid | new_exception) & ~exception.ack;
end
assign ecode =
illegal_instruction_pattern_r ? ILLEGAL_INST :
is_ecall_r ? ecall_code :
~issue.fetch_metadata.ok ? issue.fetch_metadata.error_code :
BREAK;
always_ff @(posedge clk) begin
if (new_exception) begin
exception.code <= ecode;
@ -639,33 +352,4 @@ module decode_and_issue
////////////////////////////////////////////////////
//Assertions
////////////////////////////////////////////////////
//Trace Interface
generate if (ENABLE_TRACE_INTERFACE) begin : gen_decode_trace
assign tr_operand_stall = issue.stage_valid & ~gc.fetch_flush & ~gc.issue_hold & ~pre_issue_exception_pending & ~operands_ready & |issue_ready;
assign tr_unit_stall = issue_valid & ~gc.fetch_flush & ~|issue_ready;
assign tr_no_id_stall = (~issue.stage_valid & ~pc_id_available & ~gc.fetch_flush); //All instructions in execution pipeline
assign tr_no_instruction_stall = (pc_id_available & ~issue.stage_valid) | gc.fetch_flush;
assign tr_other_stall = issue.stage_valid & ~instruction_issued & ~(tr_operand_stall | tr_unit_stall | tr_no_id_stall | tr_no_instruction_stall);
assign tr_branch_operand_stall = tr_operand_stall & unit_needed_issue_stage[UNIT_IDS.BR];
assign tr_alu_operand_stall = tr_operand_stall & unit_needed_issue_stage[UNIT_IDS.ALU] & ~unit_needed_issue_stage[UNIT_IDS.BR];
assign tr_ls_operand_stall = tr_operand_stall & unit_needed_issue_stage[UNIT_IDS.LS];
assign tr_div_operand_stall = tr_operand_stall & unit_needed_issue_stage[UNIT_IDS.DIV];
//Instruction Mix
always_ff @(posedge clk) begin
tr_alu_op <= issue_to[UNIT_IDS.ALU];
tr_branch_or_jump_op <= issue_to[UNIT_IDS.BR];
tr_load_op <= issue_to[UNIT_IDS.LS] & is_load_r;
tr_store_op <= issue_to[UNIT_IDS.LS] & is_store_r;
tr_mul_op <= issue_to[UNIT_IDS.MUL];
tr_div_op <= issue_to[UNIT_IDS.DIV];
tr_misc_op <= issue_to[UNIT_IDS.CSR] | issue_to[UNIT_IDS.IEC];
end
assign tr_instruction_issued_dec = instruction_issued;
assign tr_instruction_pc_dec = issue.pc;
assign tr_instruction_data_dec = issue.instruction;
end endgenerate
endmodule

View file

@ -1,137 +0,0 @@
/*
* Copyright © 2017 Eric Matthews, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Eric Matthews <ematthew@sfu.ca>
*/
module dtag_banks
import cva5_config::*;
import cva5_types::*;
# (
parameter cpu_config_t CONFIG = EXAMPLE_CONFIG,
parameter derived_cache_config_t SCONFIG = '{default: 0}
)
(
input logic clk,
input logic rst,
input logic[29:0] stage1_addr,
input logic[29:0] stage2_addr,
input logic[29:0] inv_addr,
input logic[CONFIG.DCACHE.WAYS-1:0] update_way,
input logic update,
input logic stage1_adv,
input logic stage1_inv,
input logic extern_inv,
output logic extern_inv_complete,
output tag_hit,
output logic[CONFIG.DCACHE.WAYS-1:0] tag_hit_way
);
typedef struct packed{
logic valid;
logic [SCONFIG.TAG_W-1:0] tag;
} dtag_entry_t;
function logic[SCONFIG.TAG_W-1:0] getTag(logic[29:0] addr);
return addr[SCONFIG.SUB_LINE_ADDR_W + SCONFIG.LINE_ADDR_W +: SCONFIG.TAG_W];
endfunction
function logic[SCONFIG.LINE_ADDR_W-1:0] getLineAddr(logic[29:0] addr);
return addr[SCONFIG.LINE_ADDR_W + SCONFIG.SUB_LINE_ADDR_W - 1 : SCONFIG.SUB_LINE_ADDR_W];
endfunction
dtag_entry_t tag_line [CONFIG.DCACHE.WAYS-1:0];
dtag_entry_t inv_tag_line [CONFIG.DCACHE.WAYS-1:0];
dtag_entry_t new_tagline;
logic [CONFIG.DCACHE.WAYS-1:0] update_tag_way;
logic inv_tags_accessed;
logic[CONFIG.DCACHE.WAYS-1:0] inv_hit_way;
logic[CONFIG.DCACHE.WAYS-1:0] inv_hit_way_r;
logic [SCONFIG.LINE_ADDR_W-1:0] update_port_addr;
dtag_entry_t stage2_hit_comparison_tagline;
dtag_entry_t inv_hit_comparison_tagline;
////////////////////////////////////////////////////
//Implementation
////////////////////////////////////////////////////
//Muxing of cache miss or invalidation control logic and tags
assign update_port_addr =
CONFIG.DCACHE.USE_EXTERNAL_INVALIDATIONS ?
((update) ? getLineAddr(stage2_addr) : getLineAddr(inv_addr)) :
getLineAddr(stage2_addr);
assign new_tagline = '{valid: update, tag: getTag(stage2_addr)};
always_ff @ (posedge clk) begin
if (rst)
inv_tags_accessed <= 0;
else
inv_tags_accessed <= extern_inv & ~update;
end
assign extern_inv_complete = (extern_inv & ~update) & inv_tags_accessed;
////////////////////////////////////////////////////
//Memory instantiation and hit detection
assign stage2_hit_comparison_tagline = '{valid: 1, tag: getTag(stage2_addr)};
assign inv_hit_comparison_tagline = '{valid: 1, tag: getTag(inv_addr)};
generate for (genvar i=0; i < CONFIG.DCACHE.WAYS; i++) begin : dtag_bank_gen
assign update_tag_way[i] = update_way[i] | (inv_hit_way[i] & extern_inv_complete);
tag_bank #($bits(dtag_entry_t), CONFIG.DCACHE.LINES) dtag_bank (
.clk (clk),
.rst (rst),
.en_a (stage1_adv),
.wen_a (stage1_inv),
.addr_a (getLineAddr(stage1_addr)),
.data_in_a ('0),
.data_out_a (tag_line[i]),
.en_b (update | extern_inv),
.wen_b (update_tag_way[i]),
.addr_b (update_port_addr),
.data_in_b (new_tagline),
.data_out_b(inv_tag_line[i])
);
assign inv_hit_way[i] = (inv_hit_comparison_tagline == inv_tag_line[i]);
assign tag_hit_way[i] = (stage2_hit_comparison_tagline == tag_line[i]);
end endgenerate
assign tag_hit = |tag_hit_way;
////////////////////////////////////////////////////
//Assertions
endmodule

184
core/execution_units/alu_unit.sv Executable file
View file

@ -0,0 +1,184 @@
/*
* Copyright © 2017-2020 Eric Matthews, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Eric Matthews <ematthew@sfu.ca>
*/
module alu_unit
import cva5_config::*;
import riscv_types::*;
import cva5_types::*;
import opcodes::*;
(
input logic clk,
input logic rst,
input decode_packet_t decode_stage,
output unit_needed,
output logic [REGFILE_READ_PORTS-1:0] uses_rs,
output logic uses_rd,
input issue_packet_t issue_stage,
input logic issue_stage_ready,
input logic [31:0] constant_alu,
input rs_addr_t issue_rs_addr [REGFILE_READ_PORTS],
input logic [31:0] rf [REGFILE_READ_PORTS],
unit_issue_interface.unit issue,
unit_writeback_interface.unit wb
);
typedef enum logic [1:0] {
LOGIC_XOR = 2'b00,
LOGIC_OR = 2'b01,
LOGIC_AND = 2'b10,
LOGIC_OTHER = 2'b11
} logic_op_t;
common_instruction_t instruction;//rs1_addr, rs2_addr, fn3, fn7, rd_addr, upper/lower opcode
logic [31:0] rs2_data;
logic imm_type;
alu_op_t alu_op;
alu_op_t alu_op_r;
logic_op_t logic_op;
logic_op_t logic_op_r;
logic subtract;
logic is_slt;
logic[32:0] add_sub_result;
logic add_sub_carry_in;
logic[31:0] logic_and_upper_slt;
logic[32:0] sign_ext_adder1;
logic[32:0] sign_ext_adder2;
logic[31:0] shift_result;
logic[31:0] result;
////////////////////////////////////////////////////
//Implementation
////////////////////////////////////////////////////
//Decode
assign instruction = decode_stage.instruction;
assign unit_needed = decode_stage.instruction inside {
LUI, AUIPC, JAL, JALR,
ADDI, SLLI, SLTI, SLTIU, XORI, SRLI, SRAI, ORI, ANDI,
ADD, SUB, SLL, SLT, SLTU, XOR, SRL, SRA, OR, AND
};
always_comb begin
uses_rs = '0;
uses_rs[RS1] = decode_stage.instruction inside {
JALR,
ADDI, SLLI, SLTI, SLTIU, XORI, SRLI, SRAI, ORI, ANDI,
ADD, SUB, SLL, SLT, SLTU, XOR, SRL, SRA, OR, AND
};
uses_rs[RS2] = decode_stage.instruction inside {
ADD, SUB, SLL, SLT, SLTU, XOR, SRL, SRA, OR, AND
};
uses_rd = decode_stage.instruction inside {
LUI, AUIPC, JAL, JALR,
ADDI, SLLI, SLTI, SLTIU, XORI, SRLI, SRAI, ORI, ANDI,
ADD, SUB, SLL, SLT, SLTU, XOR, SRL, SRA, OR, AND
};
end
always_comb begin
case (instruction.upper_opcode) inside
LUI_T, AUIPC_T, JAL_T, JALR_T : alu_op = ALU_CONSTANT;
default :
case (instruction.fn3) inside
XOR_fn3, OR_fn3, AND_fn3, SLTU_fn3, SLT_fn3 : alu_op = ALU_SLT;
SLL_fn3, SRA_fn3 : alu_op = ALU_SHIFT;
default : alu_op = ALU_ADD_SUB;
endcase
endcase
end
always_comb begin
case (instruction.fn3) inside
XOR_fn3 : logic_op = LOGIC_XOR;
OR_fn3 : logic_op = LOGIC_OR;
AND_fn3 : logic_op = LOGIC_AND;
default : logic_op = LOGIC_OTHER;
endcase
end
//Constant ALU:
// provides LUI, AUIPC, JAL, JALR results for ALU
// provides PC+4 for BRANCH unit and ifence in GC unit
always_ff @(posedge clk) begin
if (issue_stage_ready) begin
imm_type <= instruction.upper_opcode inside {ARITH_IMM_T};
alu_op_r <= alu_op;
logic_op_r <= logic_op;
subtract <= decode_stage.instruction inside {SUB, SLTI, SLTIU, SLT, SLTU};
is_slt <= instruction.fn3 inside {SLT_fn3, SLTU_fn3};
end
end
////////////////////////////////////////////////////
//Issue
//Logic ops put through the adder carry chain to reduce resources
//TODO: explore moving this mux into the regfile bypass mux
assign rs2_data = imm_type ? 32'(signed'(issue_stage.instruction[31:20])) : rf[RS2];
always_comb begin
case (logic_op_r)
LOGIC_XOR : logic_and_upper_slt = rf[RS1] ^ rs2_data;
LOGIC_OR : logic_and_upper_slt = rf[RS1] | rs2_data;
LOGIC_AND : logic_and_upper_slt = rf[RS1] & rs2_data;
default : logic_and_upper_slt = 0; //ADD/SUB/SLT/SLTU
endcase
end
//Add/Sub ops
assign sign_ext_adder1 = {(rf[RS1][31] & ~issue_stage.fn3[0]), rf[RS1]};
assign sign_ext_adder2 = {(rs2_data[31] & ~issue_stage.fn3[0]) ^ subtract, rs2_data ^ {32{subtract}}};
assign {add_sub_result, add_sub_carry_in} = {sign_ext_adder1, 1'b1} + {sign_ext_adder2, subtract};
//Shift ops
barrel_shifter shifter (
.shifter_input(rf[RS1]),
.shift_amount(imm_type ? issue_rs_addr[RS2] : rf[RS2][4:0]),
.arith(rf[RS1][31] & issue_stage.instruction[30]),
.lshift(~issue_stage.fn3[2]),
.shifted_result(shift_result)
);
always_comb begin
case (alu_op_r)
ALU_CONSTANT : result = constant_alu;//LUI, AUIPC, JAL, JALR
ALU_ADD_SUB : result = add_sub_result[31:0];
ALU_SLT : result = {logic_and_upper_slt[31:1], is_slt ? add_sub_result[32] : logic_and_upper_slt[0]};
default : result = shift_result; //ALU_SHIFT
endcase
end
////////////////////////////////////////////////////
//Output
assign issue.ready = 1;
assign wb.rd = result;
assign wb.done = issue.possible_issue;
assign wb.id = issue.id;
////////////////////////////////////////////////////
//Assertions
endmodule

View file

@ -0,0 +1,253 @@
/*
* Copyright © 2017-2019 Eric Matthews, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Eric Matthews <ematthew@sfu.ca>
*/
module branch_unit
import cva5_config::*;
import riscv_types::*;
import cva5_types::*;
import opcodes::*;
# (
parameter cpu_config_t CONFIG = EXAMPLE_CONFIG
)
(
input logic clk,
input logic rst,
input decode_packet_t decode_stage,
output logic unit_needed,
output logic [REGFILE_READ_PORTS-1:0] uses_rs,
output logic uses_rd,
input issue_packet_t issue_stage,
input logic issue_stage_ready,
input logic [31:0] constant_alu,
input logic [31:0] rf [REGFILE_READ_PORTS],
unit_issue_interface.unit issue,
output branch_results_t br_results,
output logic branch_flush,
exception_interface.unit exception
);
common_instruction_t instruction;//rs1_addr, rs2_addr, fn3, fn7, rd_addr, upper/lower opcode
logic branch_issued_r;
logic result;
//Branch Predictor
logic branch_taken;
logic branch_taken_ex;
id_t id_ex;
logic [31:0] jump_pc;
logic [31:0] new_pc;
logic [31:0] new_pc_ex;
logic [31:0] pc_ex;
logic instruction_is_completing;
logic branch_complete;
logic jal_or_jalr_ex;
logic [32:0] rs1;
logic [32:0] rs2;
////////////////////////////////////////////////////
//Implementation
////////////////////////////////////////////////////
//Decode
assign instruction = decode_stage.instruction;
assign unit_needed = decode_stage.instruction inside {
BEQ, BNE, BLT, BGE, BLTU, BGEU, JALR, JAL
};
always_comb begin
uses_rs = '0;
uses_rs[RS1] = decode_stage.instruction inside {
BEQ, BNE, BLT, BGE, BLTU, BGEU, JALR
};
uses_rs[RS2] = decode_stage.instruction inside {
BEQ, BNE, BLT, BGE, BLTU, BGEU
};
uses_rd = 0;//JALR/JAL writeback handled by ALU
end
////////////////////////////////////////////////////
//RAS Support
logic rs1_link;
logic rd_link;
logic rs1_eq_rd;
logic is_return;
logic is_call;
assign rs1_link = instruction.rs1_addr inside {1,5};
assign rd_link = instruction.rd_addr inside {1,5};
assign rs1_eq_rd = (instruction.rs1_addr == instruction.rd_addr);
always_ff @(posedge clk) begin
if (issue_stage_ready) begin
is_return <= (instruction.upper_opcode inside {JALR_T}) & ((rs1_link & ~rd_link) | (rs1_link & rd_link & ~rs1_eq_rd));
is_call <= (instruction.upper_opcode inside {JAL_T, JALR_T}) & rd_link;
end
end
////////////////////////////////////////////////////
//PC Offset
logic[19:0] jal_imm;
logic[11:0] jalr_imm;
logic[11:0] br_imm;
logic [20:0] pc_offset;
logic [20:0] pc_offset_r;
assign jal_imm = {decode_stage.instruction[31], decode_stage.instruction[19:12], decode_stage.instruction[20], decode_stage.instruction[30:21]};
assign jalr_imm = decode_stage.instruction[31:20];
assign br_imm = {decode_stage.instruction[31], decode_stage.instruction[7], decode_stage.instruction[30:25], decode_stage.instruction[11:8]};
always_comb begin
case (decode_stage.instruction[3:2])
2'b11 : pc_offset = 21'(signed'({jal_imm, 1'b0}));
2'b01 : pc_offset = 21'(signed'(jalr_imm));
default : pc_offset = 21'(signed'({br_imm, 1'b0}));
endcase
end
always_ff @(posedge clk) begin
if (issue_stage_ready)
pc_offset_r <= pc_offset;
end
////////////////////////////////////////////////////
logic jalr;
logic jal_or_jalr;
logic br_use_signed;
always_ff @(posedge clk) begin
if (issue_stage_ready) begin
jalr <= (~decode_stage.instruction[3] & decode_stage.instruction[2]);
jal_or_jalr <= decode_stage.instruction[2];
br_use_signed <= !(instruction.fn3 inside {BLTU_fn3, BGEU_fn3});
end
end
////////////////////////////////////////////////////
//Issue
//Only stall condition is if the following instruction is not valid for pc comparisons.
//If the next instruction isn't valid, no instruction can be issued anyways, so it
//is safe to hardcode this to one.
assign issue.ready = 1;
//Branch new request is held if the following instruction hasn't arrived at decode/issue yet
set_clr_reg_with_rst #(.SET_OVER_CLR(1), .WIDTH(1), .RST_VALUE(0)) branch_issued_m (
.clk, .rst,
.set(issue.new_request),
.clr(issue_stage.stage_valid | exception.valid),
.result(branch_issued_r)
);
//To determine if the branch was predicted correctly we need to wait until the
//subsequent instruction has reached the issue stage
assign instruction_is_completing = branch_issued_r & issue_stage.stage_valid;
//Sign extend
assign rs1 = {(rf[RS1][31] & br_use_signed), rf[RS1]};
assign rs2 = {(rf[RS2][31] & br_use_signed), rf[RS2]};
////////////////////////////////////////////////////
//Branch/Jump target determination
//Branch comparison and final address calculation
//are performed in the issue stage
branch_comparator bc (
.less_than(issue_stage.fn3[2]),
.a(rs1),
.b(rs2),
.xor_result(issue_stage.fn3[0]),
.result(result)
);
assign branch_taken = result | jal_or_jalr;
assign jump_pc = (jalr ? rs1[31:0] : issue_stage.pc) + 32'(signed'(pc_offset_r));
assign new_pc = branch_taken ? jump_pc : constant_alu;
always_ff @(posedge clk) begin
if (issue.new_request) begin
branch_taken_ex <= branch_taken;
new_pc_ex <= {new_pc[31:1], new_pc[0] & ~jalr};
id_ex <= issue.id;
jal_or_jalr_ex <= jal_or_jalr;
end
end
////////////////////////////////////////////////////
//Exception support
generate if (CONFIG.INCLUDE_M_MODE) begin : gen_branch_exception
logic new_exception;
assign new_exception = new_pc[1] & branch_taken & issue.new_request;
always_ff @(posedge clk) begin
if (rst)
exception.valid <= 0;
else
exception.valid <= (exception.valid & ~exception.ack) | new_exception;
end
always_ff @(posedge clk) begin
if (issue.new_request)
exception.id <= issue.id;
end
assign exception.code = INST_ADDR_MISSALIGNED;
assign exception.tval = new_pc_ex;
end
endgenerate
////////////////////////////////////////////////////
//Predictor support
logic is_return_ex;
logic is_call_ex;
always_ff @(posedge clk) begin
if (issue.possible_issue) begin
is_return_ex <= is_return;
is_call_ex <= is_call;
pc_ex <= issue_stage.pc;
end
end
assign br_results.id = id_ex;
assign br_results.valid = instruction_is_completing;
assign br_results.pc = pc_ex;
assign br_results.target_pc = new_pc_ex;
assign br_results.branch_taken = branch_taken_ex;
assign br_results.is_branch = ~jal_or_jalr_ex;
assign br_results.is_return = is_return_ex;
assign br_results.is_call = is_call_ex;
assign branch_flush = instruction_is_completing & (issue_stage.pc[31:1] != new_pc_ex[31:1]);
////////////////////////////////////////////////////
//End of Implementation
////////////////////////////////////////////////////
////////////////////////////////////////////////////
//Assertions
endmodule

842
core/execution_units/csr_unit.sv Executable file
View file

@ -0,0 +1,842 @@
/*
* Copyright © 2017-2020 Eric Matthews, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Eric Matthews <ematthew@sfu.ca>
*/
module csr_unit
import cva5_config::*;
import riscv_types::*;
import cva5_types::*;
import csr_types::*;
import opcodes::*;
# (
parameter cpu_config_t CONFIG = EXAMPLE_CONFIG
)
(
input logic clk,
input logic rst,
input decode_packet_t decode_stage,
output logic unit_needed,
output logic [REGFILE_READ_PORTS-1:0] uses_rs,
output logic uses_rd,
input issue_packet_t issue_stage,
input logic issue_stage_ready,
input rs_addr_t issue_rs_addr [REGFILE_READ_PORTS],
input logic [31:0] rf [REGFILE_READ_PORTS],
//Unit Interfaces
unit_issue_interface.unit issue,
unit_writeback_interface.unit wb,
//Privilege
output logic [1:0] current_privilege,
//FP
input logic [4:0] fflag_wmask, //Always valid
output logic [2:0] dyn_rm,
//GC
input logic interrupt_taken,
output logic interrupt_pending,
output logic processing_csr,
//TLB and MMU
output logic tlb_on,
output logic [ASIDLEN-1:0] asid,
//MMUs
mmu_interface.csr immu,
mmu_interface.csr dmmu,
//CSR exception interface
input exception_packet_t exception,
output logic [31:0] exception_target_pc,
//exception return
input logic mret,
input logic sret,
output logic [31:0] epc,
//Retire
input id_t retire_ids [RETIRE_PORTS],
input logic [LOG2_RETIRE_PORTS : 0] retire_count,
//External
input interrupt_t s_interrupt,
input interrupt_t m_interrupt
);
typedef struct packed{
csr_addr_t addr;
logic[1:0] op;
logic reads;
logic writes;
logic [31:0] data;
} csr_inputs_t;
typedef enum logic [2:0] {
MSTATUS_UNCHANGED = 0,
MSTATUS_WRITE = 1,
MSTATUS_INTERRUPT = 2,
MSTATUS_EXCEPTION = 3,
MSTATUS_MRET = 4,
MSTATUS_SRET = 5
} mstatus_cases_t;
mstatus_cases_t mstatus_case;
logic busy;
logic commit;
logic commit_in_progress;
csr_inputs_t csr_inputs;
csr_inputs_t csr_inputs_r;
privilege_t privilege_level;
privilege_t next_privilege_level;
//write_logic
logic swrite;
logic mwrite;
logic [255:0] sub_write_en;
logic [31:0] selected_csr;
logic [31:0] selected_csr_r;
logic [31:0] updated_csr;
logic [31:0] next_csr;
function logic mwrite_en (input csr_addr_t addr);
return mwrite & sub_write_en[addr.sub_addr];
endfunction
function logic swrite_en (input csr_addr_t addr);
return swrite & sub_write_en[addr.sub_addr];
endfunction
////////////////////////////////////////////////////
//Legalization Functions
function logic [31:0] init_medeleg_mask();
init_medeleg_mask = 0;
if (CONFIG.INCLUDE_S_MODE) begin
init_medeleg_mask[INST_ADDR_MISSALIGNED] = 1;
init_medeleg_mask[INST_ACCESS_FAULT] = 1;
init_medeleg_mask[ILLEGAL_INST] = 1;
init_medeleg_mask[BREAK] = 1;
init_medeleg_mask[LOAD_ADDR_MISSALIGNED] = 1;
init_medeleg_mask[LOAD_FAULT] = 1;
init_medeleg_mask[STORE_AMO_ADDR_MISSALIGNED] = 1;
init_medeleg_mask[STORE_AMO_FAULT] = 1;
init_medeleg_mask[ECALL_U] = 1;
init_medeleg_mask[INST_PAGE_FAULT] = 1;
init_medeleg_mask[LOAD_PAGE_FAULT] = 1;
init_medeleg_mask[STORE_OR_AMO_PAGE_FAULT] = 1;
end
endfunction
function logic [31:0] init_mideleg_mask();
init_mideleg_mask = 0;
if (CONFIG.INCLUDE_S_MODE) begin
init_mideleg_mask[S_SOFTWARE_INTERRUPT] = CONFIG.INCLUDE_S_MODE;
init_mideleg_mask[S_TIMER_INTERRUPT] = CONFIG.INCLUDE_S_MODE;
init_mideleg_mask[S_EXTERNAL_INTERRUPT] = CONFIG.INCLUDE_S_MODE;
end
endfunction
function logic [2**ECODE_W-1:0] init_exception_masking_rom();
init_exception_masking_rom = '{default: 0};
init_exception_masking_rom[INST_ADDR_MISSALIGNED] = 1;
init_exception_masking_rom[INST_ACCESS_FAULT] = CONFIG.INCLUDE_S_MODE;
init_exception_masking_rom[ILLEGAL_INST] = 1;
init_exception_masking_rom[BREAK] = 1;
init_exception_masking_rom[LOAD_ADDR_MISSALIGNED] = 1;
init_exception_masking_rom[LOAD_FAULT] = CONFIG.INCLUDE_S_MODE;
init_exception_masking_rom[STORE_AMO_ADDR_MISSALIGNED] = 1;
init_exception_masking_rom[STORE_AMO_FAULT] = CONFIG.INCLUDE_S_MODE;
init_exception_masking_rom[ECALL_U] = CONFIG.INCLUDE_S_MODE;
init_exception_masking_rom[ECALL_S] = CONFIG.INCLUDE_S_MODE;
init_exception_masking_rom[ECALL_M] = 1;
init_exception_masking_rom[INST_PAGE_FAULT] = CONFIG.INCLUDE_S_MODE;
init_exception_masking_rom[LOAD_PAGE_FAULT] = CONFIG.INCLUDE_S_MODE;
init_exception_masking_rom[STORE_OR_AMO_PAGE_FAULT] = CONFIG.INCLUDE_S_MODE;
endfunction
function logic [2**ECODE_W-1:0] init_interrupt_masking_rom();
init_interrupt_masking_rom = '{default: 0};
init_interrupt_masking_rom[S_SOFTWARE_INTERRUPT] = CONFIG.INCLUDE_S_MODE;
init_interrupt_masking_rom[M_SOFTWARE_INTERRUPT] = 1;
init_interrupt_masking_rom[S_TIMER_INTERRUPT] = CONFIG.INCLUDE_S_MODE;
init_interrupt_masking_rom[M_TIMER_INTERRUPT] = 1;
init_interrupt_masking_rom[S_EXTERNAL_INTERRUPT] = CONFIG.INCLUDE_S_MODE;
init_interrupt_masking_rom[M_EXTERNAL_INTERRUPT] = 1;
endfunction
////////////////////////////////////////////////////
//Implementation
////////////////////////////////////////////////////
//Decode
assign unit_needed = decode_stage.instruction inside {CSRRW, CSRRS, CSRRC, CSRRWI, CSRRSI, CSRRCI};
always_comb begin
uses_rs = '0;
uses_rs[RS1] = decode_stage.instruction inside {CSRRW, CSRRS, CSRRC};
uses_rd = unit_needed;
end
////////////////////////////////////////////////////
//Issue
assign csr_inputs = '{
addr : issue_stage.instruction[31:20],
op : issue_stage.fn3[1:0],
data : issue_stage.fn3[2] ? {27'b0, issue_rs_addr[RS1]} : rf[RS1],
reads : ~((issue_stage.fn3[1:0] == CSR_RW) && (issue_stage.rd_addr == 0)),
writes : ~((issue_stage.fn3[1:0] == CSR_RC) && (issue_rs_addr[RS1] == 0))
};
assign processing_csr = busy | issue.new_request;
assign issue.ready = ~busy;
always_ff @(posedge clk) begin
if (rst)
busy <= 0;
else
busy <= (busy & ~wb.ack) | issue.new_request;
end
always_ff @(posedge clk) begin
if (issue.new_request)
csr_inputs_r <= csr_inputs;
end
always_ff @(posedge clk) begin
if (rst)
commit_in_progress <= 0;
else
commit_in_progress <= (commit_in_progress & ~issue.new_request) | commit;
end
//Waits until CSR instruction is the oldest issued instruction
assign commit = (retire_ids[0] == wb.id) & busy & (~commit_in_progress);
////////////////////////////////////////////////////
//Output
always_ff @(posedge clk) begin
if (rst)
wb.done <= 0;
else
wb.done <= (wb.done & ~wb.ack) | commit;
end
always_ff @(posedge clk) begin
if (issue.new_request)
wb.id <= issue.id;
end
assign wb.rd = selected_csr_r;
////////////////////////////////////////////////////
//Shared logic
always_ff @(posedge clk) begin
sub_write_en <= (1 << csr_inputs_r.addr.sub_addr);
mwrite <= CONFIG.INCLUDE_M_MODE && commit && (csr_inputs_r.addr.rw_bits != CSR_READ_ONLY && csr_inputs_r.addr.privilege == MACHINE_PRIVILEGE);
swrite <= CONFIG.INCLUDE_S_MODE && commit && (csr_inputs_r.addr.rw_bits != CSR_READ_ONLY && csr_inputs_r.addr.privilege == SUPERVISOR_PRIVILEGE);
end
always_comb begin
case (csr_inputs_r.op)
CSR_RW : next_csr = csr_inputs_r.data;
CSR_RS : next_csr = selected_csr | csr_inputs_r.data;
CSR_RC : next_csr = selected_csr & ~csr_inputs_r.data;
default : next_csr = csr_inputs_r.data;
endcase
end
always_ff @(posedge clk) begin
if (commit)
updated_csr <= next_csr;
end
////////////////////////////////////////////////////
//Machine Mode Registers
////////////////////////////////////////////////////
////////////////////////////////////////////////////
//Constant Registers
////////////////////////////////////////////////////
//Machine ISA register
localparam misa_t misa = '{
default:0,
mxlen:1,
A:(CONFIG.INCLUDE_AMO),
I:1,
M:(CONFIG.INCLUDE_UNIT.MUL && CONFIG.INCLUDE_UNIT.DIV),
S:(CONFIG.INCLUDE_S_MODE),
U:(CONFIG.INCLUDE_U_MODE),
F:(CONFIG.INCLUDE_UNIT.FPU),
D:(CONFIG.INCLUDE_UNIT.FPU)
};
////////////////////////////////////////////////////
//Machine Version Registers
localparam logic [31:0] mvendorid = 0;
localparam logic [31:0] marchid = 0;
localparam logic [31:0] mimpid = CONFIG.CSRS.MACHINE_IMPLEMENTATION_ID;
localparam logic [31:0] mhartid = CONFIG.CSRS.CPU_ID;
////////////////////////////////////////////////////
//MSTATUS
localparam logic [31:0] mstatush = 0; //Always little endian
////////////////////////////////////////////////////
//Non-Constant Registers
mstatus_t mstatus;
logic[31:0] mtvec;
logic[31:0] medeleg;
logic[31:0] mideleg;
logic[31:0] mepc;
mip_t mip, mip_new;
mie_t mie;
mcause_t mcause;
logic[31:0] mtval;
logic[31:0] mscratch;
//Virtualization support: TSR, TW, TVM unused
//Extension context status: SD, FS, XS unused
localparam mstatus_t mstatus_mask = '{
default:0,
mprv:(CONFIG.INCLUDE_U_MODE | CONFIG.INCLUDE_S_MODE),
mxr:(CONFIG.INCLUDE_S_MODE),
sum:(CONFIG.INCLUDE_U_MODE & CONFIG.INCLUDE_S_MODE),
mpp:'1,
spp:(CONFIG.INCLUDE_S_MODE),
mpie:1,
spie:(CONFIG.INCLUDE_S_MODE),
mie:1,
sie:(CONFIG.INCLUDE_S_MODE)
};
localparam mstatus_t sstatus_mask = '{default:0, mxr:1, sum:1, spp:1, spie:1, sie:1};
localparam mip_t sip_mask = '{default:0, seip:CONFIG.INCLUDE_S_MODE, stip:CONFIG.INCLUDE_S_MODE, ssip:CONFIG.INCLUDE_S_MODE};
localparam mie_t sie_mask = '{default:0, seie:CONFIG.INCLUDE_S_MODE, stie:CONFIG.INCLUDE_S_MODE, ssie:CONFIG.INCLUDE_S_MODE};
generate if (CONFIG.INCLUDE_M_MODE) begin : gen_csr_m_mode
mstatus_t mstatus_new;
mstatus_t mstatus_write_mask;
logic [ECODE_W-1:0] interrupt_cause_r;
//Interrupt and Exception Delegation
//Can delegate to supervisor if currently in supervisor or user modes
logic can_delegate;
logic exception_delegated;
logic interrupt_delegated;
assign can_delegate = CONFIG.INCLUDE_S_MODE & privilege_level inside {SUPERVISOR_PRIVILEGE, USER_PRIVILEGE};
assign exception_delegated = can_delegate & exception.valid & medeleg[exception.code];
assign interrupt_delegated = can_delegate & interrupt_taken & mideleg[interrupt_cause_r];
one_hot_to_integer #(6)
mstatus_case_one_hot (
.one_hot ({sret, mret, exception.valid, interrupt_taken, (mwrite_en(MSTATUS) | swrite_en(SSTATUS)), 1'b0}),
.int_out (mstatus_case)
);
always_comb begin
case (mstatus_case) inside
MSTATUS_MRET : next_privilege_level = privilege_t'(mstatus.mpp);
MSTATUS_SRET : next_privilege_level = privilege_t'({1'b0,mstatus.spp});
MSTATUS_INTERRUPT : next_privilege_level = interrupt_delegated ? SUPERVISOR_PRIVILEGE : MACHINE_PRIVILEGE;
MSTATUS_EXCEPTION : next_privilege_level = exception_delegated ? SUPERVISOR_PRIVILEGE : MACHINE_PRIVILEGE;
default : next_privilege_level = privilege_level;
endcase
end
//Current privilege level
always_ff @(posedge clk) begin
if (rst)
privilege_level <= MACHINE_PRIVILEGE;
else
privilege_level <= next_privilege_level;
end
assign current_privilege = privilege_level;
assign mstatus_write_mask = swrite ? sstatus_mask : mstatus_mask;
always_comb begin
mstatus_new = mstatus;
case (mstatus_case) inside
MSTATUS_WRITE : mstatus_new = (mstatus & ~mstatus_write_mask) | (updated_csr & mstatus_write_mask);
MSTATUS_MRET : begin
mstatus_new.mie = mstatus.mpie;
mstatus_new.mpie = 1;
mstatus_new.mpp = CONFIG.INCLUDE_U_MODE ? USER_PRIVILEGE : MACHINE_PRIVILEGE;
if (mstatus.mpp != MACHINE_PRIVILEGE)
mstatus_new.mprv = 0;
end
MSTATUS_SRET : begin
mstatus_new.sie = mstatus.spie;
mstatus_new.spie = 1;
mstatus_new.spp = USER_PRIVILEGE[0];
mstatus_new.mprv = 0;
end
MSTATUS_INTERRUPT, MSTATUS_EXCEPTION : begin
if (next_privilege_level == SUPERVISOR_PRIVILEGE) begin
mstatus_new.spie = (privilege_level == SUPERVISOR_PRIVILEGE) ? mstatus.sie : 0;
mstatus_new.sie = 0;
mstatus_new.spp = privilege_level[0]; //one if from supervisor-mode, zero if from user-mode
end
else begin
mstatus_new.mpie = (privilege_level == MACHINE_PRIVILEGE) ? mstatus.mie : ((privilege_level == SUPERVISOR_PRIVILEGE) ? mstatus.sie : 0);
mstatus_new.mie = 0;
mstatus_new.mpp = privilege_level; //machine,supervisor or user
end
end
default : mstatus_new = mstatus;
endcase
end
always_ff @(posedge clk) begin
if (rst)
mstatus <= '{default:0, mpp:MACHINE_PRIVILEGE};
else
mstatus <= mstatus_new;
end
////////////////////////////////////////////////////
//MTVEC
//No vectored mode, mode hard-coded to zero
initial mtvec[31:2] = CONFIG.CSRS.RESET_MTVEC[31:2];
always_ff @(posedge clk) begin
mtvec[1:0] <= '0;
if (CONFIG.CSRS.NON_STANDARD_OPTIONS.MTVEC_WRITEABLE & mwrite_en(MTVEC))
mtvec[31:2] <= updated_csr[31:2];
end
assign exception_target_pc = mtvec;
////////////////////////////////////////////////////
//MEDELEG
localparam logic [31:0] medeleg_mask = init_medeleg_mask();
always_ff @(posedge clk) begin
if (rst)
medeleg <= '0;
else if (mwrite_en(MEDELEG) & CONFIG.INCLUDE_S_MODE)
medeleg <= (updated_csr & medeleg_mask);
end
////////////////////////////////////////////////////
//MIDELEG
localparam logic [31:0] mideleg_mask = init_mideleg_mask();
always_ff @(posedge clk) begin
if (rst)
mideleg <= '0;
else if (mwrite_en(MIDELEG) & CONFIG.INCLUDE_S_MODE)
mideleg <= (updated_csr & mideleg_mask);
end
////////////////////////////////////////////////////
//MIP
localparam mip_t mip_mask = '{default:0, meip:1, seip:CONFIG.INCLUDE_S_MODE, mtip:1, stip:CONFIG.INCLUDE_S_MODE, msip:1, ssip:CONFIG.INCLUDE_S_MODE};
localparam mip_t mip_w_mask = '{default:0, seip:CONFIG.INCLUDE_S_MODE, stip:CONFIG.INCLUDE_S_MODE, ssip:CONFIG.INCLUDE_S_MODE};
always_comb begin
mip_new = '0;
mip_new.ssip = s_interrupt.software;
mip_new.stip = s_interrupt.timer;
mip_new.seip = s_interrupt.external;
mip_new.msip = m_interrupt.software;
mip_new.mtip = m_interrupt.timer;
mip_new.meip = m_interrupt.external;
mip_new &= mip_mask;
end
always_ff @(posedge clk) begin
if (rst)
mip <= 0;
else if (mwrite_en(MIP) | (|mip_new))
mip <= (updated_csr & mip_w_mask) | mip_new;
end
assign interrupt_pending = |(mip & mie) & mstatus.mie;
////////////////////////////////////////////////////
//MIE
localparam mie_t mie_mask = '{default:0, meie:1, seie:CONFIG.INCLUDE_S_MODE, mtie:1, stie:CONFIG.INCLUDE_S_MODE, msie:1, ssie:CONFIG.INCLUDE_S_MODE};
always_ff @(posedge clk) begin
if (rst)
mie <= '0;
else if (mwrite_en(MIE) | swrite_en(SIE))
mie <= updated_csr & (swrite ? sie_mask : mie_mask);
end
////////////////////////////////////////////////////
//MEPC
//Can be software written, written on exception with
//exception causing PC. Lower two bits tied to zero.
always_ff @(posedge clk) begin
mepc[1:0] <= '0;
if (mwrite_en(MEPC) | exception.valid | interrupt_taken)
mepc[31:2] <= (exception.valid | interrupt_taken) ? exception.pc[31:2] : updated_csr[31:2];
end
assign epc = mepc;
////////////////////////////////////////////////////
//MCAUSE
//As the exception and interrupts codes are sparsely populated,
//to ensure that only legal values are written, a ROM lookup
//is used to validate the CSR write operation
localparam logic [2**ECODE_W-1:0] M_EXCEPTION_MASKING_ROM = init_exception_masking_rom();
localparam logic [2**ECODE_W-1:0] M_INTERRUPT_MASKING_ROM = init_interrupt_masking_rom();
logic mcause_write_valid;
always_comb begin
if (updated_csr[31]) //interrupt
mcause_write_valid = M_INTERRUPT_MASKING_ROM[updated_csr[ECODE_W-1:0]];
else
mcause_write_valid = M_EXCEPTION_MASKING_ROM[updated_csr[ECODE_W-1:0]];
end
mip_t mip_cause;
logic [5:0] mip_priority_vector;
logic [2:0] mip_cause_sel;
localparam logic [ECODE_W-1:0] interruput_code_table [7:0] = '{ 0, 0,
M_EXTERNAL_INTERRUPT, M_TIMER_INTERRUPT, M_SOFTWARE_INTERRUPT,
S_EXTERNAL_INTERRUPT, S_TIMER_INTERRUPT, S_SOFTWARE_INTERRUPT
};
assign mip_cause = (mip & mie);
assign mip_priority_vector = '{mip_cause.meip, mip_cause.mtip, mip_cause.msip, mip_cause.seip, mip_cause.stip, mip_cause.ssip};
priority_encoder #(.WIDTH(6))
interrupt_cause_encoder (
.priority_vector (mip_priority_vector),
.encoded_result (mip_cause_sel)
);
always_ff @(posedge clk) begin
if (interrupt_pending)
interrupt_cause_r <= interruput_code_table[mip_cause_sel];
end
always_ff @(posedge clk) begin
mcause.zeroes <= '0;
if (rst) begin
mcause.is_interrupt <= 0;
mcause.code <= 0;
end
else if (CONFIG.CSRS.NON_STANDARD_OPTIONS.INCLUDE_MCAUSE & ((mcause_write_valid & mwrite_en(MCAUSE)) | exception.valid | interrupt_taken)) begin
mcause.is_interrupt <= interrupt_taken | (mwrite_en(MCAUSE) & updated_csr[31]);
mcause.code <= interrupt_taken ? interrupt_cause_r : exception.valid ? exception.code : updated_csr[ECODE_W-1:0];
end
end
////////////////////////////////////////////////////
//MTVAL
always_ff @(posedge clk) begin
if (CONFIG.CSRS.NON_STANDARD_OPTIONS.INCLUDE_MTVAL & (mwrite_en(MTVAL) | exception.valid))
mtval <= exception.valid ? exception.tval : updated_csr;
end
////////////////////////////////////////////////////
//MSCRATCH
always_ff @(posedge clk) begin
if (CONFIG.CSRS.NON_STANDARD_OPTIONS.INCLUDE_MSCRATCH & mwrite_en(MSCRATCH))
mscratch <= updated_csr;
end
end
endgenerate
////////////////////////////////////////////////////
//END OF MACHINE REGS
////////////////////////////////////////////////////
////////////////////////////////////////////////////
//BEGIN OF SUPERVISOR REGS
////////////////////////////////////////////////////
logic[31:0] sepc;
logic[31:0] stime;
logic[31:0] stimecmp;
logic[31:0] scause;
logic[31:0] stval;
logic[31:0] sstatus;
logic[31:0] stvec;
satp_t satp;
logic[31:0] sscratch;
//TLB status --- used to mux physical/virtual address
assign tlb_on = CONFIG.INCLUDE_S_MODE & satp.mode;
assign asid = satp.asid;
//******************
generate if (CONFIG.INCLUDE_S_MODE) begin : gen_csr_s_mode
////////////////////////////////////////////////////
//MMU interface
assign immu.mxr = mstatus.mxr;
assign dmmu.mxr = mstatus.mxr;
assign immu.sum = mstatus.sum;
assign dmmu.sum = mstatus.sum;
assign immu.privilege = privilege_level;
assign dmmu.privilege = mstatus.mprv ? mstatus.mpp : privilege_level;
assign immu.satp_ppn = satp.ppn;
assign dmmu.satp_ppn = satp.ppn;
////////////////////////////////////////////////////
////////////////////////////////////////////////////
//STVEC
logic [31:0] stvec_mask = '1;
always_ff @(posedge clk) begin
if (rst)
stvec <= {CONFIG.CSRS.RESET_VEC[31:2], 2'b00};
else if (swrite_en(STVEC))
stvec <= (updated_csr & stvec_mask);
end
////////////////////////////////////////////////////
//SATP
logic[31:0] satp_mask;
assign satp_mask = '1;
always_ff @(posedge clk) begin
if (rst)
satp <= 0;
else if (swrite_en(SATP))
satp <= (updated_csr & satp_mask);
end
////////////////////////////////////////////////////
//SSCRATCH
always_ff @(posedge clk) begin
if (swrite_en(SSCRATCH))
sscratch <= updated_csr;
end
end
endgenerate
////////////////////////////////////////////////////
//END OF SUPERVISOR REGS
////////////////////////////////////////////////////
////////////////////////////////////////////////////
//Timers and Counters
//Register increment for instructions completed
//Increments suppressed on writes to these registers
localparam COUNTER_W = CONFIG.CSRS.NON_STANDARD_OPTIONS.COUNTER_W;
localparam MCYCLE_WRITEABLE = CONFIG.CSRS.NON_STANDARD_OPTIONS.MCYCLE_WRITEABLE;
localparam MINSTR_WRITEABLE = CONFIG.CSRS.NON_STANDARD_OPTIONS.MINSTR_WRITEABLE;
logic[COUNTER_W-1:0] mcycle;
logic[COUNTER_W-1:0] mtime;
logic[COUNTER_W-1:0] minst_ret;
logic[COUNTER_W-1:0] mcycle_input_next;
logic[COUNTER_W-1:0] minst_ret_input_next;
logic[LOG2_RETIRE_PORTS:0] minst_ret_inc;
logic mcycle_inc;
assign mcycle_input_next[31:0] = (MCYCLE_WRITEABLE & mwrite_en(MCYCLE)) ? updated_csr : mcycle[31:0];
assign mcycle_input_next[COUNTER_W-1:32] = (MCYCLE_WRITEABLE & mwrite_en(MCYCLE)) ? updated_csr[COUNTER_W-33:0] : mcycle[COUNTER_W-1:32];
assign mcycle_inc = ~(MCYCLE_WRITEABLE & (mwrite_en(MCYCLE) | mwrite_en(MCYCLEH)));
always_ff @(posedge clk) begin
if (rst)
mcycle <= 0;
else
mcycle <= mcycle_input_next + COUNTER_W'(mcycle_inc);
end
assign minst_ret_input_next[31:0] = (MINSTR_WRITEABLE & mwrite_en(MINSTRET)) ? updated_csr : minst_ret[31:0];
assign minst_ret_input_next[COUNTER_W-1:32] = (MINSTR_WRITEABLE & mwrite_en(MINSTRET)) ? updated_csr[COUNTER_W-33:0] : minst_ret[COUNTER_W-1:32];
assign minst_ret_inc = (MINSTR_WRITEABLE & (mwrite_en(MINSTRET) | mwrite_en(MINSTRETH))) ? '0 : retire_count;
always_ff @(posedge clk) begin
if (rst)
minst_ret <= 0;
else
minst_ret <= minst_ret_input_next + COUNTER_W'(minst_ret_inc);
end
////////////////////////////////////////////////////
//Floating-Point status register
//Contains 5 exception flags (invalid, inexact, overflow, underflow, divide by zero)
//Also contains dynamic rounding mode (round to zero, round to +infinity, round to -infinity, round to nearest ties to even, round to nearest ties away)
//These fields can be accessed individually or simultaneously through different addresses
logic[2:0] frm;
logic[4:0] fflags;
assign dyn_rm = frm;
generate if (CONFIG.INCLUDE_UNIT.FPU) begin : gen_csr_fp
typedef enum logic[1:0] {
WRITE_NONE = 2'b00,
WRITE_FFLAGS = 2'b01,
WRITE_FRM = 2'b10,
WRITE_BOTH = 2'b11
} fcsr_write_t;
fcsr_write_t fcsr_write_type;
always_comb begin
case (csr_inputs_r.addr) inside
FFLAGS : fcsr_write_type = WRITE_FFLAGS;
FRM : fcsr_write_type = WRITE_FRM;
FCSR : fcsr_write_type = WRITE_BOTH;
default : fcsr_write_type = WRITE_NONE;
endcase
end
//Older versions of the spec mandated an illegal instruction exception if an instruction
//with the dynamic rounding mode was issued and the frm register contained an invalid
//rounding mode. This has since been changed to "reserved" behaviour, meaning we do not
//have to do anything special. In this case, fp_roundup would default to rne
always_ff @(posedge clk) begin
if (rst) begin
frm <= '0;
fflags <= '0;
end
else begin
//Explicit writes
if (commit) begin
case (fcsr_write_type)
WRITE_FFLAGS : fflags <= next_csr[4:0];
WRITE_FRM : frm <= next_csr[2:0];
WRITE_BOTH : {frm, fflags} <= next_csr[7:0];
default;
endcase
end
else //Implicit writes (can never overlap explicit writes)
fflags <= fflags | fflag_wmask;
end
end
end endgenerate
////////////////////////////////////////////////////
//CSR mux
logic [31:0] read_mask;
always_comb begin
case (csr_inputs_r.addr) inside
SSTATUS : read_mask = CONFIG.INCLUDE_S_MODE ? sstatus_mask : '1;
SIE : read_mask = CONFIG.INCLUDE_S_MODE ? sie_mask : '1;
SIP : read_mask = CONFIG.INCLUDE_S_MODE ? sip_mask : '1;
default : read_mask = '1;
endcase
end
always_comb begin
case (csr_inputs_r.addr) inside
//Machine info
MISA : selected_csr = CONFIG.INCLUDE_M_MODE ? misa : '0;
MVENDORID : selected_csr = CONFIG.INCLUDE_M_MODE ? mvendorid : '0;
MARCHID : selected_csr = CONFIG.INCLUDE_M_MODE ? marchid : '0;
MIMPID : selected_csr = CONFIG.INCLUDE_M_MODE ? mimpid : '0;
MHARTID : selected_csr = CONFIG.INCLUDE_M_MODE ? mhartid : '0;
//Machine trap setup
MSTATUS : selected_csr = CONFIG.INCLUDE_M_MODE ? mstatus : '0;
MEDELEG : selected_csr = CONFIG.INCLUDE_M_MODE ? medeleg : '0;
MIDELEG : selected_csr = CONFIG.INCLUDE_M_MODE ? mideleg : '0;
MIE : selected_csr = CONFIG.INCLUDE_M_MODE ? mie : '0;
MTVEC : selected_csr = CONFIG.INCLUDE_M_MODE ? mtvec : '0;
MCOUNTEREN : selected_csr = '0;
//Machine trap handling
MSCRATCH : selected_csr = CONFIG.INCLUDE_M_MODE ? mscratch : '0;
MEPC : selected_csr = CONFIG.INCLUDE_M_MODE ? mepc : '0;
MCAUSE : selected_csr = CONFIG.INCLUDE_M_MODE ? mcause : '0;
MTVAL : selected_csr = CONFIG.INCLUDE_M_MODE ? mtval : '0;
MIP : selected_csr = CONFIG.INCLUDE_M_MODE ? mip : '0;
//Machine Memory Protection
[12'h3EF : 12'h3A0] : selected_csr = '0;
//Machine Timers and Counters
MCYCLE : selected_csr = CONFIG.INCLUDE_M_MODE ? mcycle[31:0] : '0;
MINSTRET : selected_csr = CONFIG.INCLUDE_M_MODE ? minst_ret[31:0] : '0;
[12'hB03 : 12'hB1F] : selected_csr = '0;
MCYCLEH : selected_csr = CONFIG.INCLUDE_M_MODE ? 32'(mcycle[COUNTER_W-1:32]) : '0;
MINSTRETH : selected_csr = CONFIG.INCLUDE_M_MODE ? 32'(minst_ret[COUNTER_W-1:32]) : '0;
[12'hB83 : 12'hB9F] : selected_csr = '0;
//Machine Counter Setup
[12'h320 : 12'h33F] : selected_csr = '0;
//Supervisor Trap Setup
SSTATUS : selected_csr = CONFIG.INCLUDE_S_MODE ? mstatus : '0;
SEDELEG : selected_csr = '0; //No user-level interrupts/exception handling
SIDELEG : selected_csr = '0;
SIE : selected_csr = CONFIG.INCLUDE_S_MODE ? mie : '0;
STVEC : selected_csr = CONFIG.INCLUDE_S_MODE ? stvec : '0;
SCOUNTEREN : selected_csr = '0;
//Supervisor trap handling
SSCRATCH : selected_csr = CONFIG.INCLUDE_S_MODE ? sscratch : '0;
SEPC : selected_csr = CONFIG.INCLUDE_S_MODE ? sscratch : '0;
SCAUSE : selected_csr = CONFIG.INCLUDE_S_MODE ? sscratch : '0;
STVAL : selected_csr = CONFIG.INCLUDE_S_MODE ? sscratch : '0;
SIP : selected_csr = CONFIG.INCLUDE_S_MODE ? mip : '0;
//Supervisor Protection and Translation
SATP : selected_csr = CONFIG.INCLUDE_S_MODE ? satp : '0;
//User status
//Floating point
FFLAGS : selected_csr = CONFIG.INCLUDE_UNIT.FPU ? {27'b0, fflags} : '0;
FRM : selected_csr = CONFIG.INCLUDE_UNIT.FPU ? {29'b0, frm} : '0;
FCSR : selected_csr = CONFIG.INCLUDE_UNIT.FPU ? {24'b0, frm, fflags} : '0;
//User Counter Timers
CYCLE : selected_csr = mcycle[31:0];
TIME : selected_csr = mcycle[31:0];
INSTRET : selected_csr = minst_ret[31:0];
[12'hC03 : 12'hC1F] : selected_csr = '0;
CYCLEH : selected_csr = 32'(mcycle[COUNTER_W-1:32]);
TIMEH : selected_csr = 32'(mcycle[COUNTER_W-1:32]);
INSTRETH : selected_csr = 32'(minst_ret[COUNTER_W-1:32]);
[12'hC83 : 12'hC9F] : selected_csr = '0;
default : selected_csr = '0;
endcase
end
always_ff @(posedge clk) begin
if (commit)
selected_csr_r <= selected_csr & read_mask;
end
////////////////////////////////////////////////////
//Assertions
mstatus_update_assertion:
assert property (@(posedge clk) disable iff (rst) $onehot0({mret,sret,interrupt_taken, exception.valid,(mwrite_en(MSTATUS) | swrite_en(SSTATUS))})) else $error("multiple write to mstatus");
endmodule

View file

@ -0,0 +1,93 @@
/*
* Copyright © 2022 Eric Matthews
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Eric Matthews <ematthew@sfu.ca>
*/
module custom_unit
import cva5_config::*;
import riscv_types::*;
import cva5_types::*;
import opcodes::*;
(
input logic clk,
input logic rst,
input decode_packet_t decode_stage,
output logic unit_needed,
output logic [REGFILE_READ_PORTS-1:0] uses_rs,
output logic uses_rd,
input issue_packet_t issue_stage,
input logic issue_stage_ready,
input logic [31:0] rf [REGFILE_READ_PORTS],
unit_issue_interface.unit issue,
unit_writeback_interface.unit wb
);
common_instruction_t instruction;//rs1_addr, rs2_addr, fn3, fn7, rd_addr, upper/lower opcode
logic [31:0] result;
logic done;
id_t id;
////////////////////////////////////////////////////
//Implementation
//Simple 2-cycle adder that adds rs1 and rs2
//that has a throughput of 1 (so long as the result is accepted by the writeback stage)
////////////////////////////////////////////////////
//Decode
assign instruction = decode_stage.instruction;
//The following signals should be asserted when the decoded instruction
//is handled by this execution unit.
assign unit_needed = decode_stage.instruction inside {CUSTOM};
always_comb begin
uses_rs = '0;
uses_rs[RS1] = decode_stage.instruction inside {CUSTOM};
uses_rs[RS2] = decode_stage.instruction inside {CUSTOM};
uses_rd = decode_stage.instruction inside {CUSTOM};
end
////////////////////////////////////////////////////
//Issue
assign issue.ready = ~wb.done;
always_ff @(posedge clk) begin
if (issue.new_request)
id <= issue.id;
end
always_ff @(posedge clk) begin
if (issue.new_request)
result <= rf[RS1] + rf[RS2];
end
////////////////////////////////////////////////////
//Write-back
assign wb.rd = result;
always_ff @ (posedge clk) begin
if (rst)
wb.done <= 0;
else
wb.done <= (wb.done & ~wb.ack) | issue.new_request;
end
assign wb.id = id;
endmodule

View file

@ -25,15 +25,30 @@ module div_unit
import cva5_config::*;
import riscv_types::*;
import cva5_types::*;
import opcodes::*;
(
input logic clk,
input logic rst,
input gc_outputs_t gc,
input logic instruction_issued_with_rd,
input decode_packet_t decode_stage,
output logic unit_needed,
output logic [REGFILE_READ_PORTS-1:0] uses_rs,
output logic uses_rd,
input issue_packet_t issue_stage,
input logic issue_stage_ready,
input rs_addr_t issue_rs_addr [REGFILE_READ_PORTS],
input logic [31:0] rf [REGFILE_READ_PORTS],
input div_inputs_t div_inputs,
unit_issue_interface.unit issue,
unit_writeback_interface.unit wb
);
common_instruction_t instruction;//rs1_addr, rs2_addr, fn3, fn7, rd_addr, upper/lower opcode
logic mult_div_op;
logic signed_divop;
logic negate_quotient;
@ -52,74 +67,106 @@ module div_unit
typedef struct packed{
logic remainder_op;
logic negate_result;
logic divisor_is_zero;
logic reuse_result;
id_t id;
} div_attributes_t;
div_attributes_t wb_attr;
typedef struct packed{
logic [XLEN-1:0] unsigned_dividend;
logic [XLEN-1:0] unsigned_divisor;
logic [$clog2(32)-1:0] dividend_CLZ;
logic [$clog2(32)-1:0] divisor_CLZ;
logic divisor_is_zero;
logic reuse_result;
div_attributes_t attr;
} div_fifo_inputs_t;
div_fifo_inputs_t issue_fifo_inputs;
div_fifo_inputs_t div_stage;
div_attributes_t wb_attr;
unsigned_division_interface #(.DATA_WIDTH(32)) div_core();
logic in_progress;
logic div_done;
fifo_interface #(.DATA_WIDTH($bits(div_fifo_inputs_t))) input_fifo();
fifo_interface #(.DATA_WIDTH(XLEN)) wb_fifo();
////////////////////////////////////////////////////
//Implementation
fifo_interface #(.DATA_TYPE(div_fifo_inputs_t)) input_fifo();
function logic [31:0] negate_if (input logic [31:0] a, logic b);
return ({32{b}} ^ a) + 32'(b);
endfunction
////////////////////////////////////////////////////
//Implementation
////////////////////////////////////////////////////
//Decode
assign unit_needed = decode_stage.instruction inside {DIV, DIVU, REM, REMU};
always_comb begin
uses_rs = '0;
uses_rs[RS1] = unit_needed;
uses_rs[RS2] = unit_needed;
uses_rd = unit_needed;
end
////////////////////////////////////////////////////
//Issue
////////////////////////////////////////////////////
//Result resuse (for div/rem pairs)
rs_addr_t prev_div_rs_addr [2];
logic [1:0] div_rd_match;
logic prev_div_result_valid;
logic div_rs_overwrite;
logic div_op_reuse;
always_ff @(posedge clk) begin
if (issue.new_request)
prev_div_rs_addr <= issue_rs_addr[RS1:RS2];
end
assign div_op_reuse = {prev_div_result_valid, prev_div_rs_addr[RS1], prev_div_rs_addr[RS2]} == {1'b1, issue_rs_addr[RS1],issue_rs_addr[RS2]};
//Clear if prev div inputs are overwritten by another instruction
assign div_rd_match[RS1] = (issue_stage.rd_addr == prev_div_rs_addr[RS1]);
assign div_rd_match[RS2] = (issue_stage.rd_addr == prev_div_rs_addr[RS2]);
assign div_rs_overwrite = |div_rd_match;
set_clr_reg_with_rst #(.SET_OVER_CLR(1), .WIDTH(1), .RST_VALUE(0)) prev_div_result_valid_m (
.clk, .rst,
.set(issue.new_request & ~((issue_stage.rd_addr == issue_rs_addr[RS1]) | (issue_stage.rd_addr == issue_rs_addr[RS2]))),
.clr((instruction_issued_with_rd & div_rs_overwrite) | gc.writeback_supress), //No instructions will be issued while gc.writeback_supress is asserted
.result(prev_div_result_valid)
);
////////////////////////////////////////////////////
//Input and output sign determination
assign signed_divop = ~div_inputs.op[0];
assign signed_divop = ~ issue_stage.fn3[0];
assign negate_dividend = signed_divop & div_inputs.rs1[31];
assign negate_divisor = signed_divop & div_inputs.rs2[31];
assign negate_dividend = signed_divop & rf[RS1][31];
assign negate_divisor = signed_divop & rf[RS2][31];
assign negate_quotient = signed_divop & (div_inputs.rs1[31] ^ div_inputs.rs2[31]);
assign negate_remainder = signed_divop & (div_inputs.rs1[31]);
assign negate_quotient = signed_divop & (rf[RS1][31] ^ rf[RS2][31]);
assign negate_remainder = signed_divop & (rf[RS1][31]);
////////////////////////////////////////////////////
//Input Processing
assign unsigned_dividend = negate_if (div_inputs.rs1, negate_dividend);
assign unsigned_divisor = negate_if (div_inputs.rs2, negate_divisor);
assign unsigned_dividend = negate_if (rf[RS1], negate_dividend);
assign unsigned_divisor = negate_if (rf[RS2], negate_divisor);
//Note: If this becomes the critical path, we can use the one's complemented input instead.
//It will potentially overestimate (only when the input is a negative power-of-two), and
//the divisor width will need to be increased by one to safely handle the case where the divisor CLZ is overestimated
clz dividend_clz_block (.clz_input(unsigned_dividend), .clz(dividend_CLZ));
clz divisor_clz_block (.clz_input(unsigned_divisor), .clz(divisor_CLZ));
assign divisor_is_zero = (&divisor_CLZ) & ~div_inputs.rs2[0];
assign issue_fifo_inputs.unsigned_dividend = unsigned_dividend;
assign issue_fifo_inputs.unsigned_divisor = unsigned_divisor;
assign issue_fifo_inputs.dividend_CLZ = divisor_is_zero ? '0 : dividend_CLZ;
assign issue_fifo_inputs.divisor_CLZ = divisor_CLZ;
assign issue_fifo_inputs.attr.remainder_op = div_inputs.op[1];
assign issue_fifo_inputs.attr.negate_result = div_inputs.op[1] ? negate_remainder : (negate_quotient & ~divisor_is_zero);
assign issue_fifo_inputs.attr.divisor_is_zero = divisor_is_zero;
assign issue_fifo_inputs.attr.reuse_result = div_inputs.reuse_result;
assign issue_fifo_inputs.attr.id = issue.id;
clz #(.WIDTH(32)) dividend_clz_block (
.clz_input(unsigned_dividend),
.clz(dividend_CLZ),
.zero()
);
clz #(.WIDTH(32)) divisor_clz_block (
.clz_input(unsigned_divisor),
.clz(divisor_CLZ),
.zero(divisor_is_zero)
);
////////////////////////////////////////////////////
//Input FIFO
//Currently just a register (DEPTH=1). As one div instruction can be in-progress
//and one in this input "fifo," we can support two in-flight div ops.
cva5_fifo #(.DATA_WIDTH($bits(div_fifo_inputs_t)), .FIFO_DEPTH(1))
cva5_fifo #(.DATA_TYPE(div_fifo_inputs_t), .FIFO_DEPTH(1))
div_input_fifo (
.clk (clk),
.rst (rst),
@ -129,17 +176,28 @@ module div_unit
logic div_ready;
assign div_ready = (~in_progress) | wb.ack;
assign input_fifo.data_in = issue_fifo_inputs;
assign input_fifo.data_in = '{
unsigned_dividend : unsigned_dividend,
unsigned_divisor : unsigned_divisor,
dividend_CLZ : divisor_is_zero ? '0 : dividend_CLZ,
divisor_CLZ : divisor_CLZ,
divisor_is_zero : divisor_is_zero,
reuse_result : div_op_reuse,
attr : '{
remainder_op : issue_stage.fn3[1],
negate_result : (issue_stage.fn3[1] ? negate_remainder : (negate_quotient & ~divisor_is_zero)),
id : issue.id
}
};
assign input_fifo.push = issue.new_request;
assign input_fifo.potential_push = issue.possible_issue;
assign issue.ready = ~input_fifo.full | (~in_progress);
assign input_fifo.pop = input_fifo.valid & div_ready;
assign div_stage = input_fifo.data_out;
////////////////////////////////////////////////////
//Control Signals
assign div_core.start = input_fifo.pop & ~div_stage.attr.reuse_result;
assign div_done = div_core.done | (input_fifo.pop & div_stage.attr.reuse_result);
assign div_core.start = input_fifo.pop & ~input_fifo.data_out.reuse_result;
assign div_done = div_core.done | (input_fifo.pop & input_fifo.data_out.reuse_result);
//If more than one cycle, set in_progress so that multiple div.start signals are not sent to the div unit.
set_clr_reg_with_rst #(.SET_OVER_CLR(1), .WIDTH(1), .RST_VALUE('0))
@ -151,16 +209,16 @@ module div_unit
);
always_ff @ (posedge clk) begin
if (input_fifo.pop)
wb_attr <= div_stage.attr;
wb_attr <= input_fifo.data_out.attr;
end
////////////////////////////////////////////////////
//Div core
assign div_core.dividend = div_stage.unsigned_dividend;
assign div_core.divisor = div_stage.unsigned_divisor;
assign div_core.dividend_CLZ = div_stage.dividend_CLZ;
assign div_core.divisor_CLZ = div_stage.divisor_CLZ;
assign div_core.divisor_is_zero = div_stage.attr.divisor_is_zero;
assign div_core.dividend = input_fifo.data_out.unsigned_dividend;
assign div_core.divisor = input_fifo.data_out.unsigned_divisor;
assign div_core.dividend_CLZ = input_fifo.data_out.dividend_CLZ;
assign div_core.divisor_CLZ = input_fifo.data_out.divisor_CLZ;
assign div_core.divisor_is_zero = input_fifo.data_out.divisor_is_zero;
div_core #(.DIV_WIDTH(32))
divider_block (

View file

@ -0,0 +1,73 @@
/*
* Copyright © 2023 Chris Keilbart, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Chris Keilbart <ckeilbar@sfu.ca>
*/
module carry_save_shift
import fpu_types::*;
#(
parameter WIDTH = 32 //Includes the integer bit
)(
input logic[WIDTH-1:0] four_wsum, //Shifted twice by the CALLER (because of special initialization)
input logic[WIDTH-3:0] wcarry,
input logic[WIDTH-4:0] divisor,
output logic[WIDTH-3:0] next_wsum,
output logic[WIDTH-3:0] next_wcarry,
output q_t next_q,
output logic not_in_table //Only used for assertion
);
logic[WIDTH-1:0] four_wcarry;
assign four_wcarry = {wcarry, 1'b0, (next_q == POS_ONE || next_q == POS_TWO)}; //Include the carry in from converting -qd to 2s complement here
logic[WIDTH-3:0] neg_q_d;
always_comb begin
if (next_q == POS_TWO || next_q == NEG_TWO)
neg_q_d = {divisor, 1'b0};
else if (next_q == ZERO)
neg_q_d = '0;
else
neg_q_d = {1'b0, divisor};
if (next_q == POS_ONE || next_q == POS_TWO)
neg_q_d = ~neg_q_d;
end
q_lookup lut (
.d(divisor[WIDTH-5 -: 3]),
.ws(four_wsum[WIDTH-1 -: 7]),
.wc(four_wcarry[WIDTH-1 -: 7]),
.q(next_q),
.not_in_table(not_in_table)
);
generate for (genvar i = 0; i < WIDTH-3; i++) begin : gen_carry_save_adder
assign {next_wcarry[i+1], next_wsum[i]} = four_wsum[i] + four_wcarry[i] + neg_q_d[i];
end endgenerate
//Last adder - ignore the carry out
assign next_wsum[WIDTH-3] = four_wsum[WIDTH-3] + four_wcarry[WIDTH-3] + neg_q_d[WIDTH-3];
assign next_wcarry[0] = 0;
endmodule

View file

@ -0,0 +1,174 @@
/*
* Copyright © 2023 Chris Keilbart, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Chris Keilbart <ckeilbar@sfu.ca>
*/
module fp_div_core
import fpu_types::*;
(
input logic clk,
input logic rst,
unsigned_division_interface.divider div
);
localparam DIV_WIDTH = div.DATA_WIDTH;
localparam COUNTER_WIDTH = $clog2((1+DIV_WIDTH)/2+3);
localparam QUOTIENT_WIDTH = 2*((1+DIV_WIDTH)/2)+2;
localparam DECIMAL_WIDTH = DIV_WIDTH+1;
localparam RESIDUE_WIDTH = DIV_WIDTH+3;
////////////////////////////////////////////////////
//Radix 4 divider
//Follows the design in "Digital Arithmetic" by Ercegovac and Lang
//Uses the digit set {-2, -1, 0, 1, 2}
logic[RESIDUE_WIDTH-1:0] four_wsum; //Shifted left twice
logic[DECIMAL_WIDTH-1:0] wcarry;
logic[DECIMAL_WIDTH-2:0] divisor_r;
logic[DECIMAL_WIDTH-1:0] next_wsum;
logic[DECIMAL_WIDTH-1:0] next_wcarry;
q_t current_q;
q_t next_q;
q_t muxed_q;
logic[QUOTIENT_WIDTH-1:0] quotient;
logic[QUOTIENT_WIDTH-1:0] quotient_m;
logic[QUOTIENT_WIDTH-1:0] next_quotient;
logic[QUOTIENT_WIDTH-1:0] next_quotient_m;
//Assertions
logic decremented_invalid;
logic bad_quotient_digit;
logic not_in_table;
//Control logic
logic [COUNTER_WIDTH-1:0] counter;
logic counter_full;
assign counter_full = counter == COUNTER_WIDTH'((1+DIV_WIDTH)/2+2);
always_ff @(posedge clk) begin
if (rst) begin
counter <= '0;
div.done <= 0;
end
else begin
div.done <= counter_full;
if (counter_full)
counter <= '0;
else if (div.start | |counter)
counter <= counter + 1;
end
end
//Iterate over the digits
always_ff @(posedge clk) begin
if (rst) begin
divisor_r <= '0;
four_wsum <= '0;
wcarry <= '0;
quotient <= '0;
quotient_m <= '0;
current_q <= ZERO;
end
else begin
if (div.start) begin
divisor_r <= div.divisor;
four_wsum <= {3'b0, div.dividend}; //First iteration doesn't shift the inputs
current_q <= ZERO;
wcarry <= '0;
quotient <= '0;
quotient_m <= '0;
end
else if (|counter) begin
current_q <= next_q;
four_wsum <= {next_wsum, 2'b0};
wcarry <= next_wcarry;
quotient <= next_quotient;
quotient_m <= next_quotient_m;
end
end
end
assign div.quotient = quotient[QUOTIENT_WIDTH-2 -: DIV_WIDTH]; //Shift only once instead of twice because inputs are in the range 0.1X but the output can be X.XX
//Carry save adder operating on shifted input
carry_save_shift #(.WIDTH(RESIDUE_WIDTH)) partial_sum (
.four_wsum(four_wsum),
.wcarry(wcarry),
.divisor(divisor_r),
.next_wsum(next_wsum),
.next_wcarry(next_wcarry),
.next_q(next_q),
.not_in_table(not_in_table)
);
//Digit conversion
on_the_fly #(.WIDTH(QUOTIENT_WIDTH)) quotient_conv (
.current_Q(quotient),
.current_QM(quotient_m),
.q(muxed_q),
.next_Q(next_quotient),
.next_QM(next_quotient_m),
.bad_quotient_digit(bad_quotient_digit)
);
////////////////////////////////////////////////////
//Sign/zero detection using an adder
//The alternative is a tree of generate/propagate blocks (see page 265 of "Digital Arithmetic" by Ercegovac and Lang)
//For a 55 bit width, both have very similar delays but the tree uses slightly more resources
logic is_negative;
logic[DECIMAL_WIDTH-1:0] sz_sum;
assign sz_sum = four_wsum[RESIDUE_WIDTH-1:2] + wcarry;
assign is_negative = sz_sum[DECIMAL_WIDTH-1];
always_comb begin
div.remainder = sz_sum[DIV_WIDTH-1:0];
muxed_q = current_q;
decremented_invalid = 0;
if (counter_full & is_negative) begin //Subtract 1
unique case (current_q)
POS_TWO: muxed_q = POS_ONE;
POS_ONE: muxed_q = ZERO;
ZERO: muxed_q = NEG_ONE;
NEG_ONE: muxed_q = NEG_TWO;
NEG_TWO: muxed_q = NEG_THREE;
default: decremented_invalid = 1; //For assertions only
endcase
end
end
//Assertions
decrement_bad_quotient_assertion:
assert property (@(posedge clk) disable iff (rst) (!(decremented_invalid)))
else $error("Invalid decrement of quotient digit");
decoding_bad_digit_assertion:
assert property (@(posedge clk) disable iff (rst) (!(|counter & bad_quotient_digit)))
else $error("Bad quotient digit for decoding");
missed_lut_assertion:
assert property (@(posedge clk) disable iff (rst) (!(|counter & not_in_table)))
else $error("Sum out of range of quotient lookup");
endmodule

View file

@ -0,0 +1,81 @@
/*
* Copyright © 2023 Chris Keilbart, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Chris Keilbart <ckeilbar@sfu.ca>
*/
module on_the_fly
import fpu_types::*;
#(
parameter WIDTH = 32
)(
input logic[WIDTH-1:0] current_Q,
input logic[WIDTH-1:0] current_QM,
input q_t q,
output logic[WIDTH-1:0] next_Q,
output logic[WIDTH-1:0] next_QM,
output logic bad_quotient_digit //Only used for assertion
);
logic[1:0] qin;
logic[1:0] qmin;
always_comb begin
bad_quotient_digit = 0;
unique case (q)
POS_TWO,
NEG_TWO: begin
qin = 2'b10;
qmin = 2'b01;
end
NEG_ONE: begin
qin = 2'b11;
qmin = 2'b10;
end
ZERO: begin
qin = 2'b00;
qmin = 2'b11;
end
NEG_THREE,
POS_ONE: begin
qin = 2'b01;
qmin = 2'b00;
end
default: begin
qin = 2'bXX;
qmin = 2'bXX;
bad_quotient_digit = 1;
end
endcase
end
always_comb begin
if (q == NEG_TWO || q == NEG_ONE || q == NEG_THREE)
next_Q = {current_QM[WIDTH-3:0], qin};
else
next_Q = {current_Q[WIDTH-3:0], qin};
if (q == NEG_TWO || q == NEG_ONE || q == ZERO)
next_QM = {current_QM[WIDTH-3:0], qmin};
else
next_QM = {current_Q[WIDTH-3:0], qmin};
end
endmodule

View file

@ -0,0 +1,745 @@
/*
* Copyright © 2023 Chris Keilbart, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Chris Keilbart <ckeilbar@sfu.ca>
*/
module q_lookup
import fpu_types::*;
(
input logic[2:0] d,
input logic[6:0] ws,
input logic[6:0] wc,
output q_t q,
output logic not_in_table //Only used for assertion
);
logic[6:0] combined;
assign combined = ws + wc;
always_comb begin
not_in_table = 0;
//Table contents from "Digital Arithmetic" by Ercegovac and Lang
unique case ({d, combined})
10'b0001010100: q = NEG_TWO;
10'b0001010101: q = NEG_TWO;
10'b0001010110: q = NEG_TWO;
10'b0001010111: q = NEG_TWO;
10'b0001011000: q = NEG_TWO;
10'b0001011001: q = NEG_TWO;
10'b0001011010: q = NEG_TWO;
10'b0001011011: q = NEG_TWO;
10'b0001011100: q = NEG_TWO;
10'b0001011101: q = NEG_TWO;
10'b0001011110: q = NEG_TWO;
10'b0001011111: q = NEG_TWO;
10'b0001100000: q = NEG_TWO;
10'b0001100001: q = NEG_TWO;
10'b0001100010: q = NEG_TWO;
10'b0001100011: q = NEG_TWO;
10'b0001100100: q = NEG_TWO;
10'b0001100101: q = NEG_TWO;
10'b0001100110: q = NEG_TWO;
10'b0001100111: q = NEG_TWO;
10'b0001101000: q = NEG_TWO;
10'b0001101001: q = NEG_TWO;
10'b0001101010: q = NEG_TWO;
10'b0001101011: q = NEG_TWO;
10'b0001101100: q = NEG_TWO;
10'b0001101101: q = NEG_TWO;
10'b0001101110: q = NEG_TWO;
10'b0001101111: q = NEG_TWO;
10'b0001110000: q = NEG_TWO;
10'b0001110001: q = NEG_TWO;
10'b0001110010: q = NEG_TWO;
10'b0001110011: q = NEG_ONE;
10'b0001110100: q = NEG_ONE;
10'b0001110101: q = NEG_ONE;
10'b0001110110: q = NEG_ONE;
10'b0001110111: q = NEG_ONE;
10'b0001111000: q = NEG_ONE;
10'b0001111001: q = NEG_ONE;
10'b0001111010: q = NEG_ONE;
10'b0001111011: q = NEG_ONE;
10'b0001111100: q = ZERO;
10'b0001111101: q = ZERO;
10'b0001111110: q = ZERO;
10'b0001111111: q = ZERO;
10'b0000000000: q = ZERO;
10'b0000000001: q = ZERO;
10'b0000000010: q = ZERO;
10'b0000000011: q = ZERO;
10'b0000000100: q = POS_ONE;
10'b0000000101: q = POS_ONE;
10'b0000000110: q = POS_ONE;
10'b0000000111: q = POS_ONE;
10'b0000001000: q = POS_ONE;
10'b0000001001: q = POS_ONE;
10'b0000001010: q = POS_ONE;
10'b0000001011: q = POS_ONE;
10'b0000001100: q = POS_TWO;
10'b0000001101: q = POS_TWO;
10'b0000001110: q = POS_TWO;
10'b0000001111: q = POS_TWO;
10'b0000010000: q = POS_TWO;
10'b0000010001: q = POS_TWO;
10'b0000010010: q = POS_TWO;
10'b0000010011: q = POS_TWO;
10'b0000010100: q = POS_TWO;
10'b0000010101: q = POS_TWO;
10'b0000010110: q = POS_TWO;
10'b0000010111: q = POS_TWO;
10'b0000011000: q = POS_TWO;
10'b0000011001: q = POS_TWO;
10'b0000011010: q = POS_TWO;
10'b0000011011: q = POS_TWO;
10'b0000011100: q = POS_TWO;
10'b0000011101: q = POS_TWO;
10'b0000011110: q = POS_TWO;
10'b0000011111: q = POS_TWO;
10'b0000100000: q = POS_TWO;
10'b0000100001: q = POS_TWO;
10'b0000100010: q = POS_TWO;
10'b0000100011: q = POS_TWO;
10'b0000100100: q = POS_TWO;
10'b0000100101: q = POS_TWO;
10'b0000100110: q = POS_TWO;
10'b0000100111: q = POS_TWO;
10'b0000101000: q = POS_TWO;
10'b0000101001: q = POS_TWO;
10'b0000101010: q = POS_TWO;
10'b0011010100: q = NEG_TWO;
10'b0011010101: q = NEG_TWO;
10'b0011010110: q = NEG_TWO;
10'b0011010111: q = NEG_TWO;
10'b0011011000: q = NEG_TWO;
10'b0011011001: q = NEG_TWO;
10'b0011011010: q = NEG_TWO;
10'b0011011011: q = NEG_TWO;
10'b0011011100: q = NEG_TWO;
10'b0011011101: q = NEG_TWO;
10'b0011011110: q = NEG_TWO;
10'b0011011111: q = NEG_TWO;
10'b0011100000: q = NEG_TWO;
10'b0011100001: q = NEG_TWO;
10'b0011100010: q = NEG_TWO;
10'b0011100011: q = NEG_TWO;
10'b0011100100: q = NEG_TWO;
10'b0011100101: q = NEG_TWO;
10'b0011100110: q = NEG_TWO;
10'b0011100111: q = NEG_TWO;
10'b0011101000: q = NEG_TWO;
10'b0011101001: q = NEG_TWO;
10'b0011101010: q = NEG_TWO;
10'b0011101011: q = NEG_TWO;
10'b0011101100: q = NEG_TWO;
10'b0011101101: q = NEG_TWO;
10'b0011101110: q = NEG_TWO;
10'b0011101111: q = NEG_TWO;
10'b0011110000: q = NEG_TWO;
10'b0011110001: q = NEG_ONE;
10'b0011110010: q = NEG_ONE;
10'b0011110011: q = NEG_ONE;
10'b0011110100: q = NEG_ONE;
10'b0011110101: q = NEG_ONE;
10'b0011110110: q = NEG_ONE;
10'b0011110111: q = NEG_ONE;
10'b0011111000: q = NEG_ONE;
10'b0011111001: q = NEG_ONE;
10'b0011111010: q = ZERO;
10'b0011111011: q = ZERO;
10'b0011111100: q = ZERO;
10'b0011111101: q = ZERO;
10'b0011111110: q = ZERO;
10'b0011111111: q = ZERO;
10'b0010000000: q = ZERO;
10'b0010000001: q = ZERO;
10'b0010000010: q = ZERO;
10'b0010000011: q = ZERO;
10'b0010000100: q = POS_ONE;
10'b0010000101: q = POS_ONE;
10'b0010000110: q = POS_ONE;
10'b0010000111: q = POS_ONE;
10'b0010001000: q = POS_ONE;
10'b0010001001: q = POS_ONE;
10'b0010001010: q = POS_ONE;
10'b0010001011: q = POS_ONE;
10'b0010001100: q = POS_ONE;
10'b0010001101: q = POS_ONE;
10'b0010001110: q = POS_TWO;
10'b0010001111: q = POS_TWO;
10'b0010010000: q = POS_TWO;
10'b0010010001: q = POS_TWO;
10'b0010010010: q = POS_TWO;
10'b0010010011: q = POS_TWO;
10'b0010010100: q = POS_TWO;
10'b0010010101: q = POS_TWO;
10'b0010010110: q = POS_TWO;
10'b0010010111: q = POS_TWO;
10'b0010011000: q = POS_TWO;
10'b0010011001: q = POS_TWO;
10'b0010011010: q = POS_TWO;
10'b0010011011: q = POS_TWO;
10'b0010011100: q = POS_TWO;
10'b0010011101: q = POS_TWO;
10'b0010011110: q = POS_TWO;
10'b0010011111: q = POS_TWO;
10'b0010100000: q = POS_TWO;
10'b0010100001: q = POS_TWO;
10'b0010100010: q = POS_TWO;
10'b0010100011: q = POS_TWO;
10'b0010100100: q = POS_TWO;
10'b0010100101: q = POS_TWO;
10'b0010100110: q = POS_TWO;
10'b0010100111: q = POS_TWO;
10'b0010101000: q = POS_TWO;
10'b0010101001: q = POS_TWO;
10'b0010101010: q = POS_TWO;
10'b0101010100: q = NEG_TWO;
10'b0101010101: q = NEG_TWO;
10'b0101010110: q = NEG_TWO;
10'b0101010111: q = NEG_TWO;
10'b0101011000: q = NEG_TWO;
10'b0101011001: q = NEG_TWO;
10'b0101011010: q = NEG_TWO;
10'b0101011011: q = NEG_TWO;
10'b0101011100: q = NEG_TWO;
10'b0101011101: q = NEG_TWO;
10'b0101011110: q = NEG_TWO;
10'b0101011111: q = NEG_TWO;
10'b0101100000: q = NEG_TWO;
10'b0101100001: q = NEG_TWO;
10'b0101100010: q = NEG_TWO;
10'b0101100011: q = NEG_TWO;
10'b0101100100: q = NEG_TWO;
10'b0101100101: q = NEG_TWO;
10'b0101100110: q = NEG_TWO;
10'b0101100111: q = NEG_TWO;
10'b0101101000: q = NEG_TWO;
10'b0101101001: q = NEG_TWO;
10'b0101101010: q = NEG_TWO;
10'b0101101011: q = NEG_TWO;
10'b0101101100: q = NEG_TWO;
10'b0101101101: q = NEG_TWO;
10'b0101101110: q = NEG_TWO;
10'b0101101111: q = NEG_TWO;
10'b0101110000: q = NEG_ONE;
10'b0101110001: q = NEG_ONE;
10'b0101110010: q = NEG_ONE;
10'b0101110011: q = NEG_ONE;
10'b0101110100: q = NEG_ONE;
10'b0101110101: q = NEG_ONE;
10'b0101110110: q = NEG_ONE;
10'b0101110111: q = NEG_ONE;
10'b0101111000: q = NEG_ONE;
10'b0101111001: q = NEG_ONE;
10'b0101111010: q = ZERO;
10'b0101111011: q = ZERO;
10'b0101111100: q = ZERO;
10'b0101111101: q = ZERO;
10'b0101111110: q = ZERO;
10'b0101111111: q = ZERO;
10'b0100000000: q = ZERO;
10'b0100000001: q = ZERO;
10'b0100000010: q = ZERO;
10'b0100000011: q = ZERO;
10'b0100000100: q = POS_ONE;
10'b0100000101: q = POS_ONE;
10'b0100000110: q = POS_ONE;
10'b0100000111: q = POS_ONE;
10'b0100001000: q = POS_ONE;
10'b0100001001: q = POS_ONE;
10'b0100001010: q = POS_ONE;
10'b0100001011: q = POS_ONE;
10'b0100001100: q = POS_ONE;
10'b0100001101: q = POS_ONE;
10'b0100001110: q = POS_ONE;
10'b0100001111: q = POS_TWO;
10'b0100010000: q = POS_TWO;
10'b0100010001: q = POS_TWO;
10'b0100010010: q = POS_TWO;
10'b0100010011: q = POS_TWO;
10'b0100010100: q = POS_TWO;
10'b0100010101: q = POS_TWO;
10'b0100010110: q = POS_TWO;
10'b0100010111: q = POS_TWO;
10'b0100011000: q = POS_TWO;
10'b0100011001: q = POS_TWO;
10'b0100011010: q = POS_TWO;
10'b0100011011: q = POS_TWO;
10'b0100011100: q = POS_TWO;
10'b0100011101: q = POS_TWO;
10'b0100011110: q = POS_TWO;
10'b0100011111: q = POS_TWO;
10'b0100100000: q = POS_TWO;
10'b0100100001: q = POS_TWO;
10'b0100100010: q = POS_TWO;
10'b0100100011: q = POS_TWO;
10'b0100100100: q = POS_TWO;
10'b0100100101: q = POS_TWO;
10'b0100100110: q = POS_TWO;
10'b0100100111: q = POS_TWO;
10'b0100101000: q = POS_TWO;
10'b0100101001: q = POS_TWO;
10'b0100101010: q = POS_TWO;
10'b0111010100: q = NEG_TWO;
10'b0111010101: q = NEG_TWO;
10'b0111010110: q = NEG_TWO;
10'b0111010111: q = NEG_TWO;
10'b0111011000: q = NEG_TWO;
10'b0111011001: q = NEG_TWO;
10'b0111011010: q = NEG_TWO;
10'b0111011011: q = NEG_TWO;
10'b0111011100: q = NEG_TWO;
10'b0111011101: q = NEG_TWO;
10'b0111011110: q = NEG_TWO;
10'b0111011111: q = NEG_TWO;
10'b0111100000: q = NEG_TWO;
10'b0111100001: q = NEG_TWO;
10'b0111100010: q = NEG_TWO;
10'b0111100011: q = NEG_TWO;
10'b0111100100: q = NEG_TWO;
10'b0111100101: q = NEG_TWO;
10'b0111100110: q = NEG_TWO;
10'b0111100111: q = NEG_TWO;
10'b0111101000: q = NEG_TWO;
10'b0111101001: q = NEG_TWO;
10'b0111101010: q = NEG_TWO;
10'b0111101011: q = NEG_TWO;
10'b0111101100: q = NEG_TWO;
10'b0111101101: q = NEG_TWO;
10'b0111101110: q = NEG_ONE;
10'b0111101111: q = NEG_ONE;
10'b0111110000: q = NEG_ONE;
10'b0111110001: q = NEG_ONE;
10'b0111110010: q = NEG_ONE;
10'b0111110011: q = NEG_ONE;
10'b0111110100: q = NEG_ONE;
10'b0111110101: q = NEG_ONE;
10'b0111110110: q = NEG_ONE;
10'b0111110111: q = NEG_ONE;
10'b0111111000: q = NEG_ONE;
10'b0111111001: q = NEG_ONE;
10'b0111111010: q = ZERO;
10'b0111111011: q = ZERO;
10'b0111111100: q = ZERO;
10'b0111111101: q = ZERO;
10'b0111111110: q = ZERO;
10'b0111111111: q = ZERO;
10'b0110000000: q = ZERO;
10'b0110000001: q = ZERO;
10'b0110000010: q = ZERO;
10'b0110000011: q = ZERO;
10'b0110000100: q = POS_ONE;
10'b0110000101: q = POS_ONE;
10'b0110000110: q = POS_ONE;
10'b0110000111: q = POS_ONE;
10'b0110001000: q = POS_ONE;
10'b0110001001: q = POS_ONE;
10'b0110001010: q = POS_ONE;
10'b0110001011: q = POS_ONE;
10'b0110001100: q = POS_ONE;
10'b0110001101: q = POS_ONE;
10'b0110001110: q = POS_ONE;
10'b0110001111: q = POS_ONE;
10'b0110010000: q = POS_TWO;
10'b0110010001: q = POS_TWO;
10'b0110010010: q = POS_TWO;
10'b0110010011: q = POS_TWO;
10'b0110010100: q = POS_TWO;
10'b0110010101: q = POS_TWO;
10'b0110010110: q = POS_TWO;
10'b0110010111: q = POS_TWO;
10'b0110011000: q = POS_TWO;
10'b0110011001: q = POS_TWO;
10'b0110011010: q = POS_TWO;
10'b0110011011: q = POS_TWO;
10'b0110011100: q = POS_TWO;
10'b0110011101: q = POS_TWO;
10'b0110011110: q = POS_TWO;
10'b0110011111: q = POS_TWO;
10'b0110100000: q = POS_TWO;
10'b0110100001: q = POS_TWO;
10'b0110100010: q = POS_TWO;
10'b0110100011: q = POS_TWO;
10'b0110100100: q = POS_TWO;
10'b0110100101: q = POS_TWO;
10'b0110100110: q = POS_TWO;
10'b0110100111: q = POS_TWO;
10'b0110101000: q = POS_TWO;
10'b0110101001: q = POS_TWO;
10'b0110101010: q = POS_TWO;
10'b1001010100: q = NEG_TWO;
10'b1001010101: q = NEG_TWO;
10'b1001010110: q = NEG_TWO;
10'b1001010111: q = NEG_TWO;
10'b1001011000: q = NEG_TWO;
10'b1001011001: q = NEG_TWO;
10'b1001011010: q = NEG_TWO;
10'b1001011011: q = NEG_TWO;
10'b1001011100: q = NEG_TWO;
10'b1001011101: q = NEG_TWO;
10'b1001011110: q = NEG_TWO;
10'b1001011111: q = NEG_TWO;
10'b1001100000: q = NEG_TWO;
10'b1001100001: q = NEG_TWO;
10'b1001100010: q = NEG_TWO;
10'b1001100011: q = NEG_TWO;
10'b1001100100: q = NEG_TWO;
10'b1001100101: q = NEG_TWO;
10'b1001100110: q = NEG_TWO;
10'b1001100111: q = NEG_TWO;
10'b1001101000: q = NEG_TWO;
10'b1001101001: q = NEG_TWO;
10'b1001101010: q = NEG_TWO;
10'b1001101011: q = NEG_TWO;
10'b1001101100: q = NEG_ONE;
10'b1001101101: q = NEG_ONE;
10'b1001101110: q = NEG_ONE;
10'b1001101111: q = NEG_ONE;
10'b1001110000: q = NEG_ONE;
10'b1001110001: q = NEG_ONE;
10'b1001110010: q = NEG_ONE;
10'b1001110011: q = NEG_ONE;
10'b1001110100: q = NEG_ONE;
10'b1001110101: q = NEG_ONE;
10'b1001110110: q = NEG_ONE;
10'b1001110111: q = NEG_ONE;
10'b1001111000: q = ZERO;
10'b1001111001: q = ZERO;
10'b1001111010: q = ZERO;
10'b1001111011: q = ZERO;
10'b1001111100: q = ZERO;
10'b1001111101: q = ZERO;
10'b1001111110: q = ZERO;
10'b1001111111: q = ZERO;
10'b1000000000: q = ZERO;
10'b1000000001: q = ZERO;
10'b1000000010: q = ZERO;
10'b1000000011: q = ZERO;
10'b1000000100: q = ZERO;
10'b1000000101: q = ZERO;
10'b1000000110: q = POS_ONE;
10'b1000000111: q = POS_ONE;
10'b1000001000: q = POS_ONE;
10'b1000001001: q = POS_ONE;
10'b1000001010: q = POS_ONE;
10'b1000001011: q = POS_ONE;
10'b1000001100: q = POS_ONE;
10'b1000001101: q = POS_ONE;
10'b1000001110: q = POS_ONE;
10'b1000001111: q = POS_ONE;
10'b1000010000: q = POS_ONE;
10'b1000010001: q = POS_ONE;
10'b1000010010: q = POS_TWO;
10'b1000010011: q = POS_TWO;
10'b1000010100: q = POS_TWO;
10'b1000010101: q = POS_TWO;
10'b1000010110: q = POS_TWO;
10'b1000010111: q = POS_TWO;
10'b1000011000: q = POS_TWO;
10'b1000011001: q = POS_TWO;
10'b1000011010: q = POS_TWO;
10'b1000011011: q = POS_TWO;
10'b1000011100: q = POS_TWO;
10'b1000011101: q = POS_TWO;
10'b1000011110: q = POS_TWO;
10'b1000011111: q = POS_TWO;
10'b1000100000: q = POS_TWO;
10'b1000100001: q = POS_TWO;
10'b1000100010: q = POS_TWO;
10'b1000100011: q = POS_TWO;
10'b1000100100: q = POS_TWO;
10'b1000100101: q = POS_TWO;
10'b1000100110: q = POS_TWO;
10'b1000100111: q = POS_TWO;
10'b1000101000: q = POS_TWO;
10'b1000101001: q = POS_TWO;
10'b1000101010: q = POS_TWO;
10'b1011010100: q = NEG_TWO;
10'b1011010101: q = NEG_TWO;
10'b1011010110: q = NEG_TWO;
10'b1011010111: q = NEG_TWO;
10'b1011011000: q = NEG_TWO;
10'b1011011001: q = NEG_TWO;
10'b1011011010: q = NEG_TWO;
10'b1011011011: q = NEG_TWO;
10'b1011011100: q = NEG_TWO;
10'b1011011101: q = NEG_TWO;
10'b1011011110: q = NEG_TWO;
10'b1011011111: q = NEG_TWO;
10'b1011100000: q = NEG_TWO;
10'b1011100001: q = NEG_TWO;
10'b1011100010: q = NEG_TWO;
10'b1011100011: q = NEG_TWO;
10'b1011100100: q = NEG_TWO;
10'b1011100101: q = NEG_TWO;
10'b1011100110: q = NEG_TWO;
10'b1011100111: q = NEG_TWO;
10'b1011101000: q = NEG_TWO;
10'b1011101001: q = NEG_TWO;
10'b1011101010: q = NEG_TWO;
10'b1011101011: q = NEG_TWO;
10'b1011101100: q = NEG_ONE;
10'b1011101101: q = NEG_ONE;
10'b1011101110: q = NEG_ONE;
10'b1011101111: q = NEG_ONE;
10'b1011110000: q = NEG_ONE;
10'b1011110001: q = NEG_ONE;
10'b1011110010: q = NEG_ONE;
10'b1011110011: q = NEG_ONE;
10'b1011110100: q = NEG_ONE;
10'b1011110101: q = NEG_ONE;
10'b1011110110: q = NEG_ONE;
10'b1011110111: q = NEG_ONE;
10'b1011111000: q = ZERO;
10'b1011111001: q = ZERO;
10'b1011111010: q = ZERO;
10'b1011111011: q = ZERO;
10'b1011111100: q = ZERO;
10'b1011111101: q = ZERO;
10'b1011111110: q = ZERO;
10'b1011111111: q = ZERO;
10'b1010000000: q = ZERO;
10'b1010000001: q = ZERO;
10'b1010000010: q = ZERO;
10'b1010000011: q = ZERO;
10'b1010000100: q = ZERO;
10'b1010000101: q = ZERO;
10'b1010000110: q = POS_ONE;
10'b1010000111: q = POS_ONE;
10'b1010001000: q = POS_ONE;
10'b1010001001: q = POS_ONE;
10'b1010001010: q = POS_ONE;
10'b1010001011: q = POS_ONE;
10'b1010001100: q = POS_ONE;
10'b1010001101: q = POS_ONE;
10'b1010001110: q = POS_ONE;
10'b1010001111: q = POS_ONE;
10'b1010010000: q = POS_ONE;
10'b1010010001: q = POS_ONE;
10'b1010010010: q = POS_ONE;
10'b1010010011: q = POS_ONE;
10'b1010010100: q = POS_TWO;
10'b1010010101: q = POS_TWO;
10'b1010010110: q = POS_TWO;
10'b1010010111: q = POS_TWO;
10'b1010011000: q = POS_TWO;
10'b1010011001: q = POS_TWO;
10'b1010011010: q = POS_TWO;
10'b1010011011: q = POS_TWO;
10'b1010011100: q = POS_TWO;
10'b1010011101: q = POS_TWO;
10'b1010011110: q = POS_TWO;
10'b1010011111: q = POS_TWO;
10'b1010100000: q = POS_TWO;
10'b1010100001: q = POS_TWO;
10'b1010100010: q = POS_TWO;
10'b1010100011: q = POS_TWO;
10'b1010100100: q = POS_TWO;
10'b1010100101: q = POS_TWO;
10'b1010100110: q = POS_TWO;
10'b1010100111: q = POS_TWO;
10'b1010101000: q = POS_TWO;
10'b1010101001: q = POS_TWO;
10'b1010101010: q = POS_TWO;
10'b1101010100: q = NEG_TWO;
10'b1101010101: q = NEG_TWO;
10'b1101010110: q = NEG_TWO;
10'b1101010111: q = NEG_TWO;
10'b1101011000: q = NEG_TWO;
10'b1101011001: q = NEG_TWO;
10'b1101011010: q = NEG_TWO;
10'b1101011011: q = NEG_TWO;
10'b1101011100: q = NEG_TWO;
10'b1101011101: q = NEG_TWO;
10'b1101011110: q = NEG_TWO;
10'b1101011111: q = NEG_TWO;
10'b1101100000: q = NEG_TWO;
10'b1101100001: q = NEG_TWO;
10'b1101100010: q = NEG_TWO;
10'b1101100011: q = NEG_TWO;
10'b1101100100: q = NEG_TWO;
10'b1101100101: q = NEG_TWO;
10'b1101100110: q = NEG_TWO;
10'b1101100111: q = NEG_TWO;
10'b1101101000: q = NEG_TWO;
10'b1101101001: q = NEG_TWO;
10'b1101101010: q = NEG_ONE;
10'b1101101011: q = NEG_ONE;
10'b1101101100: q = NEG_ONE;
10'b1101101101: q = NEG_ONE;
10'b1101101110: q = NEG_ONE;
10'b1101101111: q = NEG_ONE;
10'b1101110000: q = NEG_ONE;
10'b1101110001: q = NEG_ONE;
10'b1101110010: q = NEG_ONE;
10'b1101110011: q = NEG_ONE;
10'b1101110100: q = NEG_ONE;
10'b1101110101: q = NEG_ONE;
10'b1101110110: q = NEG_ONE;
10'b1101110111: q = NEG_ONE;
10'b1101111000: q = ZERO;
10'b1101111001: q = ZERO;
10'b1101111010: q = ZERO;
10'b1101111011: q = ZERO;
10'b1101111100: q = ZERO;
10'b1101111101: q = ZERO;
10'b1101111110: q = ZERO;
10'b1101111111: q = ZERO;
10'b1100000000: q = ZERO;
10'b1100000001: q = ZERO;
10'b1100000010: q = ZERO;
10'b1100000011: q = ZERO;
10'b1100000100: q = ZERO;
10'b1100000101: q = ZERO;
10'b1100000110: q = ZERO;
10'b1100000111: q = ZERO;
10'b1100001000: q = POS_ONE;
10'b1100001001: q = POS_ONE;
10'b1100001010: q = POS_ONE;
10'b1100001011: q = POS_ONE;
10'b1100001100: q = POS_ONE;
10'b1100001101: q = POS_ONE;
10'b1100001110: q = POS_ONE;
10'b1100001111: q = POS_ONE;
10'b1100010000: q = POS_ONE;
10'b1100010001: q = POS_ONE;
10'b1100010010: q = POS_ONE;
10'b1100010011: q = POS_ONE;
10'b1100010100: q = POS_TWO;
10'b1100010101: q = POS_TWO;
10'b1100010110: q = POS_TWO;
10'b1100010111: q = POS_TWO;
10'b1100011000: q = POS_TWO;
10'b1100011001: q = POS_TWO;
10'b1100011010: q = POS_TWO;
10'b1100011011: q = POS_TWO;
10'b1100011100: q = POS_TWO;
10'b1100011101: q = POS_TWO;
10'b1100011110: q = POS_TWO;
10'b1100011111: q = POS_TWO;
10'b1100100000: q = POS_TWO;
10'b1100100001: q = POS_TWO;
10'b1100100010: q = POS_TWO;
10'b1100100011: q = POS_TWO;
10'b1100100100: q = POS_TWO;
10'b1100100101: q = POS_TWO;
10'b1100100110: q = POS_TWO;
10'b1100100111: q = POS_TWO;
10'b1100101000: q = POS_TWO;
10'b1100101001: q = POS_TWO;
10'b1100101010: q = POS_TWO;
10'b1111010100: q = NEG_TWO;
10'b1111010101: q = NEG_TWO;
10'b1111010110: q = NEG_TWO;
10'b1111010111: q = NEG_TWO;
10'b1111011000: q = NEG_TWO;
10'b1111011001: q = NEG_TWO;
10'b1111011010: q = NEG_TWO;
10'b1111011011: q = NEG_TWO;
10'b1111011100: q = NEG_TWO;
10'b1111011101: q = NEG_TWO;
10'b1111011110: q = NEG_TWO;
10'b1111011111: q = NEG_TWO;
10'b1111100000: q = NEG_TWO;
10'b1111100001: q = NEG_TWO;
10'b1111100010: q = NEG_TWO;
10'b1111100011: q = NEG_TWO;
10'b1111100100: q = NEG_TWO;
10'b1111100101: q = NEG_TWO;
10'b1111100110: q = NEG_TWO;
10'b1111100111: q = NEG_TWO;
10'b1111101000: q = NEG_ONE;
10'b1111101001: q = NEG_ONE;
10'b1111101010: q = NEG_ONE;
10'b1111101011: q = NEG_ONE;
10'b1111101100: q = NEG_ONE;
10'b1111101101: q = NEG_ONE;
10'b1111101110: q = NEG_ONE;
10'b1111101111: q = NEG_ONE;
10'b1111110000: q = NEG_ONE;
10'b1111110001: q = NEG_ONE;
10'b1111110010: q = NEG_ONE;
10'b1111110011: q = NEG_ONE;
10'b1111110100: q = NEG_ONE;
10'b1111110101: q = NEG_ONE;
10'b1111110110: q = NEG_ONE;
10'b1111110111: q = NEG_ONE;
10'b1111111000: q = ZERO;
10'b1111111001: q = ZERO;
10'b1111111010: q = ZERO;
10'b1111111011: q = ZERO;
10'b1111111100: q = ZERO;
10'b1111111101: q = ZERO;
10'b1111111110: q = ZERO;
10'b1111111111: q = ZERO;
10'b1110000000: q = ZERO;
10'b1110000001: q = ZERO;
10'b1110000010: q = ZERO;
10'b1110000011: q = ZERO;
10'b1110000100: q = ZERO;
10'b1110000101: q = ZERO;
10'b1110000110: q = ZERO;
10'b1110000111: q = ZERO;
10'b1110001000: q = POS_ONE;
10'b1110001001: q = POS_ONE;
10'b1110001010: q = POS_ONE;
10'b1110001011: q = POS_ONE;
10'b1110001100: q = POS_ONE;
10'b1110001101: q = POS_ONE;
10'b1110001110: q = POS_ONE;
10'b1110001111: q = POS_ONE;
10'b1110010000: q = POS_ONE;
10'b1110010001: q = POS_ONE;
10'b1110010010: q = POS_ONE;
10'b1110010011: q = POS_ONE;
10'b1110010100: q = POS_ONE;
10'b1110010101: q = POS_ONE;
10'b1110010110: q = POS_ONE;
10'b1110010111: q = POS_ONE;
10'b1110011000: q = POS_TWO;
10'b1110011001: q = POS_TWO;
10'b1110011010: q = POS_TWO;
10'b1110011011: q = POS_TWO;
10'b1110011100: q = POS_TWO;
10'b1110011101: q = POS_TWO;
10'b1110011110: q = POS_TWO;
10'b1110011111: q = POS_TWO;
10'b1110100000: q = POS_TWO;
10'b1110100001: q = POS_TWO;
10'b1110100010: q = POS_TWO;
10'b1110100011: q = POS_TWO;
10'b1110100100: q = POS_TWO;
10'b1110100101: q = POS_TWO;
10'b1110100110: q = POS_TWO;
10'b1110100111: q = POS_TWO;
10'b1110101000: q = POS_TWO;
10'b1110101001: q = POS_TWO;
10'b1110101010: q = POS_TWO;
default: begin
q = q_t'(3'bXXX); //This prevents the tool from creating potentially costly default behaviour
not_in_table = 1; //For assertions only
end
endcase
end
endmodule

View file

@ -0,0 +1,293 @@
/*
* Copyright © 2019-2023 Yuhui Gao, Chris Keilbart, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Yuhui Gao <yuhuig@sfu.ca>
* Chris Keilbart <ckeilbar@sfu.ca>
*/
module fp_add
import cva5_config::*;
import cva5_types::*;
import fpu_types::*;
(
input logic clk,
input logic rst,
input fp_add_inputs_t args,
unit_issue_interface.unit issue,
fp_intermediate_wb_interface.unit wb
);
logic advance_to_add;
logic advance_to_final;
/////////////////////////////////////////////
//Cycle 1
//Swap and align arguments
//Also detect special cases
logic temp_rs2_sign;
assign temp_rs2_sign = args.add ? args.rs2.d.sign : ~args.rs2.d.sign;
//Special case handling
logic nv[2:0];
logic inf[2:0];
logic qnan[1:0];
logic subtract[2:0];
logic zero_result_sign[2:0];
logic inf_sign[2:0];
//SNAN or "magnitude subtraction of infinities"
assign nv[0] = args.rs1_special_case.snan | args.rs2_special_case.snan | (args.rs1_special_case.inf & args.rs2_special_case.inf & (args.rs1.d.sign ^ temp_rs2_sign));
assign qnan[0] = args.rs1_special_case.snan | args.rs1_special_case.qnan | args.rs2_special_case.snan | args.rs2_special_case.qnan | nv[0];
assign inf[0] = (args.rs1_special_case.inf | args.rs2_special_case.inf) & ~qnan[0];
assign inf_sign[0] = args.rs1_special_case.inf ? args.rs1.d.sign : temp_rs2_sign;
assign subtract[0] = args.rs1.d.sign ^ temp_rs2_sign;
assign zero_result_sign[0] = args.rm == 3'b010;
//Swap arguments, moving input with larger expo to rs1
logic rs1_sign[2:0];
expo_d_t rs1_expo[2:0];
logic rs1_expo_overflow[2:0];
logic[FRAC_WIDTH+1:0] rs1_frac[1:0];
logic[FRAC_WIDTH+1:0] rs2_frac[0:0];
grs_t rs1_grs[1:0];
grs_t temp_rs2_grs;
always_comb begin
if (~args.swap) begin
rs1_sign[0] = args.rs1.d.sign;
rs1_expo_overflow[0] = args.rs1_expo_overflow;
rs1_expo[0] = args.rs1.d.expo;
rs1_frac[0] = {args.rs1_safe, args.rs1_hidden, args.rs1.d.frac};
rs1_grs[0] = args.fp_add_grs;
rs2_frac[0] = {args.rs2_safe, args.rs2_hidden, args.rs2.d.frac};
temp_rs2_grs = '0;
end else begin
rs1_sign[0] = temp_rs2_sign;
rs1_expo_overflow[0] = 1'b0;
rs1_expo[0] = args.rs2.d.expo;
rs1_frac[0] = {args.rs2_safe, args.rs2_hidden, args.rs2.d.frac};
rs1_grs[0] = '0;
rs2_frac[0] = {args.rs1_safe, args.rs1_hidden, args.rs1.d.frac};
temp_rs2_grs = args.fp_add_grs;
end
end
//Alignment through shifting
logic shift_sticky[1:0];
logic[FRAC_WIDTH+1:0] rs2_frac_aligned[1:0];
grs_t rs2_grs[1:0];
logic[FRAC_WIDTH+GRS_WIDTH+1:0] shifter_input;
assign shifter_input = {rs2_frac[0], temp_rs2_grs};
assign {rs2_frac_aligned[0], rs2_grs[0]} = shifter_input >> args.expo_diff;
//If the shift amount is too large, bits might get shifted out so this checks for them
fp_sticky_tracking #(.INPUT_WIDTH($bits(shifter_input)), .SHIFT_WIDTH(EXPO_WIDTH+1)) sticky_tracking (
.shifter_input(shifter_input),
.shift_amount(args.expo_diff),
.sticky_bit(shift_sticky[0])
);
//Pipeline to next stage
logic valid_r;
rm_t rm_r;
id_t id_r;
logic d2s_r;
assign advance_to_add = ~valid_r | advance_to_final;
always_ff @(posedge clk) begin
if (rst)
valid_r <= 0;
else if (advance_to_add)
valid_r <= issue.new_request;
if (advance_to_add) begin
d2s_r <= args.single;
id_r <= issue.id;
rm_r <= args.rm;
nv[1] <= nv[0];
qnan[1] <= qnan[0];
inf[1] <= inf[0];
inf_sign[1] <= inf_sign[0];
subtract[1] <= subtract[0];
zero_result_sign[1] <= zero_result_sign[0];
rs1_sign[1] <= rs1_sign[0];
rs1_expo[1] <= rs1_expo[0];
rs1_expo_overflow[1] <= rs1_expo_overflow[0];
rs1_frac[1] <= rs1_frac[0];
rs1_grs[1] <= rs1_grs[0];
rs2_grs[1] <= rs2_grs[0];
rs2_frac_aligned[1] <= rs2_frac_aligned[0];
shift_sticky[1] <= shift_sticky[0];
end
end
/////////////////////////////////////////////
//Cycle 2
//Perform the sign-magnitude mantissa addition
//Coded as an adder followed by negation, but the tools will transform this into two parallel additions with a muxing of the result
//Negation is only required for different sign addition that returns a negative result
logic[FRAC_WIDTH+GRS_WIDTH+2:0] adder_in1;
logic[FRAC_WIDTH+GRS_WIDTH+2:0] adder_in2, adder_in2_1s;
logic carry_add;
grs_t grs_add;
logic[FRAC_WIDTH+1:0] frac_add;
logic sticky_add;
logic[1+GRS_WIDTH+FRAC_WIDTH+2-1:0] sum;
logic[1+GRS_WIDTH+FRAC_WIDTH+2-1:0] sum_final;
assign adder_in2 = {rs2_frac_aligned[1], rs2_grs[1], shift_sticky[1]};
assign adder_in2_1s = adder_in2 ^ {(FRAC_WIDTH+GRS_WIDTH+3){subtract[1]}};
assign adder_in1 = {rs1_frac[1], rs1_grs[1], 1'b0};
assign {carry_add, sum} = adder_in1 + adder_in2_1s + {{(FRAC_WIDTH+GRS_WIDTH+2){1'b0}}, subtract[1]};
//subtract & ~carry_add = 1 if subtract and adder_in1 > adder_in2_1s, 0 if adder_in1 < adder_in2_1s
assign sum_final = ~carry_add & subtract[1] ? -sum : sum;
assign {frac_add, grs_add, sticky_add} = sum_final;
//Pipeline to next stage
logic[FRAC_WIDTH+1:0] result_frac;
grs_t result_grs;
logic result_carry_out;
logic output_special;
logic result_expo_zero;
assign advance_to_final = wb.ack | ~wb.done;
always_ff @ (posedge clk) begin
if (rst)
wb.done <= 0;
else if (advance_to_final)
wb.done <= valid_r;
if (advance_to_final) begin
wb.d2s <= d2s_r;
wb.id <= id_r;
wb.rm <= rm_r;
nv[2] <= nv[1];
inf[2] <= inf[1];
inf_sign[2] <= inf_sign[1];
output_special <= inf[1] | qnan[1];
subtract[2] <= subtract[1];
zero_result_sign[2] <= zero_result_sign[1];
rs1_sign[2] <= rs1_sign[1];
rs1_expo[2] <= rs1_expo[1];
result_expo_zero <= ~|rs1_expo[1];
rs1_expo_overflow[2] <= rs1_expo_overflow[1];
result_frac <= frac_add;
result_carry_out <= carry_add;
result_grs[GRS_WIDTH-1:1] <= grs_add[GRS_WIDTH-1:1];
result_grs[0] <= grs_add[0] | sticky_add; //Don't lose the sticky
end
end
/////////////////////////////////////////////
//Cycle 3
//Find CLZ and determine shift amount
//Override on special case and drive outputs
logic result_zero;
logic[$clog2(FRAC_WIDTH+1+GRS_WIDTH)-1:0] clz_count;
clz #(.WIDTH(FRAC_WIDTH+1+GRS_WIDTH)) shift_clz (
.clz_input({result_frac[FRAC_WIDTH:0], result_grs}),
.clz(clz_count),
.zero(result_zero)
);
//Determine exponent and sign
logic carry_set;
logic output_zero;
logic result_sign;
logic result_expo_overflow;
expo_d_t result_expo;
fp_shift_amt_t clz_shift_amt;
assign carry_set = ~subtract[2] & result_carry_out;
assign output_zero = result_zero & ~result_frac[FRAC_WIDTH+1] & ~carry_set;
assign result_sign = output_zero & subtract[2] ? zero_result_sign[2] : (~result_carry_out & subtract[2]) ^ rs1_sign[2];
assign result_expo_overflow = ~output_zero & rs1_expo_overflow[2];
always_comb begin
clz_shift_amt = '0;
clz_shift_amt[$bits(clz_count)-1:0] = clz_count;
if (output_zero)
result_expo = '0;
else if (result_expo_zero & (result_frac[FRAC_WIDTH] | carry_set | result_frac[FRAC_WIDTH+1])) //Subnormal promotion
result_expo = 1; //Will be added to the right shift amount to get the correct exponent
else if (clz_shift_amt >= rs1_expo[2] & ~result_expo_zero & ~result_frac[FRAC_WIDTH+1] & ~carry_set) //Subnormal demotion
result_expo = rs1_expo[2] - 1;
else
result_expo = rs1_expo[2];
end
fp_t special_result;
always_comb begin
if (inf[2]) begin
special_result.d.sign = inf_sign[2];
special_result.d.expo = '1;
special_result.d.frac = '0;
end
else //qnan
special_result.raw = CANONICAL_NAN;
end
//Writeback
assign issue.ready = advance_to_add;
assign wb.fflags.nv = nv[2];
assign wb.fflags.of = 0;
assign wb.fflags.uf = 0;
assign wb.fflags.dz = 0;
assign wb.fflags.nx = 0; //Will be set by normalization
assign wb.carry = ~output_special & carry_set;
assign wb.safe = result_frac[FRAC_WIDTH+1] & ~output_special;
assign wb.hidden = result_frac[FRAC_WIDTH] | output_special;
assign wb.grs = output_special ? '0 : result_grs;
always_comb begin
wb.clz = '0;
if (~output_zero & ~output_special)
wb.clz[$bits(clz_count)-1:0] = clz_count;
if (output_special)
wb.rd = special_result;
else begin
wb.rd.d.sign = result_sign;
wb.rd.d.expo = result_expo;
wb.rd.d.frac = result_frac[FRAC_WIDTH-1:0];
end
end
assign wb.expo_overflow = result_expo_overflow & ~output_special;
assign wb.subnormal = ~|result_expo & ~output_special & ~wb.right_shift & ~result_expo_overflow;
assign wb.right_shift = ~output_special & (result_frac[FRAC_WIDTH+1] | carry_set);
assign wb.right_shift_amt = {{(EXPO_WIDTH-2){1'b0}}, carry_set, result_frac[FRAC_WIDTH+1] & ~carry_set}; //Either 1 or 2
assign wb.ignore_max_expo = output_special;
endmodule

View file

@ -0,0 +1,203 @@
/*
* Copyright © 2019-2023 Yuhui Gao, Chris Keilbart, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Yuhui Gao <yuhuig@sfu.ca>
* Chris Keilbart <ckeilbar@sfu.ca>
*/
module fp_div
import cva5_config::*;
import fpu_types::*;
(
input logic clk,
input logic rst,
input fp_div_inputs_t args,
unit_issue_interface.unit issue,
fp_intermediate_wb_interface.unit wb
);
unsigned_division_interface #(.DATA_WIDTH(FRAC_WIDTH+3)) div();
////////////////////////////////////////////////////
//Implementation
//Iterative divider core, bypassed on special cases
logic result_sign;
logic busy;
logic new_request_r;
assign issue.ready = ~busy | wb.ack;
always_ff @(posedge clk) begin
if (rst) begin
busy <= 0;
new_request_r <= 0;
end
else begin
if (wb.ack)
busy <= 0;
if (issue.new_request)
busy <= 1;
new_request_r <= issue.new_request;
end
if (issue.new_request)
result_sign <= args.rs1.d.sign ^ args.rs2.d.sign;
end
////////////////////////////////////////////////////
//Special cases
//Edge cases like NaN, infinity, and zero don't require division so return immediately
logic nv, nv_r;
logic dz, dz_r;
logic qnan, qnan_r;
logic inf;
logic zero, zero_r;
logic early_exit;
fp_t special_result;
//Special case handling
assign nv = (args.rs1_special_case.zero & args.rs2_special_case.zero) | (args.rs1_special_case.inf & args.rs2_special_case.inf) | args.rs1_special_case.snan | args.rs2_special_case.snan;
assign dz = ~|args.rs1_special_case & args.rs2_special_case.zero;
assign qnan = nv | args.rs1_special_case.qnan | args.rs2_special_case.qnan;
assign inf = ~qnan & (dz | args.rs1_special_case.inf);
assign zero = ~qnan & (args.rs1_special_case.zero | args.rs2_special_case.inf);
always_ff @(posedge clk) begin
if (rst)
early_exit <= 0;
else if (wb.ack)
early_exit <= 0;
else if (issue.new_request)
early_exit <= qnan | inf | zero;
if (issue.new_request) begin
nv_r <= nv;
dz_r <= dz;
qnan_r <= qnan;
zero_r <= zero;
end
end
always_comb begin
if (zero_r) begin
special_result.d.sign = result_sign;
special_result.raw[FLEN-2:0] = '0;
end
else if (qnan_r)
special_result.raw = CANONICAL_NAN;
else begin
special_result.d.sign = result_sign;
special_result.d.expo = '1;
special_result.d.frac = '0;
end
end
////////////////////////////////////////////////////
//Mantissa division core
//Designed to be swappable (though note that only a subset of the division interface ports are used)
//Operates on normalized values and width is extended to compute guard/round/sticky
logic result_hidden;
frac_d_t result_frac;
logic[1:0] result_gr;
fp_shift_amt_t left_shift_amt;
assign div.dividend = {1'b1, args.rs1.d.frac, 2'b0};
assign div.divisor = {1'b1, args.rs2.d.frac, 2'b0};
assign div.start = issue.new_request & ~(qnan | inf | zero); //start div only if no special cases
assign {result_hidden, result_frac, result_gr} = div.quotient;
fp_div_core div_core (
.div(div),
.*);
//Calculate CLZ: because 0.5 < result < 2, the shift amount is either 0 or 1
assign left_shift_amt[EXPO_WIDTH-1:1] = '0;
assign left_shift_amt[0] = ~result_hidden;
////////////////////////////////////////////////////
//Exponent handling
//Subtract exponents
//Special considerations for subnormal numbers
logic right_shift;
fp_shift_amt_t right_shift_amt;
logic[EXPO_WIDTH+1:0] expo_intermediate;
logic[EXPO_WIDTH+1:0] expo_intermediate_r;
assign expo_intermediate =
({1'b0, args.rs1.d.expo} + {{EXPO_WIDTH{1'b0}}, ~args.rs1_hidden} - {1'b0, args.rs1_prenormalize_shift_amt}) -
({1'b0, args.rs2.d.expo} + {{EXPO_WIDTH{1'b0}}, ~args.rs2_hidden} - {1'b0, args.rs2_prenormalize_shift_amt})
+ BIAS;
assign right_shift = expo_intermediate_r[EXPO_WIDTH+1] | (~|expo_intermediate_r[EXPO_WIDTH:1] & ((~result_hidden & expo_intermediate_r[0]) | ~expo_intermediate_r[0]));
assign right_shift_amt = ~expo_intermediate_r[EXPO_WIDTH-1:0] + 2;
always_ff @(posedge clk) begin
if (issue.new_request)
expo_intermediate_r <= expo_intermediate;
end
////////////////////////////////////////////////////
//Output management
//Either return the early execute values on cycle 1, or the regular values once the divider finishes
logic div_hold;
assign wb.done = div.done | div_hold | early_exit;
always_ff @(posedge clk) begin
if (rst)
div_hold <= 0;
else
div_hold <= ~wb.ack & (div.done | div_hold);
end
always_ff @(posedge clk) begin
if (issue.new_request) begin
wb.id <= issue.id;
wb.rm <= args.rm;
wb.d2s <= args.single;
end
end
always_comb begin
if (new_request_r)
wb.rd = special_result;
else begin
wb.rd.d.sign = result_sign;
wb.rd.d.expo = expo_intermediate_r[EXPO_WIDTH-1:0];
wb.rd.d.frac = result_frac;
end
end
//Note that this overflow detection also captures subnormal numbers but they are ignored when subnormal is set
assign wb.expo_overflow = expo_intermediate_r[EXPO_WIDTH] & ~new_request_r;
assign wb.fflags.nv = nv_r;
assign wb.fflags.dz = dz_r;
//Set in writeback
assign wb.fflags.of = 0;
assign wb.fflags.uf = 0;
assign wb.fflags.nx = 0;
assign wb.carry = 0;
assign wb.safe = 0;
assign wb.hidden = (new_request_r & ~zero_r) | (~new_request_r & result_hidden);
assign wb.grs = new_request_r ? '0 : {result_gr, div.remainder, {(GRS_WIDTH-FRAC_WIDTH-5){1'b0}}};
assign wb.clz = new_request_r ? '0 : left_shift_amt;
assign wb.subnormal = ~new_request_r & right_shift;
assign wb.right_shift = ~new_request_r & right_shift;
assign wb.right_shift_amt = right_shift_amt;
assign wb.ignore_max_expo = new_request_r;
endmodule

View file

@ -0,0 +1,106 @@
/*
* Copyright © 2019-2023 Yuhui Gao, Chris Keilbart, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Yuhui Gao <yuhuig@sfu.ca>
* Chris Keilbart <ckeilbar@sfu.ca>
*/
module fp_div_sqrt_wrapper
import cva5_config::*;
import fpu_types::*;
(
input logic clk,
input logic rst,
input fp_div_inputs_t div_inputs,
input fp_sqrt_inputs_t sqrt_inputs,
unit_issue_interface.unit div_issue,
unit_issue_interface.unit sqrt_issue,
fp_intermediate_wb_interface.unit wb
);
fp_intermediate_wb_interface div_wb();
fp_intermediate_wb_interface sqrt_wb();
////////////////////////////////////////////////////
//Implementation
//Div/Sqrt with distinct issue
//Shared writeback
fp_div div (
.args(div_inputs),
.issue(div_issue),
.wb(div_wb),
.*);
fp_sqrt sqrt (
.args(sqrt_inputs),
.issue(sqrt_issue),
.wb(sqrt_wb),
.*);
//SQRT has higher priority on ties because of longer latency
always_comb begin
sqrt_wb.ack = wb.ack & sqrt_wb.done;
div_wb.ack = wb.ack & ~sqrt_wb.done;
if (sqrt_wb.done) begin
wb.id = sqrt_wb.id;
wb.done = 1;
wb.rd = sqrt_wb.rd;
wb.expo_overflow = sqrt_wb.expo_overflow;
wb.fflags = sqrt_wb.fflags;
wb.rm = sqrt_wb.rm;
wb.carry = sqrt_wb.carry;
wb.safe = sqrt_wb.safe;
wb.hidden = sqrt_wb.hidden;
//Collapse sticky - this saves a wide 2:1 mux
wb.grs[GRS_WIDTH-1-:2] = sqrt_wb.grs[GRS_WIDTH-1-:2];
wb.grs[GRS_WIDTH-3] = |sqrt_wb.grs[GRS_WIDTH-3:0];
wb.grs[GRS_WIDTH-4:0] = '0;
wb.clz = sqrt_wb.clz;
wb.right_shift = sqrt_wb.right_shift;
wb.right_shift_amt = sqrt_wb.right_shift_amt;
wb.subnormal = sqrt_wb.subnormal;
wb.ignore_max_expo = sqrt_wb.ignore_max_expo;
wb.d2s = sqrt_wb.d2s;
end else begin
wb.id = div_wb.id;
wb.done = div_wb.done;
wb.rd = div_wb.rd;
wb.expo_overflow = div_wb.expo_overflow;
wb.fflags = div_wb.fflags;
wb.rm = div_wb.rm;
wb.carry = div_wb.carry;
wb.safe = div_wb.safe;
wb.hidden = div_wb.hidden;
//Collapse sticky - this saves a wide 2:1 mux
wb.grs[GRS_WIDTH-1-:3] = div_wb.grs[GRS_WIDTH-1-:3]; //Preserve MSB sticky because there can be a left shift of 1
wb.grs[GRS_WIDTH-4] = |div_wb.grs[GRS_WIDTH-4:0];
wb.grs[GRS_WIDTH-5:0] = '0;
wb.clz = div_wb.clz;
wb.right_shift = div_wb.right_shift;
wb.right_shift_amt = div_wb.right_shift_amt;
wb.subnormal = div_wb.subnormal;
wb.ignore_max_expo = div_wb.ignore_max_expo;
wb.d2s = div_wb.d2s;
end
end
endmodule

View file

@ -0,0 +1,101 @@
/*
* Copyright © 2019-2023 Yuhui Gao, Chris Keilbart, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Yuhui Gao <yuhuig@sfu.ca>
* Chris Keilbart <ckeilbar@sfu.ca>
*/
module fp_madd_wrapper
import cva5_config::*;
import fpu_types::*;
import cva5_types::*;
#(
parameter cpu_config_t CONFIG = EXAMPLE_CONFIG
)
(
input logic clk,
input logic rst,
input fp_madd_inputs_t args,
unit_issue_interface.unit issue,
fp_intermediate_wb_interface.unit madd_wb,
fp_intermediate_wb_interface.unit mul_wb
);
unit_issue_interface mul_issue();
unit_issue_interface add_issue();
/////////////////////////////////////////////
//Multiplication unit
//Writes back multiplication instructions directly with its own port
//Generates FMA operands
fp_add_inputs_t fma_mul_outputs;
logic fma_valid;
logic fma_valid_r;
logic fma_advance;
id_t fma_id;
assign fma_advance = ~fma_valid_r | add_issue.ready;
assign mul_issue.new_request = ~args.add & issue.new_request;
assign mul_issue.id = issue.id;
fp_mul #(.CONFIG(CONFIG)) mul_core (
.mul_args(args.mul_args),
.fma(args.fma),
.fma_args(args.fma_args),
.issue(mul_issue),
.wb(mul_wb),
.add_ready(fma_advance),
.add_valid(fma_valid),
.add_id(fma_id),
.add_args(fma_mul_outputs),
.*);
//It would probably be possible to use these directly without registering if some of the exponent logic in the multiplier was pushed to an earlier cycle
fp_add_inputs_t fma_mul_outputs_r;
id_t fma_id_r;
always_ff @(posedge clk) begin
if (rst)
fma_valid_r <= 0;
else if (fma_advance)
fma_valid_r <= fma_valid;
if (fma_advance) begin
fma_id_r <= fma_id;
fma_mul_outputs_r <= fma_mul_outputs;
end
end
/////////////////////////////////////////////
//Addition unit
//Input comes from FMA or add instructions, prioritizing FMA
//FMA inputs are the registered outputs from the multiplier
fp_add_inputs_t add_inputs;
assign add_inputs = fma_valid_r ? fma_mul_outputs_r : args.add_args;
assign add_issue.id = fma_valid_r ? fma_id_r : issue.id;
assign add_issue.new_request = fma_valid_r | (issue.new_request & args.add);
fp_add add_core (
.args(add_inputs),
.issue(add_issue),
.wb(madd_wb),
.*);
assign issue.ready = (~args.add & mul_issue.ready) | (args.add & add_issue.ready & ~fma_valid_r);
endmodule

View file

@ -0,0 +1,298 @@
/*
* Copyright © 2019-2023 Yuhui Gao, Chris Keilbart, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Yuhui Gao <yuhuig@sfu.ca>
* Chris Keilbart <ckeilbar@sfu.ca>
*/
module fp_mul
import cva5_config::*;
import cva5_types::*;
import fpu_types::*;
#(
parameter cpu_config_t CONFIG = EXAMPLE_CONFIG
)
(
input logic clk,
input logic rst,
input fp_mul_inputs_t mul_args,
input logic fma,
input fp_fma_inputs_t fma_args,
unit_issue_interface.unit issue,
fp_intermediate_wb_interface.unit wb,
input logic add_ready,
output logic add_valid,
output id_t add_id,
output fp_add_inputs_t add_args
);
localparam HALF_GRS_WIDTH = GRS_WIDTH/2;
logic advance_to_mul2;
logic advance_to_final;
/////////////////////////////////////////////
//Cycle 1
//Half of the multiplication
//Special case detection
logic nv[2:0];
logic inf[2:0];
logic qnan[2:0];
logic true_zero[2:0];
logic subnormal_zero[2:0];
assign nv[0] = (mul_args.rs1_special_case.zero & mul_args.rs2_special_case.inf) | (mul_args.rs1_special_case.inf & mul_args.rs2_special_case.zero) | mul_args.rs1_special_case.snan | mul_args.rs2_special_case.snan;
assign qnan[0] = nv[0] | mul_args.rs1_special_case.snan | mul_args.rs1_special_case.qnan | mul_args.rs2_special_case.snan | mul_args.rs2_special_case.qnan;
assign inf[0] = ((mul_args.rs1_special_case.inf & ~mul_args.rs2_special_case.zero) | (~mul_args.rs1_special_case.zero & mul_args.rs2_special_case.inf)) & ~qnan[0];
assign true_zero[0] = (mul_args.rs1_special_case.zero | mul_args.rs2_special_case.zero) & ~qnan[0];
//The exponent logic can only handle 1 subnormal argument. 2 subnormals produces 0 mantissa but a set sticky bit
assign subnormal_zero[0] = ~mul_args.rs1_hidden & ~mul_args.rs2_hidden & ~true_zero[0];
//Unpacking
id_t id[2:0];
rm_t rm[2:0];
logic d2s[2:0];
logic sign_xor[2:0];
expo_d_t rs1_expo[1:0];
expo_d_t rs2_expo[1:0];
fp_shift_amt_t rs2_prenormalize_shift_amt[1:0];
fp_fma_inputs_t fma_info[2:0];
assign id[0] = issue.id;
assign rm[0] = mul_args.rm;
assign d2s[0] = mul_args.single;
assign sign_xor[0] = mul_args.rs1.d.sign ^ mul_args.rs2.d.sign;
assign rs1_expo[0] = mul_args.rs1.d.expo;
assign rs2_expo[0] = mul_args.rs2.d.expo + {{(EXPO_WIDTH-1){1'b0}}, ~mul_args.rs2_hidden};
assign rs2_prenormalize_shift_amt[0] = mul_args.rs2_prenormalize_shift_amt;
assign fma_info[0] = fma_args;
//Pipelining
logic valid_r;
logic fma_r;
assign advance_to_mul2 = ~valid_r | advance_to_final;
always_ff @(posedge clk) begin
if (rst)
valid_r <= 0;
else if (advance_to_mul2)
valid_r <= issue.new_request;
if (advance_to_mul2) begin
fma_r <= fma;
id[1] <= id[0];
rm[1] <= rm[0];
d2s[1] <= d2s[0];
sign_xor[1] <= sign_xor[0];
rs1_expo[1] <= rs1_expo[0];
rs2_expo[1] <= rs2_expo[0];
rs2_prenormalize_shift_amt[1] <= rs2_prenormalize_shift_amt[0];
fma_info[1] <= fma_info[0];
nv[1] <= nv[0];
qnan[1] <= qnan[0];
inf[1] <= inf[0];
true_zero[1] <= true_zero[0];
subnormal_zero[1] <= subnormal_zero[0];
end
end
////////////////////////////////////////////////////
//Multiplication itself
//Pipelined over 2 cycles
logic[FRAC_WIDTH:0] mul_in1;
logic[FRAC_WIDTH:0] mul_in2;
logic[2*FRAC_WIDTH+2-1:0] intermediate_frac;
always_ff @(posedge clk) begin
if (advance_to_mul2) begin
mul_in1 <= {1'b1, mul_args.rs1.d.frac};
mul_in2 <= {1'b1, mul_args.rs2.d.frac};
end
if (advance_to_final)
intermediate_frac <= mul_in1 * mul_in2;
end
/////////////////////////////////////////////
//Cycle 2
//Second half of the multiplication
//Exponent logic depends on the presence of subnormal numbers
logic[EXPO_WIDTH+1:0] signed_expo;
logic[EXPO_WIDTH:0] neg_signed_expo;
logic[EXPO_WIDTH:0] intermediate_expo;
logic intermediate_expo_is_zero;
//Negative intermediate expo -> subnormal result
//To normalize a subnormal result, the exponent is set to abs(intermediate expo), and the frac is right shifted for the same amount. Normalization handles driving the expo_norm to 0
assign signed_expo = {1'b0, rs1_expo[1]} + ({1'b0, rs2_expo[1]} - {1'b0, rs2_prenormalize_shift_amt[1]}) - {2'b0, {(EXPO_WIDTH-1){1'b1}}};
assign neg_signed_expo = -signed_expo[EXPO_WIDTH:0];
assign intermediate_expo = signed_expo[EXPO_WIDTH+1] ? neg_signed_expo : signed_expo[EXPO_WIDTH:0];
assign intermediate_expo_is_zero = ~|signed_expo;
//Pipelining
logic result_expo_overflow;
expo_d_t result_expo;
logic[EXPO_WIDTH+1:0] result_expo_diff;
logic result_expo_is_negative;
logic result_expo_is_zero;
logic output_special;
assign advance_to_final = (wb.done & wb.ack) | (~wb.done & ~add_valid) | (add_valid & add_ready);
always_ff @ (posedge clk) begin
if (rst) begin
wb.done <= 0;
add_valid <= 0;
end
else if (advance_to_final) begin
wb.done <= valid_r & ~fma_r;
add_valid <= valid_r & fma_r;
end
if (advance_to_final) begin
id[2] <= id[1];
d2s[2] <= d2s[1];
rm[2] <= rm[1];
sign_xor[2] <= sign_xor[1];
nv[2] <= nv[1];
qnan[2] <= qnan[1];
inf[2] <= inf[1];
true_zero[2] <= true_zero[1];
subnormal_zero[2] <= subnormal_zero[1];
output_special <= inf[1] | qnan[1] | true_zero[1] | subnormal_zero[1];
fma_info[2] <= fma_info[1];
result_expo_overflow <= intermediate_expo[EXPO_WIDTH];
result_expo_is_negative <= signed_expo[EXPO_WIDTH+1];
result_expo_is_zero <= intermediate_expo_is_zero;
result_expo <= intermediate_expo[EXPO_WIDTH-1:0];
result_expo_diff <= signed_expo;
end
end
/////////////////////////////////////////////
//Output
//Finalize multiplication outputs
//Create FMA arguments
logic result_safe;
logic result_hidden;
frac_d_t result_frac;
logic[HALF_GRS_WIDTH-1:0] result_grs;
logic result_is_subnormal;
assign {result_safe, result_hidden, result_frac} = intermediate_frac[2*FRAC_WIDTH+2-1-:2+FRAC_WIDTH];
//There is no reduction for the full grs, but this accommodates optional intermediate rounding
assign result_grs = {intermediate_frac[FRAC_WIDTH-1-:HALF_GRS_WIDTH-1], |intermediate_frac[FRAC_WIDTH-HALF_GRS_WIDTH:0]};
assign result_is_subnormal = result_expo_is_negative | (result_expo_is_zero & ~result_safe);
//Special case handling
fp_t special_result;
always_comb begin
if (inf[2]) begin
special_result.d.sign = sign_xor[2];
special_result.d.expo = '1;
special_result.d.frac = '0;
end
else if (qnan[2])
special_result.raw = CANONICAL_NAN;
else begin //Zero
special_result.d.sign = sign_xor[2];
special_result.d.expo = '0;
special_result.d.frac = '0;
end
end
assign issue.ready = advance_to_mul2;
//Writeback
assign wb.id = id[2];
assign wb.d2s = d2s[2];
assign wb.fflags.nv = nv[2];
assign wb.fflags.of = 0;
assign wb.fflags.uf = 0;
assign wb.fflags.dz = 0;
assign wb.fflags.nx = 0; //Will be set by normalization
assign wb.carry = 0;
assign wb.safe = result_safe;
assign wb.hidden = output_special ? qnan[2] | inf[2] : result_hidden;
assign wb.clz = '0;
assign wb.ignore_max_expo = output_special;
always_comb begin
wb.grs = '0;
if (subnormal_zero[2])
wb.grs[0] = 1'b1; //Result is some nonzero number - set sticky
else if (~output_special)
wb.grs[GRS_WIDTH-1-:HALF_GRS_WIDTH] = result_grs;
if (output_special)
wb.rd = special_result;
else begin
wb.rd.d.sign = sign_xor[2];
wb.rd.d.expo = result_expo;
wb.rd.d.frac = result_frac;
end
end
assign wb.rm = rm[2];
assign wb.expo_overflow = result_expo_overflow & ~output_special;
assign wb.subnormal = result_is_subnormal & ~output_special;
assign wb.right_shift = (result_is_subnormal | result_safe) & ~output_special;
//If the result is subnormal, right shift frac by 1 extra position
assign wb.right_shift_amt = result_is_subnormal ? result_expo+1 : 1;
//FMA args
assign add_id = id[2];
assign add_args.rm = rm[2];
assign add_args.single = d2s[2];
assign add_args.add = fma_info[2].add_sign;
assign add_args.rs1_expo_overflow = wb.expo_overflow;
assign add_args.fp_add_grs = wb.grs;
assign add_args.rs1.d.sign = wb.rd.d.sign ^ fma_info[2].mul_sign;
assign add_args.rs1.d.expo = result_expo_is_negative ? '0 : wb.rd.d.expo;
assign add_args.rs1.d.frac = wb.rd.d.frac;
assign add_args.rs1_hidden = wb.hidden;
assign add_args.rs1_safe = wb.safe & ~subnormal_zero[2];
assign add_args.rs1_special_case.zero = true_zero[2] | subnormal_zero[2];
assign add_args.rs1_special_case.inf = inf[2];
assign add_args.rs1_special_case.qnan = qnan[2];
assign add_args.rs1_special_case.snan = nv[2];
assign add_args.rs2 = fma_info[2].rs3;
assign add_args.rs2_hidden = fma_info[2].rs3_hidden;
assign add_args.rs2_safe = 0;
assign add_args.rs2_special_case = fma_info[2].rs3_special_case;
//Compare exponents for swapping
logic rs3_add;
logic[EXPO_WIDTH+1:0] expo_diff;
logic[EXPO_WIDTH:0] expo_diff_negate;
logic[EXPO_WIDTH+1:0] expo_diff_rs1;
assign rs3_add = ~fma_info[2].rs3_hidden;
assign expo_diff_rs1 = result_expo_is_negative & ~output_special ? result_expo_diff : {1'b0, wb.expo_overflow, wb.rd.d.expo};
assign expo_diff = expo_diff_rs1 - ({2'b0, fma_info[2].rs3.d.expo} + {{(EXPO_WIDTH){1'b0}}, 1'b0, rs3_add});
assign expo_diff_negate = -expo_diff[EXPO_WIDTH:0];
assign add_args.expo_diff = expo_diff[EXPO_WIDTH+1] ? expo_diff_negate : expo_diff[EXPO_WIDTH:0];
assign add_args.swap = expo_diff[EXPO_WIDTH+1];
endmodule

View file

@ -0,0 +1,410 @@
/*
* Copyright © 2019-2023 Yuhui Gao, Chris Keilbart, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Yuhui Gao <yuhuig@sfu.ca>
* Chris Keilbart <ckeilbar@sfu.ca>
*/
module fp_normalize_rounding_top
import cva5_config::*;
import fpu_types::*;
import cva5_types::*;
#(
parameter int unsigned NUM_WB_UNITS = 4
)(
input logic clk,
input logic rst,
fp_intermediate_wb_interface.wb intermediate_wb[NUM_WB_UNITS-1:0], //Priority order highest to lowest
unit_writeback_interface.unit wb,
output fflags_t fflags
);
localparam SHIFT_WIDTH = 3+FRAC_WIDTH+GRS_WIDTH;
function logic[SHIFT_WIDTH-1:0] reverse(input logic[SHIFT_WIDTH-1:0] in);
foreach(in[i])
reverse[i] = in[SHIFT_WIDTH-1-i];
endfunction
typedef struct packed {
id_t id;
logic valid;
fp_t data;
logic expo_overflow;
fflags_t fflags;
rm_t rm;
logic d2s;
logic carry;
logic safe;
logic hidden;
grs_t grs;
fp_shift_amt_t clz;
logic subnormal;
logic right_shift;
fp_shift_amt_t right_shift_amt;
logic ignore_max_expo;
} fp_normalize_packet_t;
typedef struct packed {
logic valid;
id_t id;
fflags_t fflags;
rm_t rm;
logic d2s;
logic sign_norm;
expo_d_t expo_norm;
logic expo_overflow_norm;
logic right_shift;
fp_shift_amt_t shift_amt;
logic sp_overflow;
logic[EXPO_WIDTH_F-1:0] sp_expo;
logic[SHIFT_WIDTH-1:0] shifter_in;
} fp_shift_packet_t;
typedef struct packed {
id_t id;
logic valid;
fp_t data;
logic expo_overflow;
logic hidden;
rm_t rm;
fflags_t fflags;
logic d2s;
logic round_lsb;
logic[2:0] round_grs;
logic[1:0] tiny_rs;
} fp_round_packet_t;
////////////////////////////////////////////////////
//Implementation
//First chooses a writeback request
//Then normalizes through shifting and rounds
logic advance_norm;
logic advance_shift;
logic advance_round;
fp_normalize_packet_t normalize_packet;
fp_shift_packet_t shift_packet;
fp_round_packet_t round_packet;
////////////////////////////////////////////////////
//Writeback
//Chooses a writeback request in descending priority order
//First unpacks interface signals so they can be dynamically indexed
logic[$clog2(NUM_WB_UNITS)-1:0] unit_sel;
//TODO: false circular dependency because misc_wb2fp uses ack as ready
//unit_done[2:0] -> unit_ack[3] -> wb2fp.ack -> wb2fp.ready -> issue_to[4] -> wb2fp.new_request -> unit_done[3]
/* verilator lint_off UNOPTFLAT */
logic[NUM_WB_UNITS-1:0] unit_ack;
/* verilator lint_on UNOPTFLAT */
id_t[NUM_WB_UNITS-1:0] unit_instruction_id;
logic[NUM_WB_UNITS-1:0] unit_done;
fp_t[NUM_WB_UNITS-1:0] unit_rd;
logic[NUM_WB_UNITS-1:0] unit_expo_overflow;
fflags_t[NUM_WB_UNITS-1:0] unit_fflags;
rm_t[NUM_WB_UNITS-1:0] unit_rm;
logic[NUM_WB_UNITS-1:0] unit_carry;
logic[NUM_WB_UNITS-1:0] unit_safe;
logic[NUM_WB_UNITS-1:0] unit_hidden;
grs_t[NUM_WB_UNITS-1:0] unit_grs;
fp_shift_amt_t[NUM_WB_UNITS-1:0] unit_clz;
logic[NUM_WB_UNITS-1:0] unit_right_shift;
fp_shift_amt_t[NUM_WB_UNITS-1:0] unit_right_shift_amt;
logic[NUM_WB_UNITS-1:0] unit_subnormal;
logic[NUM_WB_UNITS-1:0] unit_ignore_max_expo;
logic[NUM_WB_UNITS-1:0] unit_d2s;
generate for (genvar i = 0; i < NUM_WB_UNITS; i++) begin : gen_unpack
assign intermediate_wb[i].ack = unit_ack[i];
assign unit_instruction_id[i] = intermediate_wb[i].id;
assign unit_done[i] = intermediate_wb[i].done;
assign unit_rd[i] = intermediate_wb[i].rd;
assign unit_expo_overflow[i] = intermediate_wb[i].expo_overflow;
assign unit_fflags[i] = intermediate_wb[i].fflags;
assign unit_rm[i] = intermediate_wb[i].rm;
assign unit_carry[i] = intermediate_wb[i].carry;
assign unit_safe[i] = intermediate_wb[i].safe;
assign unit_hidden[i] = intermediate_wb[i].hidden;
assign unit_grs[i] = intermediate_wb[i].grs;
assign unit_clz[i] = intermediate_wb[i].clz;
assign unit_right_shift[i] = intermediate_wb[i].right_shift;
assign unit_right_shift_amt[i] = intermediate_wb[i].right_shift_amt;
assign unit_subnormal[i] = intermediate_wb[i].subnormal;
assign unit_ignore_max_expo[i] = intermediate_wb[i].ignore_max_expo;
assign unit_d2s[i] = intermediate_wb[i].d2s;
end endgenerate
//Per-ID muxes for commit buffer
always_comb begin
unit_sel = $bits(unit_sel)'(NUM_WB_UNITS-1); //Must default to lowest priority because any other units override
for (int i = NUM_WB_UNITS-2; i >= 0; i--) begin
if (unit_done[i])
unit_sel = $bits(unit_sel)'(i);
end
unit_ack = '0;
unit_ack[unit_sel] = advance_norm;
end
//Advance logic
assign advance_norm = advance_shift | ~normalize_packet.valid;
always_ff @(posedge clk) begin
if (rst)
normalize_packet.valid <= 0;
else if (advance_norm)
normalize_packet.valid <= |unit_done;
if (advance_norm) begin
normalize_packet.id <= unit_instruction_id[unit_sel];
normalize_packet.data <= unit_rd[unit_sel];
normalize_packet.expo_overflow <= unit_expo_overflow[unit_sel];
normalize_packet.fflags <= unit_fflags[unit_sel];
normalize_packet.rm <= unit_rm[unit_sel];
normalize_packet.d2s <= unit_d2s[unit_sel];
normalize_packet.carry <= unit_carry[unit_sel];
normalize_packet.safe <= unit_safe[unit_sel];
normalize_packet.hidden <= unit_hidden[unit_sel];
normalize_packet.grs <= unit_grs[unit_sel];
normalize_packet.clz <= unit_clz[unit_sel];
normalize_packet.subnormal <= unit_subnormal[unit_sel];
normalize_packet.right_shift <= unit_right_shift[unit_sel];
normalize_packet.right_shift_amt <= unit_right_shift_amt[unit_sel];
normalize_packet.ignore_max_expo <= unit_ignore_max_expo[unit_sel];
end
end
////////////////////////////////////////////////////
//Normalization
//Determine the shift amount and direction according to the exponent
//Potentially flip the mantissa
logic right_shift;
logic dp_overflow;
logic sp_overflow;
fp_shift_amt_t shift_amt;
expo_d_t dp_expo;
expo_s_t sp_expo;
logic[SHIFT_WIDTH-1:0] in_left;
logic[SHIFT_WIDTH-1:0] in_right;
fp_prenormalize normalize_inst(
.single(normalize_packet.d2s),
.right_shift_in(normalize_packet.right_shift),
.overflow_in(normalize_packet.expo_overflow),
.subnormal(normalize_packet.subnormal),
.expo_in(normalize_packet.data.d.expo),
.ignore_max_expo(normalize_packet.ignore_max_expo),
.left_shift_amt(normalize_packet.clz),
.right_shift_amt(normalize_packet.right_shift_amt),
.right_shift_out(right_shift),
.dp_overflow_out(dp_overflow),
.sp_overflow_out(sp_overflow),
.shift_amt_out(shift_amt),
.dp_expo_out(dp_expo),
.sp_expo_out(sp_expo)
);
//Shifter input
assign in_right = {normalize_packet.carry, normalize_packet.safe, normalize_packet.hidden, normalize_packet.data.d.frac, normalize_packet.grs};
assign in_left = reverse(in_right);
//Advance logic
assign advance_shift = advance_round | ~shift_packet.valid;
always_ff @(posedge clk) begin
if (rst)
shift_packet.valid <= 0;
else if (advance_shift)
shift_packet.valid <= normalize_packet.valid;
if (advance_shift) begin
shift_packet.sign_norm <= normalize_packet.data.d.sign;
shift_packet.rm <= normalize_packet.rm;
shift_packet.id <= normalize_packet.id;
shift_packet.fflags <= normalize_packet.fflags;
shift_packet.d2s <= normalize_packet.d2s;
shift_packet.right_shift <= right_shift;
shift_packet.expo_overflow_norm <= dp_overflow;
shift_packet.sp_overflow <= sp_overflow;
shift_packet.shift_amt <= shift_amt;
shift_packet.expo_norm <= dp_expo;
shift_packet.sp_expo <= sp_expo;
shift_packet.shifter_in <= right_shift ? in_right : in_left;
end
end
////////////////////////////////////////////////////
//Shifting and Roundup
//Extremely wide right shifter, output is flipped for left shifts
//Extracts the bits used for determining rounding
logic[SHIFT_WIDTH-1:0] shift_intermediate;
logic[SHIFT_WIDTH-1:0] shift_final;
grs_t grs_norm;
frac_d_t frac_norm;
logic round_lsb;
logic[2:0] round_grs;
logic[1:0] tiny_rs;
assign shift_intermediate = shift_packet.shifter_in >> shift_packet.shift_amt;
assign shift_final = shift_packet.right_shift ? shift_intermediate : reverse(shift_intermediate);
assign grs_norm = shift_final[GRS_WIDTH-1:0];
assign frac_norm = shift_final[GRS_WIDTH+FRAC_WIDTH-1:GRS_WIDTH];
//Right shifts may lose sticky bits - keep track
logic sticky;
logic set_sticky;
assign set_sticky = sticky & shift_packet.right_shift;
fp_sticky_tracking #(.INPUT_WIDTH(SHIFT_WIDTH), .SHIFT_WIDTH(EXPO_WIDTH)) right_sticky (
.shifter_input(shift_packet.shifter_in),
.shift_amount(shift_packet.shift_amt),
.sticky_bit(sticky)
);
//GRS extraction for rounding
//RISC-V specifies that tininess must be detected after rounding, as opposed to before. They only differ on underflow for +-2^-EMIN.
//IEEE 754 states that we must therefore determine tininess as if the exponent range was unbounded (but not the fraction)
//Therefore, we must undo the right shift of 1 to fit the exponent range when determining the roundup
always_comb begin
if (shift_packet.d2s) begin
round_lsb = frac_norm[FRAC_WIDTH-FRAC_WIDTH_F];
round_grs[2:1] = frac_norm[FRAC_WIDTH-FRAC_WIDTH_F-1-:2];
round_grs[0] = |frac_norm[FRAC_WIDTH-FRAC_WIDTH_F-3:0] | |grs_norm | set_sticky;
tiny_rs[1] = frac_norm[FRAC_WIDTH-FRAC_WIDTH_F-3];
tiny_rs[0] = |frac_norm[FRAC_WIDTH-FRAC_WIDTH_F-4:0] | |grs_norm | set_sticky;
end
else begin
round_lsb = frac_norm[0];
round_grs[2:1] = grs_norm[GRS_WIDTH-1-:2];
round_grs[0] = |grs_norm[GRS_WIDTH-3:0] | set_sticky;
tiny_rs[1] = grs_norm[GRS_WIDTH-3];
tiny_rs[0] = |grs_norm[GRS_WIDTH-4:0] | set_sticky;
end
end
//Advance logic
assign advance_round = wb.ack | ~round_packet.valid;
always_ff @ (posedge clk) begin
if (rst)
round_packet.valid <= 0;
else if (advance_round)
round_packet.valid <= shift_packet.valid;
if (advance_round) begin
round_packet.hidden <= shift_final[GRS_WIDTH+FRAC_WIDTH];
round_packet.id <= shift_packet.id;
round_packet.rm <= shift_packet.rm;
round_packet.d2s <= shift_packet.d2s;
round_packet.round_lsb <= round_lsb;
round_packet.round_grs <= round_grs;
round_packet.tiny_rs <= tiny_rs;
round_packet.data.d.sign <= shift_packet.sign_norm;
round_packet.fflags.nv <= shift_packet.fflags.nv;
round_packet.fflags.dz <= shift_packet.fflags.dz;
round_packet.fflags.of <= shift_packet.fflags.of;
round_packet.fflags.uf <= shift_packet.fflags.uf;
round_packet.fflags.nx <= shift_packet.fflags.nx | |round_grs;
if (shift_packet.d2s) begin
round_packet.expo_overflow <= shift_packet.sp_overflow;
round_packet.data.d.expo <= {{(EXPO_WIDTH-EXPO_WIDTH_F){1'b1}}, shift_packet.sp_expo}; //Allow the roundup to propagate to overflow
round_packet.data.d.frac <= {frac_norm[FRAC_WIDTH-1-:FRAC_WIDTH_F], {(FRAC_WIDTH-FRAC_WIDTH_F){1'b1}}};
end
else begin
round_packet.expo_overflow <= shift_packet.expo_overflow_norm;
round_packet.data.d.expo <= shift_packet.expo_norm;
round_packet.data.d.frac <= frac_norm;
end
end
end
////////////////////////////////////////////////////
//Rounding
//Perform the rounding by adding based on the saved bits from the previous cycle
//Also detects overflow
logic frac_overflow;
frac_d_t frac_out;
expo_d_t expo_out;
logic overflow_exp;
fp_t rd;
logic roundup;
logic roundup_tiny;
fp_t result_if_overflow;
fp_roundup real_round (
.sign(round_packet.data.d.sign),
.rm(round_packet.rm),
.grs(round_packet.round_grs),
.lsb(round_packet.round_lsb),
.roundup(roundup),
.result_if_overflow(result_if_overflow)
);
fp_roundup tininess_round (
.sign(round_packet.data.d.sign),
.rm(round_packet.rm),
.grs({round_packet.round_grs[1], round_packet.tiny_rs}),
.lsb(round_packet.round_grs[2]),
.roundup(roundup_tiny),
.result_if_overflow()
);
assign {frac_overflow, frac_out} = round_packet.data.d.frac + (FRAC_WIDTH)'(roundup);
assign expo_out = round_packet.data.d.expo + EXPO_WIDTH'(frac_overflow);
//Compute exponent overflow due to rounding in parallel with roundup addition
assign overflow_exp = (frac_overflow & &round_packet.data.d.expo[EXPO_WIDTH-1:1]) | round_packet.expo_overflow;
//Output
assign wb.id = round_packet.id;
assign wb.done = round_packet.valid;
assign wb.rd = rd.raw;
always_comb begin
if (overflow_exp) begin
//Convert dp overflow value to sp
if (round_packet.d2s) begin
rd.s.box = '1;
rd.s.sign = result_if_overflow.d.sign;
rd.s.expo = result_if_overflow.d.expo[EXPO_WIDTH_F-1:0];
rd.s.frac = result_if_overflow.d.frac[FRAC_WIDTH_F-1:0];
end
else
rd = result_if_overflow;
end
else if (round_packet.d2s) begin
rd.s.box = '1;
rd.s.sign = round_packet.data.d.sign;
rd.s.expo = expo_out[EXPO_WIDTH_F-1:0];
rd.s.frac = frac_out[FRAC_WIDTH-1-:FRAC_WIDTH_F];
end
else begin
rd.d.sign = round_packet.data.d.sign;
rd.d.expo = expo_out;
rd.d.frac = frac_out;
end
end
assign fflags.nv = round_packet.fflags.nv;
assign fflags.dz = round_packet.fflags.dz;
assign fflags.of = round_packet.fflags.of | ~round_packet.fflags.nv & overflow_exp;
//Underflow only occurs if inexact
assign fflags.uf = round_packet.fflags.uf | (~round_packet.fflags.nv & round_packet.fflags.nx & ~round_packet.hidden & (~frac_overflow | ~(round_packet.round_grs[2] & roundup_tiny)));
//Overflow is inexact
assign fflags.nx = round_packet.fflags.nx | ~round_packet.fflags.nv & overflow_exp;
endmodule

View file

@ -0,0 +1,109 @@
/*
* Copyright © 2019-2023 Yuhui Gao, Chris Keilbart, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Yuhui Gao <yuhuig@sfu.ca>
* Chris Keilbart <ckeilbar@sfu.ca>
*/
module fp_prenormalize
import cva5_config::*;
import fpu_types::*;
(
input logic single,
input logic right_shift_in,
input logic overflow_in,
input logic subnormal,
input expo_d_t expo_in,
input logic ignore_max_expo,
input fp_shift_amt_t left_shift_amt,
input fp_shift_amt_t right_shift_amt,
output logic right_shift_out,
output logic dp_overflow_out,
output logic sp_overflow_out,
output fp_shift_amt_t shift_amt_out,
output expo_d_t dp_expo_out,
output expo_s_t sp_expo_out
);
logic[EXPO_WIDTH:0] starting_expo;
assign starting_expo = {overflow_in, expo_in};
////////////////////////////////////////////////////
//Double precision
//Left shifts are capped at reducing the exponent to 0
//Right shifts increment the exponent except when subnormal
logic expo_less_than_left_shift_amt;
fp_shift_amt_t left_shift_amt_adjusted;
logic[EXPO_WIDTH:0] expo_norm_left_shift_intermediate;
logic[EXPO_WIDTH:0] expo_norm_left_shift;
logic[EXPO_WIDTH:0] expo_norm_right_shift;
logic dp_overflow_intermediate;
//Left shift logic - cap the left shift amount to the exponent if it would turn negative
assign {expo_less_than_left_shift_amt, expo_norm_left_shift_intermediate} = {starting_expo & {(EXPO_WIDTH+1){~subnormal}}} - (EXPO_WIDTH+1)'(left_shift_amt); //drive to zero if subnormal
assign left_shift_amt_adjusted = expo_less_than_left_shift_amt ? expo_in : left_shift_amt;
assign expo_norm_left_shift = expo_less_than_left_shift_amt ? '0 : expo_norm_left_shift_intermediate;
//Right shift logic - exponent is zero if subnormal
assign expo_norm_right_shift = subnormal ? '0 : starting_expo + (EXPO_WIDTH+1)'(right_shift_amt);
//Select the final double precision exponent and overflow value
assign {dp_overflow_intermediate, dp_expo_out} = right_shift_in ? expo_norm_right_shift : expo_norm_left_shift;
assign dp_overflow_out = dp_overflow_intermediate | (~ignore_max_expo & &dp_expo_out);
////////////////////////////////////////////////////
//Single precision
//Normal double numbers map onto the subnormal single range
//This means left shifts may turn into right shifts
logic[EXPO_WIDTH-1:0] single_shift_amt;
logic[EXPO_WIDTH-1:0] expo_sum;
logic shift_sign;
always_comb begin
single_shift_amt = right_shift_in ? right_shift_amt : -left_shift_amt;
expo_sum = expo_in + single_shift_amt;
sp_overflow_out = overflow_in | (expo_sum > BIAS+BIAS_F & ~&expo_sum); //All 1 = NaN/infinity but not an overflow
//Determine SP expo and shift amount due to subnormal numbers
sp_expo_out = '0;
if (expo_sum <= BIAS-BIAS_F && expo_sum > BIAS-BIAS_F-FRAC_WIDTH_F-3)
single_shift_amt += (BIAS-BIAS_F+1) - expo_sum;
else if (expo_sum <= BIAS-BIAS_F-FRAC_WIDTH_F-3) //Cap shift amount to prevent losing the sticky bit entirely
single_shift_amt += FRAC_WIDTH_F+3;
else //Maps onto regular range
sp_expo_out = {expo_sum[EXPO_WIDTH-1], expo_sum[EXPO_WIDTH_F-2:0]};
shift_sign = single_shift_amt[EXPO_WIDTH-1];
if (shift_sign)
single_shift_amt = -single_shift_amt;
if (single) begin
right_shift_out = ~shift_sign;
shift_amt_out = single_shift_amt;
end
else begin
right_shift_out = right_shift_in;
shift_amt_out = right_shift_in ? right_shift_amt : left_shift_amt_adjusted;
end
end
endmodule

View file

@ -0,0 +1,446 @@
/*
* Copyright © 2019-2023 Yuhui Gao, Chris Keilbart, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Yuhui Gao <yuhuig@sfu.ca>
* Chris Keilbart <ckeilbar@sfu.ca>
*/
module fp_preprocessing
import cva5_config::*;
import cva5_types::*;
import fpu_types::*;
#(
parameter cpu_config_t CONFIG = EXAMPLE_CONFIG,
parameter FP_NUM_UNITS = 5
)
(
input logic clk,
input logic rst,
unit_issue_interface.decode unit_issue[FP_NUM_UNITS-1:0],
//Unit Inputs
input fp_preprocessing_packet_t pkt,
output logic ready,
output fp_madd_inputs_t madd_args,
output fp_div_inputs_t div_args,
output fp_sqrt_inputs_t sqrt_args,
output fp_wb2fp_misc_inputs_t wb2fp_args,
output fp_wb2int_misc_inputs_t wb2int_args
);
/////////////////////////////////////////////
//Control Logic
//Cycle 0 has combinational speculative preprocessing that is registered on valid requests
//Cycle 1 has some additional preprocessing and also issues the instruction
id_t id_r;
rm_t rm_r;
logic single;
logic single_r;
logic[FP_NUM_UNITS-1:0] target_unit;
logic[FP_NUM_UNITS-1:0] issue_to;
logic[FP_NUM_UNITS-1:0] unit_ready;
logic accept_request;
logic stage2_valid;
logic stage2_advance;
//Unpack interface array
generate for (genvar i = 0; i < FP_NUM_UNITS; i++) begin : gen_interface_unpack
assign unit_ready[i] = unit_issue[i].ready;
assign unit_issue[i].new_request = issue_to[i];
assign unit_issue[i].id = id_r;
end endgenerate
assign stage2_advance = stage2_valid & |(unit_ready & target_unit);
assign issue_to = target_unit & {FP_NUM_UNITS{stage2_advance}};
assign ready = ~stage2_valid | stage2_advance;
assign accept_request = ready & pkt.valid;
assign single = pkt.is_single;
always_ff @(posedge clk) begin
if (rst) begin
target_unit <= '0;
stage2_valid <= 0;
end
else begin
if (accept_request) begin
target_unit <= pkt.unit;
stage2_valid <= 1;
end
else if (stage2_advance)
stage2_valid <= 0;
end
if (accept_request) begin
id_r <= pkt.id;
rm_r <= pkt.rm;
single_r <= single;
end
end
/////////////////////////////////////////////
//Cycle 0 preprocessing
//Single to double, normalization, and special case detection
//Also computes whether the arguments should be swapped
fp_t rs1, rs1_r;
fp_t rs2, rs2_r;
fp_t rs3, rs3_r;
special_case_t[2:0] special_case, special_case_r;
logic[2:0] hidden, hidden_r;
logic[0:0] hidden_single;
logic[1:0] hidden_double;
fp_t[2:0] rs_converted;
logic rs1_boxed, rs2_boxed;
fp_shift_amt_t rs1_norm_shift, rs1_norm_shift_r;
fp_shift_amt_t rs2_norm_shift, rs2_norm_shift_r;
frac_d_t rs1_norm_frac, rs1_norm_frac_r;
frac_d_t rs2_norm_frac, rs2_norm_frac_r;
assign rs1 = pkt.rs1;
assign rs2 = pkt.rs2;
assign rs3 = pkt.rs3;
//Unit instantiation
fp_rs_preprocess #(.CONFIG(CONFIG)) rs1_pre (
.in(rs1),
.single(single),
.double(rs_converted[0]),
.special(special_case[0]),
.is_boxed(rs1_boxed),
.hidden(hidden[0]),
.hidden_double(hidden_double[0]),
.hidden_single(hidden_single[0]),
.prenormalize_shift(rs1_norm_shift),
.prenormalize_frac(rs1_norm_frac)
);
fp_rs_preprocess #(.CONFIG(CONFIG)) rs2_pre (
.in(rs2),
.single(single),
.double(rs_converted[1]),
.special(special_case[1]),
.is_boxed(rs2_boxed),
.hidden(hidden[1]),
.hidden_double(hidden_double[1]),
.hidden_single(),
.prenormalize_shift(rs2_norm_shift),
.prenormalize_frac(rs2_norm_frac)
);
fp_rs_preprocess #(.CONFIG(CONFIG)) rs3_pre (
.in(rs3),
.single(single),
.double(rs_converted[2]),
.special(special_case[2]),
.is_boxed(),
.hidden(hidden[2]),
.hidden_double(),
.hidden_single(),
.prenormalize_shift(),
.prenormalize_frac()
);
always_ff @ (posedge clk) begin
if (accept_request) begin
rs1_r <= single ? rs_converted[0] : rs1;
rs2_r <= single ? rs_converted[1] : rs2;
rs3_r <= single ? rs_converted[2] : rs3;
special_case_r <= special_case;
hidden_r <= hidden;
rs1_norm_shift_r <= rs1_norm_shift;
rs2_norm_shift_r <= rs2_norm_shift;
rs1_norm_frac_r <= rs1_norm_frac;
rs2_norm_frac_r <= rs2_norm_frac;
end
end
//Swap calculation
logic[EXPO_WIDTH:0] expo_diff;
logic swap, swap_r;
logic rs1_smaller_mantissa;
expo_d_t rs1_expo_padded;
expo_d_t rs2_expo_padded;
logic[FRAC_WIDTH_F-1:0] rs1_mant;
logic[FRAC_WIDTH_F-1:0] rs2_mant;
assign swap = expo_diff[EXPO_WIDTH] ? 1 : |expo_diff[EXPO_WIDTH-1:0] ? 0 : rs1_smaller_mantissa;
assign rs1_expo_padded[EXPO_WIDTH-1:EXPO_WIDTH_F] = '0;
assign rs2_expo_padded[EXPO_WIDTH-1:EXPO_WIDTH_F] = '0;
//The exponent comparison checks boxing because the minmax instruction assumes NaNs are the larger operand
assign rs1_expo_padded[EXPO_WIDTH_F-1:0] = rs1_boxed ? rs1.s.expo : '1;
assign rs2_expo_padded[EXPO_WIDTH_F-1:0] = rs2_boxed ? rs2.s.expo : '1;
//For the mantissa, all that is required is inf < snan/qnan
assign rs1_mant = {~rs1_boxed | rs1.s.frac[FRAC_WIDTH_F-1], rs1.s.frac[FRAC_WIDTH_F-2:0]};
assign rs2_mant = {~rs2_boxed | rs2.s.frac[FRAC_WIDTH_F-1], rs2.s.frac[FRAC_WIDTH_F-2:0]};
always_comb begin
if (single) begin
rs1_smaller_mantissa = rs1_mant < rs2_mant;
expo_diff = rs1_expo_padded - rs2_expo_padded;
end
else begin
rs1_smaller_mantissa = rs1.d.frac < rs2.d.frac;
expo_diff = rs1.d.expo - rs2.d.expo;
end
end
always_ff @ (posedge clk) begin
if (accept_request)
swap_r <= swap;
end
/////////////////////////////////////////////
//Cycle 1 swap
//After the swap RS1 will hold the larger argument
fp_t rs1_norm;
fp_t rs2_norm;
fp_t rs1_swapped;
fp_t rs2_swapped;
fp_shift_amt_t rs2_swapped_shift;
logic rs1_swapped_hidden;
logic rs2_swapped_hidden;
always_comb begin
rs1_norm.d.sign = rs1_r.d.sign;
rs1_norm.d.expo = rs1_r.d.expo;
rs1_norm.d.frac = rs1_norm_frac_r;
rs2_norm.d.sign = rs2_r.d.sign;
rs2_norm.d.expo = rs2_r.d.expo;
rs2_norm.d.frac = rs2_norm_frac_r;
//Do not need to swap special case, because multiplication is the only unit that needs it and the order doesn't matter there
if (swap_r) begin
{rs1_swapped, rs2_swapped} = {rs2_norm, rs1_norm};
{rs1_swapped_hidden, rs2_swapped_hidden} = {hidden_r[1], hidden_r[0]};
rs2_swapped_shift = rs1_norm_shift_r;
end else begin
{rs1_swapped, rs2_swapped} = {rs1_norm, rs2_norm};
{rs1_swapped_hidden, rs2_swapped_hidden} = {hidden_r[0], hidden_r[1]};
rs2_swapped_shift = rs2_norm_shift_r;
end
end
/////////////////////////////////////////////
//FMA Unit
//Issue cycle FMA
logic is_fma_r;
logic is_fadd_r;
logic add_r;
logic neg_mul_r;
//FMA
assign madd_args.fma = is_fma_r;
assign madd_args.fma_args.mul_sign = neg_mul_r;
assign madd_args.fma_args.add_sign = add_r;
assign madd_args.fma_args.rs3 = rs3_r;
assign madd_args.fma_args.rs3_hidden = hidden_r[2];
assign madd_args.fma_args.rs3_special_case = special_case_r[2];
//FMUL
assign madd_args.mul_args.rs1_special_case = special_case_r[0];
assign madd_args.mul_args.rs2_special_case = special_case_r[1];
assign madd_args.mul_args.rs1_hidden = rs1_swapped_hidden;
assign madd_args.mul_args.rs2_hidden = rs2_swapped_hidden;
assign madd_args.mul_args.rs1 = rs1_swapped;
assign madd_args.mul_args.rs2 = rs2_swapped;
assign madd_args.mul_args.rm = rm_r;
assign madd_args.mul_args.single = single_r;
assign madd_args.mul_args.rs2_prenormalize_shift_amt = rs2_swapped_shift;
//FADD
logic[EXPO_WIDTH:0] expo_diff_issued;
logic[EXPO_WIDTH:0] double_expo_diff;
logic[EXPO_WIDTH:0] double_expo_diff_r;
//Precalculate the double exponent difference, saves time in the next cycle (because the hidden bits don't need to be included)
assign double_expo_diff = (rs1.d.expo + {{(EXPO_WIDTH-1){1'b0}}, ~hidden_double[0]}) - (rs2.d.expo + {{(EXPO_WIDTH-1){1'b0}}, ~hidden_double[1]});
always_comb begin
if (single_r)
expo_diff_issued = rs1_r.d.expo - rs2_r.d.expo;
else
expo_diff_issued = double_expo_diff_r;
if (swap_r)
expo_diff_issued = -expo_diff_issued;
end
assign madd_args.add = is_fadd_r;
assign madd_args.add_args.rs1 = rs1_r;
assign madd_args.add_args.rs2 = rs2_r;
assign madd_args.add_args.rs1_hidden = hidden_r[0];
assign madd_args.add_args.rs2_hidden = hidden_r[1];
assign madd_args.add_args.rs1_safe = 0;
assign madd_args.add_args.rs2_safe = 0;
assign madd_args.add_args.rs1_special_case = special_case_r[0];
assign madd_args.add_args.rs2_special_case = special_case_r[1];
assign madd_args.add_args.rs1_expo_overflow = 0;
assign madd_args.add_args.expo_diff = expo_diff_issued;
assign madd_args.add_args.add = add_r;
assign madd_args.add_args.swap = swap_r;
assign madd_args.add_args.fp_add_grs = '0;
assign madd_args.add_args.rm = rm_r;
assign madd_args.add_args.single = single_r;
always_ff @ (posedge clk) begin
if (accept_request) begin
is_fma_r <= pkt.is_fma;
is_fadd_r <= pkt.is_fadd;
add_r <= pkt.add;
neg_mul_r <= pkt.neg_mul;
double_expo_diff_r <= double_expo_diff;
end
end
/////////////////////////////////////////////
//FDIV
assign div_args.rs1 = rs1_norm;
assign div_args.rs2 = rs2_norm;
assign div_args.rm = rm_r;
assign div_args.rs1_hidden = hidden_r[0];
assign div_args.rs2_hidden = hidden_r[1];
assign div_args.rs1_prenormalize_shift_amt = rs1_norm_shift_r;
assign div_args.rs2_prenormalize_shift_amt = rs2_norm_shift_r;
assign div_args.single = single_r;
assign div_args.rs1_special_case = special_case_r[0];
assign div_args.rs2_special_case = special_case_r[1];
/////////////////////////////////////////////
//FSQRT
assign sqrt_args.rs1 = rs1_norm;
assign sqrt_args.rs1_hidden = hidden_r[0];
assign sqrt_args.special_case = special_case_r[0];
assign sqrt_args.rs1_prenormalize_shift_amt = rs1_norm_shift_r;
assign sqrt_args.rm = rm_r;
assign sqrt_args.single = single_r;
/////////////////////////////////////////////
//WB2FP
//Issue cycle F2I
logic rs1_boxed_r;
logic rs2_boxed_r;
logic[31:0] int_rs_abs;
logic[31:0] int_rs_abs_r;
logic[31:0] int_rs1_r;
logic i2f_sign;
logic i2f_sign_r;
logic is_i2f_r;
logic is_minmax_r;
logic is_sign_inj_r;
logic is_sign_inj_single_r;
logic is_mv_i2f_r;
logic is_d2s_r;
assign i2f_sign = pkt.conv_signed & pkt.int_rs1[31];
assign int_rs_abs = i2f_sign ? -pkt.int_rs1 : pkt.int_rs1;
//Cycle 1 - WB2FP
assign wb2fp_args.i2f = is_i2f_r;
assign wb2fp_args.fminmax = is_minmax_r;
assign wb2fp_args.fsgnj = is_sign_inj_r;
assign wb2fp_args.fmv = is_mv_i2f_r;
assign wb2fp_args.d2s = is_d2s_r;
assign wb2fp_args.int_rs = int_rs1_r;
assign wb2fp_args.rs1 = rs1_r;
assign wb2fp_args.rs1_hidden = hidden_r[0];
assign wb2fp_args.rs1_special_case = special_case_r[0];
assign wb2fp_args.fsgnj_single = is_sign_inj_single_r;
assign wb2fp_args.rs1_boxed = rs1_boxed_r;
assign wb2fp_args.rs2_boxed = rs2_boxed_r;
assign wb2fp_args.swap = swap_r;
assign wb2fp_args.rs2 = rs2_r;
assign wb2fp_args.single = single_r;
assign wb2fp_args.rm = rm_r;
assign wb2fp_args.rs2_special_case = special_case_r[1];
assign wb2fp_args.int_rs_abs = int_rs_abs_r;
assign wb2fp_args.i2f_sign = i2f_sign_r;
always_ff @ (posedge clk) begin
if (accept_request) begin
rs1_boxed_r <= rs1_boxed;
rs2_boxed_r <= rs2_boxed;
int_rs1_r <= pkt.int_rs1;
int_rs_abs_r <= int_rs_abs;
i2f_sign_r <= i2f_sign;
is_i2f_r <= pkt.is_i2f;
is_minmax_r <= pkt.is_minmax;
is_sign_inj_r <= pkt.is_sign_inj;
is_sign_inj_single_r <= pkt.is_sign_inj_single;
is_mv_i2f_r <= pkt.is_mv_i2f;
is_d2s_r <= pkt.is_d2s;
end
end
/////////////////////////////////////////////
//WB2INT
//Issue cycle F2I
logic f2i_is_signed_r;
logic is_class_r;
logic is_fcmp_r;
logic is_f2i_r;
logic rs1_hidden_single_r;
expo_d_t rs1_expo_unbiased;
expo_d_t rs1_expo_unbiased_r;
logic int_less_than_1;
logic int_less_than_1_r;
//Cycle 0 F2I preprocessing
expo_d_t expo_amt;
expo_d_t bias_amt;
assign expo_amt = single ? {{(EXPO_WIDTH-EXPO_WIDTH_F){1'b0}}, rs1.s.expo} : rs1.d.expo;
assign bias_amt = single ? BIAS_F : BIAS;
assign {int_less_than_1, rs1_expo_unbiased} = expo_amt - bias_amt;
//Cycle 1 - WB2INT
assign wb2int_args.fclass = is_class_r;
assign wb2int_args.fcmp = is_fcmp_r;
assign wb2int_args.f2i = is_f2i_r;
assign wb2int_args.int_less_than_1 = int_less_than_1_r;
assign wb2int_args.rs1_expo_unbiased = rs1_expo_unbiased_r;
assign wb2int_args.rs1 = rs1_r;
assign wb2int_args.rs1_original_hidden_bit = single_r ? rs1_hidden_single_r : hidden_r[0];
assign wb2int_args.rs1_special_case = special_case_r[0];
assign wb2int_args.rs2_special_case = special_case_r[1];
assign wb2int_args.rs2 = rs2_r;
assign wb2int_args.swap = swap_r;
assign wb2int_args.rm = rm_r;
assign wb2int_args.rs1_hidden = hidden_r[0];
assign wb2int_args.is_signed = f2i_is_signed_r;
always_ff @ (posedge clk) begin
if (accept_request) begin
f2i_is_signed_r <= pkt.conv_signed;
is_class_r <= pkt.is_class;
is_fcmp_r <= pkt.is_fcmp;
is_f2i_r <= pkt.is_f2i;
rs1_hidden_single_r <= hidden_single[0];
int_less_than_1_r <= int_less_than_1;
rs1_expo_unbiased_r <= rs1_expo_unbiased;
end
end
endmodule

View file

@ -0,0 +1,71 @@
/*
* Copyright © 2019-2023 Yuhui Gao, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Yuhui Gao <yuhuig@sfu.ca>
*/
module fp_roundup
import cva5_config::*;
import fpu_types::*;
(
input logic sign,
input rm_t rm,
input logic[2:0] grs,
input logic lsb,
output logic roundup,
output fp_t result_if_overflow
);
always_comb begin
result_if_overflow.d.sign = sign;
unique case(rm)
default: begin //nearest ties to even
result_if_overflow.d.expo = '1;
result_if_overflow.d.frac = '0;
roundup = grs[2] & (lsb | |grs[1:0]);
end
3'b100: begin //nearest ties to away
result_if_overflow.d.expo = '1;
result_if_overflow.d.frac = '0;
roundup = grs[2];
end
3'b011: begin //round to positive inf
//only round if: positive, has extra bits in grs
result_if_overflow.d.expo = {{(EXPO_WIDTH-1){1'b1}}, ~sign};
result_if_overflow.d.frac = {FRAC_WIDTH{sign}};
roundup = ~sign & |grs;
end
3'b010: begin //round to negative inf
//only round if: negative, has extra bits in grs
result_if_overflow.d.expo = {{(EXPO_WIDTH-1){1'b1}}, sign};
result_if_overflow.d.frac = {FRAC_WIDTH{~sign}};
roundup = sign & |grs;
end
3'b001: begin //round to zero
result_if_overflow.d.expo = {{(EXPO_WIDTH-1){1'b1}}, 1'b0};
result_if_overflow.d.frac = '1;
roundup = 0;
end
endcase
end
endmodule

View file

@ -0,0 +1,173 @@
/*
* Copyright © 2023 Chris Keilbart, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Chris Keilbart <ckeilbar@sfu.ca>
*/
module fp_rs_preprocess
import cva5_config::*;
import fpu_types::*;
#(
parameter cpu_config_t CONFIG = EXAMPLE_CONFIG
)
(
input fp_t in, //Packed form
input logic single,
output fp_t double, //Only valid if input was single
//Special cases
output special_case_t special,
output logic is_boxed,
output logic hidden,
output logic hidden_double,
output logic hidden_single,
//Pre normalization
output fp_shift_amt_t prenormalize_shift,
output frac_d_t prenormalize_frac
);
////////////////////////////////////////////////////
//Special case detection
//Depends on the type of the input
//Single precision must check NaN boxing
logic inf_d, inf_s;
logic snan_d, snan_s;
logic qnan_d, qnan_s;
logic zero_d, zero_s;
assign is_boxed = &in.s.box;
assign special.inf = single ? inf_s & is_boxed : inf_d;
assign special.snan = single ? snan_s & is_boxed : snan_d;
assign special.qnan = single ? qnan_s | ~is_boxed : qnan_d;
assign special.zero = single ? zero_s & is_boxed : zero_d;
assign hidden = single ? ~zero_s : hidden_double; //TODO: singles sharing subnormal range with doubles
fp_special_case_detection #(.FRAC_W(FRAC_WIDTH_F), .EXPO_W(EXPO_WIDTH_F), .SUBNORMAL(1)) input_case_s (
.expo(in.s.expo),
.frac(in.s.frac),
.is_inf(inf_s),
.is_SNaN(snan_s),
.is_QNaN(qnan_s),
.is_zero(zero_s),
.hidden(hidden_single)
);
fp_special_case_detection #(.FRAC_W(FRAC_WIDTH), .EXPO_W(EXPO_WIDTH), .SUBNORMAL(1)) input_case_d (
.expo(in.d.expo),
.frac(in.d.frac),
.is_inf(inf_d),
.is_SNaN(snan_d),
.is_QNaN(qnan_d),
.is_zero(zero_d),
.hidden(hidden_double)
);
////////////////////////////////////////////////////
//Normalization
//Done by shifting to set the implicit leading 1 (required by many execution units for subnormal numbers)
//Does CLZ + shift in one cycle
logic[EXPO_WIDTH_F-1:0] exponent_add;
logic[FRAC_WIDTH-1:0] shift_arr;
logic clz_hidden;
logic[FRAC_WIDTH+1:0] clz_arr;
logic[$clog2(FRAC_WIDTH+2)-1:0] clz_count;
//Set up the array for shifting
always_comb begin
if (single) begin
clz_hidden = hidden_single;
shift_arr = '0;
shift_arr[FRAC_WIDTH-1 -: FRAC_WIDTH_F] = in.s.frac;
end
else begin
clz_hidden = hidden_double;
shift_arr = in.d.frac;
end
end
//Check leading zero to get shift count
assign clz_arr = {clz_hidden, shift_arr, 1'b1}; //Pad to ensure the count is always accurate
clz #(.WIDTH(FRAC_WIDTH+2)) frac_clz (
.clz_input(clz_arr),
.clz(clz_count),
.zero()
);
//Do the normalization shift
always_comb begin
prenormalize_frac = shift_arr << clz_count;
prenormalize_shift = '0;
if (~single)
prenormalize_shift[$clog2(FRAC_WIDTH)-1:0] = clz_count[$clog2(FRAC_WIDTH)-1:0];
exponent_add = '0;
exponent_add[$clog2(FRAC_WIDTH+2)-1:0] = clz_count;
end
////////////////////////////////////////////////////
//Single to Double
//Scales exponent considering different ranges and shifting amounts
//Uses normalized mantissa
expo_d_t add_amt;
expo_d_t bias_amt;
expo_d_t expo_out;
//Input case -> Output
//Not NaN boxed -> CNaN
//sNaN -> CNaN
//qNaN -> CNaN
//+-0 -> +-0
//+-infty -> +-infty
//subnormal -> not subnormal (this depends on relative widths)
//regular -> regular
//Sign
assign double.d.sign = snan_s | qnan_s | ~is_boxed ? 1'b0 : in.s.sign;
//Exponent
assign add_amt = hidden_single ? {{(EXPO_WIDTH-EXPO_WIDTH_F){1'b0}}, in.s.expo} : -{{(EXPO_WIDTH-EXPO_WIDTH_F){1'b0}}, exponent_add};
always_comb begin
bias_amt = BIAS - BIAS_F;
if (~hidden_single)
bias_amt[0] = 1;
end
assign expo_out = bias_amt + add_amt;
always_comb begin
if (inf_s | snan_s | qnan_s | ~is_boxed)
double.d.expo = '1;
else if (zero_s)
double.d.expo = '0;
else
double.d.expo = expo_out;
end
//Mantissa
always_comb begin
if (snan_s | qnan_s | ~is_boxed) //NaNs get canonicalized from s->d
double.d.frac = {1'b1, {(FRAC_WIDTH-1){1'b0}}};
else
double.d.frac = prenormalize_frac;
end
endmodule

View file

@ -0,0 +1,51 @@
/*
* Copyright © 2019-2023 Yuhui Gao, Chris Keilbart, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Yuhui Gao <yuhuig@sfu.ca>
* Chris Keilbart <ckeilbar@sfu.ca>
*/
module fp_special_case_detection
#(
parameter FRAC_W = 52,
parameter EXPO_W = 11,
parameter SUBNORMAL = 1
)(
input logic[EXPO_W-1:0] expo,
input logic[FRAC_W-1:0] frac,
output logic is_inf,
output logic is_SNaN,
output logic is_QNaN,
output logic is_zero,
output logic hidden
);
logic expo_all_1s;
logic frac_lower_0s;
assign expo_all_1s = &expo;
assign frac_lower_0s = ~|frac[FRAC_W-2:0];
assign hidden = |expo;
assign is_inf = expo_all_1s & ~frac[FRAC_W-1] & frac_lower_0s; //Fully 0
assign is_SNaN = expo_all_1s & ~frac[FRAC_W-1] & ~frac_lower_0s; //Leading 0 but not fully 0
assign is_QNaN = expo_all_1s & frac[FRAC_W-1]; //Leading 1
assign is_zero = SUBNORMAL ? ~hidden & ~frac[FRAC_W-1] & frac_lower_0s : ~hidden; //Flush to 0 when not enabled
endmodule

View file

@ -0,0 +1,193 @@
/*
* Copyright © 2019-2023 Yuhui Gao, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Yuhui Gao <yuhuig@sfu.ca>
* Chris Keilbart <ckeilbar@sfu.ca>
*/
module fp_sqrt
import cva5_config::*;
import fpu_types::*;
(
input logic clk,
input logic rst,
input fp_sqrt_inputs_t args,
unit_issue_interface.unit issue,
fp_intermediate_wb_interface.unit wb
);
//Hidden + GRS + 1 (because without the +1 it gave the wrong sticky bit in certain cases)
unsigned_sqrt_interface #(.DATA_WIDTH(FRAC_WIDTH+5)) sqrt();
////////////////////////////////////////////////////
//Implementation
//Iterative square root core, bypassed on special cases
logic busy;
logic new_request_r;
assign issue.ready = ~busy | wb.ack;
always_ff @(posedge clk) begin
if (rst) begin
busy <= 0;
new_request_r <= 0;
end
else begin
if (wb.ack)
busy <= 0;
if (issue.new_request)
busy <= 1;
new_request_r <= issue.new_request;
end
end
////////////////////////////////////////////////////
//Special cases
//Handle edge cases like negative numbers, infinity, NaN, zero, and powers of 2
//Don't require mantissa calculation and bypass the core
logic nv, nv_r;
logic inf; //Default if not qnan_r or zero_r
logic qnan, qnan_r;
logic zero, zero_r;
logic early_exit;
logic result_sign;
fp_t special_result;
expo_d_t result_expo;
assign nv = (args.rs1.d.sign & ~args.special_case.qnan & ~args.special_case.zero) | args.special_case.snan;
assign qnan = args.special_case.qnan | nv;
assign zero = args.special_case.zero;
assign inf = args.special_case.inf & ~args.rs1.d.sign;
always_ff @(posedge clk) begin
if (rst)
early_exit <= 0;
else if (wb.ack)
early_exit <= 0;
else if (issue.new_request)
early_exit <= inf | zero | qnan;
if (issue.new_request) begin
result_sign <= args.rs1.d.sign;
nv_r <= nv;
qnan_r <= qnan;
zero_r <= zero;
end
end
always_comb begin
if (qnan_r)
special_result.raw = CANONICAL_NAN;
else if (zero_r) begin
special_result.d.sign = result_sign;
special_result.d.expo = '0;
special_result.d.frac = '0;
end
else begin //Inf
special_result.d.sign = 0;
special_result.d.expo = '1;
special_result.d.frac = '0;
end
end
////////////////////////////////////////////////////
//Exponent logic
//Normalized for subnormal inputs
//Halved for positive exponents and doubled for negative exponents
logic[EXPO_WIDTH:0] norm_expo;
logic[EXPO_WIDTH:0] norm_expo_r;
logic[EXPO_WIDTH:0] unbiased_expo;
assign norm_expo = args.rs1.d.expo + {{(EXPO_WIDTH-1){1'b0}}, ~args.rs1_hidden} - args.rs1_prenormalize_shift_amt;
assign unbiased_expo = norm_expo_r - {{(EXPO_WIDTH-1){1'b0}}, ~norm_expo_r[0]} - BIAS;
//Right shift by 1 halves both positive and negative numbers
assign result_expo = unbiased_expo[EXPO_WIDTH:1] + BIAS;
always_ff @(posedge clk) begin
if (issue.new_request)
norm_expo_r <= norm_expo;
end
////////////////////////////////////////////////////
//Mantissa square root core
//Designed to be swappable
//Operates on normalized values shifted for alignment
logic result_hidden;
logic[3:0] result_grs;
frac_d_t result_frac;
assign sqrt.radicand = norm_expo[0] ? {2'b01, args.rs1.d.frac, 3'b0} : {1'b1, args.rs1.d.frac, 4'b0};
assign sqrt.start = issue.new_request & ~(inf | zero | qnan);
assign {result_hidden, result_frac, result_grs} = sqrt.result;
fp_sqrt_core sqrt_core (
.sqrt(sqrt),
.*);
////////////////////////////////////////////////////
//Output management
//Either return the early execute values on cycle 1, or the regular values once the square root finishes
logic sqrt_hold;
assign wb.done = sqrt.done | sqrt_hold | early_exit;
always_ff @(posedge clk) begin
if (rst)
sqrt_hold <= 0;
else
sqrt_hold <= ~wb.ack & (sqrt.done | sqrt_hold);
end
always_ff @(posedge clk) begin
if (issue.new_request) begin
wb.id <= issue.id;
wb.rm <= args.rm;
wb.d2s <= args.single;
end
end
always_comb begin
if (new_request_r)
wb.rd = special_result;
else begin
wb.rd.d.sign = 0;
wb.rd.d.expo = result_expo;
wb.rd.d.frac = result_frac;
end
wb.grs = '0;
if (~new_request_r) begin
wb.grs[GRS_WIDTH-1-:4] = result_grs;
wb.grs[GRS_WIDTH-5-:FRAC_WIDTH+5] = sqrt.remainder;
end
end
assign wb.expo_overflow = 0;
assign wb.fflags.nv = nv_r;
assign wb.fflags.dz = 0;
assign wb.fflags.of = 0;
assign wb.fflags.uf = 0;
assign wb.fflags.nx = 0; //Set in writeback
assign wb.carry = 0;
assign wb.safe = 0;
assign wb.subnormal = 0;
assign wb.hidden = (new_request_r & ~zero_r) | (~new_request_r & result_hidden);
assign wb.clz = '0;
assign wb.right_shift = 0;
assign wb.right_shift_amt = 'x;
assign wb.ignore_max_expo = 1;
endmodule

View file

@ -0,0 +1,109 @@
/*
* Copyright © 2019-2023 Yuhui Gao, Chris Keilbart, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Yuhui Gao <yuhuig@sfu.ca>
* Chris Keilbart <ckeilbar@sfu.ca>
*/
module fp_sqrt_core
(
input logic clk,
input logic rst,
unsigned_sqrt_interface.sqrt sqrt
);
typedef logic[$clog2(sqrt.DATA_WIDTH)-1:0] counter_t;
typedef logic[sqrt.DATA_WIDTH-1:0] frac_t;
////////////////////////////////////////////////////
//Radix 2 square root
//Fixed latency generating one result bit per cycle
//Control logic
logic counter_full;
counter_t counter;
assign counter_full = counter == counter_t'(sqrt.DATA_WIDTH);
always_ff @(posedge clk) begin
if (rst) begin
counter <= '0;
sqrt.done <= 0;
end
else begin
sqrt.done <= counter_full;
if (counter_full)
counter <= '0;
else if (sqrt.start | |counter)
counter <= counter + 1;
end
end
////////////////////////////////////////////////////
//Subtraction
frac_t rad;
frac_t current_subtractend;
frac_t next_subtractend;
frac_t subtractor;
frac_t subtraction;
logic overflow;
assign subtractor = {sqrt.result[sqrt.DATA_WIDTH-3:0], 2'b01};
assign {overflow, subtraction} = current_subtractend - subtractor;
////////////////////////////////////////////////////
//Next Working subtractend Determination
always_comb begin
if (overflow)
next_subtractend = {current_subtractend[sqrt.DATA_WIDTH-3:0], rad[sqrt.DATA_WIDTH-1-:2]};
else
next_subtractend = {subtraction[sqrt.DATA_WIDTH-3:0], rad[sqrt.DATA_WIDTH-1-:2]};
end
always_ff @(posedge clk) begin
if (sqrt.start) //First working subtractend extracts the upper 2 bits of the radicand
current_subtractend <= {{(sqrt.DATA_WIDTH-2){1'b0}}, sqrt.radicand[sqrt.DATA_WIDTH-1-:2]};
else
current_subtractend <= next_subtractend;
end
////////////////////////////////////////////////////
//Update remaining radicand digits
always_ff @(posedge clk) begin
if (sqrt.start) //The upper two bits are pushed to the working subtractend register
rad <= {sqrt.radicand[sqrt.DATA_WIDTH-3:0], 2'b00};
else
rad <= rad << 2;
end
////////////////////////////////////////////////////
//Quotient Determination
always_ff @(posedge clk) begin
if (sqrt.start) begin
sqrt.result <= '0;
sqrt.remainder <= '0;
end
else if (|counter) begin
//Shift in new quotient bit
sqrt.result <= {sqrt.result[sqrt.DATA_WIDTH-2:0], ~overflow};
sqrt.remainder <= next_subtractend;
end
end
endmodule

View file

@ -0,0 +1,70 @@
/*
* Copyright © 2019-2023 Yuhui Gao, Chris Keilbart, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Yuhui Gao <yuhuig@sfu.ca>
* Chris Keilbart <ckeilbar@sfu.ca>
*/
module fp_sticky_tracking
#(
parameter INPUT_WIDTH = 24,
parameter SHIFT_WIDTH = 11
)(
input logic[INPUT_WIDTH-1:0] shifter_input,
input logic[SHIFT_WIDTH-1:0] shift_amount,
output logic sticky_bit
);
//This unit returns a single bit which indicates whether a 1 got right shifted out of the input
//ORs all shifted
function logic shift_reduce(input logic[3:0] a, input logic[1:0] sel, input logic fully_shifted);
case({fully_shifted, sel})
0 : shift_reduce = a[0];
1 : shift_reduce = |a[1:0];
2 : shift_reduce = |a[2:0];
default : shift_reduce = |a;
endcase
endfunction
localparam PADDED_WIDTH = 2**SHIFT_WIDTH;
localparam NUM_TIERS = (SHIFT_WIDTH+1)/2; //log4 - each level reduces width by a factor of 4
logic[PADDED_WIDTH-1:0] tier[NUM_TIERS];
////////////////////////////////////////////////////
//Implementation
int tier_width;
int curr_shift_amount;
always_comb begin
tier = '{default: '0};
//Pad with 0s to ensure that shift amounts larger than INPUT_WIDTH generate the correct sticky
tier[0] = {{(PADDED_WIDTH-INPUT_WIDTH){1'b0}}, shifter_input};
tier_width = PADDED_WIDTH/4;
for (int i = 1; i < NUM_TIERS; i++) begin
curr_shift_amount = 32'(shift_amount) >> 2*i;
for (int j = 0; j < tier_width; j++)
tier[i][j] = shift_reduce(tier[i-1][j*4 +: 4], shift_amount[(i-1)*2 +: 2], j < curr_shift_amount);
tier_width = tier_width/4;
end
sticky_bit = shift_reduce(tier[NUM_TIERS-1][3:0], shift_amount[$clog2(PADDED_WIDTH)-1 -: 2], 1'b0);
end
endmodule

View file

@ -0,0 +1,229 @@
/*
* Copyright © 2019-2023 Yuhui Gao, Chris Keilbart, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Yuhui Gao <yuhuig@sfu.ca>
* Chris Keilbart <ckeilbar@sfu.ca>
*/
module fp_wb2fp_misc
import fpu_types::*;
import cva5_config::*;
(
input fp_wb2fp_misc_inputs_t args,
unit_issue_interface.unit issue,
fp_intermediate_wb_interface.unit wb
);
////////////////////////////////////////////////////
//Implementation
//Sign injections, min/max, s2d, d2s, i2f, and moves
//Single cycle, sharing a writeback port
assign issue.ready = wb.ack; //ACK functions as READY here
assign wb.id = issue.id;
assign wb.done = issue.new_request;
assign wb.rm = args.rm; //Only used for i2f
////////////////////////////////////////////////////
//FMV
//Transfers bits unchanged from an INT register to an FP register, boxing them
//In reduced precision, transfers the lower bits
fp_t fmv_rd;
assign fmv_rd.s.box = '1;
assign {fmv_rd.s.sign, fmv_rd.s.expo, fmv_rd.s.frac} = args.int_rs[FLEN_F-1:0];
////////////////////////////////////////////////////
//FS2D
//The actual conversion is done in preprocessing
//Can only raise on SNAN
fp_t s2d_rd;
assign s2d_rd = args.rs1;
////////////////////////////////////////////////////
//FD2S
//The actual conversion is done in postprocessing
//Canonicalizes NaNs and can also raise on SNAN
fp_t d2s_rd;
assign d2s_rd.raw = args.rs1_special_case.snan | args.rs1_special_case.qnan ? CANONICAL_NAN : args.rs1.raw;
////////////////////////////////////////////////////
//FSGN
//Modifies the sign of the first operand
//Does NOT canonicalize NaNs and does not raise any flags
logic sgn_sign;
logic rs1_sign;
logic rs2_sign;
fp_t sgn_rd;
assign rs1_sign = args.fsgnj_single ? args.rs1_boxed & args.rs1.s.sign : args.rs1.d.sign;
assign rs2_sign = args.fsgnj_single ? args.rs2_boxed & args.rs2.s.sign : args.rs2.d.sign;
always_comb begin
if (args.rm[1]) //JX
sgn_sign = rs1_sign ^ rs2_sign;
else if (args.rm[0]) //JN
sgn_sign = ~rs2_sign;
else //J
sgn_sign = rs2_sign;
if (args.fsgnj_single) begin
sgn_rd.s.box = '1;
sgn_rd.s.sign = sgn_sign;
//If rs1 is unboxed it is treated as the canonical NaN
if (args.rs1_boxed)
sgn_rd.raw[FLEN_F-2:0] = args.rs1.raw[FLEN_F-2:0];
else
sgn_rd.raw[FLEN_F-2:0] = {{EXPO_WIDTH_F{1'b1}}, 1'b1, {FRAC_WIDTH_F-1{1'b0}}};
end
else begin
sgn_rd.d.sign = sgn_sign;
sgn_rd.d.expo = args.rs1.d.expo;
sgn_rd.d.frac = args.rs1.d.frac;
end
end
////////////////////////////////////////////////////
//FMIN/FMAX
//Returns the larger/smaller argument
//Canonicalizes NaNs and can raise invalid
fp_t fminmax_rd;
logic fminmax_hidden;
logic rs1_nan;
logic rs2_nan;
assign rs1_nan = args.rs1_special_case.qnan | args.rs1_special_case.snan;
assign rs2_nan = args.rs2_special_case.qnan | args.rs2_special_case.snan;
//args.rm[0] = MAX, args.swap means rs2 > rs1
always_comb begin
case({rs1_nan, rs2_nan, args.rs1.d.sign, args.rs2.d.sign, args.rm[0], args.swap}) inside
6'b11????: begin
fminmax_rd = CANONICAL_NAN;
fminmax_hidden = 1;
end
6'b01????,
6'b00100?,
6'b00011?,
6'b000010,
6'b000001,
6'b001111,
6'b001100: begin
fminmax_rd = args.rs1;
fminmax_hidden = ~args.rs1_special_case.zero;
end
default: begin
fminmax_rd = args.rs2;
fminmax_hidden = ~args.rs2_special_case.zero;
end
endcase
end
////////////////////////////////////////////////////
//I2F
//Converts an integer to a FP number
//The actual shifting is done in postprocessing
fp_t i2f_rd;
grs_t i2f_grs;
logic[4:0] int_clz;
fp_shift_amt_t i2f_clz;
logic int_rs1_zero;
clz #(.WIDTH(32)) clz_inst (
.clz_input(args.int_rs_abs),
.clz(int_clz),
.zero(int_rs1_zero)
);
assign i2f_rd.d.sign = args.i2f_sign;
always_comb begin
if (int_rs1_zero) begin
i2f_clz = '0;
i2f_rd.d.expo = '0;
end
else begin
i2f_clz = '0;
i2f_clz[5:0] = int_clz + 1;
i2f_rd.d.expo = BIAS+32;
end
end
//When the mantissa shrinks sufficiently, the integer can no longer fit in the mantissa so it spills into the grs bits
generate if (FRAC_WIDTH >= 32) begin : gen_int_fits
always_comb begin
i2f_grs = '0;
i2f_rd.d.frac = '0;
i2f_rd.d.frac[FRAC_WIDTH-1-:32] = args.int_rs_abs;
end
end else begin : gen_int_in_grs
always_comb begin
i2f_rd.d.frac[FRAC_WIDTH-1:0] = args.int_rs_abs[31-:FRAC_WIDTH];
i2f_grs = '0;
i2f_grs[GRS_WIDTH-1-:32-FRAC_WIDTH] = args.int_rs_abs[31-FRAC_WIDTH:0];
end
end endgenerate
//Multiplex outputs of different units
always_comb begin
wb.expo_overflow = 0;
wb.fflags = '0;
wb.carry = 0;
wb.safe = 0;
wb.hidden = 0;
wb.grs = '0;
wb.clz = '0;
wb.right_shift = 0;
wb.right_shift_amt = 'x;
wb.subnormal = 0;
wb.ignore_max_expo = 1;
wb.d2s = 0;
if (args.fmv)
wb.rd = fmv_rd;
else if (args.d2s) begin
wb.rd = d2s_rd;
wb.hidden = args.rs1_hidden;
wb.d2s = 1;
wb.fflags.nv = args.rs1_special_case.snan;
end
else if (args.fsgnj) begin
wb.hidden = 1;
wb.rd = sgn_rd;
end
else if (args.fminmax) begin
wb.rd = fminmax_rd;
wb.hidden = fminmax_hidden;
wb.fflags.nv = args.rs1_special_case.snan | args.rs2_special_case.snan;
wb.d2s = args.single;
end
else if (args.i2f) begin
wb.rd = i2f_rd;
wb.grs = i2f_grs;
wb.clz = i2f_clz;
wb.d2s = args.single;
end
else begin
wb.rd = s2d_rd;
wb.hidden = args.rs1_hidden;
wb.fflags.nv = args.rs1_special_case.snan;
end
end
endmodule

View file

@ -0,0 +1,281 @@
/*
* Copyright © 2019-2023 Yuhui Gao, Chris Keilbart, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Yuhui Gao <yuhuig@sfu.ca>
* Chris Keilbart <ckeilbar@sfu.ca>
*/
module fp_wb2int_misc
import cva5_config::*;
import cva5_types::*;
import fpu_types::*;
(
input logic clk,
input logic rst,
input fp_wb2int_misc_inputs_t args,
unit_issue_interface.unit issue,
unit_writeback_interface.unit wb,
output fflags_t fflags
);
////////////////////////////////////////////////////
//Implementation
//Comparisons, classifications, conversions to integer, and moves sharing a single writeback port
//Implemented as a 2 cycle pipeline (though only the conversion needs the second cycle)
logic advance;
assign advance = wb.ack | ~wb.done;
assign issue.ready = advance;
always_ff @(posedge clk) begin
if (rst)
wb.done <= 0;
else begin
if (issue.new_request)
wb.done <= 1;
else if (wb.ack)
wb.done <= 0;
end
if (issue.new_request)
wb.id <= issue.id;
end
////////////////////////////////////////////////////
//FMV
//Transfers bits unchanged from an FP register to an INT register
//This instruction is meant to transfer single precision numbers, so in reduced precision only the single precision bits are used
logic[31:0] fmv_rd;
always_comb begin
fmv_rd = '0;
fmv_rd[FLEN_F-1:0] = args.rs1.raw[FLEN_F-1:0];
end
////////////////////////////////////////////////////
//FCLASS
//Outputs a number indicating the type of the operand
//Encoded one hot
logic[31:0] fclass_rd;
always_comb begin
fclass_rd = '0;
fclass_rd[0] = args.rs1.d.sign & args.rs1_special_case.inf;
fclass_rd[1] = args.rs1.d.sign & args.rs1_original_hidden_bit & ~|args.rs1_special_case;
fclass_rd[2] = args.rs1.d.sign & ~args.rs1_original_hidden_bit & ~args.rs1_special_case.zero;
fclass_rd[3] = args.rs1.d.sign & args.rs1_special_case.zero;
fclass_rd[4] = ~args.rs1.d.sign & args.rs1_special_case.zero;
fclass_rd[5] = ~args.rs1.d.sign & ~args.rs1_original_hidden_bit & ~args.rs1_special_case.zero;
fclass_rd[6] = ~args.rs1.d.sign & args.rs1_original_hidden_bit & ~|args.rs1_special_case;
fclass_rd[7] = ~args.rs1.d.sign & args.rs1_special_case.inf;
fclass_rd[8] = args.rs1_special_case.snan;
fclass_rd[9] = args.rs1_special_case.qnan;
end
////////////////////////////////////////////////////
//FCMP
//Implements equal, less than, and less than or equal
//For these instructions, +-0 are identical and flags can be raised for NaN operands
logic[31:0] fcmp_rd;
logic invalid_cmp;
logic unordered;
logic sign_eq;
logic expo_eq;
logic frac_eq;
logic feq;
logic flt;
logic fle;
//FLT/FLE are signalling (raise on NaN)
assign invalid_cmp = (~args.rm[1] & (args.rs1_special_case.qnan | args.rs2_special_case.qnan)) | args.rs1_special_case.snan | args.rs2_special_case.snan;
assign unordered = args.rs1_special_case.qnan | args.rs1_special_case.snan | args.rs2_special_case.qnan | args.rs2_special_case.snan;
assign sign_eq = args.rs1.d.sign == args.rs2.d.sign;
assign expo_eq = args.rs1.d.expo == args.rs2.d.expo;
assign frac_eq = args.rs1.d.frac == args.rs2.d.frac;
assign feq = (args.rs1_special_case.zero & args.rs2_special_case.zero) | (sign_eq & expo_eq & frac_eq);
assign flt = sign_eq ? (args.swap ^ args.rs1.d.sign) & ~(sign_eq & expo_eq & frac_eq) : args.rs1.d.sign & ~(args.rs1_special_case.zero & args.rs2_special_case.zero);
assign fle = flt | feq;
always_comb begin
fcmp_rd = '0;
if (args.rm[1])
fcmp_rd[0] = feq & ~unordered;
else if (args.rm[0])
fcmp_rd[0] = flt & ~unordered;
else
fcmp_rd[0] = fle & ~unordered;
end
//Choose between the three single cycle operations
logic[31:0] single_rd;
logic single_valid;
logic single_invalid;
always_ff @(posedge clk) begin
if (issue.new_request) begin
single_valid <= ~args.f2i;
single_invalid <= args.fcmp & invalid_cmp;
if (args.fcmp)
single_rd <= fcmp_rd;
else if (args.fclass)
single_rd <= fclass_rd;
else
single_rd <= fmv_rd;
end
end
////////////////////////////////////////////////////
//F2I
//First cycle detects edge cases and shifts args
//Second cycle rounds
logic[2:0] grs;
logic[FRAC_WIDTH:0] rs1_frac;
logic[FRAC_WIDTH:0] f2i_frac;
logic[31:0] f2i_int;
logic[32+FRAC_WIDTH-1:0] shift_in;
logic[32+FRAC_WIDTH-1:0] f2i_int_dot_frac;
logic rs1_expo_unbiased_greater_than_31;
logic rs1_expo_unbiased_greater_than_30;
logic subtract;
logic roundup;
//Cycle 1 - calculate roundup and detect special and edge cases
assign rs1_expo_unbiased_greater_than_31 = args.rs1.d.expo > (BIAS+31);
assign rs1_expo_unbiased_greater_than_30 = args.rs1.d.expo > (BIAS+30);
assign rs1_frac = {args.rs1_hidden, args.rs1.d.frac};
assign shift_in = {{31{1'b0}}, rs1_frac};
//Left shift according to exponent
assign f2i_int_dot_frac = shift_in << args.rs1_expo_unbiased;
always_comb begin
if (args.int_less_than_1) begin
f2i_int = '0;
f2i_frac = rs1_frac;
end else
{f2i_int, f2i_frac} = {f2i_int_dot_frac, 1'b0};
end
//Calculate rounding bits and -roundup or +roundup
logic sticky;
assign sticky = |f2i_frac[FRAC_WIDTH-2:0];
always_comb begin
if (args.int_less_than_1) begin
if (args.rs1.d.expo == expo_d_t'(BIAS-1))
grs = {f2i_frac[FRAC_WIDTH-:2], sticky};
else if (args.rs1.d.expo == expo_d_t'(BIAS-2))
grs = {1'b0, f2i_frac[FRAC_WIDTH], f2i_frac[FRAC_WIDTH-1] | sticky};
else
grs = {2'b0, (|f2i_frac[FRAC_WIDTH-:2] | sticky)};
end else
grs = {f2i_frac[FRAC_WIDTH-:2], sticky};
end
fp_roundup f2i_int_roundup (
.sign(args.rs1.d.sign),
.rm(args.rm),
.grs(grs),
.lsb(f2i_int[0]),
.roundup(roundup),
.result_if_overflow()
);
assign subtract = args.rs1.d.sign & args.is_signed;
//Special case handling - this is sometimes the critical path in the FPU
//This special case detection can be done in the second cycle, which may make that a new critical path
//However, calculating the roundup takes approximately the same amount of time as these special cases
logic inexact;
logic all_frac;
logic greater_than_largest_unsigned_int;
logic smaller_than_smallest_unsigned_int;
logic greater_than_largest_signed_int;
logic smaller_than_smallest_signed_int;
logic special;
assign inexact = |grs;
assign all_frac = &f2i_int[30:0];
assign greater_than_largest_unsigned_int = ~args.is_signed & (~args.rs1.d.sign | args.rs1_special_case.snan | args.rs1_special_case.qnan) & ((f2i_int[31] & all_frac & roundup) | rs1_expo_unbiased_greater_than_31);
assign smaller_than_smallest_unsigned_int = ~args.is_signed & args.rs1.d.sign & ~args.rs1_special_case.zero & ~(args.int_less_than_1 & ~roundup);
assign greater_than_largest_signed_int = args.is_signed & ((args.rs1_special_case.snan | args.rs1_special_case.qnan | ~args.rs1.d.sign) & ((~f2i_int[31] & all_frac & roundup) | rs1_expo_unbiased_greater_than_30));
assign smaller_than_smallest_signed_int = args.is_signed & args.rs1.d.sign & ((f2i_int[31] & (|f2i_int[30:0] | roundup)) | rs1_expo_unbiased_greater_than_31);
assign special = (~args.is_signed & (greater_than_largest_unsigned_int | smaller_than_smallest_unsigned_int)) | (args.is_signed & (greater_than_largest_signed_int | smaller_than_smallest_signed_int));
//Cycle 2 - do the rounding and override special cases
//Input negative -> -roundup - f2i_int
//Input positive -> roundup + f2i_int
logic r_greater_than_largest_unsigned_int;
logic r_greater_than_largest_signed_int;
logic r_smaller_than_smallest_signed_int;
logic r_inexact;
logic r_special;
logic r_subtract;
logic r_roundup;
logic[31:0] r_f2i_int;
logic[31:0] in1;
logic[31:0] in2;
logic[31:0] f2i_int_rounded;
logic[31:0] special_case_result;
logic carry_in;
assign in1 = r_subtract ? -(32'(r_roundup)) : 32'(r_roundup);
assign in2 = r_f2i_int ^ {32{r_subtract}};
assign {f2i_int_rounded, carry_in} = {in1, 1'b1} + {in2, r_subtract};
always_comb begin
if (r_greater_than_largest_unsigned_int)
special_case_result = 32'hffffffff; //2^32 - 1;
else if (r_greater_than_largest_signed_int)
special_case_result = 32'h7fffffff; //2^31 - 1;
else if (r_smaller_than_smallest_signed_int)
special_case_result = 32'h80000000; //-2^31;
else
special_case_result = 0;
end
//F2I pipeline
always_ff @ (posedge clk) begin
if (issue.new_request) begin
r_greater_than_largest_unsigned_int <= greater_than_largest_unsigned_int;
r_greater_than_largest_signed_int <= greater_than_largest_signed_int;
r_smaller_than_smallest_signed_int <= smaller_than_smallest_signed_int;
r_inexact <= inexact;
r_special <= special;
r_f2i_int <= f2i_int;
r_subtract <= subtract;
r_roundup <= roundup;
end
end
//Multiplex the outputs from f2i and the single cycle units
always_comb begin
fflags = '0;
if (single_valid) begin
wb.rd = single_rd;
fflags.nv = single_invalid;
end
else begin
wb.rd = r_special ? special_case_result : f2i_int_rounded;
fflags.nv = r_special;
fflags.nx = r_inexact & ~r_special;
end
end
endmodule

View file

@ -0,0 +1,277 @@
/*
* Copyright © 2019-2023 Yuhui Gao, Chris Keilbart, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Yuhui Gao <yuhuig@sfu.ca>
* Chris Keilbart <ckeilbar@sfu.ca>
*/
module fpu_top
import cva5_config::*;
import cva5_types::*;
import fpu_types::*;
import opcodes::*;
#(
parameter cpu_config_t CONFIG = EXAMPLE_CONFIG
)
(
input logic clk,
input logic rst,
input decode_packet_t decode_stage,
output logic unit_needed,
output logic[REGFILE_READ_PORTS-1:0] uses_rs,
output logic[2:0] fp_uses_rs,
output logic uses_rd,
output logic fp_uses_rd,
input logic issue_stage_ready,
input logic[2:0] dyn_rm,
input logic[31:0] int_rf[REGFILE_READ_PORTS],
input logic[FLEN-1:0] fp_rf[3],
unit_issue_interface.unit issue,
unit_writeback_interface.unit int_wb,
unit_writeback_interface.unit fp_wb,
output fflags_t fflags
);
fp_madd_inputs_t madd_inputs;
fp_div_inputs_t div_inputs;
fp_sqrt_inputs_t sqrt_inputs;
fp_wb2fp_misc_inputs_t wb2fp_inputs;
fp_wb2int_misc_inputs_t wb2int_inputs;
fflags_t int_fflags;
fflags_t fp_fflags;
unit_issue_interface intermediate_issue[4:0](); //FMA, FDIV, FSQRT, WB2FP, WB2INT
fp_intermediate_wb_interface intermediate_unit_wb[3:0](); //FMADD, FMUL, FDIV/FSQRT, WB2FP
////////////////////////////////////////////////////
//Implementation
//This unit instantiates the internal FPU components and connects them
//It is also responsible for instruction decoding
////////////////////////////////////////////////////
//Decode
always_comb begin
uses_rs = '0;
uses_rs[RS1] = decode_stage.instruction inside {
SP_FCVT_S_W, SP_FCVT_S_WU, SP_FMV_W_X,
DP_FCVT_D_W, DP_FCVT_D_WU
};
fp_uses_rs = '0;
fp_uses_rs[RS1] = decode_stage.instruction inside {
SP_FMADD, SP_FMSUB, SP_FNMSUB, SP_FNMADD, SP_FADD, SP_FSUB, SP_FMUL,
SP_FDIV, SP_FSQRT, SP_FSGNJ, SP_FSGNJN, SP_FSGNJX, SP_FMIN, SP_FMAX,
SP_FCVT_W_S, SP_FCVT_WU_S, SP_FMV_X_W, SP_FEQ, SP_FLT, SP_FLE, SP_FCLASS,
DP_FMADD, DP_FMSUB, DP_FNMSUB, DP_FNMADD, DP_FADD, DP_FSUB, DP_FMUL,
DP_FDIV, DP_FSQRT, DP_FSGNJ, DP_FSGNJN, DP_FSGNJX, DP_FMIN, DP_FMAX,
DP_FCVT_S_D, DP_FCVT_D_S, DP_FEQ, DP_FLT, DP_FLE, DP_FCLASS, DP_FCVT_W_D, DP_FCVT_WU_D
};
fp_uses_rs[RS2] = decode_stage.instruction inside {
SP_FMADD, SP_FMSUB, SP_FNMSUB, SP_FNMADD, SP_FADD, SP_FSUB, SP_FMUL,
SP_FDIV, SP_FSQRT, SP_FSGNJ, SP_FSGNJN, SP_FSGNJX, SP_FMIN, SP_FMAX,
SP_FEQ, SP_FLT, SP_FLE,
DP_FMADD, DP_FMSUB, DP_FNMSUB, DP_FNMADD, DP_FADD, DP_FSUB, DP_FMUL,
DP_FDIV, DP_FSQRT, DP_FSGNJ, DP_FSGNJN, DP_FSGNJX, DP_FMIN, DP_FMAX,
DP_FEQ, DP_FLT, DP_FLE
};
fp_uses_rs[RS3] = decode_stage.instruction inside {
SP_FMADD, SP_FMSUB, SP_FNMSUB, SP_FNMADD,
DP_FMADD, DP_FMSUB, DP_FNMSUB, DP_FNMADD
};
uses_rd = decode_stage.instruction inside {
SP_FCVT_W_S, SP_FCVT_WU_S, SP_FMV_X_W, SP_FEQ, SP_FLT, SP_FLE, SP_FCLASS,
DP_FEQ, DP_FLT, DP_FLE, DP_FCLASS, DP_FCVT_W_D, DP_FCVT_WU_D
};
fp_uses_rd = decode_stage.instruction inside {
SP_FMADD, SP_FMSUB, SP_FNMSUB, SP_FNMADD, SP_FADD, SP_FSUB, SP_FMUL,
SP_FDIV, SP_FSQRT, SP_FSGNJ, SP_FSGNJN, SP_FSGNJX, SP_FMIN, SP_FMAX,
SP_FCVT_S_W, SP_FCVT_S_WU, SP_FMV_W_X,
DP_FMADD, DP_FMSUB, DP_FNMSUB, DP_FNMADD, DP_FADD, DP_FSUB, DP_FMUL,
DP_FDIV, DP_FSQRT, DP_FSGNJ, DP_FSGNJN, DP_FSGNJX, DP_FMIN, DP_FMAX,
DP_FCVT_S_D, DP_FCVT_D_S, DP_FCVT_D_W, DP_FCVT_D_WU
};
unit_needed = decode_stage.instruction inside {
SP_FMADD, SP_FMSUB, SP_FNMSUB, SP_FNMADD, SP_FADD, SP_FSUB, SP_FMUL,
DP_FMADD, DP_FMSUB, DP_FNMSUB, DP_FNMADD, DP_FADD, DP_FSUB, DP_FMUL,
SP_FDIV, SP_FSQRT,
DP_FDIV, DP_FSQRT,
SP_FSGNJ, SP_FSGNJN, SP_FSGNJX, SP_FMIN, SP_FMAX, SP_FCVT_S_W, SP_FCVT_S_WU, SP_FMV_W_X,
DP_FSGNJ, DP_FSGNJN, DP_FSGNJX, DP_FMIN, DP_FMAX, DP_FCVT_S_D, DP_FCVT_D_S, DP_FCVT_D_W, DP_FCVT_D_WU,
SP_FCVT_W_S, SP_FCVT_WU_S, SP_FMV_X_W, SP_FEQ, SP_FLT, SP_FLE, SP_FCLASS,
DP_FEQ, DP_FLT, DP_FLE, DP_FCLASS, DP_FCVT_W_D, DP_FCVT_WU_D
};
end
////////////////////////////////////////////////////
//Shared preprocessing
logic is_single;
//Instruction families
logic is_fma;
logic is_fmul;
logic is_fadd;
logic is_div;
logic is_sqrt;
logic is_i2f;
logic is_mv_i2f;
logic is_s2d;
logic is_d2s;
logic is_minmax;
logic is_sign_inj;
logic is_f2i;
logic is_mv_f2i;
logic is_fcmp;
logic is_class;
//Used to distinguish between instructions in a family
logic add;
logic neg_mul;
logic conv_signed;
logic is_sign_inj_single;
rm_t rm_r;
fp_preprocessing_packet_t pkt;
assign pkt.valid = issue.new_request;
assign pkt.unit[0] = is_fma | is_fmul | is_fadd;
assign pkt.unit[1] = is_div;
assign pkt.unit[2] = is_sqrt;
assign pkt.unit[3] = is_i2f | is_mv_i2f | is_minmax | is_sign_inj | is_s2d | is_d2s;
assign pkt.unit[4] = is_f2i | is_mv_f2i | is_fcmp | is_class;
assign pkt.rs1 = fp_rf[RS1];
assign pkt.rs2 = fp_rf[RS2];
assign pkt.rs3 = fp_rf[RS3];
assign pkt.int_rs1 = int_rf[RS1];
assign pkt.id = issue.id;
assign pkt.is_single = is_single;
assign pkt.is_fma = is_fma;
assign pkt.is_fadd = is_fadd;
assign pkt.is_i2f = is_i2f;
assign pkt.is_d2s = is_d2s;
assign pkt.is_minmax = is_minmax;
assign pkt.is_sign_inj = is_sign_inj;
assign pkt.is_sign_inj_single = is_sign_inj_single;
assign pkt.is_f2i = is_f2i;
assign pkt.is_mv_i2f = is_mv_i2f;
assign pkt.is_fcmp = is_fcmp;
assign pkt.is_class = is_class;
assign pkt.add = add;
assign pkt.neg_mul = neg_mul;
assign pkt.conv_signed = conv_signed;
assign pkt.rm = &rm_r ? dyn_rm : rm_r;
always_ff @(posedge clk) begin
if (issue_stage_ready) begin
//Only the instructions that convert their arguments from s->d
is_single <= decode_stage.instruction inside {SP_FMADD, SP_FMSUB, SP_FNMSUB, SP_FNMADD, SP_FADD, SP_FSUB, SP_FMUL, SP_FDIV, SP_FSQRT, SP_FMIN, SP_FMAX, SP_FCVT_S_W, SP_FCVT_S_WU, DP_FCVT_D_S, SP_FCVT_W_S, SP_FCVT_WU_S, SP_FEQ, SP_FLT, SP_FLE, SP_FCLASS};
//Partial decoding to distinguish instructions from each other
is_fma <= ~decode_stage.instruction[4];
is_fmul <= decode_stage.instruction[4] & decode_stage.instruction[31:27] inside {5'b0??10};
is_fadd <= decode_stage.instruction[4] & decode_stage.instruction[31:27] inside {5'b0000?};
is_div <= decode_stage.instruction[4] & decode_stage.instruction[31:27] inside {5'b00?11};
is_sqrt <= decode_stage.instruction[4] & decode_stage.instruction[31:27] inside {5'b01?1?};
is_i2f <= decode_stage.instruction[4] & decode_stage.instruction[31:27] inside {5'b1?01?};
is_mv_f2i <= decode_stage.instruction[4] & decode_stage.instruction[31:27] inside {5'b1110?} & ~decode_stage.instruction[12];
is_s2d <= decode_stage.instruction[4] & decode_stage.instruction[31:27] inside {5'b01?0?} & ~decode_stage.instruction[20];
is_d2s <= decode_stage.instruction[4] & decode_stage.instruction[31:27] inside {5'b01?0?} & decode_stage.instruction[20];
is_minmax <= decode_stage.instruction[4] & decode_stage.instruction[31:27] inside {5'b0?1?1};
is_sign_inj <= decode_stage.instruction[4] & decode_stage.instruction[31:27] inside {5'b0?1?0};
is_sign_inj_single <= ~decode_stage.instruction[25];
is_f2i <= decode_stage.instruction[4] & decode_stage.instruction[31:27] inside {5'b1?00?};
is_mv_i2f <= decode_stage.instruction[4] & decode_stage.instruction[31:27] inside {5'b1?11?};
is_fcmp <= decode_stage.instruction[4] & decode_stage.instruction[31:27] inside {5'b10???};
is_class <= decode_stage.instruction[4] & decode_stage.instruction[31:27] inside {5'b1110?} & decode_stage.instruction[12];
//Double duty for both FADD and FMA
add <= decode_stage.instruction[4] ? ~decode_stage.instruction[27] : decode_stage.instruction[3:2] inside {2'b00, 2'b10};
neg_mul <= decode_stage.instruction[3];
conv_signed <= ~decode_stage.instruction[20];
rm_r <= decode_stage.instruction[14:12];
end
end
fp_preprocessing #(.CONFIG(CONFIG), .FP_NUM_UNITS(5)) fp_preprocessing_inst (
.unit_issue(intermediate_issue),
.pkt(pkt),
.ready(issue.ready),
.madd_args(madd_inputs),
.div_args(div_inputs),
.sqrt_args(sqrt_inputs),
.wb2fp_args(wb2fp_inputs),
.wb2int_args(wb2int_inputs),
.*);
////////////////////////////////////////////////////
//Execution Units
fp_madd_wrapper #(.CONFIG(CONFIG)) fp_madd_inst (
.args(madd_inputs),
.issue(intermediate_issue[0]),
.madd_wb(intermediate_unit_wb[1]),
.mul_wb(intermediate_unit_wb[2]),
.*);
fp_div_sqrt_wrapper div_sqrt_inst (
.div_inputs(div_inputs),
.sqrt_inputs(sqrt_inputs),
.div_issue(intermediate_issue[1]),
.sqrt_issue(intermediate_issue[2]),
.wb(intermediate_unit_wb[0]),
.*);
fp_wb2fp_misc wb2fp_misc_inst (
.args(wb2fp_inputs),
.issue(intermediate_issue[3]),
.wb(intermediate_unit_wb[3])
);
fp_wb2int_misc wb2int_misc_inst (
.args(wb2int_inputs),
.issue(intermediate_issue[4]),
.wb(int_wb),
.fflags(int_fflags),
.*);
////////////////////////////////////////////////////
//Normalization and rounding
fp_normalize_rounding_top #(.NUM_WB_UNITS(4)) norm_round_inst (
.intermediate_wb(intermediate_unit_wb),
.wb(fp_wb),
.fflags(fp_fflags),
.*);
////////////////////////////////////////////////////
//Updating flags
//Combine both wb2int and wb2fp in one because they can writeback simultaneously
logic fp_accepted;
logic int_accepted;
assign fp_accepted = fp_wb.done & fp_wb.ack;
assign int_accepted = int_wb.done & int_wb.ack;
always_comb begin
fflags = '0;
if (fp_accepted & int_accepted)
fflags = fp_fflags | int_fflags;
else if (fp_accepted)
fflags = fp_fflags;
else if (int_accepted)
fflags = int_fflags;
end
endmodule

View file

@ -26,6 +26,7 @@ module gc_unit
import riscv_types::*;
import cva5_types::*;
import csr_types::*;
import opcodes::*;
# (
parameter cpu_config_t CONFIG = EXAMPLE_CONFIG
@ -36,8 +37,17 @@ module gc_unit
input logic rst,
//Decode
input decode_packet_t decode_stage,
output logic unit_needed,
output logic [REGFILE_READ_PORTS-1:0] uses_rs,
output logic uses_rd,
input issue_packet_t issue_stage,
input logic issue_stage_ready,
input logic [31:0] constant_alu,
input logic [31:0] rf [REGFILE_READ_PORTS],
unit_issue_interface.unit issue,
input gc_inputs_t gc_inputs,
//Branch miss predict
input logic branch_flush,
@ -54,8 +64,6 @@ module gc_unit
input logic [31:0] epc,
//Retire
input retire_packet_t retire,
input id_t retire_ids [RETIRE_PORTS],
input id_t retire_ids_next [RETIRE_PORTS],
input logic [$clog2(NUM_EXCEPTION_SOURCES)-1:0] current_exception_unit,
@ -110,6 +118,7 @@ module gc_unit
//LS exceptions (miss-aligned, TLB and MMU) (issue stage)
//fetch flush, take exception. If execute or later exception occurs first, exception is overridden
common_instruction_t instruction;//rs1_addr, rs2_addr, fn3, fn7, rd_addr, upper/lower opcode
typedef enum {RST_STATE, PRE_CLEAR_STATE, INIT_CLEAR_STATE, IDLE_STATE, TLB_CLEAR_STATE, POST_ISSUE_DRAIN, PRE_ISSUE_FLUSH, POST_ISSUE_DISCARD} gc_state;
gc_state state;
@ -118,7 +127,6 @@ module gc_unit
logic init_clear_done;
logic tlb_clear_done;
gc_inputs_t gc_inputs_r;
logic post_issue_idle;
logic ifence_in_progress;
logic ret_in_progress;
@ -135,8 +143,52 @@ module gc_unit
logic gc_pc_override;
logic [31:0] gc_pc;
typedef struct packed{
logic [31:0] pc_p4;
logic is_ifence;
logic is_mret;
logic is_sret;
} gc_inputs_t;
gc_inputs_t gc_inputs;
gc_inputs_t gc_inputs_r;
////////////////////////////////////////////////////
//Implementation
////////////////////////////////////////////////////
//Decode
logic is_ifence;
logic is_mret;
logic is_sret;
assign instruction = decode_stage.instruction;
assign unit_needed =
(CONFIG.INCLUDE_M_MODE & decode_stage.instruction inside {MRET}) |
(CONFIG.INCLUDE_S_MODE & decode_stage.instruction inside {SRET, SFENCE_VMA}) |
(CONFIG.INCLUDE_IFENCE & decode_stage.instruction inside {FENCE_I});
always_comb begin
uses_rs = '0;
uses_rs[RS1] = CONFIG.INCLUDE_S_MODE & decode_stage.instruction inside {SFENCE_VMA};
uses_rd = 0;
end
always_ff @(posedge clk) begin
if (issue_stage_ready) begin
is_ifence = (instruction.upper_opcode == FENCE_T) & CONFIG.INCLUDE_IFENCE;
is_mret = (instruction.upper_opcode == SYSTEM_T) & (decode_stage.instruction[31:20] == MRET_imm) & CONFIG.INCLUDE_M_MODE;
is_sret = (instruction.upper_opcode == SYSTEM_T) & (decode_stage.instruction[31:20] == SRET_imm) & CONFIG.INCLUDE_S_MODE;
end
end
assign gc_inputs.pc_p4 = constant_alu;
assign gc_inputs.is_ifence = is_ifence;
assign gc_inputs.is_mret = is_mret;
assign gc_inputs.is_sret = is_sret;
////////////////////////////////////////////////////
//Issue
//Input registering
always_ff @(posedge clk) begin
if (issue.new_request)

View file

@ -22,13 +22,12 @@
module addr_hash
import cva5_config::*;
import riscv_types::*;
import cva5_types::*;
#(
parameter logic USE_BIT_3 = 1
)
(
input logic clk,
input logic rst,
input logic [31:0] addr,
output addr_hash_t addr_hash
);
@ -36,9 +35,9 @@ module addr_hash
////////////////////////////////////////////////////
//Implementation
//Xor addr in groups of 4-bits, truncating to the virtual/physical address invariant bits (11:0)
//lower two bits are not used due to complications in determining overlap between byte
//halfword and word operations.
assign addr_hash[0] = addr[2] ^ addr[6] ^ addr[10];
//lower two bits (and third in double) are not used due to complications in determining
//overlap between byte doubleword, halfword and word operations.
assign addr_hash[0] = (USE_BIT_3 & addr[2]) ^ addr[6] ^ addr[10];
assign addr_hash[1] = addr[3] ^ addr[7] ^ addr[11];
assign addr_hash[2] = addr[4] ^ addr[8];
assign addr_hash[3] = addr[5] ^ addr[9];

View file

@ -0,0 +1,327 @@
/*
* Copyright © 2022 Eric Matthews
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Eric Matthews <ematthew@sfu.ca>
*/
module dcache
import cva5_config::*;
import riscv_types::*;
import cva5_types::*;
# (
parameter cpu_config_t CONFIG = EXAMPLE_CONFIG
)
(
input logic clk,
input logic rst,
input logic dcache_on,
l1_arbiter_request_interface.master l1_request,
l1_arbiter_return_interface.master l1_response,
input logic sc_complete,
input logic sc_success,
input logic clear_reservation,
input amo_details_t amo,
input logic uncacheable_load,
input logic uncacheable_store,
input logic is_load,
input logic load_request,
input logic store_request,
output logic load_ready,
output logic store_ready,
input data_access_shared_inputs_t ls_load,
input data_access_shared_inputs_t ls_store,
memory_sub_unit_interface.responder ls
);
localparam derived_cache_config_t SCONFIG = get_derived_cache_params(CONFIG, CONFIG.DCACHE, CONFIG.DCACHE_ADDR);
localparam LOG2_WAYS = (CONFIG.DCACHE.WAYS == 1) ? 1 : $clog2(CONFIG.DCACHE.WAYS);
localparam bit [SCONFIG.SUB_LINE_ADDR_W-1:0] END_OF_LINE_COUNT = SCONFIG.SUB_LINE_ADDR_W'(CONFIG.DCACHE.LINE_W-1);
cache_functions_interface # (.LINE_W(SCONFIG.LINE_ADDR_W), .SUB_LINE_W(SCONFIG.SUB_LINE_ADDR_W)) addr_utils ();
typedef struct packed{
logic [31:0] addr;
logic uncacheable;
} load_stage2_t;
load_stage2_t stage2_load;
typedef struct packed{
logic [31:0] addr;
logic [3:0] be;
logic [31:0] data;
logic cache_op;
logic uncacheable;
} store_stage2_t;
store_stage2_t stage2_store;
logic [CONFIG.DCACHE.WAYS-1:0] load_tag_hit_way;
logic [CONFIG.DCACHE.WAYS-1:0] store_tag_hit_way;
logic [CONFIG.DCACHE.WAYS-1:0] replacement_way;
logic [CONFIG.DCACHE.WAYS-1:0] replacement_way_r;
logic load_tag_check;
logic load_hit;
logic store_hit;
logic [LOG2_WAYS-1:0] tag_hit_index;
logic [LOG2_WAYS-1:0] replacement_index;
logic [LOG2_WAYS-1:0] replacement_index_r;
logic [LOG2_WAYS-1:0] load_sel;
logic is_target_word;
logic [SCONFIG.SUB_LINE_ADDR_W-1:0] word_count;
logic miss_data_valid;
logic line_complete;
logic arb_load_sel;
logic load_l1_arb_ack;
logic store_l1_arb_ack;
logic [31:0] ram_load_data [CONFIG.DCACHE.WAYS-1:0];
typedef enum {
LOAD_IDLE = 0,
LOAD_HIT_CHECK = 1,
LOAD_L1_REQUEST = 2,
LOAD_FILL = 3
} load_path_enum_t;
logic [3:0] load_state, load_state_next;
typedef enum {
STORE_IDLE = 0,
STORE_L1_REQUEST = 1
} store_path_enum_t;
logic [1:0] store_state, store_state_next;
////////////////////////////////////////////////////
//Implementation
////////////////////////////////////////////////////
//Load Path
always_ff @ (posedge clk) begin
if (rst) begin
load_state <= 0;
load_state[LOAD_IDLE] <= 1;
end
else
load_state <= load_state_next;
end
always_comb begin
load_state_next[LOAD_IDLE] = (load_state[LOAD_IDLE] & ~load_request) | ((load_hit & ~load_request) | line_complete);
load_state_next[LOAD_HIT_CHECK] = load_request;
load_state_next[LOAD_L1_REQUEST] = (load_state[LOAD_L1_REQUEST] & ~load_l1_arb_ack) | (load_state[LOAD_HIT_CHECK] & ~load_hit);
load_state_next[LOAD_FILL] = (load_state[LOAD_FILL] & ~line_complete) | (load_state[LOAD_L1_REQUEST] & load_l1_arb_ack);
end
assign load_ready = (load_state[LOAD_IDLE] | load_hit) & (store_state[STORE_IDLE] | store_l1_arb_ack);
always_ff @ (posedge clk) begin
if (load_request) begin
stage2_load.addr <= ls_load.addr;
stage2_load.uncacheable <= uncacheable_load;
end
end
assign load_tag_check = load_request & dcache_on & ~uncacheable_load;
////////////////////////////////////////////////////
//Load Miss
always_ff @ (posedge clk) begin
if (load_request)
word_count <= 0;
else
word_count <= word_count + SCONFIG.SUB_LINE_ADDR_W'(l1_response.data_valid);
end
assign is_target_word = (stage2_load.addr[2 +: SCONFIG.SUB_LINE_ADDR_W] == word_count) | stage2_load.uncacheable;
assign line_complete = l1_response.data_valid & ((word_count == END_OF_LINE_COUNT) | stage2_load.uncacheable);
////////////////////////////////////////////////////
//Store Path
always_ff @ (posedge clk) begin
if (rst) begin
store_state <= 0;
store_state[STORE_IDLE] <= 1;
end
else
store_state <= store_state_next;
end
always_comb begin
store_state_next[STORE_IDLE] = (store_state[STORE_IDLE] & (~store_request | (store_request & ls_store.cache_op))) | (store_l1_arb_ack & ~store_request);
store_state_next[STORE_L1_REQUEST] = (store_state[STORE_L1_REQUEST] & ~store_l1_arb_ack) | (store_request & ~ls_store.cache_op);
end
assign store_ready = (store_state[STORE_IDLE] | store_l1_arb_ack) & (load_state[LOAD_IDLE] | load_hit);
assign ls.ready = is_load ? load_ready : store_ready;
always_ff @ (posedge clk) begin
if (store_request) begin
stage2_store.addr <= ls_store.addr;
stage2_store.uncacheable <= uncacheable_store;
stage2_store.be <= ls_store.be;
stage2_store.data <= ls_store.data_in;
stage2_store.cache_op <= ls_store.cache_op;
end
end
////////////////////////////////////////////////////
//L1 Arbiter Interface
//Priority to oldest request
fifo_interface #(.DATA_TYPE(logic)) request_order();
assign request_order.data_in = load_request;
assign request_order.push = load_request | (store_request & ~ls_store.cache_op);
assign request_order.potential_push = request_order.push;
assign request_order.pop = l1_request.ack | load_hit;
cva5_fifo #(.DATA_TYPE(logic), .FIFO_DEPTH(2))
request_order_fifo (
.clk (clk),
.rst (rst),
.fifo (request_order)
);
assign arb_load_sel = request_order.data_out;
assign l1_request.addr = arb_load_sel ? stage2_load.addr : stage2_store.addr;//Memory interface aligns request to burst size (done there to support AMO line-read word-write)
assign l1_request.data = stage2_store.data;
assign l1_request.rnw = arb_load_sel;
assign l1_request.be = stage2_store.be;
assign l1_request.size = (arb_load_sel & ~stage2_load.uncacheable) ? 5'(CONFIG.DCACHE.LINE_W-1) : 0;//LR and AMO ops are included in load
assign l1_request.is_amo = 0;
assign l1_request.amo = 0;
assign l1_request.request = load_state[LOAD_L1_REQUEST] | store_state[STORE_L1_REQUEST];
assign load_l1_arb_ack = l1_request.ack & arb_load_sel;
assign store_l1_arb_ack = l1_request.ack & ~arb_load_sel;
////////////////////////////////////////////////////
//Replacement policy (free runing one-hot cycler, i.e. pseudo random)
cycler #(CONFIG.DCACHE.WAYS) replacement_policy (
.clk (clk),
.rst (rst),
.en (1'b1),
.one_hot (replacement_way)
);
////////////////////////////////////////////////////
//Tag banks
dcache_tag_banks #(.CONFIG(CONFIG), .SCONFIG(SCONFIG))
tag_banks (
.clk (clk),
.rst (rst),
.load_addr (ls_load.addr),
.load_req (load_tag_check),
.miss_addr (stage2_load.addr),
.miss_req (load_l1_arb_ack),
.miss_way (replacement_way),
.inv_addr ({l1_response.inv_addr, 2'b0}),
.extern_inv (l1_response.inv_valid),
.extern_inv_complete (l1_response.inv_ack),
.store_addr (ls_store.addr),
.store_addr_r (stage2_store.addr),
.store_req (store_request),
.cache_op_req (ls_store.cache_op),
.load_tag_hit (load_hit),
.load_tag_hit_way (load_tag_hit_way),
.store_tag_hit (store_hit),
.store_tag_hit_way (store_tag_hit_way)
);
////////////////////////////////////////////////////
//Data Bank(s)
logic [SCONFIG.LINE_ADDR_W+SCONFIG.SUB_LINE_ADDR_W-1:0] data_read_addr;
assign data_read_addr = load_state[LOAD_FILL] ? {addr_utils.getTagLineAddr(stage2_load.addr), word_count} : addr_utils.getDataLineAddr(ls_load.addr);
generate for (genvar i=0; i < CONFIG.DCACHE.WAYS; i++) begin : data_bank_gen
byte_en_bram #(CONFIG.DCACHE.LINES*CONFIG.DCACHE.LINE_W) data_bank (
.clk(clk),
.addr_a(data_read_addr),
.addr_b(addr_utils.getDataLineAddr(stage2_store.addr)),
.en_a(load_tag_check | (replacement_way_r[i] & l1_response.data_valid)),
.en_b(store_tag_hit_way[i]),
.be_a({4{(replacement_way_r[i] & l1_response.data_valid)}}),
.be_b(stage2_store.be),
.data_in_a(l1_response.data),
.data_in_b(stage2_store.data),
.data_out_a(ram_load_data[i]),
.data_out_b()
);
end endgenerate
////////////////////////////////////////////////////
//Output
//One-hot tag hit / update logic to binary int
one_hot_to_integer #(CONFIG.DCACHE.WAYS)
hit_way_conv (
.one_hot (load_tag_hit_way),
.int_out (tag_hit_index)
);
one_hot_to_integer #(CONFIG.DCACHE.WAYS)
replacment_way_conv (
.one_hot (replacement_way),
.int_out (replacement_index)
);
always_ff @ (posedge clk) begin
if (load_l1_arb_ack) begin
replacement_way_r <= replacement_way;
replacement_index_r <= replacement_index;
end
end
always_ff @ (posedge clk) miss_data_valid <= l1_response.data_valid & is_target_word;
logic collision;
logic [31:0] saved_data;
logic [3:0] saved_be;
assign collision = store_state[STORE_L1_REQUEST] & (stage2_store.addr[31:2] == ls_load.addr[31:2]);
always_ff @ (posedge clk) begin
if (load_request) begin
saved_data <= stage2_store.data;
saved_be <= {4{collision}} & stage2_store.be;
end
end
assign load_sel = load_state[LOAD_HIT_CHECK] ? tag_hit_index : replacement_index_r;
always_comb for (int i = 0; i < 4; i++)
ls.data_out[8*i+:8] = saved_be[i] ? saved_data[8*i+:8] : ram_load_data[load_sel][8*i+:8];
assign ls.data_valid = load_hit | miss_data_valid;
////////////////////////////////////////////////////
//End of Implementation
////////////////////////////////////////////////////
////////////////////////////////////////////////////
//Assertions
dcache_request_when_not_ready_assertion:
assert property (@(posedge clk) disable iff (rst) load_request |-> load_ready)
else $error("dcache received request when not ready");
dache_suprious_l1_ack_assertion:
assert property (@(posedge clk) disable iff (rst) l1_request.ack |-> (load_state[LOAD_L1_REQUEST] | store_state[STORE_L1_REQUEST]))
else $error("dcache received ack without a request");
endmodule

View file

@ -0,0 +1,114 @@
/*
* Copyright © 2022 Eric Matthews
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Eric Matthews <ematthew@sfu.ca>
*/
module dcache_tag_banks
import cva5_config::*;
import cva5_types::*;
# (
parameter cpu_config_t CONFIG = EXAMPLE_CONFIG,
parameter derived_cache_config_t SCONFIG = '{LINE_ADDR_W : 9, SUB_LINE_ADDR_W : 2, TAG_W : 15}
)
(
input logic clk,
input logic rst,
//Port A
input logic[31:0] load_addr,
input logic load_req,
input logic[31:0] miss_addr,
input logic miss_req,
input logic[CONFIG.DCACHE.WAYS-1:0] miss_way,
input logic[31:0] inv_addr,
input logic extern_inv,
output logic extern_inv_complete,
//Port B
input logic[31:0] store_addr,
input logic[31:0] store_addr_r,
input logic store_req,
input logic cache_op_req,
output logic load_tag_hit,
output logic store_tag_hit,
output logic[CONFIG.DCACHE.WAYS-1:0] load_tag_hit_way,
output logic[CONFIG.DCACHE.WAYS-1:0] store_tag_hit_way
);
typedef struct packed {
logic valid;
logic [SCONFIG.TAG_W-1:0] tag;
} dtag_entry_t;
cache_functions_interface # (.TAG_W(SCONFIG.TAG_W), .LINE_W(SCONFIG.LINE_ADDR_W), .SUB_LINE_W(SCONFIG.SUB_LINE_ADDR_W)) addr_utils ();
dtag_entry_t tag_line_a [CONFIG.DCACHE.WAYS-1:0];
dtag_entry_t tag_line_b [CONFIG.DCACHE.WAYS-1:0];
dtag_entry_t new_tagline;
logic [SCONFIG.LINE_ADDR_W-1:0] porta_addr;
logic [SCONFIG.LINE_ADDR_W-1:0] portb_addr;
logic external_inv;
logic load_req_r;
logic store_req_r;
////////////////////////////////////////////////////
//Implementation
always_ff @ (posedge clk) load_req_r <= load_req;
always_ff @ (posedge clk) store_req_r <= store_req & ~cache_op_req;
assign external_inv = extern_inv & CONFIG.DCACHE.USE_EXTERNAL_INVALIDATIONS;
assign porta_addr = miss_req ? addr_utils.getTagLineAddr(miss_addr) : external_inv ? addr_utils.getTagLineAddr(inv_addr) : addr_utils.getTagLineAddr(store_addr);
assign portb_addr = addr_utils.getTagLineAddr(load_addr);
assign extern_inv_complete = external_inv & ~miss_req;
assign new_tagline = '{valid: miss_req, tag: addr_utils.getTag(miss_addr)};
////////////////////////////////////////////////////
//Memory instantiation and hit detection
generate for (genvar i = 0; i < CONFIG.DCACHE.WAYS; i++) begin : tag_bank_gen
dual_port_bram #(.WIDTH($bits(dtag_entry_t)), .LINES(CONFIG.DCACHE.LINES)) dtag_bank (
.clk (clk),
.en_a (store_req | (miss_req & miss_way[i]) | external_inv),
.wen_a ((miss_req & miss_way[i]) | external_inv | (store_req & cache_op_req)),
.addr_a (porta_addr),
.data_in_a (new_tagline),
.data_out_a (tag_line_a[i]),
.en_b (load_req),
.wen_b ('0),
.addr_b (portb_addr),
.data_in_b ('0),
.data_out_b(tag_line_b[i])
);
assign store_tag_hit_way[i] = ({store_req_r, 1'b1, addr_utils.getTag(store_addr_r)} == {1'b1, tag_line_a[i]});
assign load_tag_hit_way[i] = ({load_req_r, 1'b1, addr_utils.getTag(miss_addr)} == {1'b1, tag_line_b[i]});
end endgenerate
assign load_tag_hit = |load_tag_hit_way;
assign store_tag_hit = |store_tag_hit_way;
endmodule

View file

@ -0,0 +1,270 @@
/*
* Copyright © 2020 Eric Matthews, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Eric Matthews <ematthew@sfu.ca>
*/
module load_store_queue //ID-based input buffer for Load/Store Unit
import cva5_config::*;
import riscv_types::*;
import cva5_types::*;
import fpu_types::*;
# (
parameter cpu_config_t CONFIG = EXAMPLE_CONFIG
)
(
input logic clk,
input logic rst,
input gc_outputs_t gc,
load_store_queue_interface.queue lsq,
input logic [$clog2(CONFIG.NUM_WB_GROUPS)-1:0] store_forward_wb_group,
input logic [1:0] fp_store_forward_wb_group,
//Writeback snooping
input wb_packet_t wb_packet [CONFIG.NUM_WB_GROUPS],
input fp_wb_packet_t fp_wb_packet [2],
//Retire release
input retire_packet_t store_retire
);
localparam LOG2_SQ_DEPTH = $clog2(CONFIG.SQ_DEPTH);
localparam DOUBLE_MIN_WIDTH = FLEN >= 32 ? 32 : FLEN;
typedef struct packed {
logic [31:0] addr;
logic [2:0] fn3;
logic fp;
logic double;
id_t id;
logic store_collision;
logic [LOG2_SQ_DEPTH-1:0] sq_index;
} lq_entry_t;
logic [LOG2_SQ_DEPTH-1:0] sq_index;
logic [LOG2_SQ_DEPTH-1:0] sq_oldest;
addr_hash_t addr_hash;
logic potential_store_conflict;
logic load_pop;
logic load_addr_bit_3;
logic [2:0] load_fn3;
fp_ls_op_t load_type;
logic store_pop;
logic store_addr_bit_3;
logic [31:0] store_data;
fifo_interface #(.DATA_TYPE(lq_entry_t)) lq();
store_queue_interface sq();
////////////////////////////////////////////////////
//Implementation
//Can accept requests so long as store queue is not full
//To allow additional loads with a full store queue would require
//extra logic to handle the case where there is a collision and the
//sq is full
assign lsq.full = sq.full;
//Address hash for load-store collision checking
addr_hash #(.USE_BIT_3(~CONFIG.INCLUDE_UNIT.FPU))
lsq_addr_hash (
.addr (lsq.data_in.addr),
.addr_hash (addr_hash)
);
////////////////////////////////////////////////////
//Load Queue
cva5_fifo #(.DATA_TYPE(lq_entry_t), .FIFO_DEPTH(MAX_IDS))
load_queue_fifo (
.clk(clk),
.rst(rst),
.fifo(lq)
);
//FIFO control signals
assign lq.push = lsq.push & lsq.data_in.load;
assign lq.potential_push = lsq.potential_push;
assign lq.pop = load_pop;
//FIFO data ports
assign lq.data_in = '{
addr : lsq.data_in.addr,
fn3 : lsq.data_in.fn3,
fp : lsq.data_in.fp,
double : lsq.data_in.double,
id : lsq.data_in.id,
store_collision : potential_store_conflict,
sq_index : sq_index
};
////////////////////////////////////////////////////
//Store Queue
assign sq.push = lsq.push & (lsq.data_in.store | lsq.data_in.cache_op);
assign sq.pop = store_pop;
assign sq.data_in = lsq.data_in;
store_queue # (.CONFIG(CONFIG)) sq_block (
.clk (clk),
.rst (rst | gc.sq_flush),
.sq (sq),
.store_forward_wb_group (store_forward_wb_group),
.fp_store_forward_wb_group (fp_store_forward_wb_group),
.addr_hash (addr_hash),
.potential_store_conflict (potential_store_conflict),
.sq_index (sq_index),
.sq_oldest (sq_oldest),
.wb_packet (wb_packet),
.fp_wb_packet (fp_wb_packet),
.store_retire (store_retire)
);
////////////////////////////////////////////////////
//Output
//Priority is for loads over stores.
//A store will be selected only if no loads are ready
generate
if (CONFIG.INCLUDE_UNIT.FPU) begin : gen_fpu_split
if (FLEN > 32) begin : gen_load_split
//Double precision loads are done across two cycles, higher word first
logic load_p2;
logic load_fp_hold;
assign load_fp_hold = ~load_p2 & lq.data_out.double;
assign load_pop = lsq.load_pop & ~load_fp_hold;
assign load_addr_bit_3 = load_fp_hold | lq.data_out.addr[2];
assign load_fn3 = lq.data_out.fp ? LS_W_fn3 : lq.data_out.fn3;
always_comb begin
if (~lq.data_out.fp)
load_type = INT_DONE;
else if (~lq.data_out.double)
load_type = SINGLE_DONE;
else if (load_p2)
load_type = DOUBLE_DONE;
else
load_type = DOUBLE_HOLD;
end
always_ff @(posedge clk) begin
if (rst)
load_p2 <= 0;
else if (lsq.load_pop)
load_p2 <= load_fp_hold;
end
end else begin : gen_no_load_split
//All loads are single cycle (load only the upper word)
assign load_pop = lsq.load_pop;
assign load_addr_bit_3 = lq.data_out.addr[2] | lq.data_out.double;
assign load_fn3 = lq.data_out.fp ? LS_W_fn3 : lq.data_out.fn3;
always_comb begin
if (lq.data_out.double)
load_type = DOUBLE_DONE;
else if (lq.data_out.fp)
load_type = SINGLE_DONE;
else
load_type = INT_DONE;
end
end
////////////////////////////////////////////////////
//Stores
//Mux between integer stores, single precision stores, and double precision stores
//Double precision stores take 2 cycles, with the lowest 32 bits on the first cycle (even if FLEN <= 32)
//This is because some functions load double-precision data as integers and operate on them
//Therefore, reduced FP numbers must be stored as if they were full size
logic store_p2;
logic store_fp_hold;
assign store_fp_hold = ~store_p2 & sq.data_out.double;
assign store_pop = lsq.store_pop & ~store_fp_hold;
assign store_addr_bit_3 = sq.data_out.double ? store_p2 : sq.data_out.addr[2];
always_ff @(posedge clk) begin
if (rst)
store_p2 <= 0;
else if (lsq.store_pop)
store_p2 <= store_fp_hold;
end
always_comb begin
store_data = '0;
if (sq.data_out.fp & ~sq.data_out.double) //Store single in upper bits
store_data[31-:FLEN_F] = sq.data_out.fp_data[FLEN_F-1:0];
else if (store_fp_hold) //First cycle of double - store lower bits (may just be 0)
store_data = 32'(sq.data_out.fp_data[DOUBLE_MIN_WIDTH-1:0]) << 64-FLEN;
else if (store_p2) //Second cycle of double - store upper bits
store_data[31-:DOUBLE_MIN_WIDTH] = sq.data_out.fp_data[FLEN-1-:DOUBLE_MIN_WIDTH];
else //Not FP
store_data = sq.data_out.data;
end
end else begin : gen_no_fpu
//Plain integer memory operations
assign load_pop = lsq.load_pop;
assign load_addr_bit_3 = lq.data_out.addr[2];
assign load_fn3 = lq.data_out.fn3;
assign load_type = INT_DONE;
assign store_pop = lsq.store_pop;
assign store_addr_bit_3 = sq.data_out.addr[2];
assign store_data = sq.data_out.data;
end
endgenerate
logic load_blocked;
assign load_blocked = (lq.data_out.store_collision & (lq.data_out.sq_index != sq_oldest));
assign lsq.load_valid = lq.valid & ~load_blocked;
assign lsq.store_valid = sq.valid;
assign lsq.load_data_out = '{
addr : {lq.data_out.addr[31:3], load_addr_bit_3, lq.data_out.addr[1:0]},
load : 1,
store : 0,
cache_op : 0,
be : 'x,
fn3 : load_fn3,
data_in : 'x,
id : lq.data_out.id,
fp_op : load_type
};
assign lsq.store_data_out = '{
addr : {sq.data_out.addr[31:3], store_addr_bit_3, sq.data_out.addr[1:0]},
load : 0,
store : 1,
cache_op : sq.data_out.cache_op,
be : sq.data_out.be,
fn3 : 'x,
data_in : store_data,
id : 'x,
fp_op : fp_ls_op_t'('x)
};
assign lsq.sq_empty = sq.empty;
assign lsq.no_released_stores_pending = sq.no_released_stores_pending;
assign lsq.empty = ~lq.valid & sq.empty;
////////////////////////////////////////////////////
//End of Implementation
////////////////////////////////////////////////////
////////////////////////////////////////////////////
//Assertions
endmodule

View file

@ -25,6 +25,8 @@ module load_store_unit
import cva5_config::*;
import riscv_types::*;
import cva5_types::*;
import fpu_types::*;
import opcodes::*;
# (
parameter cpu_config_t CONFIG = EXAMPLE_CONFIG
@ -35,7 +37,26 @@ module load_store_unit
input logic rst,
input gc_outputs_t gc,
input load_store_inputs_t ls_inputs,
input decode_packet_t decode_stage,
output logic unit_needed,
output logic [REGFILE_READ_PORTS-1:0] uses_rs,
output logic [2:0] fp_uses_rs,
output logic uses_rd,
output logic fp_uses_rd,
output logic decode_is_store,
input issue_packet_t issue_stage,
input logic issue_stage_ready,
input logic instruction_issued_with_rd,
input logic fp_instruction_issued_with_rd,
input logic rs2_inuse,
input logic fp_rs2_inuse,
input rs_addr_t issue_rs_addr [REGFILE_READ_PORTS],
input logic [$clog2(CONFIG.NUM_WB_GROUPS)-1:0] issue_rd_wb_group,
input logic fp_issue_rd_wb_group,
input logic [31:0] rf [REGFILE_READ_PORTS],
input logic[FLEN-1:0] fp_rf[3],
unit_issue_interface.unit issue,
input logic dcache_on,
@ -55,17 +76,16 @@ module load_store_unit
local_memory_interface.master data_bram,
//Writeback-Store Interface
input wb_packet_t wb_snoop,
input wb_packet_t wb_packet [CONFIG.NUM_WB_GROUPS],
input fp_wb_packet_t fp_wb_packet [2],
//Retire release
input id_t retire_ids [RETIRE_PORTS],
input logic retire_port_valid [RETIRE_PORTS],
input retire_packet_t store_retire,
exception_interface.unit exception,
output load_store_status_t load_store_status,
unit_writeback_interface.unit wb,
output logic tr_load_conflict_delay
unit_writeback_interface.unit fp_wb
);
localparam NUM_SUB_UNITS = int'(CONFIG.INCLUDE_DLOCAL_MEM) + int'(CONFIG.INCLUDE_PERIPHERAL_BUS) + int'(CONFIG.INCLUDE_DCACHE);
@ -76,7 +96,7 @@ module load_store_unit
localparam DCACHE_ID = int'(CONFIG.INCLUDE_DLOCAL_MEM) + int'(CONFIG.INCLUDE_PERIPHERAL_BUS);
//Should be equal to pipeline depth of longest load/store subunit
localparam ATTRIBUTES_DEPTH = 2;//CONFIG.INCLUDE_DCACHE ? 2 : 1;
localparam ATTRIBUTES_DEPTH = 1;
//Subunit signals
addr_utils_interface #(CONFIG.DLOCAL_MEM_ADDR.L, CONFIG.DLOCAL_MEM_ADDR.H) dlocal_mem_addr_utils ();
@ -93,15 +113,19 @@ module load_store_unit
logic [NUM_SUB_UNITS-1:0] unit_ready;
logic [NUM_SUB_UNITS-1:0] unit_data_valid;
logic [NUM_SUB_UNITS-1:0] last_unit;
logic [NUM_SUB_UNITS-1:0] current_unit;
logic units_ready;
logic sub_unit_ready;
logic [NUM_SUB_UNITS_W-1:0] subunit_id;
logic unit_switch;
logic unit_switch_in_progress;
logic unit_switch_hold;
logic sel_load;
logic sub_unit_issue;
logic sub_unit_load_issue;
logic sub_unit_store_issue;
logic load_complete;
logic [31:0] virtual_address;
@ -110,46 +134,127 @@ module load_store_unit
logic [31:0] aligned_load_data;
logic [31:0] final_load_data;
logic unaligned_addr;
logic load_exception_complete;
logic exception_is_fp;
logic fence_hold;
typedef struct packed{
logic is_halfword;
logic is_signed;
logic [1:0] byte_addr;
logic [1:0] sign_sel;
logic [1:0] final_mux_sel;
id_t id;
logic [NUM_SUB_UNITS_W-1:0] subunit_id;
fp_ls_op_t fp_op;
} load_attributes_t;
load_attributes_t mem_attr, wb_attr;
load_attributes_t wb_attr;
common_instruction_t instruction;//rs1_addr, rs2_addr, fn3, fn7, rd_addr, upper/lower opcode
logic [3:0] be;
//FIFOs
fifo_interface #(.DATA_WIDTH($bits(load_attributes_t))) load_attributes();
fifo_interface #(.DATA_TYPE(load_attributes_t)) load_attributes();
load_store_queue_interface lsq();
logic tr_possible_load_conflict_delay;
////////////////////////////////////////////////////
//Implementation
////////////////////////////////////////////////////
//Decode
assign instruction = decode_stage.instruction;
assign unit_needed = instruction inside {LB, LH, LW, LBU, LHU, SB, SH, SW, FENCE} |
(CONFIG.INCLUDE_CBO & instruction inside {CBO_INVAL, CBO_CLEAN, CBO_FLUSH}) |
(CONFIG.INCLUDE_UNIT.FPU & instruction inside {SP_FLW, SP_FSW, DP_FLD, DP_FSD});
always_comb begin
uses_rs = '0;
uses_rs[RS1] = instruction inside {LB, LH, LW, LBU, LHU, SB, SH, SW} |
(CONFIG.INCLUDE_CBO & instruction inside {CBO_INVAL, CBO_CLEAN, CBO_FLUSH}) |
(CONFIG.INCLUDE_UNIT.FPU & instruction inside {SP_FLW, SP_FSW, DP_FLD, DP_FSD});
uses_rs[RS2] = CONFIG.INCLUDE_FORWARDING_TO_STORES ? 0 : instruction inside {SB, SH, SW};
uses_rd = instruction inside {LB, LH, LW, LBU, LHU};
fp_uses_rs = '0;
fp_uses_rs[RS2] = ~CONFIG.INCLUDE_FORWARDING_TO_STORES & CONFIG.INCLUDE_UNIT.FPU & instruction inside {SP_FSW, DP_FSD};
fp_uses_rd = CONFIG.INCLUDE_UNIT.FPU & instruction inside {SP_FLW, DP_FLD};
end
////////////////////////////////////////////////////
//LS specific decode support
typedef struct packed{
logic is_load;
logic is_store;
logic is_fence;
logic is_cbo;
logic is_fpu;
logic is_double;
logic [11:0] offset;
} ls_attr_t;
ls_attr_t decode_attr;
ls_attr_t issue_attr;
logic [11:0] load_offset;
logic [11:0] store_offset;
assign load_offset = instruction[31:20];
assign store_offset = {instruction[31:25], instruction[11:7]};
assign decode_attr = '{
is_load : instruction inside {LB, LH, LW, LBU, LHU} | CONFIG.INCLUDE_UNIT.FPU & instruction inside {SP_FLW, DP_FLD},
is_store : instruction inside {SB, SH, SW} | CONFIG.INCLUDE_UNIT.FPU & instruction inside {SP_FSW, DP_FSD},
is_fence : instruction inside {FENCE},
is_cbo : CONFIG.INCLUDE_CBO & instruction inside {CBO_INVAL, CBO_CLEAN, CBO_FLUSH},
is_fpu : CONFIG.INCLUDE_UNIT.FPU & instruction inside {SP_FLW, SP_FSW, DP_FLD, DP_FSD},
is_double : CONFIG.INCLUDE_UNIT.FPU & instruction inside {DP_FLD, DP_FSD},
offset : instruction[5] ? store_offset : ((CONFIG.INCLUDE_CBO & instruction[2]) ? '0 : load_offset)
};
assign decode_is_store = decode_attr.is_store | decode_attr.is_cbo;
always_ff @(posedge clk) begin
if (issue_stage_ready)
issue_attr <= decode_attr;
end
typedef struct packed{
id_t id;
logic [$clog2(CONFIG.NUM_WB_GROUPS)-1:0] wb_group;
logic fp_wb_group;
} rd_attributes_t;
rd_attributes_t rd_attributes;
//Store FP instructions in 32-64
lutram_1w_1r #(.DATA_TYPE(rd_attributes_t), .DEPTH(64))
rd_to_id_table (
.clk(clk),
.waddr({fp_instruction_issued_with_rd, issue_stage.rd_addr}),
.raddr({issue_attr.is_fpu, issue_rs_addr[RS2]}),
.ram_write(instruction_issued_with_rd | fp_instruction_issued_with_rd),
.new_ram_data('{
id : issue_stage.id,
wb_group : issue_rd_wb_group,
fp_wb_group : fp_issue_rd_wb_group
}),
.ram_data_out(rd_attributes)
);
////////////////////////////////////////////////////
//Alignment Exception
generate if (CONFIG.INCLUDE_M_MODE) begin : gen_ls_exceptions
logic new_exception;
always_comb begin
case(ls_inputs.fn3)
LS_H_fn3, L_HU_fn3 : unaligned_addr = virtual_address[0];
LS_W_fn3 : unaligned_addr = |virtual_address[1:0];
default : unaligned_addr = 0;
endcase
if (issue_stage.fn3 == LS_H_fn3 | issue_stage.fn3 == L_HU_fn3)
unaligned_addr = virtual_address[0];
else if (issue_stage.fn3 == LS_W_fn3)
unaligned_addr = |virtual_address[1:0];
//Double-precision operations raise if not aligned on 8 byte boundary even though they are decomposed into 4 byte operations
//This is because the operation might straddle two memory regions
else if (CONFIG.INCLUDE_UNIT.FPU & issue_stage.fn3 == LS_D_fn3)
unaligned_addr = |virtual_address[2:0];
else
unaligned_addr = 0;
end
assign new_exception = unaligned_addr & issue.new_request & ~ls_inputs.fence;
assign new_exception = unaligned_addr & issue.new_request & ~issue_attr.is_fence;
always_ff @(posedge clk) begin
if (rst)
exception.valid <= 0;
@ -157,9 +262,16 @@ module load_store_unit
exception.valid <= (exception.valid & ~exception.ack) | new_exception;
end
always_ff @(posedge clk) begin
if (rst)
exception_is_fp <= 0;
else if (new_exception)
exception_is_fp <= CONFIG.INCLUDE_UNIT.FPU & issue_attr.is_fpu;
end
always_ff @(posedge clk) begin
if (new_exception & ~exception.valid) begin
exception.code <= ls_inputs.store ? STORE_AMO_ADDR_MISSALIGNED : LOAD_ADDR_MISSALIGNED;
exception.code <= issue_attr.is_store ? STORE_AMO_ADDR_MISSALIGNED : LOAD_ADDR_MISSALIGNED;
exception.tval <= virtual_address;
exception.id <= issue.id;
end
@ -178,17 +290,17 @@ module load_store_unit
assign load_store_status = '{
sq_empty : lsq.sq_empty,
no_released_stores_pending : lsq.no_released_stores_pending,
idle : lsq.empty & (~load_attributes.valid) & units_ready
idle : lsq.empty & (~load_attributes.valid) & (&unit_ready)
};
////////////////////////////////////////////////////
//TLB interface
assign virtual_address = ls_inputs.rs1 + 32'(signed'(ls_inputs.offset));
assign virtual_address = rf[RS1] + 32'(signed'(issue_attr.offset));
assign tlb.virtual_address = virtual_address;
assign tlb.new_request = tlb_on & issue.new_request;
assign tlb.execute = 0;
assign tlb.rnw = ls_inputs.load & ~ls_inputs.store;
assign tlb.rnw = issue_attr.is_load & ~issue_attr.is_store;
////////////////////////////////////////////////////
//Byte enable generation
@ -198,7 +310,7 @@ module load_store_unit
// SB: specific byte
always_comb begin
be = 0;
case(ls_inputs.fn3[1:0])
case(issue_stage.fn3[1:0])
LS_B_fn3[1:0] : be[virtual_address[1:0]] = 1;
LS_H_fn3[1:0] : begin
be[virtual_address[1:0]] = 1;
@ -212,44 +324,46 @@ module load_store_unit
//Load Store Queue
assign lsq.data_in = '{
addr : tlb_on ? tlb.physical_address : virtual_address,
fn3 : ls_inputs.fn3,
fn3 : issue_stage.fn3,
be : be,
data : ls_inputs.rs2,
load : ls_inputs.load,
store : ls_inputs.store,
data : rf[RS2],
load : issue_attr.is_load,
store : issue_attr.is_store,
cache_op : issue_attr.is_cbo,
id : issue.id,
forwarded_store : ls_inputs.forwarded_store,
id_needed : ls_inputs.store_forward_id
id_needed : rd_attributes.id,
fp : issue_attr.is_fpu,
double : issue_attr.is_double,
fp_data : fp_rf[RS2]
};
assign lsq.potential_push = issue.possible_issue;
assign lsq.push = issue.new_request & ~unaligned_addr & (~tlb_on | tlb.done) & ~ls_inputs.fence;
assign lsq.push = issue.new_request & ~unaligned_addr & (~tlb_on | tlb.done) & ~issue_attr.is_fence;
load_store_queue # (.CONFIG(CONFIG)) lsq_block (
.clk (clk),
.rst (rst),
.gc (gc),
.lsq (lsq),
.wb_snoop (wb_snoop),
.retire_ids (retire_ids),
.retire_port_valid (retire_port_valid),
.tr_possible_load_conflict_delay (tr_possible_load_conflict_delay)
.store_forward_wb_group (rs2_inuse ? rd_attributes.wb_group : '0),
.fp_store_forward_wb_group ({fp_rs2_inuse & rd_attributes.fp_wb_group, fp_rs2_inuse & ~rd_attributes.fp_wb_group}),
.wb_packet (wb_packet),
.fp_wb_packet (fp_wb_packet),
.store_retire (store_retire)
);
assign shared_inputs = lsq.data_out;
assign lsq.pop = sub_unit_issue;
assign shared_inputs = sel_load ? lsq.load_data_out : lsq.store_data_out;
assign lsq.load_pop = sub_unit_load_issue;
assign lsq.store_pop = sub_unit_store_issue;
////////////////////////////////////////////////////
//Unit tracking
assign current_unit = sub_unit_address_match;
always_ff @ (posedge clk) begin
if (load_attributes.push)
last_unit <= sub_unit_address_match;
end
//When switching units, ensure no outstanding loads so that there can be no timing collisions with results
assign unit_switch = (current_unit != last_unit) & load_attributes.valid;
assign unit_switch = lsq.load_valid & (sub_unit_address_match != last_unit) & load_attributes.valid;
always_ff @ (posedge clk) begin
unit_switch_in_progress <= (unit_switch_in_progress | unit_switch) & ~load_attributes.valid;
end
@ -257,23 +371,27 @@ module load_store_unit
////////////////////////////////////////////////////
//Primary Control Signals
assign units_ready = &unit_ready & (~unit_switch_hold);
assign sel_load = lsq.load_valid;
assign sub_unit_ready = unit_ready[subunit_id] & (~unit_switch_hold);
assign load_complete = |unit_data_valid;
assign issue.ready = (~tlb_on | tlb.ready) & (~lsq.full) & (~fence_hold) & (~exception.valid);
assign sub_unit_issue = lsq.valid & units_ready;
assign sub_unit_load_issue = sel_load & lsq.load_valid & sub_unit_ready & sub_unit_address_match[subunit_id];
assign sub_unit_store_issue = (lsq.store_valid & ~sel_load) & sub_unit_ready & sub_unit_address_match[subunit_id];
assign sub_unit_issue = sub_unit_load_issue | sub_unit_store_issue;
always_ff @ (posedge clk) begin
if (rst)
fence_hold <= 0;
else
fence_hold <= (fence_hold & ~load_store_status.idle) | (issue.new_request & ls_inputs.fence);
fence_hold <= (fence_hold & ~load_store_status.idle) | (issue.new_request & issue_attr.is_fence);
end
////////////////////////////////////////////////////
//Load attributes FIFO
logic [1:0] final_mux_sel;
logic [NUM_SUB_UNITS_W-1:0] subunit_id;
one_hot_to_integer #(NUM_SUB_UNITS)
sub_unit_select (
@ -282,27 +400,26 @@ module load_store_unit
);
always_comb begin
case(shared_inputs.fn3)
case(lsq.load_data_out.fn3)
LS_B_fn3, L_BU_fn3 : final_mux_sel = 0;
LS_H_fn3, L_HU_fn3 : final_mux_sel = 1;
default : final_mux_sel = 2; //LS_W_fn3
endcase
end
assign mem_attr = '{
is_halfword : shared_inputs.fn3[0],
is_signed : ~|shared_inputs.fn3[2:1],
byte_addr : shared_inputs.addr[1:0],
assign load_attributes.data_in = '{
is_signed : ~|lsq.load_data_out.fn3[2:1],
byte_addr : lsq.load_data_out.addr[1:0],
sign_sel : lsq.load_data_out.addr[1:0] | {1'b0, lsq.load_data_out.fn3[0]},//halfword
final_mux_sel : final_mux_sel,
id : shared_inputs.id,
subunit_id : subunit_id
id : lsq.load_data_out.id,
subunit_id : subunit_id,
fp_op : lsq.load_data_out.fp_op
};
assign load_attributes.data_in = mem_attr;
assign load_attributes.push = sub_unit_issue & shared_inputs.load;
assign load_attributes.push = sub_unit_load_issue;
assign load_attributes.potential_push = load_attributes.push;
cva5_fifo #(.DATA_WIDTH($bits(load_attributes_t)), .FIFO_DEPTH(ATTRIBUTES_DEPTH))
cva5_fifo #(.DATA_TYPE(load_attributes_t), .FIFO_DEPTH(ATTRIBUTES_DEPTH))
attributes_fifo (
.clk (clk),
.rst (rst),
@ -367,9 +484,20 @@ module load_store_unit
endgenerate
generate if (CONFIG.INCLUDE_DCACHE) begin : gen_ls_dcache
logic uncacheable;
logic load_ready;
logic store_ready;
logic uncacheable_load;
logic uncacheable_store;
logic dcache_load_request;
logic dcache_store_request;
assign sub_unit_address_match[DCACHE_ID] = dcache_addr_utils.address_range_check(shared_inputs.addr);
assign uncacheable = uncacheable_utils.address_range_check(shared_inputs.addr);
assign uncacheable_load = CONFIG.DCACHE.USE_NON_CACHEABLE & uncacheable_utils.address_range_check(shared_inputs.addr);
assign uncacheable_store = CONFIG.DCACHE.USE_NON_CACHEABLE & uncacheable_utils.address_range_check(shared_inputs.addr);
assign dcache_load_request = sub_unit_load_issue & sub_unit_address_match[DCACHE_ID];
assign dcache_store_request = sub_unit_store_issue & sub_unit_address_match[DCACHE_ID];
dcache # (.CONFIG(CONFIG))
data_cache (
@ -381,8 +509,16 @@ module load_store_unit
.sc_complete (sc_complete),
.sc_success (sc_success),
.clear_reservation (clear_reservation),
.amo (ls_inputs.amo),
.uncacheable (uncacheable),
.amo (),
.uncacheable_load (uncacheable_load),
.uncacheable_store (uncacheable_store),
.is_load (sel_load),
.load_ready (load_ready),
.store_ready (store_ready),
.load_request (dcache_load_request),
.store_request (dcache_store_request),
.ls_load (lsq.load_data_out),
.ls_store (lsq.store_data_out),
.ls (sub_unit[DCACHE_ID])
);
end
@ -391,7 +527,6 @@ module load_store_unit
////////////////////////////////////////////////////
//Output Muxing
logic sign_bit_data [4];
logic [1:0] sign_bit_sel;
logic sign_bit;
assign unit_muxed_load_data = unit_data_array[wb_attr.subunit_id];
@ -402,8 +537,7 @@ module load_store_unit
assign aligned_load_data[7:0] = unit_muxed_load_data[wb_attr.byte_addr*8 +: 8];
assign sign_bit_data = '{unit_muxed_load_data[7], unit_muxed_load_data[15], unit_muxed_load_data[23], unit_muxed_load_data[31]};
assign sign_bit_sel = wb_attr.byte_addr | {1'b0, wb_attr.is_halfword};
assign sign_bit = wb_attr.is_signed & sign_bit_data[sign_bit_sel];
assign sign_bit = wb_attr.is_signed & sign_bit_data[wb_attr.sign_sel];
//Sign extending
always_comb begin
@ -414,12 +548,47 @@ module load_store_unit
endcase
end
//FP buffering first load result
logic[FLEN-1:0] fp_result;
generate if (CONFIG.INCLUDE_UNIT.FPU && FLEN > 32) begin : gen_fp_load_buffering
logic[31:0] saved_msb;
always_ff @(posedge clk) begin
if (rst)
saved_msb <= '1;
else begin
if (load_complete & wb_attr.fp_op == DOUBLE_HOLD)
saved_msb <= unit_muxed_load_data;
else if (load_complete) //Boxing
saved_msb <= '1;
end
end
always_comb begin
fp_result = '1;
fp_result[FLEN-1-:32] = saved_msb;
if (wb_attr.fp_op == SINGLE_DONE)
fp_result[FLEN_F-1:0] = unit_muxed_load_data[31-:FLEN_F];
else
fp_result[FLEN-33:0] = unit_muxed_load_data[31-:FLEN-32];
end
end else if (CONFIG.INCLUDE_UNIT.FPU) begin : gen_fpu_no_buffering
//No buffering ever required - all results are final
assign fp_result = wb_attr.fp_op == SINGLE_DONE ? {{(FLEN-FLEN_F){1'b1}}, unit_muxed_load_data[31-:FLEN_F]} : unit_muxed_load_data[31-:FLEN];
end
else begin : gen_no_fpu
assign fp_result = 'x;
end endgenerate
////////////////////////////////////////////////////
//Output bank
assign wb.rd = final_load_data;
assign wb.done = load_complete | load_exception_complete;
assign wb.done = (load_complete & (~CONFIG.INCLUDE_UNIT.FPU | wb_attr.fp_op == INT_DONE)) | (load_exception_complete & ~exception_is_fp);
//TODO: exceptions seemingly clobber load data if it appears on the same cycle
assign wb.id = load_exception_complete ? exception.id : wb_attr.id;
assign fp_wb.rd = fp_result;
assign fp_wb.done = (load_complete & (wb_attr.fp_op == SINGLE_DONE | wb_attr.fp_op == DOUBLE_DONE)) | (load_exception_complete & exception_is_fp);
assign fp_wb.id = load_exception_complete ? exception.id : wb_attr.id;
////////////////////////////////////////////////////
//End of Implementation
////////////////////////////////////////////////////
@ -430,17 +599,5 @@ module load_store_unit
assert property (@(posedge clk) disable iff (rst) load_complete |-> (load_attributes.valid && unit_data_valid[wb_attr.subunit_id]))
else $error("Spurious load complete detected!");
// `ifdef ENABLE_SIMULATION_ASSERTIONS
// invalid_ls_address_assertion:
// assert property (@(posedge clk) disable iff (rst) (sub_unit_issue & ~ls_inputs.fence) |-> |sub_unit_address_match)
// else $error("invalid L/S address");
// `endif
////////////////////////////////////////////////////
//Trace Interface
generate if (ENABLE_TRACE_INTERFACE) begin : gen_ls_trace
assign tr_load_conflict_delay = tr_possible_load_conflict_delay & units_ready;
end
endgenerate
endmodule

View file

@ -0,0 +1,332 @@
/*
* Copyright © 2020 Eric Matthews, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Eric Matthews <ematthew@sfu.ca>
*/
module store_queue
import cva5_config::*;
import riscv_types::*;
import cva5_types::*;
# (
parameter cpu_config_t CONFIG = EXAMPLE_CONFIG
)
(
input logic clk,
input logic rst,
store_queue_interface.queue sq,
input logic [$clog2(CONFIG.NUM_WB_GROUPS)-1:0] store_forward_wb_group,
input logic [1:0] fp_store_forward_wb_group,
//Address hash (shared by loads and stores)
input addr_hash_t addr_hash,
//hash check on adding a load to the queue
output logic [$clog2(CONFIG.SQ_DEPTH)-1:0] sq_index,
output logic [$clog2(CONFIG.SQ_DEPTH)-1:0] sq_oldest,
output logic potential_store_conflict,
//Writeback snooping
input wb_packet_t wb_packet [CONFIG.NUM_WB_GROUPS],
input fp_wb_packet_t fp_wb_packet [2],
//Retire
input retire_packet_t store_retire
);
localparam FINAL_TABLE_WIDTH = CONFIG.INCLUDE_UNIT.FPU && FLEN > 32 ? FLEN : 32;
localparam LOG2_SQ_DEPTH = $clog2(CONFIG.SQ_DEPTH);
localparam NUM_OF_FORWARDING_PORTS = CONFIG.NUM_WB_GROUPS - 1;
typedef logic [LOG2_SQ_DEPTH-1:0] sq_index_t;
typedef struct packed {
id_t id_needed;
logic [$clog2(CONFIG.NUM_WB_GROUPS)-1:0] wb_group;
logic [1:0] fp_wb_group;
logic fp;
sq_index_t sq_index;
} retire_table_t;
retire_table_t retire_table_out;
wb_packet_t wb_snoop [CONFIG.NUM_WB_GROUPS];
fp_wb_packet_t fp_wb_snoop [2];
//Register-based memory blocks
logic [CONFIG.SQ_DEPTH-1:0] valid;
logic [CONFIG.SQ_DEPTH-1:0] valid_next;
addr_hash_t [CONFIG.SQ_DEPTH-1:0] hashes;
//LUTRAM-based memory blocks
sq_entry_t output_entry;
sq_entry_t output_entry_r;
logic [1:0] retire_alignment;
sq_index_t sq_index_next;
sq_index_t sq_oldest_next;
logic [LOG2_SQ_DEPTH:0] released_count;
logic [CONFIG.SQ_DEPTH-1:0] new_request_one_hot;
logic [CONFIG.SQ_DEPTH-1:0] issued_one_hot;
logic [31:0] data_pre_alignment;
logic [31:0] marshalled_data;
logic [FLEN-1:0] fp_marshalled_data;
logic [FINAL_TABLE_WIDTH-1:0] sq_data_in;
logic [FINAL_TABLE_WIDTH-1:0] sq_data_out;
////////////////////////////////////////////////////
//Implementation
//Store Queue indicies
assign sq_index_next = sq_index + LOG2_SQ_DEPTH'(sq.push);
assign sq_oldest_next = sq_oldest + LOG2_SQ_DEPTH'(sq.pop);
always_ff @ (posedge clk) begin
if (rst) begin
sq_index <= 0;
sq_oldest <= 0;
end else begin
sq_index <= sq_index_next;
sq_oldest <= sq_oldest_next;
end
end
assign new_request_one_hot = CONFIG.SQ_DEPTH'(sq.push) << sq_index;
assign issued_one_hot = CONFIG.SQ_DEPTH'(sq.pop) << sq_oldest;
assign valid_next = (valid | new_request_one_hot) & ~issued_one_hot;
always_ff @ (posedge clk) begin
if (rst) begin
valid <= '0;
sq.full <= 0;
end else begin
valid <= valid_next;
sq.full <= &valid_next;
end
end
assign sq.empty = ~|valid;
//SQ attributes and issue data
lutram_1w_1r #(.DATA_TYPE(sq_entry_t), .DEPTH(CONFIG.SQ_DEPTH))
store_attr (
.clk(clk),
.waddr(sq_index),
.raddr(sq_oldest_next),
.ram_write(sq.push),
.new_ram_data('{
addr : sq.data_in.addr,
be : sq.data_in.be,
cache_op : sq.data_in.cache_op,
data : '0,
fp : sq.data_in.fp,
double : sq.data_in.double,
fp_data : '0
}),
.ram_data_out(output_entry)
);
always_ff @ (posedge clk) begin
output_entry_r <= output_entry;
end
lutram_1w_1r #(.DATA_TYPE(logic[1:0]), .DEPTH(MAX_IDS))
store_alignment (
.clk(clk),
.waddr(sq.data_in.id),
.raddr(store_retire.id),
.ram_write(sq.push),
.new_ram_data(sq.data_in.addr[1:0]),
.ram_data_out(retire_alignment)
);
//Compare store addr-hashes against new load addr-hash
always_comb begin
potential_store_conflict = 0;
for (int i = 0; i < CONFIG.SQ_DEPTH; i++)
potential_store_conflict |= {(valid[i] & ~issued_one_hot[i]), addr_hash} == {1'b1, hashes[i]};
end
////////////////////////////////////////////////////
//Register-based storage
//Address hashes
always_ff @ (posedge clk) begin
for (int i = 0; i < CONFIG.SQ_DEPTH; i++) begin
if (new_request_one_hot[i])
hashes[i] <= addr_hash;
end
end
////////////////////////////////////////////////////
//Release Handling
always_ff @ (posedge clk) begin
if (rst)
released_count <= 0;
else
released_count <= released_count + (LOG2_SQ_DEPTH + 1)'(store_retire.valid) - (LOG2_SQ_DEPTH + 1)'(sq.pop);
end
assign sq.no_released_stores_pending = ~|released_count;
////////////////////////////////////////////////////
//Forwarding and Store Data
//Forwarding is only needed from multi-cycle writeback ports
//Currently this is the LS port [1] and the MUL/DIV/CSR port [2]
always_ff @ (posedge clk) begin
wb_snoop <= wb_packet;
fp_wb_snoop <= fp_wb_packet;
end
lutram_1w_1r #(.DATA_TYPE(retire_table_t), .DEPTH(MAX_IDS))
store_retire_table_lutram (
.clk(clk),
.waddr(sq.data_in.id),
.raddr(store_retire.id),
.ram_write(sq.push),
.new_ram_data('{
id_needed : sq.data_in.id_needed,
wb_group : store_forward_wb_group,
fp_wb_group : fp_store_forward_wb_group,
fp : sq.data_in.fp,
sq_index : sq_index
}),
.ram_data_out(retire_table_out)
);
logic [31:0] wb_data [NUM_OF_FORWARDING_PORTS+1];
logic [FLEN-1:0] fp_wb_data [3];
//Data issued with the store can be stored by store-id
lutram_1w_1r #(.DATA_TYPE(logic[31:0]), .DEPTH(MAX_IDS))
non_forwarded_port (
.clk(clk),
.waddr(sq.data_in.id),
.raddr(store_retire.id),
.ram_write(sq.push),
.new_ram_data(sq.data_in.data),
.ram_data_out(wb_data[0])
);
//Data from wb ports is stored by ID and then accessed by store-id to store-id-needed translation
generate
for (genvar i = 0; i < NUM_OF_FORWARDING_PORTS; i++) begin : lutrams
lutram_1w_1r #(.DATA_TYPE(logic[31:0]), .DEPTH(MAX_IDS))
writeback_port (
.clk(clk),
.waddr(wb_snoop[i+1].id),
.raddr(retire_table_out.id_needed),
.ram_write(wb_snoop[i+1].valid),
.new_ram_data(wb_snoop[i+1].data),
.ram_data_out(wb_data[i+1])
);
end
endgenerate
generate
if (CONFIG.INCLUDE_UNIT.FPU) begin : gen_fp_issue_data_storage
//FP data issued with the store and data from the FP writeback ports is saved
lutram_1w_1r #(.DATA_TYPE(logic[FLEN-1:0]), .DEPTH(MAX_IDS))
fp_non_forwarded_port (
.clk(clk),
.waddr(sq.data_in.id),
.raddr(store_retire.id),
.ram_write(sq.push),
.new_ram_data(sq.data_in.fp_data),
.ram_data_out(fp_wb_data[0])
);
end
for (genvar i = 0; i < 2; i++) begin : gen_fp_wb_data_storage
lutram_1w_1r #(.DATA_TYPE(logic[FLEN-1:0]), .DEPTH(MAX_IDS))
writeback_port (
.clk(clk),
.waddr(fp_wb_snoop[i].id),
.raddr(retire_table_out.id_needed),
.ram_write(fp_wb_snoop[i].valid),
.new_ram_data(fp_wb_snoop[i].data),
.ram_data_out(fp_wb_data[i+1])
);
end
endgenerate
////////////////////////////////////////////////////
//Data Marshalling
assign fp_marshalled_data = fp_wb_data[retire_table_out.fp_wb_group];
assign data_pre_alignment = wb_data[retire_table_out.wb_group];
always_comb begin
//Input: ABCD
//Assuming aligned requests,
//Possible byte selections: (A/C/D, B/D, C/D, D)
marshalled_data[7:0] = data_pre_alignment[7:0];
marshalled_data[15:8] = (retire_alignment[1:0] == 2'b01) ? data_pre_alignment[7:0] : data_pre_alignment[15:8];
marshalled_data[23:16] = (retire_alignment[1:0] == 2'b10) ? data_pre_alignment[7:0] : data_pre_alignment[23:16];
case(retire_alignment[1:0])
2'b10 : marshalled_data[31:24] = data_pre_alignment[15:8];
2'b11 : marshalled_data[31:24] = data_pre_alignment[7:0];
default : marshalled_data[31:24] = data_pre_alignment[31:24];
endcase
end
//Final storage table for the store queue (includes FP data)
//SQ-index addressed
generate
if (CONFIG.INCLUDE_UNIT.FPU && FLEN > 32) begin : gen_upper_always_fp
assign sq_data_in[FLEN-1:32] = fp_marshalled_data[FLEN-1:32];
assign sq_data_in[31:0] = retire_table_out.fp ? fp_marshalled_data[31:0] : marshalled_data[31:0];
end else if (CONFIG.INCLUDE_UNIT.FPU && FLEN == 32) begin : gen_no_upper
assign sq_data_in = retire_table_out.fp ? fp_marshalled_data : marshalled_data;
end else if (CONFIG.INCLUDE_UNIT.FPU && FLEN < 32) begin : gen_upper_always_int
assign sq_data_in[31:FLEN] = marshalled_data[31:FLEN];
assign sq_data_in[FLEN-1:0] = retire_table_out.fp ? fp_marshalled_data[FLEN-1:0] : marshalled_data[FLEN-1:0];
end else begin : gen_no_fpu
assign sq_data_in = marshalled_data;
end
endgenerate
lutram_1w_1r #(.DATA_TYPE(logic[FINAL_TABLE_WIDTH-1:0]), .DEPTH(CONFIG.SQ_DEPTH))
sq_data_lutram (
.clk(clk),
.waddr(retire_table_out.sq_index),
.raddr(sq_oldest),
.ram_write(store_retire.valid),
.new_ram_data(sq_data_in),
.ram_data_out(sq_data_out)
);
assign sq.valid = |released_count;
assign sq.data_out = '{
addr : output_entry_r.addr,
be : output_entry_r.be,
cache_op : output_entry_r.cache_op,
data : sq_data_out[31:0],
fp : output_entry_r.fp,
double : output_entry_r.double,
fp_data : FLEN'(sq_data_out[(CONFIG.INCLUDE_UNIT.FPU ? FLEN : 32)-1:0])
};
////////////////////////////////////////////////////
//End of Implementation
////////////////////////////////////////////////////
////////////////////////////////////////////////////
//Assertions
sq_overflow_assertion:
assert property (@(posedge clk) disable iff (rst) sq.push |-> (~sq.full | sq.pop)) else $error("sq overflow");
fifo_underflow_assertion:
assert property (@(posedge clk) disable iff (rst) sq.pop |-> sq.valid) else $error("sq underflow");
endmodule

View file

@ -25,22 +25,32 @@ module mul_unit
import cva5_config::*;
import riscv_types::*;
import cva5_types::*;
import opcodes::*;
(
input logic clk,
input logic rst,
input mul_inputs_t mul_inputs,
input decode_packet_t decode_stage,
output logic unit_needed,
output logic [REGFILE_READ_PORTS-1:0] uses_rs,
output logic uses_rd,
input issue_packet_t issue_stage,
input logic issue_stage_ready,
input logic [31:0] rf [REGFILE_READ_PORTS],
unit_issue_interface.unit issue,
unit_writeback_interface.unit wb
);
common_instruction_t instruction;//rs1_addr, rs2_addr, fn3, fn7, rd_addr, upper/lower opcode
logic signed [63:0] result;
logic mulh [2];
logic valid [2];
id_t id [2];
logic rs1_is_signed, rs2_is_signed;
logic rs1_is_signed, rs2_is_signed, is_mulhx;
logic signed [32:0] rs1_ext, rs2_ext;
logic signed [32:0] rs1_r, rs2_r;
@ -48,11 +58,29 @@ module mul_unit
logic stage2_advance;
////////////////////////////////////////////////////
//Implementation
assign rs1_is_signed = mul_inputs.op[1:0] inside {MULH_fn3[1:0], MULHSU_fn3[1:0]};//MUL doesn't matter
assign rs2_is_signed = mul_inputs.op[1:0] inside {MUL_fn3[1:0], MULH_fn3[1:0]};//MUL doesn't matter
assign rs1_ext = signed'({mul_inputs.rs1[31] & rs1_is_signed, mul_inputs.rs1});
assign rs2_ext = signed'({mul_inputs.rs2[31] & rs2_is_signed, mul_inputs.rs2});
////////////////////////////////////////////////////
//Decode
assign unit_needed = decode_stage.instruction inside {MUL, MULH, MULHSU, MULHU};
always_comb begin
uses_rs = '0;
uses_rs[RS1] = unit_needed;
uses_rs[RS2] = unit_needed;
uses_rd = unit_needed;
end
assign instruction = decode_stage.instruction;
always_ff @(posedge clk) begin
if (issue_stage_ready) begin
rs1_is_signed <= instruction.fn3[1:0] inside {MULH_fn3[1:0], MULHSU_fn3[1:0]};
rs2_is_signed <= instruction.fn3[1:0] inside {MULH_fn3[1:0]};
is_mulhx <= instruction.fn3[1:0] inside {MULH_fn3[1:0], MULHSU_fn3[1:0], MULHU_fn3[1:0]};
end
end
////////////////////////////////////////////////////
//Issue
assign rs1_ext = signed'({rs1_is_signed & rf[RS1][31], rf[RS1]});
assign rs2_ext = signed'({rs2_is_signed & rf[RS2][31], rf[RS2]});
//Pipeline advancement control signals
assign issue.ready = stage1_advance;
@ -73,7 +101,7 @@ module mul_unit
//Attribute Pipeline
always_ff @ (posedge clk) begin
if (stage1_advance) begin
mulh[0] <= (mul_inputs.op[1:0] != MUL_fn3[1:0]);
mulh[0] <= is_mulhx;
id[0] <= issue.id;
end
if (stage2_advance) begin

View file

@ -64,10 +64,7 @@ module branch_predictor
localparam BRANCH_ADDR_W = $clog2(CONFIG.BP.ENTRIES);
localparam BTAG_W = get_memory_width() - BRANCH_ADDR_W - 2;
function logic[BTAG_W-1:0] get_tag (input logic[31:0] pc);
return pc[BRANCH_ADDR_W+2 +: BTAG_W];
endfunction
cache_functions_interface #(.TAG_W(BTAG_W), .LINE_W(BRANCH_ADDR_W), .SUB_LINE_W(0)) addr_utils();
typedef struct packed {
logic valid;
@ -86,8 +83,6 @@ module branch_predictor
logic branch_prediction_used;
logic [CONFIG.BP.WAYS-1:0] branch_predictor_update_way;
} branch_metadata_t;
(* ramstyle = "MLAB, no_rw_check" *) logic [$bits(branch_metadata_t)-1:0] branch_metadata_table [MAX_IDS];
branch_metadata_t branch_metadata_if;
branch_metadata_t branch_metadata_ex;
logic branch_predictor_direction_changed;
@ -101,43 +96,52 @@ module branch_predictor
logic [$clog2(CONFIG.BP.WAYS > 1 ? CONFIG.BP.WAYS : 2)-1:0] hit_way;
logic tag_match;
logic use_predicted_pc;
addr_utils_interface #(CONFIG.IBUS_ADDR.L, CONFIG.IBUS_ADDR.H) ibus_addr_utils ();
/////////////////////////////////////////
genvar i;
generate if (CONFIG.INCLUDE_BRANCH_PREDICTOR)
for (i=0; i<CONFIG.BP.WAYS; i++) begin : gen_branch_tag_banks
branch_predictor_ram #(.C_DATA_WIDTH($bits(branch_table_entry_t)), .C_DEPTH(CONFIG.BP.ENTRIES))
dual_port_bram #(.WIDTH($bits(branch_table_entry_t)), .LINES(CONFIG.BP.ENTRIES))
tag_bank (
.clk (clk),
.rst (rst),
.write_addr (br_results.pc[2 +: BRANCH_ADDR_W]),
.write_en (tag_update_way[i]),
.write_data (ex_entry),
.read_addr (bp.next_pc[2 +: BRANCH_ADDR_W]),
.read_en (bp.new_mem_request),
.read_data (if_entry[i]));
.clk (clk),
.en_a (tag_update_way[i]),
.wen_a (tag_update_way[i]),
.addr_a (addr_utils.getHashedLineAddr(br_results.pc, i)),
.data_in_a (ex_entry),
.data_out_a (),
.en_b (bp.new_mem_request),
.wen_b (0),
.addr_b (addr_utils.getHashedLineAddr(bp.next_pc, i)),
.data_in_b ('0),
.data_out_b (if_entry[i]));
end
endgenerate
generate if (CONFIG.INCLUDE_BRANCH_PREDICTOR)
for (i=0; i<CONFIG.BP.WAYS; i++) begin : gen_branch_table_banks
branch_predictor_ram #(.C_DATA_WIDTH(32), .C_DEPTH(CONFIG.BP.ENTRIES))
dual_port_bram #(.WIDTH(32), .LINES(CONFIG.BP.ENTRIES))
addr_table (
.clk (clk),
.rst (rst),
.write_addr(br_results.pc[2 +: BRANCH_ADDR_W]),
.write_en(target_update_way[i]),
.write_data(br_results.target_pc),
.read_addr(bp.next_pc[2 +: BRANCH_ADDR_W]),
.read_en(bp.new_mem_request),
.read_data(predicted_pc[i])
.clk (clk),
.en_a (target_update_way[i]),
.wen_a (target_update_way[i]),
.addr_a (addr_utils.getHashedLineAddr(br_results.pc, i)),
.data_in_a (br_results.target_pc),
.data_out_a (),
.en_b (bp.new_mem_request),
.wen_b (0),
.addr_b (addr_utils.getHashedLineAddr(bp.next_pc, i)),
.data_in_b ('0),
.data_out_b (predicted_pc[i])
);
end
endgenerate
generate if (CONFIG.INCLUDE_BRANCH_PREDICTOR)
for (i=0; i<CONFIG.BP.WAYS; i++) begin : gen_branch_hit_detection
assign tag_matches[i] = ({if_entry[i].valid, if_entry[i].tag} == {1'b1, get_tag(bp.if_pc)});
assign tag_matches[i] = ({if_entry[i].valid, if_entry[i].tag} == {1'b1, addr_utils.getTag(bp.if_pc)});
end
endgenerate
@ -172,20 +176,25 @@ module branch_predictor
.en (1'b1),
.one_hot (replacement_way)
);
assign branch_metadata_if.branch_predictor_metadata = if_entry[hit_way].metadata;
assign branch_metadata_if.branch_prediction_used = use_predicted_pc;
assign branch_metadata_if.branch_predictor_update_way = tag_match ? tag_matches : replacement_way;
always_ff @ (posedge clk) begin
if (bp.pc_id_assigned)
branch_metadata_table[bp.pc_id] <= branch_metadata_if;
end
assign branch_metadata_ex = branch_metadata_table[br_results.id];
lutram_1w_1r #(.DATA_TYPE(branch_metadata_t), .DEPTH(MAX_IDS))
branch_metadata_table (
.clk(clk),
.waddr(bp.pc_id),
.raddr(br_results.id),
.ram_write(bp.pc_id_assigned),
.new_ram_data('{
branch_predictor_metadata : if_entry[hit_way].metadata,
branch_prediction_used : use_predicted_pc,
branch_predictor_update_way : tag_match ? tag_matches : replacement_way
}),
.ram_data_out(branch_metadata_ex)
);
////////////////////////////////////////////////////
//Execution stage update
assign ex_entry.valid = 1;
assign ex_entry.tag = get_tag(br_results.pc);
assign ex_entry.tag = addr_utils.getTag(br_results.pc);
assign ex_entry.is_branch = br_results.is_branch;
assign ex_entry.is_return = br_results.is_return;
assign ex_entry.is_call = br_results.is_call;

View file

@ -36,7 +36,6 @@ module fetch
input logic branch_flush,
input gc_outputs_t gc,
input logic tlb_on,
input logic exception,
//ID Support
@ -60,10 +59,7 @@ module fetch
wishbone_interface.master iwishbone,
input logic icache_on,
l1_arbiter_request_interface.master l1_request,
l1_arbiter_return_interface.master l1_response,
//Trace Interface
output logic tr_early_branch_correction
l1_arbiter_return_interface.master l1_response
);
localparam NUM_SUB_UNITS = int'(CONFIG.INCLUDE_ILOCAL_MEM) + int'(CONFIG.INCLUDE_ICACHE) + int'(CONFIG.INCLUDE_IBUS);
@ -97,7 +93,6 @@ module fetch
logic mmu_fault;
logic [NUM_SUB_UNITS_W-1:0] subunit_id;
} fetch_attributes_t;
fetch_attributes_t fetch_attr_next;
fetch_attributes_t fetch_attr;
logic [MAX_OUTSTANDING_REQUESTS_W:0] inflight_count;
@ -113,15 +108,13 @@ module fetch
logic [31:0] pc;
logic flush_or_rst;
fifo_interface #(.DATA_WIDTH($bits(fetch_attributes_t))) fetch_attr_fifo();
fifo_interface #(.DATA_TYPE(fetch_attributes_t)) fetch_attr_fifo();
logic update_pc;
logic new_mem_request;
logic exception_pending;
logic internal_fetch_complete;
logic [31:0] translated_address;
genvar i;
////////////////////////////////////////////////////
//Implementation
@ -164,50 +157,55 @@ module fetch
assign bp.pc_id = pc_id;
assign bp.pc_id_assigned = pc_id_assigned;
assign ras.pop = bp.use_prediction & bp.is_return & ~branch_flush & ~gc.pc_override & new_mem_request & (~early_branch_flush);
assign ras.push = bp.use_prediction & bp.is_call & ~branch_flush & ~gc.pc_override & new_mem_request & (~early_branch_flush);
////////////////////////////////////////////////////
//RAS support
logic ras_update_permitted;
assign ras_update_permitted = bp.use_prediction & new_mem_request & ~(branch_flush | gc.pc_override | early_branch_flush);
assign ras.pop = bp.is_return & ras_update_permitted;
assign ras.push = bp.is_call & ras_update_permitted;
assign ras.branch_fetched = bp.is_branch & ras_update_permitted;
assign ras.new_addr = pc_plus_4;
assign ras.branch_fetched = bp.use_prediction & bp.is_branch & new_mem_request & (~early_branch_flush); //flush not needed as FIFO resets inside of RAS
////////////////////////////////////////////////////
//TLB
assign tlb.virtual_address = pc;
assign tlb.execute = 1;
assign tlb.rnw = 0;
assign tlb.new_request = tlb.ready & (CONFIG.INCLUDE_S_MODE & tlb_on);
assign translated_address = (CONFIG.INCLUDE_S_MODE & tlb_on) ? tlb.physical_address : pc;
assign tlb.new_request = tlb.ready;
//////////////////////////////////////////////
//Issue Control Signals
assign flush_or_rst = (rst | gc.fetch_flush | early_branch_flush);
assign new_mem_request = (~tlb_on | tlb.done) & pc_id_available & ~fetch_attr_fifo.full & units_ready & (~gc.fetch_hold) & (~exception_pending);
assign new_mem_request = tlb.done & pc_id_available & ~fetch_attr_fifo.full & units_ready & (~gc.fetch_hold) & (~exception_pending);
assign pc_id_assigned = new_mem_request | tlb.is_fault;
//////////////////////////////////////////////
//Subunit Tracking
assign fetch_attr_fifo.push = pc_id_assigned;
assign fetch_attr_fifo.potential_push = pc_id_assigned;
assign fetch_attr_fifo.pop = internal_fetch_complete;
logic [NUM_SUB_UNITS_W-1:0] subunit_id;
one_hot_to_integer #(NUM_SUB_UNITS)
hit_way_conv (
.one_hot (sub_unit_address_match),
.int_out (fetch_attr_next.subunit_id)
.int_out (subunit_id)
);
assign fetch_attr_next.is_predicted_branch_or_jump = bp.use_prediction;
assign fetch_attr_next.is_branch = bp.use_prediction & bp.is_branch;
assign fetch_attr_next.address_valid = address_valid;
assign fetch_attr_next.mmu_fault = tlb.is_fault;
assign fetch_attr_fifo.data_in = '{
is_predicted_branch_or_jump : bp.use_prediction,
is_branch : (bp.use_prediction & bp.is_branch),
address_valid : address_valid,
mmu_fault : tlb.is_fault,
subunit_id : subunit_id
};
assign fetch_attr_fifo.push = pc_id_assigned;
assign fetch_attr_fifo.potential_push = pc_id_assigned;
assign fetch_attr_fifo.pop = internal_fetch_complete;
assign fetch_attr_fifo.data_in = fetch_attr_next;
cva5_fifo #(.DATA_WIDTH($bits(fetch_attributes_t)), .FIFO_DEPTH(MAX_OUTSTANDING_REQUESTS))
cva5_fifo #(.DATA_TYPE(fetch_attributes_t), .FIFO_DEPTH(MAX_OUTSTANDING_REQUESTS))
attributes_fifo (
.clk (clk),
.rst (rst),
.fifo (fetch_attr_fifo)
);
assign fetch_attr = fetch_attr_fifo.data_out;
assign inflight_count_next = inflight_count + MAX_OUTSTANDING_REQUESTS_W'(fetch_attr_fifo.push) - MAX_OUTSTANDING_REQUESTS_W'(fetch_attr_fifo.pop);
@ -234,7 +232,7 @@ module fetch
//In either case, data_valid must NOT be asserted.
generate for (i=0; i < NUM_SUB_UNITS; i++) begin : gen_fetch_sources
assign sub_unit[i].new_request = fetch_attr_fifo.push & sub_unit_address_match[i];
assign sub_unit[i].addr = translated_address;
assign sub_unit[i].addr = tlb.physical_address;
assign sub_unit[i].re = 1;
assign sub_unit[i].we = 0;
assign sub_unit[i].be = '0;
@ -247,7 +245,7 @@ module fetch
endgenerate
generate if (CONFIG.INCLUDE_ILOCAL_MEM) begin : gen_fetch_local_mem
assign sub_unit_address_match[LOCAL_MEM_ID] = ilocal_mem_addr_utils.address_range_check(translated_address);
assign sub_unit_address_match[LOCAL_MEM_ID] = ilocal_mem_addr_utils.address_range_check(tlb.physical_address);
local_mem_sub_unit i_local_mem (
.clk (clk),
.rst (rst),
@ -258,7 +256,7 @@ module fetch
endgenerate
generate if (CONFIG.INCLUDE_IBUS) begin : gen_fetch_ibus
assign sub_unit_address_match[BUS_ID] = ibus_addr_utils.address_range_check(translated_address);
assign sub_unit_address_match[BUS_ID] = ibus_addr_utils.address_range_check(tlb.physical_address);
wishbone_master iwishbone_bus (
.clk (clk),
.rst (rst),
@ -269,7 +267,7 @@ module fetch
endgenerate
generate if (CONFIG.INCLUDE_ICACHE) begin : gen_fetch_icache
assign sub_unit_address_match[ICACHE_ID] = icache_addr_utils.address_range_check(translated_address);
assign sub_unit_address_match[ICACHE_ID] = icache_addr_utils.address_range_check(tlb.physical_address);
icache #(.CONFIG(CONFIG))
i_cache (
.clk (clk),
@ -308,8 +306,6 @@ module fetch
assign is_branch_or_jump = fetch_instruction[6:2] inside {JAL_T, JALR_T, BRANCH_T};
assign early_branch_flush = (valid_fetch_result & (|unit_data_valid)) & fetch_attr.is_predicted_branch_or_jump & (~is_branch_or_jump);
assign early_branch_flush_ras_adjust = (valid_fetch_result & (|unit_data_valid)) & fetch_attr.is_branch & (~is_branch_or_jump);
if (ENABLE_TRACE_INTERFACE)
assign tr_early_branch_correction = early_branch_flush;
end endgenerate
////////////////////////////////////////////////////
//End of Implementation

View file

@ -42,6 +42,9 @@ module icache
);
localparam derived_cache_config_t SCONFIG = get_derived_cache_params(CONFIG, CONFIG.ICACHE, CONFIG.ICACHE_ADDR);
localparam bit [SCONFIG.SUB_LINE_ADDR_W-1:0] END_OF_LINE_COUNT = SCONFIG.SUB_LINE_ADDR_W'(CONFIG.ICACHE.LINE_W-1);
cache_functions_interface #(.TAG_W(SCONFIG.TAG_W), .LINE_W(SCONFIG.LINE_ADDR_W), .SUB_LINE_W(SCONFIG.SUB_LINE_ADDR_W)) addr_utils();
logic tag_hit;
logic [CONFIG.ICACHE.WAYS-1:0] tag_hit_way;
@ -51,20 +54,24 @@ module icache
logic [CONFIG.ICACHE.WAYS-1:0] tag_update_way;
logic [SCONFIG.SUB_LINE_ADDR_W-1:0] word_count;
logic [SCONFIG.SUB_LINE_ADDR_W-1:0] target_word;
logic is_target_word;
logic line_complete;
logic [31:0] data_out [CONFIG.ICACHE.WAYS-1:0];
logic [31:0] miss_data;
logic miss_in_progress;
logic linefill_in_progress;
logic request_in_progress;
logic miss_data_valid;
logic second_cycle;
logic [31:0] second_cycle_addr;
logic idle;
logic memory_complete;
fifo_interface #(.DATA_TYPE(logic[31:0])) input_fifo();
logic new_request;
logic [31:0] new_request_addr;
////////////////////////////////////////////////////
//Implementation
@ -72,18 +79,44 @@ module icache
//On the second cycle of a request hit/miss determination is performed
//On a miss, the memory request starts on the third cycle
assign new_request = (fetch_sub.new_request | input_fifo.valid) & ((~request_in_progress | tag_hit) & ~linefill_in_progress);
assign input_fifo.push = fetch_sub.new_request & (~new_request | input_fifo.valid);
assign input_fifo.potential_push = input_fifo.push;
assign input_fifo.pop = new_request & input_fifo.valid;
assign input_fifo.data_in = fetch_sub.addr;
assign new_request_addr = input_fifo.valid ? input_fifo.data_out : fetch_sub.addr;
cva5_fifo #(.DATA_TYPE(logic[31:0]), .FIFO_DEPTH(2))
cache_input_fifo (
.clk (clk),
.rst (rst),
.fifo (input_fifo)
);
////////////////////////////////////////////////////
//Ready determination
always_ff @ (posedge clk) begin
if (rst)
request_in_progress <= 0;
else
request_in_progress <= (request_in_progress & ~fetch_sub.data_valid) | new_request;
end
assign fetch_sub.ready = ~input_fifo.full;
////////////////////////////////////////////////////
//General Control Logic
always_ff @ (posedge clk) begin
if (rst)
second_cycle <= 0;
else
second_cycle <= fetch_sub.new_request;
second_cycle <= new_request;
end
always_ff @(posedge clk) begin
if (fetch_sub.new_request)
second_cycle_addr <= fetch_sub.addr;
if (new_request)
second_cycle_addr <= new_request_addr;
end
//As request can be aborted on any cycle, only update tags if memory request is in progress
@ -96,13 +129,13 @@ module icache
//Replacement policy is psuedo random
cycler #(CONFIG.ICACHE.WAYS) replacement_policy (
.clk (clk),
.rst (rst),
.en (1'b1),
.one_hot (replacement_way)
.clk (clk),
.rst (rst),
.en (1'b1),
.one_hot (replacement_way)
);
always_ff @ (posedge clk) begin
if (second_cycle)
if (second_cycle & ~linefill_in_progress)
tag_update_way <= replacement_way;
end
@ -132,102 +165,84 @@ module icache
//Miss state tracking
always_ff @ (posedge clk) begin
if (rst)
miss_in_progress <= 0;
linefill_in_progress <= 0;
else
miss_in_progress <= l1_request.ack | (miss_in_progress & ~line_complete);
linefill_in_progress <= (linefill_in_progress & ~line_complete) | l1_request.ack;
end
////////////////////////////////////////////////////
//Tag banks
itag_banks #(.CONFIG(CONFIG), .SCONFIG(SCONFIG))
icache_tag_banks (
.clk(clk),
.rst(rst), //clears the read_hit_allowed flag
.stage1_addr(fetch_sub.addr),
.stage2_addr(second_cycle_addr),
.update_way(tag_update_way),
.update(tag_update),
.stage1_adv(fetch_sub.new_request & icache_on),
.tag_hit(tag_hit),
.tag_hit_way(tag_hit_way)
.clk(clk),
.rst(rst), //clears the read_hit_allowed flag
.stage1_line_addr(addr_utils.getTagLineAddr(new_request_addr)),
.stage2_line_addr(addr_utils.getTagLineAddr(second_cycle_addr)),
.stage2_tag(addr_utils.getTag(second_cycle_addr)),
.update_way(tag_update_way),
.update(tag_update),
.stage1_adv(new_request & icache_on),
.tag_hit(tag_hit),
.tag_hit_way(tag_hit_way)
);
////////////////////////////////////////////////////
//Data Banks
genvar i;
generate for (i=0; i < CONFIG.ICACHE.WAYS; i++) begin : idata_bank_gen
byte_en_BRAM #(CONFIG.ICACHE.LINES*CONFIG.ICACHE.LINE_W) idata_bank (
dual_port_bram #(.WIDTH(32), .LINES(CONFIG.ICACHE.LINES*CONFIG.ICACHE.LINE_W)) idata_bank (
.clk(clk),
.addr_a(fetch_sub.addr[2 +: SCONFIG.LINE_ADDR_W+SCONFIG.SUB_LINE_ADDR_W]),
.addr_b({second_cycle_addr[(2+SCONFIG.SUB_LINE_ADDR_W) +: SCONFIG.LINE_ADDR_W], word_count}),
.en_a(fetch_sub.new_request),
.en_b(tag_update_way[i] & l1_response.data_valid),
.be_a('0),
.be_b('1),
.en_a(new_request),
.wen_a(0),
.addr_a(addr_utils.getDataLineAddr(new_request_addr)),
.data_in_a('0),
.data_in_b(l1_response.data),
.data_out_a(data_out[i]),
.en_b(1),
.wen_b(tag_update_way[i] & l1_response.data_valid),
.addr_b(addr_utils.getDataLineAddr({second_cycle_addr[31:SCONFIG.SUB_LINE_ADDR_W+2], word_count, 2'b0})),
.data_in_b(l1_response.data),
.data_out_b()
);
end endgenerate
////////////////////////////////////////////////////
//Miss data path
assign target_word = second_cycle_addr[2 +: SCONFIG.SUB_LINE_ADDR_W];
assign is_target_word = (target_word == word_count);
always_ff @ (posedge clk) begin
if (rst)
word_count <= 0;
else if (l1_response.data_valid)
word_count <= word_count + 1;
end
assign is_target_word = (second_cycle_addr[2 +: SCONFIG.SUB_LINE_ADDR_W] == word_count);
always_ff @ (posedge clk) begin
if (l1_response.data_valid & is_target_word)
miss_data <= l1_response.data;
else
miss_data <= 0;
word_count <= word_count + SCONFIG.SUB_LINE_ADDR_W'(l1_response.data_valid);
end
always_ff @ (posedge clk) begin
if (rst)
miss_data_valid <= 0;
else
miss_data_valid <= miss_in_progress & l1_response.data_valid & is_target_word;
end
assign line_complete = (l1_response.data_valid && (word_count == SCONFIG.SUB_LINE_ADDR_W'(CONFIG.ICACHE.LINE_W-1)));
always_ff @ (posedge clk) begin
if (rst)
memory_complete <= 0;
else
memory_complete <= line_complete;
end
assign miss_data_valid = request_in_progress & l1_response.data_valid & is_target_word;
assign line_complete = l1_response.data_valid & (word_count == END_OF_LINE_COUNT);
////////////////////////////////////////////////////
//Output muxing
localparam OMUX_W = CONFIG.ICACHE.WAYS+1;
logic [OMUX_W-1:0] priority_vector;
logic [$clog2(OMUX_W)-1:0] output_sel;
logic [31:0] output_array [OMUX_W];
always_comb begin
fetch_sub.data_out = miss_data;//zero if not a miss
priority_vector[0] = miss_data_valid;
output_array[0] = l1_response.data;
for (int i = 0; i < CONFIG.ICACHE.WAYS; i++) begin
fetch_sub.data_out = fetch_sub.data_out | (data_out[i] & {32{tag_hit_way[i]}});
priority_vector[i+1] = tag_hit_way[i];
output_array[i+1] = data_out[i];
end
end
priority_encoder #(.WIDTH(OMUX_W))
arb_encoder
(
.priority_vector (priority_vector),
.encoded_result (output_sel)
);
assign fetch_sub.data_out = output_array[output_sel];
assign fetch_sub.data_valid = miss_data_valid | tag_hit;
////////////////////////////////////////////////////
//Ready determination
always_ff @ (posedge clk) begin
if (rst)
idle <= 1;
else if (fetch_sub.new_request)
idle <= 0;
else if (memory_complete | tag_hit) //read miss OR write through complete
idle <= 1;
end
assign fetch_sub.ready = tag_hit | memory_complete | idle;
////////////////////////////////////////////////////
//End of Implementation
////////////////////////////////////////////////////
@ -239,7 +254,7 @@ module icache
else $error("Spurious icache ack received from arbiter!");
icache_l1_arb_data_valid_assertion:
assert property (@(posedge clk) disable iff (rst) l1_response.data_valid |-> miss_in_progress)
assert property (@(posedge clk) disable iff (rst) l1_response.data_valid |-> linefill_in_progress)
else $error("Spurious icache data received from arbiter!");
endmodule

View file

@ -34,8 +34,9 @@ module itag_banks
input logic clk,
input logic rst,
input logic[31:0] stage1_addr,
input logic[31:0] stage2_addr,
input logic[SCONFIG.LINE_ADDR_W-1:0] stage1_line_addr,
input logic[SCONFIG.LINE_ADDR_W-1:0] stage2_line_addr,
input logic[SCONFIG.TAG_W-1:0] stage2_tag,
input logic[CONFIG.ICACHE.WAYS-1:0] update_way,
input logic update,
@ -46,22 +47,11 @@ module itag_banks
output logic[CONFIG.ICACHE.WAYS-1:0] tag_hit_way
);
//Valid + tag
typedef logic [SCONFIG.TAG_W : 0] itag_entry_t;
function logic[SCONFIG.TAG_W-1:0] getTag(logic[31:0] addr);
return addr[2+SCONFIG.SUB_LINE_ADDR_W+SCONFIG.LINE_ADDR_W +: SCONFIG.TAG_W];
endfunction
function logic[SCONFIG.LINE_ADDR_W-1:0] getLineAddr(logic[31:0] addr);
return addr[SCONFIG.LINE_ADDR_W + SCONFIG.SUB_LINE_ADDR_W + 1 : SCONFIG.SUB_LINE_ADDR_W + 2];
endfunction
logic hit_allowed;
itag_entry_t tag_line[CONFIG.ICACHE.WAYS-1:0];
itag_entry_t stage2_tag;
assign stage2_tag = {1'b1, getTag(stage2_addr)};
logic hit_allowed;
always_ff @ (posedge clk) begin
if (rst)
@ -73,23 +63,23 @@ module itag_banks
genvar i;
generate
for (i=0; i < CONFIG.ICACHE.WAYS; i++) begin : tag_bank_gen
tag_bank #(SCONFIG.TAG_W+1, CONFIG.ICACHE.LINES) itag_bank (.*,
.en_a(stage1_adv), .wen_a('0),
.addr_a(getLineAddr(stage1_addr)),
.data_in_a('0), .data_out_a(tag_line[i]),
.en_b(update), .wen_b(update_way[i]),
.addr_b(getLineAddr(stage2_addr)),
.data_in_b(stage2_tag), .data_out_b()
dual_port_bram #(.WIDTH(SCONFIG.TAG_W+1), .LINES(CONFIG.ICACHE.LINES)) itag_bank (.*,
.clk(clk),
.en_a(stage1_adv),
.wen_a('0),
.addr_a(stage1_line_addr),
.data_in_a('0),
.data_out_a(tag_line[i]),
.en_b(update),
.wen_b(update_way[i]),
.addr_b(stage2_line_addr),
.data_in_b({1'b1, stage2_tag}),
.data_out_b()
);
assign tag_hit_way[i] = ({hit_allowed,stage2_tag} == {1'b1,tag_line[i]});
assign tag_hit_way[i] = ({hit_allowed, 1'b1, stage2_tag} == {1'b1, tag_line[i]});
end
endgenerate
assign tag_hit = |tag_hit_way;
endmodule

View file

@ -38,33 +38,36 @@ module ras
ras_interface.self ras
);
(* ramstyle = "MLAB, no_rw_check" *) logic[31:0] lut_ram [CONFIG.BP.RAS_ENTRIES];
localparam RAS_DEPTH_W = $clog2(CONFIG.BP.RAS_ENTRIES);
logic [RAS_DEPTH_W-1:0] read_index;
logic [RAS_DEPTH_W-1:0] new_index;
fifo_interface #(.DATA_WIDTH(RAS_DEPTH_W)) ri_fifo();
///////////////////////////////////////////////////////
//For simulation purposes
initial lut_ram = '{default: 0};
fifo_interface #(.DATA_TYPE(logic[RAS_DEPTH_W-1:0])) ri_fifo();
///////////////////////////////////////////////////////
assign ras.addr = lut_ram[read_index];
//On a speculative branch, save the current stack pointer
//Restored if branch is misspredicted (gc_fetch_flush)
cva5_fifo #(.DATA_WIDTH(RAS_DEPTH_W), .FIFO_DEPTH(MAX_IDS))
read_index_fifo (.clk, .rst(rst | gc.fetch_flush | early_branch_flush_ras_adjust), .fifo(ri_fifo));
cva5_fifo #(.DATA_TYPE(logic[RAS_DEPTH_W-1:0]), .FIFO_DEPTH(MAX_IDS))
read_index_fifo (
.clk,
.rst(rst | gc.fetch_flush | early_branch_flush_ras_adjust),
.fifo(ri_fifo)
);
assign ri_fifo.data_in = read_index;
assign ri_fifo.push = ras.branch_fetched;
assign ri_fifo.potential_push = ras.branch_fetched;
assign ri_fifo.pop = ras.branch_retired & ri_fifo.valid; //Prevent popping from fifo if reset due to early_branch_flush_ras_adjust
always_ff @ (posedge clk) begin
if (ras.push)
lut_ram[new_index] <= ras.new_addr;
end
lutram_1w_1r #(.DATA_TYPE(logic[31:0]), .DEPTH(CONFIG.BP.RAS_ENTRIES))
ras_stack (
.clk(clk),
.waddr(new_index),
.raddr(read_index),
.ram_write(ras.push),
.new_ram_data(ras.new_addr),
.ram_data_out(ras.addr)
);
//Rolls over when full, most recent calls will be correct, but calls greater than depth
//will be lost.
logic [RAS_DEPTH_W-1:0] new_index_base;

View file

@ -1,5 +1,5 @@
/*
* Copyright © 2019 Eric Matthews, Lesley Shannon
* Copyright © 2023 Chris Keilbart, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -17,31 +17,29 @@
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Eric Matthews <ematthew@sfu.ca>
* Chris Keilbart <ckeilbar@sfu.ca>
*/
module shift_counter
module fp_writeback
import cva5_config::*;
import cva5_types::*;
#(parameter DEPTH = 16)
(
input logic clk,
input logic rst,
input logic start,
output logic done
//Unit writeback
unit_writeback_interface.wb unit_wb[2],
//WB output
output fp_wb_packet_t wb_packet[2]
);
logic [DEPTH-1:0] counter;
////////////////////////////////////////////////////
//Implementation
//Because there are two writeback ports for the FP register file, no arbitration is needed
assign wb_packet[0].id = unit_wb[0].id;
assign wb_packet[0].valid = unit_wb[0].done;
assign wb_packet[0].data = unit_wb[0].rd;
assign unit_wb[0].ack = unit_wb[0].done;
//TLB_CLEAR state shift reg
always_ff @ (posedge clk) begin
counter[0] <= start;
counter[DEPTH-1:1] <= counter[DEPTH-2:0];
end
assign done = counter[DEPTH-1];
assign wb_packet[1].id = unit_wb[1].id;
assign wb_packet[1].valid = unit_wb[1].done;
assign wb_packet[1].data = unit_wb[1].rd;
assign unit_wb[1].ack = unit_wb[1].done;
endmodule

View file

@ -51,25 +51,34 @@ module instruction_metadata_and_id_management
output decode_packet_t decode,
input logic decode_advance,
input logic decode_uses_rd,
input logic fp_decode_uses_rd,
input rs_addr_t decode_rd_addr,
input exception_sources_t decode_exception_unit,
input logic decode_is_store,
//renamer
input phys_addr_t decode_phys_rd_addr,
input phys_addr_t fp_decode_phys_rd_addr,
//Issue stage
input issue_packet_t issue,
input logic instruction_issued,
input logic instruction_issued_with_rd,
input logic fp_instruction_issued_with_rd,
//WB
input wb_packet_t wb_packet [CONFIG.NUM_WB_GROUPS],
output commit_packet_t commit_packet [CONFIG.NUM_WB_GROUPS],
input fp_wb_packet_t fp_wb_packet [2],
output phys_addr_t wb_phys_addr [CONFIG.NUM_WB_GROUPS],
output phys_addr_t fp_wb_phys_addr [2],
//Retirer
output retire_packet_t retire,
output retire_packet_t wb_retire,
output retire_packet_t fp_wb_retire,
output retire_packet_t store_retire,
output id_t retire_ids [RETIRE_PORTS],
output id_t retire_ids_next [RETIRE_PORTS],
output logic retire_port_valid [RETIRE_PORTS],
output logic [LOG2_RETIRE_PORTS : 0] retire_count,
//CSR
output logic [LOG2_MAX_IDS:0] post_issue_count,
@ -78,16 +87,19 @@ module instruction_metadata_and_id_management
output logic [$clog2(NUM_EXCEPTION_SOURCES)-1:0] current_exception_unit
);
//////////////////////////////////////////
(* ramstyle = "MLAB, no_rw_check" *) logic [31:0] pc_table [MAX_IDS];
(* ramstyle = "MLAB, no_rw_check" *) logic [31:0] instruction_table [MAX_IDS];
(* ramstyle = "MLAB, no_rw_check" *) logic [0:0] valid_fetch_addr_table [MAX_IDS];
localparam NUM_WB_GROUPS = CONFIG.NUM_WB_GROUPS + 32'(CONFIG.INCLUDE_UNIT.FPU) + 32'(CONFIG.INCLUDE_UNIT.FPU);
logic [31:0] decode_pc;
logic [31:0] decode_instruction;
fetch_metadata_t decode_fetch_metadata;
(* ramstyle = "MLAB, no_rw_check" *) phys_addr_t phys_addr_table [MAX_IDS];
(* ramstyle = "MLAB, no_rw_check" *) logic [0:0] uses_rd_table [MAX_IDS];
(* ramstyle = "MLAB, no_rw_check" *) logic [$bits(fetch_metadata_t)-1:0] fetch_metadata_table [MAX_IDS];
(* ramstyle = "MLAB, no_rw_check" *) logic [$bits(exception_sources_t)-1:0] exception_unit_table [MAX_IDS];
typedef enum logic[1:0] {
NONE = 2'b00,
RD = 2'b01,
STORE = 2'b10,
FP_RD = 2'b11
} instruction_type_t;
instruction_type_t decode_type;
instruction_type_t retire_type [RETIRE_PORTS];
id_t decode_id;
id_t oldest_pre_issue_id;
@ -99,60 +111,128 @@ module instruction_metadata_and_id_management
logic [LOG2_MAX_IDS:0] post_issue_count_next;
logic [LOG2_MAX_IDS:0] inflight_count;
retire_packet_t retire_next;
logic retire_port_valid_next [RETIRE_PORTS];
retire_packet_t wb_retire_next;
retire_packet_t fp_wb_retire_next;
retire_packet_t store_retire_next;
genvar i;
logic retire_port_valid_next [RETIRE_PORTS];
logic [LOG2_RETIRE_PORTS : 0] retire_count_next;
////////////////////////////////////////////////////
//Implementation
////////////////////////////////////////////////////
//Instruction Metadata
//PC table
//Number of read ports = 1 or 2 (decode stage + exception logic (if enabled))
always_ff @ (posedge clk) begin
if (pc_id_assigned)
pc_table[pc_id] <= if_pc;
end
//PC table(s)
lutram_1w_1r #(.DATA_TYPE(logic[31:0]), .DEPTH(MAX_IDS))
pc_table (
.clk(clk),
.waddr(pc_id),
.raddr(decode_id),
.ram_write(pc_id_assigned),
.new_ram_data(if_pc),
.ram_data_out(decode_pc)
);
generate if (CONFIG.INCLUDE_M_MODE) begin : gen_pc_id_exception_support
lutram_1w_1r #(.DATA_TYPE(logic[31:0]), .DEPTH(MAX_IDS))
pc_table_exception (
.clk(clk),
.waddr(pc_id),
.raddr(retire_ids_next[0]),
.ram_write(pc_id_assigned),
.new_ram_data(if_pc),
.ram_data_out(oldest_pc)
);
end endgenerate
////////////////////////////////////////////////////
//Instruction table
//Number of read ports = 1 (decode stage)
always_ff @ (posedge clk) begin
if (fetch_complete)
instruction_table[fetch_id] <= fetch_instruction;
end
lutram_1w_1r #(.DATA_TYPE(logic[31:0]), .DEPTH(MAX_IDS))
instruction_table (
.clk(clk),
.waddr(fetch_id),
.raddr(decode_id),
.ram_write(fetch_complete),
.new_ram_data(fetch_instruction),
.ram_data_out(decode_instruction)
);
////////////////////////////////////////////////////
//Valid fetched address table
//Number of read ports = 1 (decode stage)
always_ff @ (posedge clk) begin
if (fetch_complete)
fetch_metadata_table[fetch_id] <= fetch_metadata;
end
lutram_1w_1r #(.DATA_TYPE(fetch_metadata_t), .DEPTH(MAX_IDS))
fetch_metadata_table (
.clk(clk),
.waddr(fetch_id),
.raddr(decode_id),
.ram_write(fetch_complete),
.new_ram_data(fetch_metadata),
.ram_data_out(decode_fetch_metadata)
);
////////////////////////////////////////////////////
//Phys rd table
//Number of read ports = (NUM_WB_GROUPS - 1) (ALU WB group uses issue_phys_rd_addr)
always_ff @ (posedge clk) begin
if (decode_advance)
phys_addr_table[decode_id] <= decode_phys_rd_addr;
end
////////////////////////////////////////////////////
//Uses rd table
//Retire Instruction Type Table
//Number of read ports = RETIRE_PORTS
always_ff @ (posedge clk) begin
if (decode_advance)
uses_rd_table[decode_id] <= decode_uses_rd & |decode_rd_addr;
always_comb begin
if (decode_uses_rd & |decode_rd_addr)
decode_type = RD;
else if (decode_is_store)
decode_type = STORE;
else if (fp_decode_uses_rd)
decode_type = FP_RD;
else
decode_type = NONE;
end
lutram_1w_mr #(.DATA_TYPE(logic[1:0]), .DEPTH(MAX_IDS), .NUM_READ_PORTS(RETIRE_PORTS))
retire_instruction_type_table (
.clk(clk),
.waddr(decode_id),
.raddr(retire_ids_next),
.ram_write(decode_advance),
.new_ram_data(decode_type),
.ram_data_out(retire_type)
);
////////////////////////////////////////////////////
//id_to_phys_rd_table
//Number of read ports = WB_GROUPS
id_t wb_ids [NUM_WB_GROUPS];
phys_addr_t wb_phys_addrs [NUM_WB_GROUPS];
always_comb begin
wb_ids[NUM_WB_GROUPS-2] = fp_wb_packet[0].id;
wb_ids[NUM_WB_GROUPS-1] = fp_wb_packet[1].id;
fp_wb_phys_addr[0] = wb_phys_addrs[NUM_WB_GROUPS-2];
fp_wb_phys_addr[1] = wb_phys_addrs[NUM_WB_GROUPS-1];
for (int i = 0; i < CONFIG.NUM_WB_GROUPS; i++) begin
//This will overwrite the FP packets if the configuration does not include it
wb_ids[i] = wb_packet[i].id;
wb_phys_addr[i] = wb_phys_addrs[i];
end
end
lutram_1w_mr #(.DATA_TYPE(phys_addr_t), .DEPTH(MAX_IDS), .NUM_READ_PORTS(NUM_WB_GROUPS))
id_to_phys_rd_table (
.clk(clk),
.waddr(decode_id),
.raddr(wb_ids),
.ram_write(decode_advance),
.new_ram_data(fp_decode_uses_rd ? fp_decode_phys_rd_addr : decode_phys_rd_addr),
.ram_data_out(wb_phys_addrs)
);
////////////////////////////////////////////////////
//Exception unit table
always_ff @ (posedge clk) begin
if (decode_advance)
exception_unit_table[decode_id] <= decode_exception_unit;
end
generate if (CONFIG.INCLUDE_M_MODE) begin : gen_id_exception_support
lutram_1w_1r #(.DATA_TYPE(logic[$bits(exception_sources_t)-1:0]), .DEPTH(MAX_IDS))
exception_unit_table (
.clk(clk),
.waddr(decode_id),
.raddr(retire_ids_next[0]),
.ram_write(decode_advance),
.new_ram_data(decode_exception_unit),
.ram_data_out(current_exception_unit)
);
end endgenerate
////////////////////////////////////////////////////
//ID Management
@ -182,19 +262,18 @@ module instruction_metadata_and_id_management
end
//Retire IDs
//Each retire port lags behind the previous one by one index (eg. [3, 2, 1, 0])
generate for (i = 0; i < RETIRE_PORTS; i++) begin :gen_retire_ids
generate for (genvar i = 0; i < RETIRE_PORTS; i++) begin :gen_retire_ids
always_ff @ (posedge clk) begin
if (rst)
retire_ids_next[i] <= LOG2_MAX_IDS'(i);
else
retire_ids_next[i] <= retire_ids_next[i] + LOG2_MAX_IDS'(retire_next.count);
retire_ids_next[i] <= retire_ids_next[i] + LOG2_MAX_IDS'(retire_count_next);
end
always_ff @ (posedge clk) begin
if (~gc.retire_hold)
retire_ids[i] <= retire_ids_next[i];
end
end endgenerate
//Represented as a negative value so that the MSB indicates that the decode stage is valid
@ -216,7 +295,7 @@ module instruction_metadata_and_id_management
pre_issue_count <= pre_issue_count_next;
end
assign post_issue_count_next = post_issue_count + ID_COUNTER_W'(instruction_issued) - ID_COUNTER_W'(retire_next.count);
assign post_issue_count_next = post_issue_count + ID_COUNTER_W'(instruction_issued) - ID_COUNTER_W'(retire_count_next);
always_ff @ (posedge clk) begin
if (rst)
post_issue_count <= 0;
@ -238,19 +317,36 @@ module instruction_metadata_and_id_management
//Non-writeback instructions not included as current instruction set
//complete in their first cycle of the execute stage, or do not cause an
//exception after that point
logic id_waiting_toggle [NUM_WB_GROUPS];
id_t id_waiting_toggle_addr [NUM_WB_GROUPS];
always_comb begin
id_waiting_toggle[0] = (instruction_issued_with_rd & issue.is_multicycle) | fp_instruction_issued_with_rd;
id_waiting_toggle_addr[0] = issue.id;
id_waiting_toggle[NUM_WB_GROUPS-2] = fp_wb_packet[0].valid;
id_waiting_toggle_addr[NUM_WB_GROUPS-2] = fp_wb_packet[0].id;
id_waiting_toggle[NUM_WB_GROUPS-1] = fp_wb_packet[1].valid;
id_waiting_toggle_addr[NUM_WB_GROUPS-1] = fp_wb_packet[1].id;
//This will overwrite the FP packets if the configuration does not include it
for (int i = 1; i < CONFIG.NUM_WB_GROUPS; i++) begin
id_waiting_toggle[i] = wb_packet[i].valid;
id_waiting_toggle_addr[i] = wb_packet[i].id;
end
end
toggle_memory_set # (
.DEPTH (MAX_IDS),
.NUM_WRITE_PORTS (2),
.NUM_READ_PORTS (RETIRE_PORTS),
.WRITE_INDEX_FOR_RESET (0),
.READ_INDEX_FOR_RESET (0)
.NUM_WRITE_PORTS (NUM_WB_GROUPS),
.NUM_READ_PORTS (RETIRE_PORTS)
) id_waiting_for_writeback_toggle_mem_set
(
.clk (clk),
.rst (rst),
.init_clear (gc.init_clear),
.toggle ('{(instruction_issued_with_rd & issue.is_multicycle), wb_packet[1].valid}),
.toggle_addr ('{issue.id, wb_packet[1].id}),
.toggle (id_waiting_toggle),
.toggle_addr (id_waiting_toggle_addr),
.read_addr (retire_ids_next),
.in_use (id_waiting_for_writeback)
);
@ -260,14 +356,9 @@ module instruction_metadata_and_id_management
logic contiguous_retire;
logic id_is_post_issue [RETIRE_PORTS];
logic id_ready_to_retire [RETIRE_PORTS];
logic [LOG2_RETIRE_PORTS-1:0] phys_id_sel;
logic [RETIRE_PORTS-1:0] retire_id_uses_rd;
logic [RETIRE_PORTS-1:0] retire_id_waiting_for_writeback;
generate for (i = 0; i < RETIRE_PORTS; i++) begin : gen_retire_writeback
assign retire_id_uses_rd[i] = uses_rd_table[retire_ids_next[i]];
assign retire_id_waiting_for_writeback[i] = id_waiting_for_writeback[i];
end endgenerate
logic [LOG2_RETIRE_PORTS-1:0] retire_with_rd_sel;
logic [LOG2_RETIRE_PORTS-1:0] retire_with_fp_rd_sel;
logic [LOG2_RETIRE_PORTS-1:0] retire_with_store_sel;
//Supports retiring up to RETIRE_PORTS instructions. The retired block of instructions must be
//contiguous and must start with the first retire port. Additionally, only one register file writing
@ -275,40 +366,64 @@ module instruction_metadata_and_id_management
//If an exception is pending, only retire a single intrustuction per cycle. As such, the pending
//exception will have to become the oldest instruction retire_ids[0] before it can retire.
logic retire_with_rd_found;
logic retire_with_fp_rd_found;
logic retire_with_store_found;
always_comb begin
contiguous_retire = ~gc.retire_hold;
retire_with_rd_found = 0;
retire_with_fp_rd_found = 0;
retire_with_store_found = 0;
retire_with_rd_sel = 0;
retire_with_fp_rd_sel = 0;
retire_with_store_sel = 0;
for (int i = 0; i < RETIRE_PORTS; i++) begin
id_is_post_issue[i] = post_issue_count > ID_COUNTER_W'(i);
id_ready_to_retire[i] = (id_is_post_issue[i] & contiguous_retire & ~id_waiting_for_writeback[i]);
retire_port_valid_next[i] = id_ready_to_retire[i] & ~(retire_id_uses_rd[i] & retire_with_rd_found);
retire_port_valid_next[i] = id_ready_to_retire[i] & ~((retire_type[i] == RD & retire_with_rd_found) | (retire_type[i] == STORE & retire_with_store_found) | (retire_type[i] == FP_RD & retire_with_fp_rd_found));
retire_with_rd_found |= retire_port_valid_next[i] & retire_id_uses_rd[i];
retire_with_rd_found |= retire_port_valid_next[i] & retire_type[i] == RD;
retire_with_fp_rd_found |= retire_port_valid_next[i] & retire_type[i] == FP_RD;
retire_with_store_found |= retire_port_valid_next[i] & retire_type[i] == STORE;
contiguous_retire &= retire_port_valid_next[i] & ~gc.exception_pending;
if (retire_port_valid_next[i] & retire_type[i] == RD)
retire_with_rd_sel = LOG2_RETIRE_PORTS'(i);
if (retire_port_valid_next[i] & retire_type[i] == FP_RD)
retire_with_fp_rd_sel = LOG2_RETIRE_PORTS'(i);
if (retire_port_valid_next[i] & retire_type[i] == STORE)
retire_with_store_sel = LOG2_RETIRE_PORTS'(i);
end
end
//retire_next packet
priority_encoder #(.WIDTH(RETIRE_PORTS))
phys_id_sel_encoder (
.priority_vector (retire_id_uses_rd),
.encoded_result (phys_id_sel)
);
assign retire_next.phys_id = retire_ids_next[phys_id_sel];
assign retire_next.valid = retire_with_rd_found;
//retire_next packets
assign wb_retire_next = '{
id : retire_ids_next[retire_with_rd_sel],
valid : retire_with_rd_found
};
assign fp_wb_retire_next = '{
id : retire_ids_next[retire_with_fp_rd_sel],
valid : retire_with_fp_rd_found
};
assign store_retire_next = '{
id : retire_ids_next[retire_with_store_sel],
valid : retire_with_store_found
};
always_comb begin
retire_next.count = 0;
retire_count_next = 0;
for (int i = 0; i < RETIRE_PORTS; i++) begin
retire_next.count += retire_port_valid_next[i];
retire_count_next += retire_port_valid_next[i];
end
end
always_ff @ (posedge clk) begin
retire.valid <= retire_next.valid;
retire.phys_id <= retire_next.phys_id;
retire.count <= gc.writeback_supress ? '0 : retire_next.count;
wb_retire <= wb_retire_next;
fp_wb_retire <= fp_wb_retire_next;
store_retire <= store_retire_next;
retire_count <= gc.writeback_supress ? '0 : retire_count_next;
for (int i = 0; i < RETIRE_PORTS; i++)
retire_port_valid[i] <= retire_port_valid_next[i] & ~gc.writeback_supress;
end
@ -318,31 +433,14 @@ module instruction_metadata_and_id_management
assign pc_id_available = ~inflight_count[LOG2_MAX_IDS];
//Decode
assign decode.id = decode_id;
assign decode.valid = fetched_count_neg[LOG2_MAX_IDS];
assign decode.pc = pc_table[decode_id];
assign decode.instruction = instruction_table[decode_id];
assign decode.fetch_metadata = CONFIG.INCLUDE_M_MODE ? fetch_metadata_table[decode_id] : '{ok : 1, error_code : INST_ACCESS_FAULT};
//Writeback/Commit support
phys_addr_t commit_phys_addr [CONFIG.NUM_WB_GROUPS];
assign commit_phys_addr[0] = issue.phys_rd_addr;
generate for (i = 1; i < CONFIG.NUM_WB_GROUPS; i++) begin : gen_commit_phys_addr
assign commit_phys_addr[i] = phys_addr_table[wb_packet[i].id];
end endgenerate
generate for (i = 0; i < CONFIG.NUM_WB_GROUPS; i++) begin : gen_commit_packet
assign commit_packet[i].id = wb_packet[i].id;
assign commit_packet[i].phys_addr = commit_phys_addr[i];
assign commit_packet[i].valid = wb_packet[i].valid & |commit_phys_addr[i];
assign commit_packet[i].data = wb_packet[i].data;
end endgenerate
//Exception Support
generate if (CONFIG.INCLUDE_M_MODE) begin : gen_id_exception_support
assign oldest_pc = pc_table[retire_ids_next[0]];
assign current_exception_unit = exception_unit_table[retire_ids_next[0]];
end endgenerate
localparam fetch_metadata_t ADDR_OK = '{ok : 1, error_code : INST_ADDR_MISSALIGNED};
assign decode = '{
id : decode_id,
valid : fetched_count_neg[LOG2_MAX_IDS],
pc : decode_pc,
instruction : decode_instruction,
fetch_metadata : CONFIG.INCLUDE_M_MODE ? decode_fetch_metadata : ADDR_OK
};
////////////////////////////////////////////////////
//End of Implementation

View file

@ -50,7 +50,7 @@ module l1_arbiter
logic [L1_CONNECTIONS-1:0] acks;
logic [((L1_CONNECTIONS == 1) ? 0 : ($clog2(L1_CONNECTIONS)-1)) : 0] arb_sel;
logic push_ready;
logic fifos_full;
logic request_exists;
////////////////////////////////////////////////////
//Implementation
@ -69,14 +69,14 @@ module l1_arbiter
assign sc_success = CONFIG.INCLUDE_AMO & l2.con_result;
//Arbiter can pop address FIFO at a different rate than the data FIFO, so check that both have space.
assign push_ready = ~(l2.request_full | l2.data_full);
assign fifos_full = l2.request_full | l2.data_full;
assign request_exists = |requests;
assign l2.request_push = push_ready & request_exists;
assign l2.request_push = request_exists & ~fifos_full;
////////////////////////////////////////////////////
//Dcache Specific
assign l2.wr_data_push = CONFIG.INCLUDE_DCACHE & (push_ready & l1_request[L1_DCACHE_ID].request & ~l1_request[L1_DCACHE_ID].rnw); //Assumes data cache has highest priority
assign l2.wr_data_push = l2.request_push & ~l2.rnw;
assign l2.wr_data = l1_request[L1_DCACHE_ID].data;
assign l2.wr_data_be = l1_request[L1_DCACHE_ID].be;
@ -87,29 +87,38 @@ module l1_arbiter
////////////////////////////////////////////////////
//Interface mapping
generate for (genvar i = 0; i < L1_CONNECTIONS; i++) begin : gen_l2_requests
always_comb begin
l2_requests[i].addr = l1_request[i].addr[31:2];
l2_requests[i].rnw = l1_request[i].rnw;
l2_requests[i].is_amo = l1_request[i].is_amo;
l2_requests[i].amo_type_or_burst_size = l1_request[i].size;
l2_requests[i].sub_id = L2_SUB_ID_W'(i);
end
assign l2_requests[i] = '{
addr : l1_request[i].addr[31:2],
rnw : l1_request[i].rnw,
is_amo : l1_request[i].is_amo,
amo_type_or_burst_size : l1_request[i].size,
sub_id : L2_SUB_ID_W'(i)
};
end endgenerate
////////////////////////////////////////////////////
//Arbitration
priority_encoder
#(.WIDTH(L1_CONNECTIONS))
arb_encoder
(
.priority_vector (requests),
.encoded_result (arb_sel)
);
logic [$clog2(L1_CONNECTIONS)-1:0] state;
logic [$clog2(L1_CONNECTIONS)-1:0] muxes [L1_CONNECTIONS-1:0];
always_comb begin
acks = '0;
acks[arb_sel] = l2.request_push;
always_ff @(posedge clk) begin
if (rst)
state <= 0;
else if (l2.request_push)
state <= arb_sel;
end
always_comb begin
for (int i = 0; i < L1_CONNECTIONS; i++) begin
muxes[i] = $clog2(L1_CONNECTIONS)'(i);
for (int j = 0; j < L1_CONNECTIONS; j++) begin
if (requests[(i + j) % L1_CONNECTIONS])
muxes[i] = $clog2(L1_CONNECTIONS)'((i + j) % L1_CONNECTIONS);
end
end
end
assign arb_sel = muxes[state];
assign acks = L1_CONNECTIONS'(l2.request_push) << arb_sel;
assign l2.addr = l2_requests[arb_sel].addr;
assign l2.rnw = l2_requests[arb_sel].rnw;
@ -119,7 +128,7 @@ module l1_arbiter
generate for (genvar i = 0; i < L1_CONNECTIONS; i++) begin : gen_l1_responses
assign l1_response[i].data = l2.rd_data;
assign l1_response[i].data_valid = l2.rd_data_valid && (l2.rd_sub_id == i);
assign l1_response[i].data_valid = l2.rd_data_valid & (l2.rd_sub_id == i);
end endgenerate
endmodule

View file

@ -1,159 +0,0 @@
/*
* Copyright © 2020 Eric Matthews, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Eric Matthews <ematthew@sfu.ca>
*/
module load_store_queue //ID-based input buffer for Load/Store Unit
import cva5_config::*;
import riscv_types::*;
import cva5_types::*;
# (
parameter cpu_config_t CONFIG = EXAMPLE_CONFIG
)
(
input logic clk,
input logic rst,
input gc_outputs_t gc,
load_store_queue_interface.queue lsq,
//Writeback snooping
input wb_packet_t wb_snoop,
//Retire release
input id_t retire_ids [RETIRE_PORTS],
input logic retire_port_valid [RETIRE_PORTS],
output logic tr_possible_load_conflict_delay
);
typedef struct packed {
logic [31:0] addr;
logic [2:0] fn3;
id_t id;
logic [CONFIG.SQ_DEPTH-1:0] potential_store_conflicts;
} lq_entry_t;
addr_hash_t addr_hash;
logic [CONFIG.SQ_DEPTH-1:0] potential_store_conflicts;
sq_entry_t sq_entry;
logic store_conflict;
logic load_selected;
lq_entry_t lq_data_in;
lq_entry_t lq_data_out;
fifo_interface #(.DATA_WIDTH($bits(lq_entry_t))) lq();
store_queue_interface sq();
////////////////////////////////////////////////////
//Implementation
//Can accept requests so long as store queue is not needed or is not full
assign lsq.full = lsq.data_in.store & sq.full;
//Address hash for load-store collision checking
addr_hash lsq_addr_hash (
.clk (clk),
.rst (rst | gc.sq_flush),
.addr (lsq.data_in.addr),
.addr_hash (addr_hash)
);
////////////////////////////////////////////////////
//Load Queue
cva5_fifo #(.DATA_WIDTH($bits(lq_entry_t)), .FIFO_DEPTH(MAX_IDS))
load_queue_fifo (
.clk(clk),
.rst(rst),
.fifo(lq)
);
//FIFO control signals
assign lq.push = lsq.push & lsq.data_in.load;
assign lq.potential_push = lsq.potential_push;
assign lq.pop = lsq.pop & load_selected;
//FIFO data ports
assign lq_data_in = '{
addr : lsq.data_in.addr,
fn3 : lsq.data_in.fn3,
id : lsq.data_in.id,
potential_store_conflicts : potential_store_conflicts
};
assign lq.data_in = lq_data_in;
assign lq_data_out = lq.data_out;
////////////////////////////////////////////////////
//Store Queue
assign sq.push = lsq.push & lsq.data_in.store;
assign sq.pop = lsq.pop & ~load_selected;
assign sq.data_in = lsq.data_in;
store_queue # (.CONFIG(CONFIG)) sq_block (
.clk (clk),
.rst (rst | gc.sq_flush),
.lq_push (lq.push),
.lq_pop (lq.pop),
.sq (sq),
.addr_hash (addr_hash),
.potential_store_conflicts (potential_store_conflicts),
.prev_store_conflicts (lq_data_out.potential_store_conflicts),
.store_conflict (store_conflict),
.wb_snoop (wb_snoop),
.retire_ids (retire_ids),
.retire_port_valid (retire_port_valid)
);
////////////////////////////////////////////////////
//Output
//Priority is for loads over stores.
//A store will be selected only if either no loads are ready, OR if the store queue is full and a store is ready
assign load_selected = lq.valid & ~store_conflict;// & ~(sq_full & sq.valid);
assign lsq.valid = load_selected | sq.valid;
assign lsq.data_out = '{
addr : load_selected ? lq_data_out.addr : sq.data_out.addr,
load : load_selected,
store : ~load_selected,
be : load_selected ? '0 : sq.data_out.be,
fn3 : load_selected ? lq_data_out.fn3 : sq.data_out.fn3,
data_in : sq.data_out.data,
id : lq_data_out.id
};
assign lsq.sq_empty = sq.empty;
assign lsq.no_released_stores_pending = sq.no_released_stores_pending;
assign lsq.empty = ~lq.valid & sq.empty;
////////////////////////////////////////////////////
//End of Implementation
////////////////////////////////////////////////////
////////////////////////////////////////////////////
//Assertions
////////////////////////////////////////////////////
//Trace Interface
generate if (ENABLE_TRACE_INTERFACE) begin : gen_lsq_trace
assign tr_possible_load_conflict_delay = lq.valid & (store_conflict | (sq.full & sq.valid));
end
endgenerate
endmodule

View file

@ -1,69 +0,0 @@
/*
* Copyright © 2017-2019 Eric Matthews, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Eric Matthews <ematthew@sfu.ca>
*/
module one_hot_occupancy
#(parameter DEPTH = 4)
(
input logic clk,
input logic rst,
input logic push,
input logic pop,
output logic almost_full,
output logic full,
output logic empty,
output logic almost_empty,
output logic valid
);
logic [DEPTH:0] valid_chain;
//Occupancy Tracking
always_ff @ (posedge clk) begin
if (rst) begin
valid_chain[0] <= 1;
valid_chain[DEPTH:1] <= 0;
end
else begin
case({push,pop})
2'b10 : valid_chain <= {valid_chain[DEPTH-1:0], 1'b0};
2'b01 : valid_chain <= {1'b0, valid_chain[DEPTH:1]};
default : valid_chain <= valid_chain;
endcase
end
end
assign empty = valid_chain[0];
assign almost_empty = valid_chain[1];
assign valid = ~valid_chain[0];
assign full = valid_chain[DEPTH];
assign almost_full = valid_chain[DEPTH-1];
////////////////////////////////////////////////////
//Assertions
always_ff @ (posedge clk) begin
assert (!(~rst & valid_chain[DEPTH] & push)) else $error("overflow");
assert (!(~rst & valid_chain[0] & pop)) else $error("underflow");
end
endmodule

View file

@ -1,16 +0,0 @@
module placer_randomizer # (
parameter logic [7:0] PLACER_SEED = 8'h2B
)
(
input logic clk,
input logic [7:0] samples,
output logic result
);
always_ff @(posedge clk) begin
result <= |(samples & PLACER_SEED);
end
endmodule

View file

@ -1,109 +0,0 @@
/*
* Copyright © 2019 Eric Matthews, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Eric Matthews <ematthew@sfu.ca>
*/
import cva5_config::*;
import cva5_types::*;
module reg_inuse (
input logic clk,
input logic rst,
input logic clr,
input logic [4:0] rs1_addr,
input logic [4:0] rs2_addr,
input logic [4:0] issued_rd_addr,
input logic [4:0] retired_rd_addr,
input logic issued,
input logic retired,
output logic rs1_inuse,
output logic rs2_inuse
);
////////////////////////////////////////////////////
//Memory organized as 2 sets of dual-ported memories
logic bankA [32];
logic bankB [32];
logic [4:0] w_clear;
logic [4:0] wb_rd_addr_muxed;
logic wb_collision;
////////////////////////////////////////////////////
//Implementation
//////////////////////////////////////////
//Initialize to all inuse (0,1) for simulation,
//will be cleared by GC after reset in hardware
// synthesis translate_off
initial bankA = '{default: 0};
initial bankB = '{default: 0};
// synthesis translate_on
//After reset, clear is held for at least 32 cycles to reset memory block
assign wb_rd_addr_muxed = clr ? w_clear : retired_rd_addr;
//reset is for simulation purposes only, not needed for actual design
always_ff @ (posedge clk) begin
if (rst)
w_clear <= 0;
else
w_clear <= w_clear + 5'(clr);
end
assign wb_collision = retired && (issued_rd_addr == retired_rd_addr);
always_ff @ (posedge clk) begin
if (issued)
bankA[issued_rd_addr] <= wb_collision ? ~bankA[wb_rd_addr_muxed] : ~bankB[issued_rd_addr];
end
always_ff @ (posedge clk) begin
if (retired | clr)
bankB[wb_rd_addr_muxed] <= bankA[wb_rd_addr_muxed];
end
assign rs1_inuse = bankA[rs1_addr] ^ bankB[rs1_addr];
assign rs2_inuse = bankA[rs2_addr] ^ bankB[rs2_addr];
////////////////////////////////////////////////////
//End of Implementation
////////////////////////////////////////////////////
////////////////////////////////////////////////////
//Assertions
////////////////////////////////////////////////////
//Simulation Only
// synthesis translate_off
logic sim_inuse [32];
always_comb begin
foreach (sim_inuse[i])
sim_inuse[i] = bankA[i] ^ bankB[i];
end
// synthesis translate_on
endmodule

View file

@ -1,69 +0,0 @@
/*
* Copyright © 2017-2020 Eric Matthews, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Eric Matthews <ematthew@sfu.ca>
*/
module register_bank
import cva5_config::*;
import riscv_types::*;
import cva5_types::*;
#(
parameter NUM_READ_PORTS = 2
)
(
input logic clk,
input logic rst,
//Writeback
input phys_addr_t write_addr,
input logic [31:0] new_data,
input logic commit,
//Issue
input phys_addr_t read_addr [NUM_READ_PORTS],
output logic [31:0] data [NUM_READ_PORTS]
);
(* ramstyle = "MLAB, no_rw_check" *) logic [31:0] register_file_bank [64];
////////////////////////////////////////////////////
//Implementation
////////////////////////////////////////////////////
//Register File
//Assign zero to r0 and initialize all registers to zero for simulation
initial register_file_bank = '{default: 0};
always_ff @ (posedge clk) begin
if (commit)
register_file_bank[write_addr] <= new_data;
end
generate for (genvar i = 0; i < NUM_READ_PORTS; i++)
assign data[i] = register_file_bank[read_addr[i]];
endgenerate
////////////////////////////////////////////////////
//Assertions
write_to_zero_reg_assertion:
assert property (@(posedge clk) disable iff (rst) !(commit & write_addr == 0))
else $error("Write to zero reg occured!");
endmodule

View file

@ -1,5 +1,5 @@
/*
* Copyright © 2020 Eric Matthews, Lesley Shannon
* Copyright © 2020 Eric Matthews, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -26,8 +26,12 @@ module register_file
import riscv_types::*;
import cva5_types::*;
# (
parameter cpu_config_t CONFIG = EXAMPLE_CONFIG
#(
parameter NUM_WB_GROUPS = 2,
parameter READ_PORTS = 2,
parameter PORT_ZERO_ABSENT = 0,
parameter USE_ZERO = 0,
parameter type WB_PACKET_TYPE = wb_packet_t
)
(
@ -36,25 +40,34 @@ module register_file
input gc_outputs_t gc,
//decode write interface
input phys_addr_t decode_phys_rs_addr [REGFILE_READ_PORTS],
input logic [$clog2(CONFIG.NUM_WB_GROUPS)-1:0] decode_rs_wb_group [REGFILE_READ_PORTS],
input phys_addr_t decode_phys_rs_addr [READ_PORTS],
input logic [$clog2(NUM_WB_GROUPS)-1:0] decode_rs_wb_group [READ_PORTS],
input phys_addr_t decode_phys_rd_addr,
input logic decode_advance,
input logic decode_uses_rd,
input rs_addr_t decode_rd_addr, //Ignored if USE_ZERO
//Issue interface
register_file_issue_interface.register_file rf_issue,
//Writeback
input commit_packet_t commit [CONFIG.NUM_WB_GROUPS]
input WB_PACKET_TYPE commit [NUM_WB_GROUPS],
input phys_addr_t wb_phys_addr [NUM_WB_GROUPS]
);
typedef logic [31:0] rs_data_set_t [REGFILE_READ_PORTS];
rs_data_set_t rs_data_set [CONFIG.NUM_WB_GROUPS];
localparam TOGGLE_PORTS = NUM_WB_GROUPS+1+32'(PORT_ZERO_ABSENT);
localparam DATA_WIDTH = $bits(commit[0].data);
typedef logic [DATA_WIDTH-1:0] rs_data_t [READ_PORTS];
rs_data_t regfile_rs_data [NUM_WB_GROUPS];
rs_data_t regfile_rs_data_r;
rs_data_t commit_rs_data [NUM_WB_GROUPS];
logic bypass [READ_PORTS];
logic decode_inuse [REGFILE_READ_PORTS];
logic decode_inuse_r [REGFILE_READ_PORTS];
logic decode_inuse [READ_PORTS];
genvar i;
phys_addr_t inuse_read_addr [READ_PORTS*2];
logic inuse [READ_PORTS*2];
logic toggle [TOGGLE_PORTS];
phys_addr_t toggle_addr [TOGGLE_PORTS];
////////////////////////////////////////////////////
//Implementation
@ -62,83 +75,106 @@ module register_file
//Phys register inuse
//toggle ports: decode advance, single-cycle/fetch_flush, multi-cycle commit
//read ports: rs-decode, rs-issue
always_comb begin
for (int i = 0; i < READ_PORTS; i++) begin
inuse_read_addr[i] = decode_phys_rs_addr[i];
inuse_read_addr[i+READ_PORTS] = rf_issue.phys_rs_addr[i];
decode_inuse[i] = inuse[i];
rf_issue.inuse[i] = inuse[i+READ_PORTS];
end
toggle[0] = decode_advance & decode_uses_rd & (USE_ZERO | |decode_rd_addr) & ~gc.fetch_flush;
toggle_addr[0] = decode_phys_rd_addr;
toggle[1] = rf_issue.single_cycle_or_flush;
toggle_addr[1] = rf_issue.phys_rd_addr;
for (int i = 1; i < NUM_WB_GROUPS+PORT_ZERO_ABSENT; i++) begin
toggle[i+1] = commit[i-PORT_ZERO_ABSENT].valid & (USE_ZERO | |wb_phys_addr[i-PORT_ZERO_ABSENT]);
toggle_addr[i+1] = wb_phys_addr[i-PORT_ZERO_ABSENT];
end
end
toggle_memory_set # (
.DEPTH (64),
.NUM_WRITE_PORTS (3),
.NUM_READ_PORTS (REGFILE_READ_PORTS*2),
.WRITE_INDEX_FOR_RESET (0),
.READ_INDEX_FOR_RESET (0)
.NUM_WRITE_PORTS (TOGGLE_PORTS),
.NUM_READ_PORTS (READ_PORTS*2)
) id_inuse_toggle_mem_set
(
.clk (clk),
.rst (rst),
.init_clear (gc.init_clear),
.toggle ('{
(decode_advance & decode_uses_rd & |decode_phys_rd_addr & ~gc.fetch_flush),
rf_issue.single_cycle_or_flush,
commit[1].valid
}),
.toggle_addr ('{
decode_phys_rd_addr,
rf_issue.phys_rd_addr,
commit[1].phys_addr
}),
.read_addr ('{
decode_phys_rs_addr[RS1],
decode_phys_rs_addr[RS2],
rf_issue.phys_rs_addr[RS1],
rf_issue.phys_rs_addr[RS2]
}),
.in_use ('{
decode_inuse[RS1],
decode_inuse[RS2],
rf_issue.inuse[RS1],
rf_issue.inuse[RS2]
})
.toggle (toggle),
.toggle_addr (toggle_addr),
.read_addr (inuse_read_addr),
.in_use (inuse)
);
always_ff @ (posedge clk) begin
if (decode_advance)
decode_inuse_r <= decode_inuse;
end
////////////////////////////////////////////////////
//Register Banks
//Implemented in seperate module as there is not universal tool support for inferring
//arrays of memory blocks.
generate for (i = 0; i < CONFIG.NUM_WB_GROUPS; i++) begin : register_file_gen
register_bank #(.NUM_READ_PORTS(REGFILE_READ_PORTS))
reg_group (
.clk, .rst,
.write_addr(commit[i].phys_addr),
.new_data(commit[i].data),
.commit(commit[i].valid & ~gc.writeback_supress),
.read_addr(decode_phys_rs_addr),
.data(rs_data_set[i])
);
//LUTRAM implementation
//Read in decode stage, writeback groups muxed and output registered per regfile read port
generate for (genvar i = 0; i < NUM_WB_GROUPS; i++) begin : register_file_gen
lutram_1w_mr #(.DATA_TYPE(logic[DATA_WIDTH-1:0]), .DEPTH(64), .NUM_READ_PORTS(READ_PORTS))
register_file_bank (
.clk,
.waddr(wb_phys_addr[i]),
.raddr(decode_phys_rs_addr),
.ram_write(commit[i].valid & ~gc.writeback_supress),
.new_ram_data(commit[i].data),
.ram_data_out(regfile_rs_data[i])
);
end endgenerate
generate for (genvar i = 0; i < READ_PORTS; i++) begin : register_file_ff_gen
always_ff @ (posedge clk) begin
if (((~|decode_phys_rs_addr[i] & ~USE_ZERO) & decode_advance))
regfile_rs_data_r[i] <= '0;
else if (decode_advance)
regfile_rs_data_r[i] <= regfile_rs_data[decode_rs_wb_group[i]][i];
end
end endgenerate
////////////////////////////////////////////////////
//Register File Muxing
logic [$clog2(CONFIG.NUM_WB_GROUPS)-1:0] rs_wb_group [REGFILE_READ_PORTS];
logic bypass [REGFILE_READ_PORTS];
assign rs_wb_group = decode_advance ? decode_rs_wb_group : rf_issue.rs_wb_group;
assign bypass = decode_advance ? decode_inuse : decode_inuse_r;
//Bypass registers
//(per wb group and per read port)
always_ff @ (posedge clk) begin
for (int i = 0; i < REGFILE_READ_PORTS; i++) begin
if (decode_advance | rf_issue.inuse[i])
rf_issue.data[i] <= bypass[i] ? commit[rs_wb_group[i]].data : rs_data_set[rs_wb_group[i]][i];
end
for (int i = 0; i < NUM_WB_GROUPS; i++)
for (int j = 0; j < READ_PORTS; j++)
if (decode_advance | rf_issue.inuse[j])
commit_rs_data[i][j] <= commit[i].data;
end
////////////////////////////////////////////////////
//Register File Muxing
//Output mux per read port: bypass wb_group registers with registerfile data a
localparam MUX_W = $clog2(NUM_WB_GROUPS+1);
typedef logic [DATA_WIDTH-1:0] issue_data_mux_t [2**MUX_W];
issue_data_mux_t issue_data_mux [READ_PORTS];
logic [MUX_W-1:0] issue_sel [READ_PORTS];
always_ff @ (posedge clk) begin
for (int i = 0; i < READ_PORTS; i++)
if (decode_advance)
issue_sel[i] <= decode_inuse[i] ? (MUX_W)'(decode_rs_wb_group[i]) : (MUX_W)'(2**MUX_W-1);
end
always_comb begin
for (int i = 0; i < READ_PORTS; i++) begin
issue_data_mux[i] = '{default: 'x};
issue_data_mux[i][2**MUX_W-1] = regfile_rs_data_r[i];
for (int j = 0; j < NUM_WB_GROUPS; j++)
issue_data_mux[i][j] = commit_rs_data[j][i];
end
end
always_comb for (int i = 0; i < READ_PORTS; i++)
rf_issue.data[i] = issue_data_mux[i][issue_sel[i]];
////////////////////////////////////////////////////
//End of Implementation
////////////////////////////////////////////////////
////////////////////////////////////////////////////
//Assertions
for (genvar i = 0; i < CONFIG.NUM_WB_GROUPS; i++) begin : write_to_rd_zero_assertion
assert property (@(posedge clk) disable iff (rst) (commit[i].valid) |-> (commit[i].phys_addr != 0)) else $error("write to register zero");
end
endmodule

View file

@ -32,7 +32,7 @@ module register_free_list
import cva5_types::*;
#(
parameter DATA_WIDTH = 70,
parameter type DATA_TYPE = logic,
parameter FIFO_DEPTH = 4
)
(
@ -45,7 +45,7 @@ module register_free_list
localparam LOG2_FIFO_DEPTH = $clog2(FIFO_DEPTH);
//Force FIFO depth to next power of 2
(* ramstyle = "MLAB, no_rw_check" *) logic [DATA_WIDTH-1:0] lut_ram [(2**LOG2_FIFO_DEPTH)];
(* ramstyle = "MLAB, no_rw_check" *) logic [$bits(DATA_TYPE)-1:0] lut_ram [(2**LOG2_FIFO_DEPTH)];
logic [LOG2_FIFO_DEPTH-1:0] write_index;
logic [LOG2_FIFO_DEPTH-1:0] read_index;
logic [LOG2_FIFO_DEPTH:0] inflight_count;

View file

@ -27,7 +27,9 @@ module renamer
import cva5_types::*;
# (
parameter cpu_config_t CONFIG = EXAMPLE_CONFIG
parameter NUM_WB_GROUPS = 2,
parameter READ_PORTS = 2,
parameter RENAME_ZERO = 0 //If set, will use issue.fp_uses_rd instead of issue.uses_rd (in addition to what the name implies)
)
(
@ -44,22 +46,21 @@ module renamer
input logic instruction_issued_with_rd,
//Retire response
input retire_packet_t retire
input retire_packet_t wb_retire
);
//////////////////////////////////////////
typedef struct packed{
typedef struct packed {
rs_addr_t rd_addr;
phys_addr_t spec_phys_addr;
phys_addr_t previous_phys_addr;
logic [$clog2(CONFIG.NUM_WB_GROUPS)-1:0] previous_wb_group;
logic [$clog2(NUM_WB_GROUPS)-1:0] previous_wb_group;
} renamer_metadata_t;
renamer_metadata_t inuse_list_input;
renamer_metadata_t inuse_list_output;
renamer_metadata_t inuse_table_input;
renamer_metadata_t inuse_table_output;
logic [5:0] clear_index;
fifo_interface #(.DATA_WIDTH($bits(phys_addr_t))) free_list ();
fifo_interface #(.DATA_WIDTH($bits(renamer_metadata_t))) inuse_list ();
fifo_interface #(.DATA_TYPE(phys_addr_t)) free_list ();
logic rename_valid;
logic rollback;
@ -69,10 +70,10 @@ module renamer
//Zero register is never renamed
//If a renamed destination is flushed in the issue stage, state is rolled back
//When an instruction reaches the retire stage it either commits or reverts its renaming depending on whether the instruction retires or is discarded
assign rename_valid = (~gc.fetch_flush) & decode_advance & decode.uses_rd & |decode.rd_addr;
assign rename_valid = (~gc.fetch_flush) & decode_advance & decode.uses_rd & (RENAME_ZERO | |decode.rd_addr);
//Revert physcial address assignment on a flush
assign rollback = gc.fetch_flush & issue.stage_valid & issue.uses_rd & |issue.rd_addr;
assign rollback = gc.fetch_flush & issue.stage_valid & (RENAME_ZERO ? issue.fp_uses_rd : issue.uses_rd) & (RENAME_ZERO | |issue.rd_addr);
//counter for indexing through memories for post-reset clearing/initialization
lfsr #(.WIDTH(6), .NEEDS_RESET(0))
@ -84,7 +85,7 @@ module renamer
////////////////////////////////////////////////////
//Free list FIFO
register_free_list #(.DATA_WIDTH($bits(phys_addr_t)), .FIFO_DEPTH(32)) free_list_fifo (
register_free_list #(.DATA_TYPE(phys_addr_t), .FIFO_DEPTH(32)) free_list_fifo (
.clk (clk),
.rst (rst),
.fifo (free_list),
@ -92,58 +93,56 @@ module renamer
);
//During post reset init, initialize FIFO with free list (registers 32-63)
assign free_list.potential_push = (gc.init_clear & ~clear_index[5]) | (retire.valid);
assign free_list.potential_push = (gc.init_clear & ~clear_index[5]) | (wb_retire.valid);
assign free_list.push = free_list.potential_push;
assign free_list.data_in = gc.init_clear ? {1'b1, clear_index[4:0]} : (gc.writeback_supress ? inuse_list_output.spec_phys_addr : inuse_list_output.previous_phys_addr);
assign free_list.data_in = gc.init_clear ? {1'b1, clear_index[4:0]} : (gc.writeback_supress ? inuse_table_output.spec_phys_addr : inuse_table_output.previous_phys_addr);
assign free_list.pop = rename_valid;
////////////////////////////////////////////////////
//Inuse list FIFO
cva5_fifo #(.DATA_WIDTH($bits(renamer_metadata_t)), .FIFO_DEPTH(32)) inuse_list_fifo (
//Inuse table
assign inuse_table_input = '{
rd_addr : issue.rd_addr,
spec_phys_addr : issue.phys_rd_addr,
previous_phys_addr : spec_table_previous_r.phys_addr,
previous_wb_group : spec_table_previous_r.wb_group
};
lutram_1w_1r #(.DATA_TYPE(renamer_metadata_t), .DEPTH(MAX_IDS))
inuse_table (
.clk (clk),
.rst (rst),
.fifo (inuse_list)
.waddr (issue.id),
.raddr (wb_retire.id),
.ram_write (instruction_issued_with_rd),
.new_ram_data (inuse_table_input),
.ram_data_out (inuse_table_output)
);
assign inuse_list.potential_push = instruction_issued_with_rd & |issue.rd_addr;
assign inuse_list.push = inuse_list.potential_push;
assign inuse_list_input.rd_addr = issue.rd_addr;
assign inuse_list_input.spec_phys_addr = issue.phys_rd_addr;
assign inuse_list_input.previous_phys_addr = spec_table_previous_r.phys_addr;
assign inuse_list_input.previous_wb_group = spec_table_previous_r.wb_group;
assign inuse_list.data_in = inuse_list_input;
assign inuse_list_output = inuse_list.data_out;
assign inuse_list.pop = retire.valid;
////////////////////////////////////////////////////
//Speculative rd-to-phys Table
//On rollback restore the previous contents
//During post reset init, initialize rd_to_phys with in-use list (lower 32 registers)
typedef struct packed{
typedef struct packed {
phys_addr_t phys_addr;
logic [$clog2(CONFIG.NUM_WB_GROUPS)-1:0] wb_group;
logic [$clog2(NUM_WB_GROUPS)-1:0] wb_group;
} spec_table_t;
rs_addr_t spec_table_read_addr [REGFILE_READ_PORTS+1];
spec_table_t spec_table_read_data [REGFILE_READ_PORTS+1];
rs_addr_t spec_table_read_addr [READ_PORTS+1];
spec_table_t spec_table_read_data [READ_PORTS+1];
spec_table_t spec_table_next;
spec_table_t spec_table_next_mux [4];
spec_table_t spec_table_previous;
spec_table_t spec_table_previous_r;
logic spec_table_update;
rs_addr_t spec_table_write_index;
rs_addr_t spec_table_write_index_mux [4];
assign spec_table_update = rename_valid | rollback | gc.init_clear | (retire.valid & gc.writeback_supress);
assign spec_table_update = rename_valid | rollback | gc.init_clear | (wb_retire.valid & gc.writeback_supress);
logic [1:0] spec_table_sel;
one_hot_to_integer #(.C_WIDTH(4)) spec_table_sel_one_hot_to_int (
.one_hot ({gc.init_clear, rollback, (retire.valid & gc.writeback_supress), 1'b0}),
.one_hot ({gc.init_clear, rollback, (wb_retire.valid & gc.writeback_supress), 1'b0}),
.int_out (spec_table_sel)
);
@ -152,9 +151,9 @@ module renamer
assign spec_table_next_mux[0].phys_addr = free_list.data_out;
assign spec_table_next_mux[0].wb_group = decode.rd_wb_group;
//gc.writeback_supress
assign spec_table_write_index_mux[1] = inuse_list_output.rd_addr;
assign spec_table_next_mux[1].phys_addr = inuse_list_output.previous_phys_addr;
assign spec_table_next_mux[1].wb_group = inuse_list_output.previous_wb_group;
assign spec_table_write_index_mux[1] = inuse_table_output.rd_addr;
assign spec_table_next_mux[1].phys_addr = inuse_table_output.previous_phys_addr;
assign spec_table_next_mux[1].wb_group = inuse_table_output.previous_wb_group;
//rollback
assign spec_table_write_index_mux[2] = issue.rd_addr;
assign spec_table_next_mux[2].phys_addr = spec_table_previous_r.phys_addr;
@ -168,12 +167,12 @@ module renamer
assign spec_table_next = spec_table_next_mux[spec_table_sel];
assign spec_table_read_addr[0] = spec_table_write_index;
assign spec_table_read_addr[1:REGFILE_READ_PORTS] = '{decode.rs_addr[RS1], decode.rs_addr[RS2]};
assign spec_table_read_addr[1+:READ_PORTS] = decode.rs_addr;
lutram_1w_mr #(
.WIDTH($bits(spec_table_t)),
.DATA_TYPE(spec_table_t),
.DEPTH(32),
.NUM_READ_PORTS(REGFILE_READ_PORTS+1)
.NUM_READ_PORTS(READ_PORTS+1)
)
spec_table_ram (
.clk(clk),
@ -183,24 +182,20 @@ module renamer
.new_ram_data(spec_table_next),
.ram_data_out(spec_table_read_data)
);
assign spec_table_previous = spec_table_read_data[0];
always_ff @ (posedge clk) begin
if (spec_table_update) begin
spec_table_previous_r <= spec_table_previous;
end
if (spec_table_update)
spec_table_previous_r <= spec_table_read_data[0];
end
////////////////////////////////////////////////////
//Renamed Outputs
spec_table_t [REGFILE_READ_PORTS-1:0] spec_table_decode;
generate for (genvar i = 0; i < REGFILE_READ_PORTS; i++) begin : gen_renamed_addrs
assign spec_table_decode[i] = spec_table_read_data[i+1];
assign decode.phys_rs_addr[i] = spec_table_decode[i].phys_addr;
assign decode.rs_wb_group[i] = spec_table_decode[i].wb_group;
generate for (genvar i = 0; i < READ_PORTS; i++) begin : gen_renamed_addrs
assign decode.phys_rs_addr[i] = spec_table_read_data[i+1].phys_addr;
assign decode.rs_wb_group[i] = spec_table_read_data[i+1].wb_group;
end endgenerate
assign decode.phys_rd_addr = RENAME_ZERO | |decode.rd_addr ? free_list.data_out : '0;
assign decode.phys_rd_addr = |decode.rd_addr ? free_list.data_out : '0;
////////////////////////////////////////////////////
//End of Implementation
////////////////////////////////////////////////////
@ -208,10 +203,10 @@ module renamer
////////////////////////////////////////////////////
//Assertions
rename_rd_zero_assertion:
assert property (@(posedge clk) disable iff (rst) (decode.rd_addr == 0) |-> (decode.phys_rd_addr == 0)) else $error("rd zero renamed");
assert property (@(posedge clk) disable iff (rst || RENAME_ZERO) (decode.rd_addr == 0) |-> (decode.phys_rd_addr == 0)) else $error("rd zero renamed");
for (genvar i = 0; i < REGFILE_READ_PORTS; i++) begin : rename_rs_zero_assertion
assert property (@(posedge clk) disable iff (rst) (decode.rs_addr[i] == 0) |-> (decode.phys_rs_addr[i] == 0)) else $error("rs zero renamed");
for (genvar i = 0; i < READ_PORTS; i++) begin : rename_rs_zero_assertion
assert property (@(posedge clk) disable iff (rst || RENAME_ZERO) (decode.rs_addr[i] == 0) |-> (decode.phys_rs_addr[i] == 0)) else $error("rs zero renamed");
end
endmodule

View file

@ -1,267 +0,0 @@
/*
* Copyright © 2020 Eric Matthews, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Eric Matthews <ematthew@sfu.ca>
*/
module store_queue
import cva5_config::*;
import riscv_types::*;
import cva5_types::*;
# (
parameter cpu_config_t CONFIG = EXAMPLE_CONFIG
)
(
input logic clk,
input logic rst,
input logic lq_push,
input logic lq_pop,
store_queue_interface.queue sq,
//Address hash (shared by loads and stores)
input addr_hash_t addr_hash,
//hash check on adding a load to the queue
output logic [CONFIG.SQ_DEPTH-1:0] potential_store_conflicts,
//Load issue collision check
input logic [CONFIG.SQ_DEPTH-1:0] prev_store_conflicts,
output logic store_conflict,
//Writeback snooping
input wb_packet_t wb_snoop,
//Retire
input id_t retire_ids [RETIRE_PORTS],
input logic retire_port_valid [RETIRE_PORTS]
);
localparam LOG2_SQ_DEPTH = $clog2(CONFIG.SQ_DEPTH);
typedef logic [LOG2_MAX_IDS:0] load_check_count_t;
wb_packet_t wb_snoop_r;
//Register-based memory blocks
logic [CONFIG.SQ_DEPTH-1:0] valid;
logic [CONFIG.SQ_DEPTH-1:0] valid_next;
addr_hash_t [CONFIG.SQ_DEPTH-1:0] hashes;
logic [CONFIG.SQ_DEPTH-1:0] released;
id_t [CONFIG.SQ_DEPTH-1:0] id_needed;
load_check_count_t [CONFIG.SQ_DEPTH-1:0] load_check_count;
logic [31:0] store_data_from_wb [CONFIG.SQ_DEPTH];
//LUTRAM-based memory blocks
sq_entry_t sq_entry_in;
(* ramstyle = "MLAB, no_rw_check" *) logic [$bits(sq_entry_t)-1:0] sq_entry [CONFIG.SQ_DEPTH];
(* ramstyle = "MLAB, no_rw_check" *) id_t [CONFIG.SQ_DEPTH-1:0] ids;
(* ramstyle = "MLAB, no_rw_check" *) logic [LOG2_SQ_DEPTH-1:0] sq_ids [MAX_IDS];
load_check_count_t [CONFIG.SQ_DEPTH-1:0] load_check_count_next;
logic [LOG2_SQ_DEPTH-1:0] sq_index;
logic [LOG2_SQ_DEPTH-1:0] sq_index_next;
logic [LOG2_SQ_DEPTH-1:0] sq_oldest;
logic [CONFIG.SQ_DEPTH-1:0] new_request_one_hot;
logic [CONFIG.SQ_DEPTH-1:0] issued_one_hot;
logic [CONFIG.SQ_DEPTH-1:0] wb_id_match;
////////////////////////////////////////////////////
//Implementation
assign sq_index_next = sq_index +LOG2_SQ_DEPTH'(sq.push);
always_ff @ (posedge clk) begin
if (rst)
sq_index <= 0;
else
sq_index <= sq_index_next;
end
always_ff @ (posedge clk) begin
if (rst)
sq_oldest <= 0;
else
sq_oldest <= sq_oldest +LOG2_SQ_DEPTH'(sq.pop);
end
assign new_request_one_hot = CONFIG.SQ_DEPTH'(sq.push) << sq_index;
assign issued_one_hot = CONFIG.SQ_DEPTH'(sq.pop) << sq_oldest;
assign valid_next = (valid | new_request_one_hot) & ~issued_one_hot;
always_ff @ (posedge clk) begin
if (rst)
valid <= '0;
else
valid <= valid_next;
end
assign sq.empty = ~|valid;
always_ff @ (posedge clk) begin
if (rst)
sq.full <= 0;
else
sq.full <= valid_next[sq_index_next] | (|load_check_count_next[sq_index_next]);
end
//SQ attributes and issue data
assign sq_entry_in = '{
addr : sq.data_in.addr,
be : sq.data_in.be,
fn3 : sq.data_in.fn3,
forwarded_store : sq.data_in.forwarded_store,
data : sq.data_in.data
};
always_ff @ (posedge clk) begin
if (sq.push)
sq_entry[sq_index] <= sq_entry_in;
end
//Hash mem
always_ff @ (posedge clk) begin
if (sq.push)
hashes[sq_index] <= addr_hash;
end
//Keep count of the number of pending loads that might need a store result
//Mask out any store completing on this cycle
logic [CONFIG.SQ_DEPTH-1:0] new_load_waiting;
logic [CONFIG.SQ_DEPTH-1:0] waiting_load_completed;
always_comb begin
for (int i = 0; i < CONFIG.SQ_DEPTH; i++) begin
potential_store_conflicts[i] = (valid[i] & ~issued_one_hot[i]) & (addr_hash == hashes[i]);
new_load_waiting[i] = potential_store_conflicts[i] & lq_push;
waiting_load_completed[i] = prev_store_conflicts[i] & lq_pop;
load_check_count_next[i] =
load_check_count[i]
+ LOG2_MAX_IDS'(new_load_waiting[i])
- LOG2_MAX_IDS'(waiting_load_completed[i]);
end
end
always_ff @ (posedge clk) begin
if (rst)
load_check_count <= '0;
else
load_check_count <= load_check_count_next;
end
//If a potential blocking store has not been issued yet, the load is blocked until the store(s) complete
assign store_conflict = |(prev_store_conflicts & valid);
////////////////////////////////////////////////////
//ID Handling
//sq_id to global_id mem
always_ff @ (posedge clk) begin
if (sq.push)
ids[sq_index] <= sq.data_in.id;
end
// global_id to sq_id mem
always_ff @ (posedge clk) begin
if (sq.push)
sq_ids[sq.data_in.id] <= sq_index;
end
//waiting on ID mem
always_ff @ (posedge clk) begin
if (sq.push)
id_needed[sq_index] <= sq.data_in.id_needed;
end
////////////////////////////////////////////////////
//Release Handling
logic [CONFIG.SQ_DEPTH-1:0] newly_released;
logic [LOG2_SQ_DEPTH-1:0] store_released_index [RETIRE_PORTS];
logic store_released [RETIRE_PORTS];
always_comb begin
newly_released = '0;
for (int i = 0; i < RETIRE_PORTS; i++) begin
store_released_index[i] = sq_ids[retire_ids[i]];
store_released[i] = {1'b1, ids[store_released_index[i]]} == {retire_port_valid[i], retire_ids[i]};
newly_released |= CONFIG.SQ_DEPTH'(store_released[i]) << store_released_index[i];
end
end
always_ff @ (posedge clk) begin
released <= (released | newly_released) & ~new_request_one_hot;
end
assign sq.no_released_stores_pending = ~|(valid & released);
////////////////////////////////////////////////////
//Forwarded Store Data
always_ff @ (posedge clk) begin
wb_snoop_r <= wb_snoop;
end
always_ff @ (posedge clk) begin
for (int i = 0; i < CONFIG.SQ_DEPTH; i++) begin
if ({1'b0, wb_snoop_r.valid, wb_snoop_r.id} == {released[i], 1'b1, id_needed[i]})
store_data_from_wb[i] <= wb_snoop_r.data;
end
end
////////////////////////////////////////////////////
//Store Transaction Outputs
logic [31:0] data_for_alignment;
logic [31:0] sq_data;
sq_entry_t output_entry;
assign output_entry = sq_entry[sq_oldest];
always_comb begin
//Input: ABCD
//Assuming aligned requests,
//Possible byte selections: (A/C/D, B/D, C/D, D)
data_for_alignment = output_entry.forwarded_store ? store_data_from_wb[sq_oldest] : output_entry.data;
sq_data[7:0] = data_for_alignment[7:0];
sq_data[15:8] = (output_entry.addr[1:0] == 2'b01) ? data_for_alignment[7:0] : data_for_alignment[15:8];
sq_data[23:16] = (output_entry.addr[1:0] == 2'b10) ? data_for_alignment[7:0] : data_for_alignment[23:16];
case(output_entry.addr[1:0])
2'b10 : sq_data[31:24] = data_for_alignment[15:8];
2'b11 : sq_data[31:24] = data_for_alignment[7:0];
default : sq_data[31:24] = data_for_alignment[31:24];
endcase
end
assign sq.valid = valid[sq_oldest] & released[sq_oldest];
assign sq.data_out = '{
addr : output_entry.addr,
be : output_entry.be,
fn3 : output_entry.fn3,
forwarded_store : output_entry.forwarded_store,
data : sq_data
};
////////////////////////////////////////////////////
//End of Implementation
////////////////////////////////////////////////////
////////////////////////////////////////////////////
//Assertions
sq_overflow_assertion:
assert property (@(posedge clk) disable iff (rst) sq.push |-> (~sq.full | sq.pop)) else $error("sq overflow");
fifo_underflow_assertion:
assert property (@(posedge clk) disable iff (rst) sq.pop |-> sq.valid) else $error("sq underflow");
endmodule

View file

@ -91,7 +91,7 @@ module tlb_lut_ram
genvar i;
generate
for (i=0; i<WAYS; i=i+1) begin : lut_rams
lutram_1w_1r #(.WIDTH($bits(tlb_entry_t)), .DEPTH(DEPTH))
lutram_1w_1r #(.DATA_TYPE(tlb_entry_t), .DEPTH(DEPTH))
write_port (
.clk(clk),
.waddr(tlb_addr),

View file

@ -64,9 +64,9 @@ package csr_types;
logic I; //Base
logic H;
logic G;
logic F;
logic F; //Single precision
logic E;
logic D;
logic D; //Double precision
logic C;
logic B;
logic A; //Atomic

View file

@ -33,7 +33,7 @@ package cva5_config;
////////////////////////////////////////////////////
//CSR Options
typedef struct packed {
int unsigned COUNTER_W; //CSR counter width (33-64 bits): 48-bits --> 32 days @ 100MHz
int unsigned COUNTER_W; //CSR counter width (33-64 bits): 48-bits --> 32 days @ 100MHz
bit MCYCLE_WRITEABLE;
bit MINSTR_WRITEABLE;
bit MTVEC_WRITEABLE;
@ -99,20 +99,84 @@ package cva5_config;
int unsigned DEPTH;
} tlb_config_t;
////////////////////////////////////////////////////
//Unit IDs
//To add a new unit update:
// - MAX_NUM_UNITS
// - units_t
// - unit_id_enum_t
//ensuring that the bit index in units_t matches the enum value in unit_id_enum_t
//Additionally, writeback units must be grouped before non-writeback units
localparam MAX_NUM_UNITS = 9;
typedef struct packed {
bit IEC;
bit BR;
//End of Write-Back Units
bit CUSTOM;
bit FPU;
bit CSR;
bit DIV;
bit MUL;
bit LS;
bit ALU;
} units_t;
typedef enum bit [$clog2(MAX_NUM_UNITS)-1:0] {
IEC_ID = 8,
BR_ID = 7,
//End of Write-Back Units (insert new writeback units here)
CUSTOM_ID = 6,
FPU_ID = 5,
CSR_ID = 4,
DIV_ID = 3,
MUL_ID = 2,
LS_ID = 1,
ALU_ID = 0
} unit_id_enum_t;
localparam unit_id_enum_t NON_WRITEBACK_ID = BR_ID;
//WB Group config
// First index is write-back port
// Second index is position within the write-back port (Priority selection, with highest priority for index 0)
// See EXAMPLE_WB_GROUP_CONFIG below for an example of how to specify the configuration
typedef unit_id_enum_t [MAX_NUM_UNITS-1:0][MAX_NUM_UNITS-1:0] wb_group_config_t;
//Convenience function for determining how many writeback units are in each writeback group
function int unsigned get_num_wb_units (input unit_id_enum_t [MAX_NUM_UNITS-1:0] ids);
get_num_wb_units = 0;
for (int i = 0; i < MAX_NUM_UNITS; i++)
if (ids[i] != NON_WRITEBACK_ID)
get_num_wb_units++;
endfunction
//Convenience function for turning the enum-based WB grouping into the units_t bit-vector representation
//used in decode stage to determine the writeback group for the current instruction
function units_t [MAX_NUM_UNITS-1:0] get_wb_units_type_representation(input wb_group_config_t ids);
get_wb_units_type_representation = '{default : '0};
for (int i = 0; i < MAX_NUM_UNITS; i++)
for (int j = 0; j < MAX_NUM_UNITS; j++)
if (ids[i][j] != NON_WRITEBACK_ID)
get_wb_units_type_representation[i][ids[i][j]] = 1;
endfunction
typedef struct packed {
//ISA options
bit INCLUDE_M_MODE;
bit INCLUDE_S_MODE;
bit INCLUDE_U_MODE;
bit INCLUDE_MUL;
bit INCLUDE_DIV;
bit INCLUDE_IFENCE; //local mem operations only
bit INCLUDE_CSRS;
bit INCLUDE_AMO; //cache operations only
bit INCLUDE_AMO;
bit INCLUDE_CBO; //Data cache invalidation operations
//Units
units_t INCLUDE_UNIT;
//CSR constants
csr_config_t CSRS;
//Memory Options
int unsigned SQ_DEPTH;//CAM-based reasonable max of 4
bit INCLUDE_FORWARDING_TO_STORES;
//Caches
bit INCLUDE_ICACHE;
cache_config_t ICACHE;
@ -139,6 +203,7 @@ package cva5_config;
branch_predictor_config_t BP;
//Writeback Options
int unsigned NUM_WB_GROUPS;
wb_group_config_t WB_GROUP;
} cpu_config_t;
//Function to generate derived cache parameters
@ -151,17 +216,42 @@ package cva5_config;
};
endfunction
////////////////////////////////////////////////////
//Example Config
// ALU requires its own WB port
// LS unit must be the first unit on its writeback port (LS unit does not use ack signal for timing considerations)
// Index in group is the priority order (highest priority for index zero)
// For optimal resource usage, there should be no holes in the write-back unit ordering
// (i.e. if a unit is often not included, either remove from the WB config or place at the end of a writeback group)
localparam wb_group_config_t EXAMPLE_WB_GROUP_CONFIG = '{
0 : '{0: ALU_ID, default : NON_WRITEBACK_ID},
1 : '{0: LS_ID, default : NON_WRITEBACK_ID},
2 : '{0: MUL_ID, 1: DIV_ID, 2: CSR_ID, 3: FPU_ID, 4: CUSTOM_ID, default : NON_WRITEBACK_ID},
default : '{default : NON_WRITEBACK_ID}
};
localparam cpu_config_t EXAMPLE_CONFIG = '{
//ISA options
INCLUDE_M_MODE : 1,
INCLUDE_S_MODE : 1,
INCLUDE_U_MODE : 1,
INCLUDE_MUL : 1,
INCLUDE_DIV : 1,
INCLUDE_S_MODE : 0,
INCLUDE_U_MODE : 0,
INCLUDE_UNIT : '{
ALU : 1,
LS : 1,
MUL : 1,
DIV : 1,
CSR : 1,
FPU : 1,
CUSTOM : 0,
BR : 1,
IEC : 1
},
INCLUDE_IFENCE : 1,
INCLUDE_CSRS : 1,
INCLUDE_AMO : 0,
INCLUDE_CBO : 0,
//CSR constants
CSRS : '{
MACHINE_IMPLEMENTATION_ID : 0,
@ -170,16 +260,17 @@ package cva5_config;
RESET_MTVEC : 32'h80000100,
NON_STANDARD_OPTIONS : '{
COUNTER_W : 33,
MCYCLE_WRITEABLE : 1,
MINSTR_WRITEABLE : 1,
MCYCLE_WRITEABLE : 0,
MINSTR_WRITEABLE : 0,
MTVEC_WRITEABLE : 1,
INCLUDE_MSCRATCH : 1,
INCLUDE_MSCRATCH : 0,
INCLUDE_MCAUSE : 1,
INCLUDE_MTVAL : 1
}
},
//Memory Options
SQ_DEPTH : 4,
INCLUDE_FORWARDING_TO_STORES : 1,
INCLUDE_ICACHE : 0,
ICACHE_ADDR : '{
L: 32'h80000000,
@ -249,29 +340,8 @@ package cva5_config;
RAS_ENTRIES : 8
},
//Writeback Options
NUM_WB_GROUPS : 2
};
////////////////////////////////////////////////////
//Unit IDs
typedef struct packed {
int unsigned ALU;
int unsigned LS;
int unsigned CSR;
int unsigned MUL;
int unsigned DIV;
int unsigned BR;
int unsigned IEC;
} unit_id_param_t;
localparam unit_id_param_t EXAMPLE_UNIT_IDS = '{
ALU : 0,
LS : 1,
CSR : 2,
MUL : 3,
DIV : 4,
BR : 5,
IEC : 6
NUM_WB_GROUPS : 3,
WB_GROUP : EXAMPLE_WB_GROUP_CONFIG
};
////////////////////////////////////////////////////
@ -282,17 +352,28 @@ package cva5_config;
////////////////////////////////////////////////////
//ID limit
//MAX_IDS restricted to a power of 2
localparam MAX_IDS = 8; //8 sufficient for rv32im configs
localparam MAX_IDS = 16; //8 sufficient for rv32imd configs
////////////////////////////////////////////////////
//Number of commit ports
localparam RETIRE_PORTS = 2; //min 1. (Non-powers of two supported) > 1 is recommended to allow stores to commit sooner
localparam REGFILE_READ_PORTS = 2; //min 2, for RS1 and RS2. (Non-powers of two supported)
typedef enum bit {
typedef enum {
RS1 = 0,
RS2 = 1
} rs1_index_t;
RS2 = 1,
RS3 = 2
} rs_index_t;
////////////////////////////////////////////////////
//FP number widths
localparam EXPO_WIDTH = 11; //11 is compliant
localparam FRAC_WIDTH = 52; //52 is compliant
localparam EXPO_WIDTH_F = 8; //8 is compliant
localparam FRAC_WIDTH_F = 23; //23 is compliant
localparam GRS_WIDTH = FRAC_WIDTH*2; //Should be FRAC_WIDTH*2 for full compliance
//Do not change these values, they are derived from the previous
localparam FLEN = 1+EXPO_WIDTH+FRAC_WIDTH; //Single precision (32 bits)
localparam FLEN_F = 1+EXPO_WIDTH_F+FRAC_WIDTH_F; //Double precision (64 bits)
////////////////////////////////////////////////////
//Exceptions
@ -304,19 +385,13 @@ package cva5_config;
PRE_ISSUE_EXCEPTION = 2
} exception_sources_t;
////////////////////////////////////////////////////
//Trace Options
//Trace interface is necessary for verilator simulation
localparam ENABLE_TRACE_INTERFACE = 1;
////////////////////////////////////////////////////
//L1 Arbiter IDs
localparam L1_CONNECTIONS = 4;
typedef enum bit [1:0] {
L1_DCACHE_ID = 0,
L1_DMMU_ID = 1,
L1_ICACHE_ID = 2,
L1_ICACHE_ID = 1,
L1_DMMU_ID = 2,
L1_IMMU_ID = 3
} l1_id_t;

View file

@ -41,13 +41,6 @@ package cva5_types;
ALU_SHIFT = 2'b11
} alu_op_t;
typedef enum logic [1:0] {
ALU_LOGIC_XOR = 2'b00,
ALU_LOGIC_OR = 2'b01,
ALU_LOGIC_AND = 2'b10,
ALU_LOGIC_ADD = 2'b11
} alu_logic_op_t;
typedef struct packed{
logic valid;
exception_code_t code;
@ -77,8 +70,10 @@ package cva5_types;
rs_addr_t rd_addr;
phys_addr_t phys_rd_addr;
phys_addr_t fp_phys_rd_addr;
logic uses_rd;
logic fp_uses_rd;
logic is_multicycle;
id_t id;
exception_sources_t exception_unit;
@ -86,34 +81,6 @@ package cva5_types;
fetch_metadata_t fetch_metadata;
} issue_packet_t;
typedef struct packed{
logic [XLEN:0] in1;//contains sign padding bit for slt operation
logic [XLEN:0] in2;//contains sign padding bit for slt operation
logic [XLEN-1:0] shifter_in;
logic [31:0] constant_adder;
alu_op_t alu_op;
alu_logic_op_t logic_op;
logic [4:0] shift_amount;
logic subtract;
logic arith;//contains sign padding bit for arithmetic shift right operation
logic lshift;
} alu_inputs_t;
typedef struct packed {
logic [XLEN:0] rs1;
logic [XLEN:0] rs2;
logic [31:0] pc_p4;
logic [2:0] fn3;
logic [31:0] issue_pc;
logic issue_pc_valid;
logic jal;
logic jalr;
logic jal_jalr;
logic is_call;
logic is_return;
logic [20:0] pc_offset;
} branch_inputs_t;
typedef struct packed {
id_t id;
logic valid;
@ -138,66 +105,29 @@ package cva5_types;
logic [4:0] op;
} amo_details_t;
typedef struct packed{
logic [XLEN-1:0] rs1;
logic [XLEN-1:0] rs2;
logic [11:0] offset;
logic [2:0] fn3;
logic load;
logic store;
logic fence;
logic forwarded_store;
id_t store_forward_id;
//amo support
amo_details_t amo;
} load_store_inputs_t;
typedef struct packed{
logic [XLEN-1:0] rs1;
logic [XLEN-1:0] rs2;
logic [1:0] op;
} mul_inputs_t;
typedef struct packed{
logic [XLEN-1:0] rs1;
logic [XLEN-1:0] rs2;
logic [1:0] op;
logic reuse_result;
} div_inputs_t;
typedef struct packed{
csr_addr_t addr;
logic[1:0] op;
logic reads;
logic writes;
logic [XLEN-1:0] data;
} csr_inputs_t;
typedef struct packed{
logic [31:0] pc_p4;
logic is_ifence;
logic is_mret;
logic is_sret;
} gc_inputs_t;
typedef struct packed {
logic [31:0] addr;
logic load;
logic store;
logic cache_op;
logic [3:0] be;
logic [2:0] fn3;
logic [31:0] data;
id_t id;
logic forwarded_store;
id_t id_needed;
logic fp;
logic double;
logic [FLEN-1:0] fp_data;
} lsq_entry_t;
typedef struct packed {
logic [31:0] addr;
logic [3:0] be;
logic [2:0] fn3;
logic forwarded_store;
logic cache_op;
logic [31:0] data;
logic fp;
logic double;
logic [FLEN-1:0] fp_data;
} sq_entry_t;
typedef struct packed {
@ -212,27 +142,34 @@ package cva5_types;
logic [31:0] data;
} wb_packet_t;
typedef struct packed {
id_t id;
logic valid;
logic[FLEN-1:0] data;
} fp_wb_packet_t;
typedef struct packed{
id_t id;
logic valid;
phys_addr_t phys_addr;
logic [31:0] data;
} commit_packet_t;
typedef struct packed{
logic valid;
id_t phys_id;
logic [LOG2_RETIRE_PORTS : 0] count;
} retire_packet_t;
typedef enum logic[1:0] {
INT_DONE,
SINGLE_DONE,
DOUBLE_HOLD,
DOUBLE_DONE
} fp_ls_op_t;
typedef struct packed {
logic [31:0] addr;
logic load;
logic store;
logic cache_op;
logic [3:0] be;
logic [2:0] fn3;
logic [31:0] data_in;
id_t id;
fp_ls_op_t fp_op;
} data_access_shared_inputs_t;
typedef enum {
@ -262,46 +199,50 @@ package cva5_types;
logic external;
} interrupt_t;
typedef struct packed {
//Fetch
logic early_branch_correction;
typedef enum {
FETCH_EARLY_BR_CORRECTION_STAT,
FETCH_SUB_UNIT_STALL_STAT,
FETCH_ID_STALL_STAT,
FETCH_IC_HIT_STAT,
FETCH_IC_MISS_STAT,
FETCH_IC_ARB_STALL_STAT,
//Decode
logic operand_stall;
logic unit_stall;
logic no_id_stall;
logic no_instruction_stall;
logic other_stall;
logic instruction_issued_dec;
logic branch_operand_stall;
logic alu_operand_stall;
logic ls_operand_stall;
logic div_operand_stall;
FETCH_BP_BR_CORRECT_STAT,
FETCH_BP_BR_MISPREDICT_STAT,
FETCH_BP_RAS_CORRECT_STAT,
FETCH_BP_RAS_MISPREDICT_STAT,
//Instruction mix
logic alu_op;
logic branch_or_jump_op;
logic load_op;
logic store_op;
logic mul_op;
logic div_op;
logic misc_op;
ISSUE_NO_INSTRUCTION_STAT,
ISSUE_NO_ID_STAT,
ISSUE_FLUSH_STAT,
ISSUE_UNIT_BUSY_STAT,
ISSUE_OPERANDS_NOT_READY_STAT,
ISSUE_HOLD_STAT,
ISSUE_MULTI_SOURCE_STAT,
ISSUE_OPERAND_STALL_ON_LOAD_STAT,
ISSUE_OPERAND_STALL_ON_MULTIPLY_STAT,
ISSUE_OPERAND_STALL_ON_DIVIDE_STAT,
ISSUE_OPERAND_STALL_FOR_BRANCH_STAT,
ISSUE_STORE_WITH_FORWARDED_DATA_STAT,
ISSUE_DIVIDER_RESULT_REUSE_STAT,
//Branch Unit
logic branch_correct;
logic branch_misspredict;
logic return_correct;
logic return_misspredict;
LSU_LOAD_BLOCKED_BY_STORE_STAT,
LSU_SUB_UNIT_STALL_STAT,
LSU_DC_HIT_STAT,
LSU_DC_MISS_STAT,
LSU_DC_ARB_STALL_STAT
} stats_t;
//Load Store Unit
logic load_conflict_delay;
//Register File
logic rs1_forwarding_needed;
logic rs2_forwarding_needed;
logic rs1_and_rs2_forwarding_needed;
} cva5_trace_events_t;
typedef enum {
ALU_STAT,
BR_STAT,
MUL_STAT,
DIV_STAT,
LOAD_STAT,
STORE_STAT,
FPU_STAT,
MISC_STAT
} instruction_mix_stats_t;
typedef struct packed {
logic [31:0] pc;
@ -309,10 +250,4 @@ package cva5_types;
logic valid;
} trace_retire_outputs_t;
typedef struct packed {
logic [31:0] instruction_pc_dec;
logic [31:0] instruction_data_dec;
cva5_trace_events_t events;
} trace_outputs_t;
endpackage

View file

@ -0,0 +1,236 @@
/*
* Copyright © 2019-2023 Yuhui Gao, Chris Keilbart, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* Initial code developed under the supervision of Dr. Lesley Shannon,
* Reconfigurable Computing Lab, Simon Fraser University.
*
* Author(s):
* Yuhui Gao <yuhuig@sfu.ca>
* Chris Keilbart <ckeilbar@sfu.ca>
*/
package fpu_types;
import cva5_config::*;
import cva5_types::*;
typedef logic[GRS_WIDTH-1:0] grs_t;
typedef logic[EXPO_WIDTH-1:0] fp_shift_amt_t;
//Constants
localparam BIAS = 2**(EXPO_WIDTH-1) - 1;
localparam BIAS_F = 2**(EXPO_WIDTH_F-1)-1;
localparam [FLEN-1:0] CANONICAL_NAN = {1'b0, {EXPO_WIDTH{1'b1}}, 1'b1, {(FRAC_WIDTH-1){1'b0}}}; //canonical NaN
typedef logic[EXPO_WIDTH-1:0] expo_d_t;
typedef logic[EXPO_WIDTH_F-1:0] expo_s_t;
typedef logic[FRAC_WIDTH-1:0] frac_d_t;
typedef union packed {
logic[FLEN-1:0] raw;
struct packed {
logic sign;
expo_d_t expo;
frac_d_t frac;
} d;
struct packed {
logic[FLEN-FLEN_F-1:0] box;
logic sign;
expo_s_t expo;
logic[FRAC_WIDTH_F-1:0] frac;
} s;
} fp_t;
typedef struct packed {
logic inf;
logic snan;
logic qnan;
logic zero;
} special_case_t;
typedef logic[2:0] rm_t;
typedef struct packed {
logic nv;
logic dz;
logic of;
logic uf;
logic nx;
} fflags_t;
typedef struct packed {
rm_t rm;
logic valid;
logic[4:0] unit;
fp_t rs1;
fp_t rs2;
fp_t rs3;
logic[31:0] int_rs1;
id_t id;
logic is_single;
logic is_fma;
logic is_fadd;
logic is_i2f;
logic is_d2s;
logic is_minmax;
logic is_sign_inj;
logic is_sign_inj_single;
logic is_f2i;
logic is_mv_i2f;
logic is_fcmp;
logic is_class;
logic add;
logic neg_mul;
logic conv_signed;
} fp_preprocessing_packet_t;
typedef struct packed {
fp_t rs1;
fp_t rs2;
logic rs1_hidden;
logic rs2_hidden;
logic rs1_safe;
logic rs2_safe;
special_case_t rs1_special_case;
special_case_t rs2_special_case;
logic rs1_expo_overflow;
logic[EXPO_WIDTH:0] expo_diff;
logic add;
logic swap;
grs_t fp_add_grs;
rm_t rm;
logic single;
} fp_add_inputs_t;
typedef struct packed {
special_case_t rs1_special_case;
special_case_t rs2_special_case;
logic rs1_hidden;
logic rs2_hidden;
fp_t rs1;
fp_t rs2;
rm_t rm;
logic single;
fp_shift_amt_t rs2_prenormalize_shift_amt;
} fp_mul_inputs_t;
typedef struct packed {
logic mul_sign;
logic add_sign;
fp_t rs3;
logic rs3_hidden;
special_case_t rs3_special_case;
} fp_fma_inputs_t;
typedef struct packed {
logic add;
logic fma;
//mul is implicit if others unset
fp_add_inputs_t add_args;
fp_fma_inputs_t fma_args;
fp_mul_inputs_t mul_args;
} fp_madd_inputs_t;
typedef struct packed {
fp_t rs1;
fp_t rs2;
rm_t rm;
logic rs1_hidden;
logic rs2_hidden;
fp_shift_amt_t rs1_prenormalize_shift_amt;
fp_shift_amt_t rs2_prenormalize_shift_amt;
logic single;
special_case_t rs1_special_case;
special_case_t rs2_special_case;
} fp_div_inputs_t;
//Digit set for division
typedef enum logic[2:0] {
NEG_THREE = 3'b010, //Only reached by subtraction when last quotient digit is -2
NEG_TWO = 3'b011,
NEG_ONE = 3'b001,
ZERO = 3'b000,
POS_ONE = 3'b101,
POS_TWO = 3'b111
} q_t;
typedef struct packed {
fp_t rs1;
logic rs1_hidden;
special_case_t special_case;
fp_shift_amt_t rs1_prenormalize_shift_amt;
rm_t rm;
logic single;
} fp_sqrt_inputs_t;
typedef struct packed {
logic i2f;
logic fminmax;
logic fsgnj;
logic fmv;
logic d2s;
//s2d is implicit if others unset
//Used by FMV
logic[31:0] int_rs;
//Used by S2D, D2S
logic rs1_hidden;
special_case_t rs1_special_case;
//Used by S2D, D2S, FSGNJ
fp_t rs1;
//Used by FSGNJ
logic fsgnj_single;
logic rs1_boxed;
logic rs2_boxed;
//Used by FSGNJ, FMINMAX
logic swap;
fp_t rs2;
//Used by FSGNJ, FMINMAX, I2F
logic single;
rm_t rm;
//Used by FMINMAX
special_case_t rs2_special_case;
//Used by I2F
logic[31:0] int_rs_abs;
logic i2f_sign;
} fp_wb2fp_misc_inputs_t;
typedef struct packed {
logic fclass;
logic fcmp;
logic f2i;
//fmv is implicit if others unset
//Used by f2i
logic int_less_than_1;
expo_d_t rs1_expo_unbiased;
//Used by fclass, fcmp, f2i
fp_t rs1;
//Used by fclass
logic rs1_original_hidden_bit;
//Used by fclass, fcmp
special_case_t rs1_special_case;
//Used by fcmp
special_case_t rs2_special_case;
fp_t rs2;
logic swap;
//Used by fcmp as fn3 and f2i as rounding
rm_t rm;
//Used by f2i
logic rs1_hidden;
logic is_signed;
} fp_wb2int_misc_inputs_t;
endpackage

View file

@ -64,24 +64,18 @@ interface unit_issue_interface;
modport unit (output ready, input possible_issue, new_request, id);
endinterface
interface unit_writeback_interface;
import riscv_types::*;
interface unit_writeback_interface #(parameter DATA_WIDTH = 32);
import cva5_types::*;
logic ack;
//Handshaking
logic ack;
logic done;
id_t id;
logic done;
logic [XLEN-1:0] rd;
id_t id;
logic [DATA_WIDTH-1:0] rd;
modport unit (
input ack,
output id, done, rd
);
modport wb (
output ack,
input id, done, rd
);
modport unit (input ack, output done, id, rd);
modport wb (output ack, input done, id, rd);
endinterface
interface ras_interface;
@ -114,25 +108,11 @@ interface exception_interface;
modport econtrol (input valid, code, id, tval, output ack);
endinterface
interface csr_exception_interface;
import riscv_types::*;
import cva5_types::*;
logic valid;
exception_code_t code;
logic [31:0] tval;
logic [31:0] exception_pc;
logic [31:0] trap_pc;
modport econtrol (output valid, code, tval, exception_pc, input trap_pc);
modport csr (input valid, code, tval, exception_pc, output trap_pc);
endinterface
interface fifo_interface #(parameter DATA_WIDTH = 42);//#(parameter type data_type = logic[31:0]);
interface fifo_interface #(parameter type DATA_TYPE = logic);
logic push;
logic pop;
logic [DATA_WIDTH-1:0] data_in;
logic [DATA_WIDTH-1:0] data_out;
DATA_TYPE data_in;
DATA_TYPE data_out;
logic valid;
logic full;
logic potential_push;
@ -198,12 +178,17 @@ interface load_store_queue_interface;
lsq_entry_t data_in;
logic potential_push;
logic push;
logic full;
logic load_pop;
logic store_pop;
//LSQ outputs
data_access_shared_inputs_t data_out;
logic valid;
logic pop;
data_access_shared_inputs_t load_data_out;
data_access_shared_inputs_t store_data_out;
logic load_valid;
logic store_valid;
logic full;
//LSQ status
logic sq_empty;
@ -211,12 +196,12 @@ interface load_store_queue_interface;
logic no_released_stores_pending;
modport queue (
input data_in, potential_push, push, pop,
output full, data_out, valid, sq_empty, empty, no_released_stores_pending
input data_in, potential_push, push, load_pop, store_pop,
output full, load_data_out, store_data_out, load_valid, store_valid, sq_empty, empty, no_released_stores_pending
);
modport ls (
output data_in, potential_push, push, pop,
input full, data_out, valid, sq_empty, empty, no_released_stores_pending
output data_in, potential_push, push, load_pop, store_pop,
input full, load_data_out, store_data_out, load_valid, store_valid, sq_empty, empty, no_released_stores_pending
);
endinterface
@ -228,12 +213,11 @@ interface store_queue_interface;
//Issue inputs
lsq_entry_t data_in;
logic push;
logic full;
logic pop;
sq_entry_t data_out;
logic valid;
logic pop;
logic full;
//SQ status
logic empty;
@ -249,26 +233,29 @@ interface store_queue_interface;
);
endinterface
interface writeback_store_interface;
import riscv_types::*;
import cva5_types::*;
interface cache_functions_interface #(parameter int TAG_W = 8, parameter int LINE_W = 4, parameter int SUB_LINE_W = 2);
id_t id_needed;
logic possibly_waiting;
logic waiting;
logic ack;
function logic [LINE_W-1:0] xor_mask (int WAY);
for (int i = 0; i < LINE_W; i++)
xor_mask[i] = ((WAY % 2) == 0) ? 1'b1 : 1'b0;
endfunction
logic id_done;
logic [31:0] data;
function logic [LINE_W-1:0] getHashedLineAddr (logic[31:0] addr, int WAY);
getHashedLineAddr = addr[2 + SUB_LINE_W +: LINE_W] ^ (addr[2 + SUB_LINE_W + LINE_W +: LINE_W] & xor_mask(WAY));
endfunction
function logic[TAG_W-1:0] getTag(logic[31:0] addr);
getTag = addr[2 + LINE_W + SUB_LINE_W +: TAG_W];
endfunction
function logic [LINE_W-1:0] getTagLineAddr (logic[31:0] addr);
getTagLineAddr = addr[2 + SUB_LINE_W +: LINE_W];
endfunction
function logic [LINE_W+SUB_LINE_W-1:0] getDataLineAddr (logic[31:0] addr);
getDataLineAddr = addr[2 +: LINE_W + SUB_LINE_W];
endfunction
modport ls (
input id_done, data,
output id_needed, possibly_waiting ,waiting, ack
);
modport wb (
input id_needed, possibly_waiting, waiting, ack,
output id_done, data
);
endinterface
interface addr_utils_interface #(parameter bit [31:0] BASE_ADDR = 32'h00000000, parameter bit [31:0] UPPER_BOUND = 32'hFFFFFFFF);
@ -285,6 +272,7 @@ interface addr_utils_interface #(parameter bit [31:0] BASE_ADDR = 32'h00000000,
localparam int unsigned BIT_RANGE = bit_range();
/* verilator lint_off SELRANGE */
function address_range_check (input logic[31:0] addr);
return (BIT_RANGE == 0) ? 1 : (addr[31:32-BIT_RANGE] == BASE_ADDR[31:32-BIT_RANGE]);
endfunction
@ -327,21 +315,31 @@ interface unsigned_division_interface #(parameter DATA_WIDTH = 32);
modport divider (output remainder, quotient, done, input dividend, dividend_CLZ, divisor, divisor_CLZ, divisor_is_zero, start);
endinterface
interface renamer_interface #(parameter NUM_WB_GROUPS = 2);
import cva5_config::*;
interface unsigned_sqrt_interface #(parameter DATA_WIDTH = 32);
logic start;
logic [DATA_WIDTH-1:0] radicand;
logic [DATA_WIDTH-1:0] remainder;
logic [DATA_WIDTH-1:0] result;
logic done;
modport requester (input remainder, result, done, output radicand, start);
modport sqrt (output remainder, result, done, input radicand, start);
endinterface
interface renamer_interface #(parameter NUM_WB_GROUPS = 3, parameter READ_PORTS = 2);
import riscv_types::*;
import cva5_types::*;
rs_addr_t rd_addr;
rs_addr_t rs_addr [REGFILE_READ_PORTS];
rs_addr_t rs_addr [READ_PORTS];
logic [$clog2(NUM_WB_GROUPS)-1:0] rd_wb_group;
logic uses_rd;
id_t id;
phys_addr_t phys_rs_addr [REGFILE_READ_PORTS];
phys_addr_t phys_rs_addr [READ_PORTS];
phys_addr_t phys_rd_addr;
logic [$clog2(NUM_WB_GROUPS)-1:0] rs_wb_group [REGFILE_READ_PORTS];
logic [$clog2(NUM_WB_GROUPS)-1:0] rs_wb_group [READ_PORTS];
modport renamer (
input rd_addr, rs_addr, rd_wb_group, uses_rd, id,
@ -353,16 +351,14 @@ interface renamer_interface #(parameter NUM_WB_GROUPS = 2);
);
endinterface
interface register_file_issue_interface #(parameter NUM_WB_GROUPS = 2);
import cva5_config::*;
import riscv_types::*;
interface register_file_issue_interface #(parameter NUM_WB_GROUPS = 3, parameter DATA_WIDTH = 32, parameter READ_PORTS = 2);
import cva5_types::*;
//read interface
phys_addr_t phys_rs_addr [REGFILE_READ_PORTS];
logic [$clog2(NUM_WB_GROUPS)-1:0] rs_wb_group [REGFILE_READ_PORTS];
logic [31:0] data [REGFILE_READ_PORTS];
logic inuse [REGFILE_READ_PORTS];
phys_addr_t phys_rs_addr [READ_PORTS];
logic [$clog2(NUM_WB_GROUPS)-1:0] rs_wb_group [READ_PORTS];
logic [DATA_WIDTH-1:0] data [READ_PORTS];
logic inuse [READ_PORTS];
//issue write interface
phys_addr_t phys_rd_addr;
@ -377,3 +373,36 @@ interface register_file_issue_interface #(parameter NUM_WB_GROUPS = 2);
input data, inuse
);
endinterface
interface fp_intermediate_wb_interface;
import cva5_types::*;
import fpu_types::*;
logic ack;
id_t id;
logic done;
fp_t rd;
logic expo_overflow;
fflags_t fflags;
rm_t rm;
logic carry;
logic safe;
logic hidden;
grs_t grs;
fp_shift_amt_t clz;
logic right_shift;
fp_shift_amt_t right_shift_amt;
logic subnormal;
logic ignore_max_expo;
logic d2s;
modport unit (
input ack,
output id, done, rd, expo_overflow, fflags, rm, hidden, grs, clz, carry, safe, subnormal, right_shift, right_shift_amt, ignore_max_expo, d2s
);
modport wb (
output ack,
input id, done, rd, expo_overflow, fflags, rm, hidden, grs, clz, carry, safe, subnormal, right_shift, right_shift_amt, ignore_max_expo, d2s
);
endinterface

View file

@ -1,5 +1,5 @@
/*
* Copyright © 2020 Eric Matthews, Lesley Shannon
* Copyright © 2017-2020 Eric Matthews, Lesley Shannon
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -20,23 +20,7 @@
* Eric Matthews <ematthew@sfu.ca>
*/
module illegal_instruction_checker
import cva5_config::*;
import riscv_types::*;
# (
parameter cpu_config_t CONFIG = EXAMPLE_CONFIG
)
(
input logic [31:0] instruction,
output logic illegal_instruction
);
////////////////////////////////////////////////////
//Instruction Patterns for Illegal Instruction Checking
package opcodes;
//Base ISA
localparam [31:0] BEQ = 32'b?????????????????000?????1100011;
localparam [31:0] BNE = 32'b?????????????????001?????1100011;
@ -98,6 +82,62 @@ module illegal_instruction_checker
localparam [31:0] REM = 32'b0000001??????????110?????0110011;
localparam [31:0] REMU = 32'b0000001??????????111?????0110011;
//Single precision
localparam [31:0] SP_FLW = 32'b?????????????????010?????0000111;
localparam [31:0] SP_FSW = 32'b?????????????????010?????0100111;
localparam [31:0] SP_FMADD = 32'b?????00??????????????????1000011;
localparam [31:0] SP_FMSUB = 32'b?????00??????????????????1000111;
localparam [31:0] SP_FNMSUB = 32'b?????00??????????????????1001011;
localparam [31:0] SP_FNMADD = 32'b?????00??????????????????1001111;
localparam [31:0] SP_FADD = 32'b0000000??????????????????1010011;
localparam [31:0] SP_FSUB = 32'b0000100??????????????????1010011;
localparam [31:0] SP_FMUL = 32'b0001000??????????????????1010011;
localparam [31:0] SP_FDIV = 32'b0001100??????????????????1010011;
localparam [31:0] SP_FSQRT = 32'b010110000000?????????????1010011;
localparam [31:0] SP_FSGNJ = 32'b0010000??????????000?????1010011;
localparam [31:0] SP_FSGNJN = 32'b0010000??????????001?????1010011;
localparam [31:0] SP_FSGNJX = 32'b0010000??????????010?????1010011;
localparam [31:0] SP_FMIN = 32'b0010100??????????000?????1010011;
localparam [31:0] SP_FMAX = 32'b0010100??????????001?????1010011;
localparam [31:0] SP_FCVT_W_S = 32'b110000000000?????????????1010011;
localparam [31:0] SP_FCVT_WU_S = 32'b110000000001?????????????1010011;
localparam [31:0] SP_FMV_X_W = 32'b111000000000?????000?????1010011;
localparam [31:0] SP_FEQ = 32'b1010000??????????010?????1010011;
localparam [31:0] SP_FLT = 32'b1010000??????????001?????1010011;
localparam [31:0] SP_FLE = 32'b1010000??????????000?????1010011;
localparam [31:0] SP_FCLASS = 32'b111000000000?????001?????1010011;
localparam [31:0] SP_FCVT_S_W = 32'b110100000000?????????????1010011;
localparam [31:0] SP_FCVT_S_WU = 32'b110100000001?????????????1010011;
localparam [31:0] SP_FMV_W_X = 32'b111100000000?????000?????1010011;
//Double precision
localparam [31:0] DP_FLD = 32'b?????????????????011?????0000111;
localparam [31:0] DP_FSD = 32'b?????????????????011?????0100111;
localparam [31:0] DP_FMADD = 32'b?????01??????????????????1000011;
localparam [31:0] DP_FMSUB = 32'b?????01??????????????????1000111;
localparam [31:0] DP_FNMSUB = 32'b?????01??????????????????1001011;
localparam [31:0] DP_FNMADD = 32'b?????01??????????????????1001111;
localparam [31:0] DP_FADD = 32'b0000001??????????????????1010011;
localparam [31:0] DP_FSUB = 32'b0000101??????????????????1010011;
localparam [31:0] DP_FMUL = 32'b0001001??????????????????1010011;
localparam [31:0] DP_FDIV = 32'b0001101??????????????????1010011;
localparam [31:0] DP_FSQRT = 32'b010110100000?????????????1010011;
localparam [31:0] DP_FSGNJ = 32'b0010001??????????000?????1010011;
localparam [31:0] DP_FSGNJN = 32'b0010001??????????001?????1010011;
localparam [31:0] DP_FSGNJX = 32'b0010001??????????010?????1010011;
localparam [31:0] DP_FMIN = 32'b0010101??????????000?????1010011;
localparam [31:0] DP_FMAX = 32'b0010101??????????001?????1010011;
localparam [31:0] DP_FCVT_S_D = 32'b010000000001?????????????1010011;
localparam [31:0] DP_FCVT_D_S = 32'b010000100000?????????????1010011;
localparam [31:0] DP_FEQ = 32'b1010001??????????010?????1010011;
localparam [31:0] DP_FLT = 32'b1010001??????????001?????1010011;
localparam [31:0] DP_FLE = 32'b1010001??????????000?????1010011;
localparam [31:0] DP_FCLASS = 32'b111000100000?????001?????1010011;
localparam [31:0] DP_FCVT_W_D = 32'b110000100000?????????????1010011;
localparam [31:0] DP_FCVT_WU_D = 32'b110000100001?????????????1010011;
localparam [31:0] DP_FCVT_D_W = 32'b110100100000?????????????1010011;
localparam [31:0] DP_FCVT_D_WU = 32'b110100100001?????????????1010011;
//AMO
localparam [31:0] AMO_ADD = 32'b00000????????????010?????0101111;
localparam [31:0] AMO_XOR = 32'b00100????????????010?????0101111;
@ -117,93 +157,12 @@ module illegal_instruction_checker
localparam [31:0] SFENCE_VMA = 32'b0001001??????????000000001110011;
localparam [31:0] WFI = 32'b00010000010100000000000001110011;
logic base_legal;
logic csr_legal;
logic csr_addr_base;
logic csr_addr_machine;
logic csr_addr_supervisor;
logic csr_addr_debug;
logic mul_legal;
logic div_legal;
logic ifence_legal;
logic amo_legal;
logic machine_legal;
logic supervisor_legal;
////////////////////////////////////////////////////
//Implementation
//Cache
localparam [31:0] CBO_INVAL = 32'b000000000000?????010000000001111;
localparam [31:0] CBO_CLEAN = 32'b000000000001?????010000000001111;
localparam [31:0] CBO_FLUSH = 32'b000000000010?????010000000001111;
assign base_legal = instruction inside {
BEQ, BNE, BLT, BGE, BLTU, BGEU, JALR, JAL, LUI, AUIPC,
ADDI, SLLI, SLTI, SLTIU, XORI, SRLI, SRAI, ORI, ANDI,
ADD, SUB, SLL, SLT, SLTU, XOR, SRL, SRA, OR, AND,
LB, LH, LW, LBU, LHU, SB, SH, SW,
FENCE
};
assign csr_addr_base = instruction[31:20] inside {
FFLAGS, FRM, FCSR,
CYCLE, TIME, INSTRET, CYCLEH, TIMEH, INSTRETH
};
localparam [31:0] CUSTOM = 32'b?????????????????????????1111011;
assign csr_addr_machine = instruction[31:20] inside {
MVENDORID, MARCHID, MIMPID, MHARTID,
MSTATUS, MISA, MEDELEG, MIDELEG, MIE, MTVEC, MCOUNTEREN,
MSCRATCH, MEPC, MCAUSE, MTVAL, MIP,
MCYCLE, MINSTRET, MCYCLEH, MINSTRETH
};
assign csr_addr_supervisor = instruction[31:20] inside {
SSTATUS, SEDELEG, SIDELEG, SIE, STVEC, SCOUNTEREN,
SSCRATCH, SEPC, SCAUSE, STVAL, SIP,
SATP
};
assign csr_addr_debug = instruction[31:20] inside {
DCSR, DPC, DSCRATCH
};
//Privilege check done later on instruction issue
//Here we just check instruction encoding and valid CSR address
assign csr_legal = instruction inside {
CSRRW, CSRRS, CSRRC, CSRRWI, CSRRSI, CSRRCI
} && (
csr_addr_base |
(CONFIG.INCLUDE_M_MODE & csr_addr_machine) |
(CONFIG.INCLUDE_S_MODE & csr_addr_supervisor)
);
assign mul_legal = instruction inside {
MUL, MULH, MULHSU, MULHU
};
assign div_legal = instruction inside {
DIV, DIVU, REM, REMU
};
assign ifence_legal = instruction inside {FENCE_I};
assign amo_legal = instruction inside {
AMO_ADD, AMO_XOR, AMO_OR, AMO_AND, AMO_MIN, AMO_MAX, AMO_MINU, AMO_MAXU, AMO_SWAP,
LR, SC
};
assign machine_legal = instruction inside {
MRET, ECALL, EBREAK
};
assign supervisor_legal = instruction inside {
SRET, SFENCE_VMA, WFI
};
assign illegal_instruction = ~(
base_legal |
(CONFIG.INCLUDE_CSRS & csr_legal) |
(CONFIG.INCLUDE_MUL & mul_legal) |
(CONFIG.INCLUDE_DIV & div_legal) |
(CONFIG.INCLUDE_AMO & amo_legal) |
(CONFIG.INCLUDE_IFENCE & ifence_legal) |
(CONFIG.INCLUDE_M_MODE & machine_legal) |
(CONFIG.INCLUDE_S_MODE & supervisor_legal)
);
endmodule
endpackage

View file

@ -35,7 +35,8 @@ package riscv_types;
logic [4:0] rs1_addr;
logic [2:0] fn3;
logic [4:0] rd_addr;
logic [6:0] opcode;
logic [6:2] upper_opcode;
logic [1:0] lower_opcode;
} common_instruction_t;
typedef enum logic [4:0] {
@ -51,7 +52,14 @@ package riscv_types;
FENCE_T = 5'b00011,
AMO_T = 5'b01011,
SYSTEM_T = 5'b11100,
//end of RV32I
FPU_LOAD_T = 5'b00001,
FPU_STORE_T = 5'b01001,
FPU_MADD_T = 5'b10000,
FPU_MSUB_T = 5'b10001,
FPU_NMSUB_T = 5'b10010,
FPU_NMADD_T = 5'b10011,
FPU_OP_T = 5'b10100,
//end of RV32IMD
CUSTOM_T = 5'b11110
} opcodes_trimmed_t;
@ -70,7 +78,7 @@ package riscv_types;
LS_B_fn3 = 3'b000,
LS_H_fn3 = 3'b001,
LS_W_fn3 = 3'b010,
//unused 011
LS_D_fn3 = 3'b011,
L_BU_fn3 = 3'b100,
L_HU_fn3 = 3'b101
//unused 110
@ -282,4 +290,39 @@ package riscv_types;
logic [XLEN-1:0] t5;
logic [XLEN-1:0] t6;
} simulation_named_regfile;
typedef struct packed{
logic [FLEN-1:0] ft0;
logic [FLEN-1:0] ft1;
logic [FLEN-1:0] ft2;
logic [FLEN-1:0] ft3;
logic [FLEN-1:0] ft4;
logic [FLEN-1:0] ft5;
logic [FLEN-1:0] ft6;
logic [FLEN-1:0] ft7;
logic [FLEN-1:0] fs0;
logic [FLEN-1:0] fs1;
logic [FLEN-1:0] fa0;
logic [FLEN-1:0] fa1;
logic [FLEN-1:0] fa2;
logic [FLEN-1:0] fa3;
logic [FLEN-1:0] fa4;
logic [FLEN-1:0] fa5;
logic [FLEN-1:0] fa6;
logic [FLEN-1:0] fa7;
logic [FLEN-1:0] fs2;
logic [FLEN-1:0] fs3;
logic [FLEN-1:0] fs4;
logic [FLEN-1:0] fs5;
logic [FLEN-1:0] fs6;
logic [FLEN-1:0] fs7;
logic [FLEN-1:0] fs8;
logic [FLEN-1:0] fs9;
logic [FLEN-1:0] fs10;
logic [FLEN-1:0] fs11;
logic [FLEN-1:0] ft8;
logic [FLEN-1:0] ft9;
logic [FLEN-1:0] ft10;
logic [FLEN-1:0] ft11;
} fp_simulation_named_regfile;
endpackage

View file

@ -23,111 +23,57 @@
module writeback
import cva5_config::*;
import riscv_types::*;
import cva5_types::*;
# (
parameter cpu_config_t CONFIG = EXAMPLE_CONFIG,
parameter int unsigned NUM_UNITS [CONFIG.NUM_WB_GROUPS] = '{1, 4},
parameter int unsigned NUM_WB_UNITS = 5
parameter int unsigned NUM_WB_UNITS = 5,
parameter unit_id_enum_t [MAX_NUM_UNITS-1:0] WB_INDEX = '{0: ALU_ID, 1: MUL_ID, 2: DIV_ID, 3: LS_ID, 4: CSR_ID, 5: FPU_ID, default: NON_WRITEBACK_ID}
)
(
input logic clk,
input logic rst,
//Unit writeback
unit_writeback_interface.wb unit_wb[NUM_WB_UNITS],
unit_writeback_interface.wb unit_wb[MAX_NUM_UNITS],
//WB output
output wb_packet_t wb_packet [CONFIG.NUM_WB_GROUPS],
//Snoop interface (LS unit)
output wb_packet_t wb_snoop
output wb_packet_t wb_packet
);
//Writeback
logic [NUM_WB_UNITS-1:0] unit_ack [CONFIG.NUM_WB_GROUPS];
//aliases for write-back-interface signals
id_t [NUM_WB_UNITS-1:0] unit_instruction_id [CONFIG.NUM_WB_GROUPS];
logic [NUM_WB_UNITS-1:0] unit_done [CONFIG.NUM_WB_GROUPS];
id_t [NUM_WB_UNITS-1:0] unit_instruction_id;
logic [NUM_WB_UNITS-1:0] unit_done;
logic [31:0] unit_rd [NUM_WB_UNITS];
logic [NUM_WB_UNITS-1:0] unit_ack;
typedef logic [XLEN-1:0] unit_rd_t [NUM_WB_UNITS];
unit_rd_t unit_rd [CONFIG.NUM_WB_GROUPS];
//Per-ID muxes for commit buffer
logic [$clog2(NUM_WB_UNITS)-1:0] unit_sel [CONFIG.NUM_WB_GROUPS];
localparam int unsigned LOG2_NUM_WB_UNITS = (NUM_WB_UNITS == 1) ? 1 : $clog2(NUM_WB_UNITS);
logic [LOG2_NUM_WB_UNITS-1:0] unit_sel;
typedef int unsigned unit_count_t [CONFIG.NUM_WB_GROUPS];
function unit_count_t get_cumulative_unit_count();
unit_count_t counts;
int unsigned cumulative_count = 0;
for (int i = 0; i < CONFIG.NUM_WB_GROUPS; i++) begin
counts[i] = cumulative_count;
cumulative_count += NUM_UNITS[i];
end
return counts;
endfunction
localparam unit_count_t CUMULATIVE_NUM_UNITS = get_cumulative_unit_count();
genvar i, j;
////////////////////////////////////////////////////
//Implementation
//Re-assigning interface inputs to array types so that they can be dynamically indexed
generate
for (i = 0; i < CONFIG.NUM_WB_GROUPS; i++) begin : gen_wb_group_unpacking
for (j = 0; j < NUM_UNITS[i]; j++) begin : gen_wb_unit_unpacking
assign unit_instruction_id[i][j] = unit_wb[CUMULATIVE_NUM_UNITS[i] + j].id;
assign unit_done[i][j] = unit_wb[CUMULATIVE_NUM_UNITS[i] + j].done;
assign unit_wb[CUMULATIVE_NUM_UNITS[i] + j].ack = unit_ack[i][j];
end
end
endgenerate
//As units are selected for commit ports based on their unit ID,
//for each additional commit port one unit can be skipped for the commit mux
generate
for (i = 0; i < CONFIG.NUM_WB_GROUPS; i++) begin : gen_wb_port_grouping
for (j = 0; j < NUM_UNITS[i]; j++) begin : gen_wb_unit_grouping
assign unit_rd[i][j] = unit_wb[CUMULATIVE_NUM_UNITS[i] + j].rd;
end
end
endgenerate
generate for (genvar i = 0; i < NUM_WB_UNITS; i++) begin : gen_wb_unit_unpacking
assign unit_instruction_id[i] = unit_wb[WB_INDEX[i]].id;
assign unit_done[i] = unit_wb[WB_INDEX[i]].done;
assign unit_rd[i] = unit_wb[WB_INDEX[i]].rd;
assign unit_wb[WB_INDEX[i]].ack = unit_ack[i];
end endgenerate
////////////////////////////////////////////////////
//Unit select for register file
//Iterating through all commit ports:
// Search for complete units (in fixed unit order)
// Assign to a commit port, mask that unit and commit port
generate for (i = 0; i < CONFIG.NUM_WB_GROUPS; i++) begin : gen_wb_mux
priority_encoder
#(.WIDTH(NUM_UNITS[i]))
unit_done_encoder
(
.priority_vector (unit_done[i][NUM_UNITS[i]-1 : 0]),
.encoded_result (unit_sel[i][NUM_UNITS[i] == 1 ? 0 : ($clog2(NUM_UNITS[i])-1) : 0])
);
assign wb_packet[i].valid = |unit_done[i];
assign wb_packet [i].id = unit_instruction_id[i][unit_sel[i]];
assign wb_packet[i].data = unit_rd[i][unit_sel[i]];
priority_encoder #(.WIDTH(NUM_WB_UNITS))
unit_done_encoder
(
.priority_vector (unit_done),
.encoded_result (unit_sel)
);
assign wb_packet = '{
valid : |unit_done,
id : unit_instruction_id[unit_sel],
data : unit_rd[unit_sel]
};
assign unit_ack[i] = NUM_WB_UNITS'(wb_packet[i].valid) << unit_sel[i];
end endgenerate
////////////////////////////////////////////////////
//Store Forwarding Support
//TODO: support additional writeback groups
//currently limited to one writeback group with the
//assumption that writeback group zero has single-cycle
//operation
always_ff @ (posedge clk) begin
if (rst)
wb_snoop.valid <= 0;
else
wb_snoop.valid <= wb_packet[1].valid;
end
always_ff @ (posedge clk) begin
wb_snoop.data <= wb_packet[1].data;
wb_snoop.id <= wb_packet[1].id;
end
assign unit_ack = NUM_WB_UNITS'(wb_packet.valid) << unit_sel;
////////////////////////////////////////////////////
//End of Implementation

View file

@ -37,13 +37,10 @@ module l1_to_wishbone
localparam MAX_REQUESTS = 32;
fifo_interface #(.DATA_WIDTH($bits(l2_request_t))) request_fifo ();
fifo_interface #(.DATA_WIDTH($bits(l2_data_request_t))) data_fifo ();
fifo_interface #(.DATA_TYPE(l2_request_t)) request_fifo ();
fifo_interface #(.DATA_TYPE(l2_data_request_t)) data_fifo ();
l2_request_t request_in;
l2_request_t request;
l2_data_request_t data_request_in;
l2_data_request_t data_request;
logic request_complete;
@ -53,34 +50,34 @@ module l1_to_wishbone
assign cpu.data_full = data_fifo.full;
//Repack input attributes
assign request_in.addr = cpu.addr;
assign request_in.rnw = cpu.rnw;
assign request_in.is_amo = cpu.is_amo;
assign request_in.amo_type_or_burst_size = cpu.amo_type_or_burst_size;
assign request_in.sub_id = cpu.sub_id;
assign request_fifo.data_in = '{
addr : cpu.addr,
rnw : cpu.rnw,
is_amo : cpu.is_amo,
amo_type_or_burst_size : cpu.amo_type_or_burst_size,
sub_id : cpu.sub_id
};
assign request_fifo.push = cpu.request_push;
assign request_fifo.potential_push = cpu.request_push;
assign request_fifo.pop = request_complete;
assign request_fifo.data_in = request_in;
assign request = request_fifo.data_out;
assign data_request_in.data = cpu.wr_data;
assign data_request_in.be = cpu.wr_data_be;
assign data_fifo.push = cpu.wr_data_push;
assign data_fifo.potential_push = cpu.wr_data_push;
assign data_fifo.pop = wishbone.we & wishbone.ack;
assign data_fifo.data_in = data_request_in;
assign data_fifo.data_in = '{
data : cpu.wr_data,
be : cpu_wr_data_be
};
assign data_request = data_fifo.data_out;
cva5_fifo #(.DATA_WIDTH($bits(l2_request_t)), .FIFO_DEPTH(MAX_REQUESTS))
cva5_fifo #(.DATA_TYPE(l2_request_t), .FIFO_DEPTH(MAX_REQUESTS))
request_fifo_block (
.clk (clk),
.rst (rst),
.fifo (request_fifo)
);
cva5_fifo #(.DATA_WIDTH($bits(l2_data_request_t)), .FIFO_DEPTH(MAX_REQUESTS))
cva5_fifo #(.DATA_TYPE(l2_data_request_t), .FIFO_DEPTH(MAX_REQUESTS))
data_fifo_block (
.clk (clk),
.rst (rst),

View file

@ -74,15 +74,29 @@ module litex_wrapper
input logic idbus_err
);
localparam wb_group_config_t MINIMAL_WB_GROUP_CONFIG = '{
0 : '{0: ALU_ID, default : NON_WRITEBACK_ID},
1 : '{0: LS_ID, 1: CSR_ID, default : NON_WRITEBACK_ID},
default : '{default : NON_WRITEBACK_ID}
};
localparam cpu_config_t MINIMAL_CONFIG = '{
//ISA options
INCLUDE_M_MODE : 1,
INCLUDE_S_MODE : 0,
INCLUDE_U_MODE : 0,
INCLUDE_MUL : 0,
INCLUDE_DIV : 0,
INCLUDE_UNIT : '{
ALU : 1,
LS : 1,
MUL : 0,
DIV : 0,
CSR : 1,
CUSTOM : 0,
BR : 1,
IEC : 1
},
INCLUDE_IFENCE : 0,
INCLUDE_CSRS : 1,
INCLUDE_AMO : 0,
//CSR constants
CSRS : '{
@ -102,6 +116,7 @@ module litex_wrapper
},
//Memory Options
SQ_DEPTH : 2,
INCLUDE_FORWARDING_TO_STORES : 0,
INCLUDE_ICACHE : 0,
ICACHE_ADDR : '{
L: 32'h40000000,
@ -171,7 +186,15 @@ module litex_wrapper
RAS_ENTRIES : 8
},
//Writeback Options
NUM_WB_GROUPS : 2
NUM_WB_GROUPS : 2,
WB_GROUP : MINIMAL_WB_GROUP_CONFIG
};
localparam wb_group_config_t STANDARD_WB_GROUP_CONFIG = '{
0 : '{0: ALU_ID, default : NON_WRITEBACK_ID},
1 : '{0: LS_ID, default : NON_WRITEBACK_ID},
2 : '{0: MUL_ID, 1: DIV_ID, 2: CSR_ID, 3: CUSTOM_ID, default : NON_WRITEBACK_ID},
default : '{default : NON_WRITEBACK_ID}
};
localparam cpu_config_t STANDARD_CONFIG = '{
@ -179,10 +202,17 @@ module litex_wrapper
INCLUDE_M_MODE : 1,
INCLUDE_S_MODE : 0,
INCLUDE_U_MODE : 0,
INCLUDE_MUL : 1,
INCLUDE_DIV : 1,
INCLUDE_UNIT : '{
ALU : 1,
LS : 1,
MUL : 1,
DIV : 1,
CSR : 1,
CUSTOM : 0,
BR : 1,
IEC : 1
},
INCLUDE_IFENCE : 0,
INCLUDE_CSRS : 1,
INCLUDE_AMO : 0,
//CSR constants
CSRS : '{
@ -202,6 +232,7 @@ module litex_wrapper
},
//Memory Options
SQ_DEPTH : 4,
INCLUDE_FORWARDING_TO_STORES : 1,
INCLUDE_ICACHE : 1,
ICACHE_ADDR : '{
L : 32'h00000000,
@ -271,7 +302,8 @@ module litex_wrapper
RAS_ENTRIES : 8
},
//Writeback Options
NUM_WB_GROUPS : 2
NUM_WB_GROUPS : 3,
WB_GROUP : STANDARD_WB_GROUP_CONFIG
};
function cpu_config_t config_select (input integer variant);
@ -290,7 +322,6 @@ module litex_wrapper
avalon_interface m_avalon();
local_memory_interface instruction_bram();
local_memory_interface data_bram();
trace_outputs_t tr;
interrupt_t s_interrupt;
//L2 to Wishbone

Some files were not shown because too many files have changed in this diff Show more