diff --git a/README.md b/README.md index e840e7bf..50fe8ddc 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ The options include different choices for the architecture of the multiplier uni The table below indicates performance, area and verification status for a few selected configurations. These are configurations on which lowRISC is focusing for performance evaluation and design verification (see [supported configs](ibex_configs.yaml)). -| Config | "small" | "maxperf" | "maxperf-pmp-bm" | +| Config | "small" | "maxperf" | "maxperf-pmp-bmfull" | | ------ | ------- | --------- | ---------------- | | Features | RV32IMC, 3 cycle mult | RV32IMC, 1 cycle mult, Branch target ALU, Writeback stage | RV32IMCB, 1 cycle mult, Branch target ALU, Writeback stage, 16 PMP regions | | Performance (Coremark/MHz) | 2.44 | 3.09 | 3.09 | diff --git a/azure-pipelines.yml b/azure-pipelines.yml index db7212ad..076912b5 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -159,4 +159,4 @@ jobs: ibex_configs: - small - experimental-maxperf-pmp - - experimental-maxperf-pmp-bm + - experimental-maxperf-pmp-bmfull diff --git a/doc/instruction_decode_execute.rst b/doc/instruction_decode_execute.rst index 00e090f1..1ec7f444 100644 --- a/doc/instruction_decode_execute.rst +++ b/doc/instruction_decode_execute.rst @@ -64,10 +64,46 @@ Other blocks use the ALU for the following tasks: * It computes memory addresses for loads and stores with a Reg + Imm calculation * The LSU uses it to increment addresses when performing two accesses to handle an unaligned access -Support for the RISC-V Bitmanipulation Extension (Document Version 0.92, November 8, 2019) is enabled via the parameter ``RV32B``. -This feature is *EXPERIMENTAL* and the details of its impact are not yet documented here. -Currently the Zbb, Zbs, Zbp, Zbe, Zbf, Zbc, Zbr and Zbt sub-extensions are implemented. -The rotate instructions `ror` and `rol` (Zbb), ternary instructions `cmov`, `cmix`, `fsl` and `fsr` as well as cyclic redundancy checks `crc32[c]` (Zbr) are completed in 2 cycles. All remaining instructions complete in one cycle. +Bit Manipulation Extension + Support for the `RISC-V Bit Manipulation Extension (Document Version 0.92, November 8, 2019) `_ is enabled via the enumerated parameter ``RV32B`` defined in :file:`rtl/ibex_pkg.sv`. + This feature is *Experimental*. + + There are two versions of the bit manipulation extension available: + The balanced implementation comprises a set of sub-extensions aiming for good benefits at a reasonable area overhead. + The full implementation comprises all 32 bit instructions defined in the extension. + The following table lists the implemented instructions in each version. + Multi-cycle instructions are completed in 2 cycles. + All remaining instructions complete in a single cycle. + + +---------------------------+---------------+--------------------------+ + | Z-Extension | Version | Multi-Cycle Instructions | + +===========================+===============+==========================+ + | Zbb (Base) | Balanced/Full | rol, ror[i] | + +---------------------------+---------------+--------------------------+ + | Zbs (Single-bit) | Balanced/Full | None | + +---------------------------+---------------+--------------------------+ + | Zbp (Permutation) | Full | None | + +---------------------------+---------------+--------------------------+ + | Zbp (Bit extract/deposit) | Full | All | + +---------------------------+---------------+--------------------------+ + | Zbf (Bit-field place) | Balanced/Full | All | + +---------------------------+---------------+--------------------------+ + | Zbc (Carry-less multiply) | Full | None | + +---------------------------+---------------+--------------------------+ + | Zbr (Crc) | Full | All | + +---------------------------+---------------+--------------------------+ + | Zbt (Ternary) | Balanced/Full | All | + +---------------------------+---------------+--------------------------+ + | Zb_tmp (Temporary)* | Balanced/Full | None | + +---------------------------+---------------+--------------------------+ + + * The sign-extend instructions `sext.b/sext.h` are defined but not yet classified in version 0.92 of the extension proposal. + Temporarily, they are assigned a separate Z-extension. + + The implementation of the B-extension comes with an area overhead of 1.8 to 3.0 kGE for the balanced version and 6.0 to 8.7 kGE for the full version. + That corresponds to an approximate percentage increase in area of 9 to 14 % and 25 to 30 % for the balanced and full versions respectively. + The ranges correspond to synthesis results generated using relaxed and maximum frequency targets respectively. + The designs have been synthesized using Synopsys Design Compiler targeting TSMC 65 nm technology. .. _mult-div: diff --git a/doc/integration.rst b/doc/integration.rst index d410bbf8..f7233d3f 100644 --- a/doc/integration.rst +++ b/doc/integration.rst @@ -12,21 +12,21 @@ Instantiation Template .. code-block:: verilog ibex_core #( - .PMPEnable ( 0 ), - .PMPGranularity ( 0 ), - .PMPNumRegions ( 4 ), - .MHPMCounterNum ( 0 ), - .MHPMCounterWidth ( 40 ), - .RV32E ( 0 ), - .RV32M ( 1 ), - .RV32B ( 0 ), - .MultiplierImplementation ( "fast" ), - .ICache ( 0 ), - .ICacheECC ( 0 ), - .SecureIbex ( 0 ), - .DbgTriggerEn ( 0 ), - .DmHaltAddr ( 32'h1A110800 ), - .DmExceptionAddr ( 32'h1A110808 ) + .PMPEnable ( 0 ), + .PMPGranularity ( 0 ), + .PMPNumRegions ( 4 ), + .MHPMCounterNum ( 0 ), + .MHPMCounterWidth ( 40 ), + .RV32E ( 0 ), + .RV32M ( 1 ), + .RV32B ( ibex_pkg::RV32BNone ), + .MultiplierImplementation ( "fast" ), + .ICache ( 0 ), + .ICacheECC ( 0 ), + .SecureIbex ( 0 ), + .DbgTriggerEn ( 0 ), + .DmHaltAddr ( 32'h1A110800 ), + .DmExceptionAddr ( 32'h1A110808 ) ) u_core ( // Clock and reset .clk_i (), @@ -74,55 +74,55 @@ Instantiation Template Parameters ---------- -+------------------------------+-------------+------------+-----------------------------------------------------------------+ -| Name | Type/Range | Default | Description | -+==============================+=============+============+=================================================================+ -| ``PMPEnable`` | bit | 0 | Enable PMP support | -+------------------------------+-------------+------------+-----------------------------------------------------------------+ -| ``PMPGranularity`` | int (0..31) | 0 | Minimum granularity of PMP address matching | -+------------------------------+-------------+------------+-----------------------------------------------------------------+ -| ``PMPNumRegions`` | int (1..16) | 4 | Number implemented PMP regions (ignored if PMPEnable == 0) | -+------------------------------+-------------+------------+-----------------------------------------------------------------+ -| ``MHPMCounterNum`` | int (0..10) | 0 | Number of performance monitor event counters | -+------------------------------+-------------+------------+-----------------------------------------------------------------+ -| ``MHPMCounterWidth`` | int (64..1) | 40 | Bit width of performance monitor event counters | -+------------------------------+-------------+------------+-----------------------------------------------------------------+ -| ``RV32E`` | bit | 0 | RV32E mode enable (16 integer registers only) | -+------------------------------+-------------+------------+-----------------------------------------------------------------+ -| ``RV32M`` | bit | 1 | M(ultiply) extension enable | -+------------------------------+-------------+------------+-----------------------------------------------------------------+ -| ``RV32B`` | bit | 0 | *EXPERIMENTAL* - B(itmanipulation) extension enable: | -| | | | Currently supported Z-extensions: Zbb (base), Zbs (single-bit) | -| | | | Zbp (bit permutation), Zbe (bit extract/deposit), | -| | | | Zbf (bit-field place) Zbc (carry-less multiplication) | -| | | | Zbr (cyclic redundancy check) and Zbt (ternary) | -+------------------------------+-------------+------------+-----------------------------------------------------------------+ -| ``BranchTargetALU`` | bit | 0 | *EXPERIMENTAL* - Enables branch target ALU removing a stall | -| | | | cycle from taken branches | -+------------------------------+-------------+------------+-----------------------------------------------------------------+ -| ``WritebackStage`` | bit | 0 | *EXPERIMENTAL* - Enables third pipeline stage (writeback) | -| | | | improving performance of loads and stores | -+------------------------------+-------------+------------+-----------------------------------------------------------------+ -| ``MultiplierImplementation`` | string | "fast" | Multiplicator type: | -| | | | "slow": multi-cycle slow, | -| | | | "fast": multi-cycle fast, | -| | | | "single-cycle": single-cycle | -+------------------------------+-------------+------------+-----------------------------------------------------------------+ -| ``ICache`` | bit | 0 | *EXPERIMENTAL* Enable instruction cache instead of prefetch | -| | | | buffer | -+------------------------------+-------------+------------+-----------------------------------------------------------------+ -| ``ICacheECC`` | bit | 0 | *EXPERIMENTAL* Enable SECDED ECC protection in ICache (if | -| | | | ICache == 1) | -+------------------------------+-------------+------------+-----------------------------------------------------------------+ -| ``SecureIbex`` | bit | 0 | *EXPERIMENTAL* Enable various additional features targeting | -| | | | secure code execution. | -+------------------------------+-------------+------------+-----------------------------------------------------------------+ -| ``DbgTriggerEn`` | bit | 0 | Enable debug trigger support (one trigger only) | -+------------------------------+-------------+------------+-----------------------------------------------------------------+ -| ``DmHaltAddr`` | int | 0x1A110800 | Address to jump to when entering Debug Mode | -+------------------------------+-------------+------------+-----------------------------------------------------------------+ -| ``DmExceptionAddr`` | int | 0x1A110808 | Address to jump to when an exception occurs while in Debug Mode | -+------------------------------+-------------+------------+-----------------------------------------------------------------+ ++------------------------------+-------------------+------------+-----------------------------------------------------------------+ +| Name | Type/Range | Default | Description | ++==============================+===================+============+=================================================================+ +| ``PMPEnable`` | bit | 0 | Enable PMP support | ++------------------------------+-------------------+------------+-----------------------------------------------------------------+ +| ``PMPGranularity`` | int (0..31) | 0 | Minimum granularity of PMP address matching | ++------------------------------+-------------------+------------+-----------------------------------------------------------------+ +| ``PMPNumRegions`` | int (1..16) | 4 | Number implemented PMP regions (ignored if PMPEnable == 0) | ++------------------------------+-------------------+------------+-----------------------------------------------------------------+ +| ``MHPMCounterNum`` | int (0..10) | 0 | Number of performance monitor event counters | ++------------------------------+-------------------+------------+-----------------------------------------------------------------+ +| ``MHPMCounterWidth`` | int (64..1) | 40 | Bit width of performance monitor event counters | ++------------------------------+-------------------+------------+-----------------------------------------------------------------+ +| ``RV32E`` | bit | 0 | RV32E mode enable (16 integer registers only) | ++------------------------------+-------------------+------------+-----------------------------------------------------------------+ +| ``RV32M`` | bit | 1 | M(ultiply) extension enable | ++------------------------------+-------------------+------------+-----------------------------------------------------------------+ +| ``RV32B`` | ibex_pkg::rv32b_e | RV32BNone | *EXPERIMENTAL* - B(itmanipulation) extension select: | +| | | | "RV32BNone": No B-extension | +| | | | "RV32BBalanced": Sub-extensions Zbb, Zbs, Zbf and | +| | | | Zbt | +| | | | "RV32Full": All sub-extensions | ++------------------------------+-------------------+------------+-----------------------------------------------------------------+ +| ``BranchTargetALU`` | bit | 0 | *EXPERIMENTAL* - Enables branch target ALU removing a stall | +| | | | cycle from taken branches | ++------------------------------+------------------ +------------+-----------------------------------------------------------------+ +| ``WritebackStage`` | bit | 0 | *EXPERIMENTAL* - Enables third pipeline stage (writeback) | +| | | | improving performance of loads and stores | ++------------------------------+-------------------+------------+-----------------------------------------------------------------+ +| ``MultiplierImplementation`` | string | "fast" | Multiplicator type: | +| | | | "slow": multi-cycle slow, | +| | | | "fast": multi-cycle fast, | +| | | | "single-cycle": single-cycle | ++------------------------------+-------------------+------------+-----------------------------------------------------------------+ +| ``ICache`` | bit | 0 | *EXPERIMENTAL* Enable instruction cache instead of prefetch | +| | | | buffer | ++------------------------------+-------------------+------------+-----------------------------------------------------------------+ +| ``ICacheECC`` | bit | 0 | *EXPERIMENTAL* Enable SECDED ECC protection in ICache (if | +| | | | ICache == 1) | ++------------------------------+-------------------+------------+-----------------------------------------------------------------+ +| ``SecureIbex`` | bit | 0 | *EXPERIMENTAL* Enable various additional features targeting | +| | | | secure code execution. | ++------------------------------+-------------------+------------+-----------------------------------------------------------------+ +| ``DbgTriggerEn`` | bit | 0 | Enable debug trigger support (one trigger only) | ++------------------------------+-------------------+------------+-----------------------------------------------------------------+ +| ``DmHaltAddr`` | int | 0x1A110800 | Address to jump to when entering Debug Mode | ++------------------------------+-------------------+------------+-----------------------------------------------------------------+ +| ``DmExceptionAddr`` | int | 0x1A110808 | Address to jump to when an exception occurs while in Debug Mode | ++------------------------------+-------------------+------------+-----------------------------------------------------------------+ Any parameter marked *EXPERIMENTAL* when enabled is not verified to the same standard as the rest of the Ibex core. diff --git a/doc/introduction.rst b/doc/introduction.rst index e0d8c0c5..0aaa5fdc 100644 --- a/doc/introduction.rst +++ b/doc/introduction.rst @@ -46,6 +46,10 @@ In addition, the following instruction set extensions are available. - 2.0 - optional + * - **B**: *EXPERIMENTAL* Standard Extension for Bit Manipulation Instructions + - 0.92 + - optional + * - **Zicsr**: Control and Status Register Instructions - 2.0 - always enabled diff --git a/dv/riscv_compliance/ibex_riscv_compliance.core b/dv/riscv_compliance/ibex_riscv_compliance.core index 99661cb7..33b5d40a 100644 --- a/dv/riscv_compliance/ibex_riscv_compliance.core +++ b/dv/riscv_compliance/ibex_riscv_compliance.core @@ -37,10 +37,10 @@ parameters: description: "Enable the E ISA extension (reduced register set) [0/1]" RV32B: - datatype: int - paramtype: vlogparam - default: 0 - description: "Enable the B ISA extension (bit manipulation EXPERIMENTAL) [0/1]" + datatype: str + default: ibex_pkg::RV32BNone + paramtype: vlogdefine + description: "Bitmanip implementation parameter enum. See ibex_pkg.sv (EXPERIMENTAL)" SRAM_INIT_FILE: datatype: str diff --git a/dv/uvm/core_ibex/Makefile b/dv/uvm/core_ibex/Makefile index 3291a8c0..105b04e3 100644 --- a/dv/uvm/core_ibex/Makefile +++ b/dv/uvm/core_ibex/Makefile @@ -65,7 +65,7 @@ PMP_REGIONS := 16 # PMP Granularity PMP_GRANULARITY := 0 -IBEX_CONFIG := experimental-maxperf-pmp-bm +IBEX_CONFIG := experimental-maxperf-pmp-bmfull # TODO(udinator) - might need options for SAIL/Whisper/Spike ifeq (${ISS},ovpsim) diff --git a/dv/uvm/core_ibex/riscv_dv_extension/testlist.yaml b/dv/uvm/core_ibex/riscv_dv_extension/testlist.yaml index 9605190b..a9cb249e 100644 --- a/dv/uvm/core_ibex/riscv_dv_extension/testlist.yaml +++ b/dv/uvm/core_ibex/riscv_dv_extension/testlist.yaml @@ -643,12 +643,22 @@ +pmp_allow_addr_overlap=1 rtl_test: core_ibex_base_test -- test: riscv_bitmanip_test +- test: riscv_bitmanip_full_test desc: > - Random instruction test with supported B extension instructions + Random instruction test with supported B extension instructions in full configuration iterations: 10 gen_test: riscv_rand_instr_test gen_opts: > +enable_b_extension=1 - +enable_bitmanip_groups=zbb,zbt,zbs,zbp,zbf,zbe,zbc,zbr + +enable_bitmanip_groups=zbb,zb_tmp,zbt,zbs,zbp,zbf,zbe,zbc,zbr + rtl_test: core_ibex_base_test + +- test: riscv_bitmanip_balanced_test + desc: > + Random instruction test with supported B extension instructions in balanced configuration + iterations: 10 + gen_test: riscv_rand_instr_test + gen_opts: > + +enable_b_extension=1 + +enable_bitmanip_groups=zbb,zb_tmp,zbt,zbs,zbf rtl_test: core_ibex_base_test diff --git a/dv/uvm/core_ibex/tb/core_ibex_tb_top.sv b/dv/uvm/core_ibex/tb/core_ibex_tb_top.sv index 87c7a885..57c5915a 100644 --- a/dv/uvm/core_ibex/tb/core_ibex_tb_top.sv +++ b/dv/uvm/core_ibex/tb/core_ibex_tb_top.sv @@ -32,12 +32,16 @@ module core_ibex_tb_top; `define IBEX_MULTIPLIER_IMPLEMENTATION fast `endif + `ifndef IBEX_CFG_RV32B + `define IBEX_CFG_RV32B ibex_pkg::RV32BNone + `endif + parameter bit PMPEnable = 1'b0; parameter int unsigned PMPGranularity = 0; parameter int unsigned PMPNumRegions = 4; parameter bit RV32E = 1'b0; parameter bit RV32M = 1'b1; - parameter bit RV32B = 1'b0; + parameter ibex_pkg::rv32b_e RV32B = `IBEX_CFG_RV32B; parameter bit BranchTargetALU = 1'b0; parameter bit WritebackStage = 1'b0; diff --git a/examples/simple_system/ibex_simple_system.core b/examples/simple_system/ibex_simple_system.core index 46db5399..f5aa9fe2 100644 --- a/examples/simple_system/ibex_simple_system.core +++ b/examples/simple_system/ibex_simple_system.core @@ -36,10 +36,10 @@ parameters: description: "Enable the E ISA extension (reduced register set) [0/1]" RV32B: - datatype: int - paramtype: vlogparam - default: 0 - description: "Enable the B ISA extension (bit manipulation EXPERIMENTAL) [0/1]" + datatype: str + default: ibex_pkg::RV32BNone + paramtype: vlogdefine + description: "Bitmanip implementation parameter enum. See ibex_pkg.sv (EXPERIMENTAL)" SRAM_INIT_FILE: datatype: str diff --git a/examples/simple_system/rtl/ibex_simple_system.sv b/examples/simple_system/rtl/ibex_simple_system.sv index a37bd993..23926e1a 100644 --- a/examples/simple_system/rtl/ibex_simple_system.sv +++ b/examples/simple_system/rtl/ibex_simple_system.sv @@ -2,6 +2,10 @@ // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 +`ifndef RV32B + `define RV32B ibex_pkg::RV32BNone +`endif + /** * Ibex simple system * @@ -19,15 +23,15 @@ module ibex_simple_system ( input IO_RST_N ); - parameter bit PMPEnable = 1'b0; - parameter int unsigned PMPGranularity = 0; - parameter int unsigned PMPNumRegions = 4; - parameter bit RV32E = 1'b0; - parameter bit RV32M = 1'b1; - parameter bit RV32B = 1'b0; - parameter bit BranchTargetALU = 1'b0; - parameter bit WritebackStage = 1'b0; - parameter MultiplierImplementation = "fast"; + parameter bit PMPEnable = 1'b0; + parameter int unsigned PMPGranularity = 0; + parameter int unsigned PMPNumRegions = 4; + parameter bit RV32E = 1'b0; + parameter bit RV32M = 1'b1; + parameter ibex_pkg::rv32b_e RV32B = `RV32B; + parameter bit BranchTargetALU = 1'b0; + parameter bit WritebackStage = 1'b0; + parameter MultiplierImplementation = "fast"; logic clk_sys = 1'b0, rst_sys_n; diff --git a/ibex_configs.yaml b/ibex_configs.yaml index 6929140b..ed0913c5 100644 --- a/ibex_configs.yaml +++ b/ibex_configs.yaml @@ -10,7 +10,7 @@ small: RV32E : 0 RV32M : 1 - RV32B : 0 + RV32B : "ibex_pkg::RV32BNone" BranchTargetALU : 0 WritebackStage : 0 MultiplierImplementation : "fast" @@ -28,7 +28,7 @@ small: experimental-maxperf: RV32E : 0 RV32M : 1 - RV32B : 0 + RV32B : "ibex_pkg::RV32BNone" BranchTargetALU : 1 WritebackStage : 1 MultiplierImplementation : "single-cycle" @@ -40,7 +40,7 @@ experimental-maxperf: experimental-maxperf-pmp: RV32E : 0 RV32M : 1 - RV32B : 0 + RV32B : "ibex_pkg::RV32BNone" BranchTargetALU : 1 WritebackStage : 1 MultiplierImplementation : "single-cycle" @@ -48,14 +48,27 @@ experimental-maxperf-pmp: PMPGranularity : 0 PMPNumRegions : 16 -# experimental-maxperf-pmp config above with bitmanip extension -experimental-maxperf-pmp-bm: +# experimental-maxperf-pmp config above with balanced bitmanip extension +experimental-maxperf-pmp-bmbalanced: RV32E : 0 RV32M : 1 - RV32B : 1 + RV32B : "ibex_pkg::RV32BBalanced" BranchTargetALU : 1 WritebackStage : 1 MultiplierImplementation : "single-cycle" PMPEnable : 1 PMPGranularity : 0 PMPNumRegions : 16 + +# experimental-maxperf-pmp config above with full bitmanip extension +experimental-maxperf-pmp-bmfull: + RV32E : 0 + RV32M : 1 + RV32B : "ibex_pkg::RV32BFull" + BranchTargetALU : 1 + WritebackStage : 1 + MultiplierImplementation : "single-cycle" + PMPEnable : 1 + PMPGranularity : 0 + PMPNumRegions : 16 + diff --git a/ibex_core.core b/ibex_core.core index 0c45f590..6c640658 100644 --- a/ibex_core.core +++ b/ibex_core.core @@ -72,9 +72,10 @@ parameters: paramtype: vlogparam RV32B: - datatype: int - default: 0 - paramtype: vlogparam + datatype: str + default: ibex_pkg::RV32BNone + paramtype: vlogdefine + description: "Bitmanip implementation parameter enum. See ibex_pkg.sv (EXPERIMENTAL)" MultiplierImplementation: datatype: str diff --git a/ibex_core_tracing.core b/ibex_core_tracing.core index 619436b1..765bbe67 100644 --- a/ibex_core_tracing.core +++ b/ibex_core_tracing.core @@ -43,9 +43,10 @@ parameters: paramtype: vlogparam RV32B: - datatype: int - default: 0 - paramtype: vlogparam + datatype: str + default: ibex_pkg::RV32BNone + paramtype: vlogdefine + description: "Bitmanip implementation parameter enum. See ibex_pkg.sv (EXPERIMENTAL)" MultiplierImplementation: datatype: str diff --git a/lint/verilator_waiver.vlt b/lint/verilator_waiver.vlt index ee041ae2..8049d659 100644 --- a/lint/verilator_waiver.vlt +++ b/lint/verilator_waiver.vlt @@ -37,12 +37,18 @@ lint_off -rule UNUSED -file "*/rtl/ibex_alu.sv" -match "*'shift_amt_compl'[5]*" // cleaner to write all bits even if not all are used lint_off -rule UNUSED -file "*/rtl/ibex_alu.sv" -match "*'shift_result_ext'[32]*" -// Signal is not used for RV32B == 0: imd_val_q_i +// Signal is not used for RV32B == RV32BNone: imd_val_q_i // // No ALU multicycle instructions exist to use the intermediate value register, // if bitmanipulation extension is not enabled. lint_off -rule UNUSED -file "*/rtl/ibex_alu.sv" -match "*'imd_val_q_i'" +// Signal is not used for RV32B == RV32BNone: butterfly_result, invbutterfly_result +// +// Need to be declared; referenced in unused if-generate block +lint_off -rule UNUSED -file "*/rtl/ibex_alu.sv" -match "*'butterfly_result'" +lint_off -rule UNUSED -file "*/rtl/ibex_alu.sv" -match "*'invbutterfly_result'" + // Bits of signal are not used: fetch_addr_n[0] // cleaner to write all bits even if not all are used lint_off -rule UNUSED -file "*/rtl/ibex_if_stage.sv" -match "*'fetch_addr_n'[0]*" diff --git a/rtl/ibex_alu.sv b/rtl/ibex_alu.sv index 0f593285..faa154c5 100644 --- a/rtl/ibex_alu.sv +++ b/rtl/ibex_alu.sv @@ -7,7 +7,7 @@ * Arithmetic logic unit */ module ibex_alu #( - parameter bit RV32B = 1'b0 + parameter ibex_pkg::rv32b_e RV32B = ibex_pkg::RV32BNone ) ( input ibex_pkg::alu_op_e operator_i, input logic [31:0] operand_a_i, @@ -20,9 +20,9 @@ module ibex_alu #( input logic multdiv_sel_i, - input logic [31:0] imd_val_q_i, - output logic [31:0] imd_val_d_o, - output logic imd_val_we_o, + input logic [31:0] imd_val_q_i[2], + output logic [31:0] imd_val_d_o[2], + output logic [1:0] imd_val_we_o, output logic [31:0] adder_result_o, output logic [33:0] adder_result_ext_o, @@ -241,16 +241,16 @@ module ibex_alu #( logic [31:0] bfp_result; // bfp: shares the shifter structure to compute bfp_mask << bfp_off - assign bfp_op = RV32B ? (operator_i == ALU_BFP) : 1'b0; + assign bfp_op = (RV32B != RV32BNone) ? (operator_i == ALU_BFP) : 1'b0; assign bfp_len = {~(|operand_b_i[27:24]), operand_b_i[27:24]}; // len = 0 encodes for len = 16 assign bfp_off = operand_b_i[20:16]; - assign bfp_mask = RV32B ? ~(32'hffff_ffff << bfp_len) : '0; + assign bfp_mask = (RV32B != RV32BNone) ? ~(32'hffff_ffff << bfp_len) : '0; for (genvar i=0; i<32; i++) begin : gen_rev_bfp_mask assign bfp_mask_rev[i] = bfp_mask[31-i]; end - assign bfp_result = - RV32B ? (~shift_result & operand_a_i) | ((operand_b_i & bfp_mask) << bfp_off) : '0; + assign bfp_result =(RV32B != RV32BNone) ? + (~shift_result & operand_a_i) | ((operand_b_i & bfp_mask) << bfp_off) : '0; // bit shift_amt[5]: word swap bit: only considered for FSL/FSR. // if set, reverse operations in first and second cycle. @@ -267,9 +267,8 @@ module ibex_alu #( end end - // single-bit mode: shift - assign shift_sbmode = RV32B ? + assign shift_sbmode = (RV32B != RV32BNone) ? (operator_i == ALU_SBSET) | (operator_i == ALU_SBCLR) | (operator_i == ALU_SBINV) : 1'b0; // left shift if this is: @@ -284,13 +283,13 @@ module ibex_alu #( unique case (operator_i) ALU_SLL: shift_left = 1'b1; ALU_SLO, - ALU_BFP: shift_left = RV32B ? 1'b1 : 1'b0; - ALU_ROL: shift_left = RV32B ? instr_first_cycle_i : 0; - ALU_ROR: shift_left = RV32B ? ~instr_first_cycle_i : 0; - ALU_FSL: shift_left = - RV32B ? (shift_amt[5] ? ~instr_first_cycle_i : instr_first_cycle_i) : 1'b0; - ALU_FSR: shift_left = - RV32B ? (shift_amt[5] ? instr_first_cycle_i : ~instr_first_cycle_i) : 1'b0; + ALU_BFP: shift_left = (RV32B != RV32BNone) ? 1'b1 : 1'b0; + ALU_ROL: shift_left = (RV32B != RV32BNone) ? instr_first_cycle_i : 0; + ALU_ROR: shift_left = (RV32B != RV32BNone) ? ~instr_first_cycle_i : 0; + ALU_FSL: shift_left = (RV32B != RV32BNone) ? + (shift_amt[5] ? ~instr_first_cycle_i : instr_first_cycle_i) : 1'b0; + ALU_FSR: shift_left = (RV32B != RV32BNone) ? + (shift_amt[5] ? instr_first_cycle_i : ~instr_first_cycle_i) : 1'b0; default: shift_left = 1'b0; endcase if (shift_sbmode) begin @@ -298,26 +297,26 @@ module ibex_alu #( end end - assign shift_arith = (operator_i == ALU_SRA); - assign shift_ones = RV32B ? (operator_i == ALU_SLO) | (operator_i == ALU_SRO) : 1'b0; - assign shift_funnel = RV32B ? (operator_i == ALU_FSL) | (operator_i == ALU_FSR) : 1'b0; + assign shift_arith = (operator_i == ALU_SRA); + assign shift_ones = + (RV32B != RV32BNone) ? (operator_i == ALU_SLO) | (operator_i == ALU_SRO) : 1'b0; + assign shift_funnel = + (RV32B != RV32BNone) ? (operator_i == ALU_FSL) | (operator_i == ALU_FSR) : 1'b0; // shifter structure. always_comb begin - // select shifter input // for bfp, sbmode and shift_left the corresponding bit-reversed input is chosen. - if (shift_sbmode) begin - shift_result = 32'h8000_0000; // rev(32'h1) + if (RV32B == RV32BNone) begin + shift_result = shift_left ? operand_a_rev : operand_a_i; end else begin unique case (1'b1) bfp_op: shift_result = bfp_mask_rev; - shift_left: shift_result = operand_a_rev; - default: shift_result = operand_a_i; + shift_sbmode: shift_result = 32'h8000_0000; + default: shift_result = shift_left ? operand_a_rev : operand_a_i; endcase end - shift_result_ext = $signed({shift_ones | (shift_arith & shift_result[31]), shift_result}) >>> shift_amt[4:0]; @@ -350,8 +349,8 @@ module ibex_alu #( // Logic-with-negate OPs (RV32B Ops) ALU_XNOR, ALU_ORN, - ALU_ANDN: bwlogic_op_b_negate = RV32B ? 1'b1 : 1'b0; - ALU_CMIX: bwlogic_op_b_negate = RV32B ? ~instr_first_cycle_i : 1'b0; + ALU_ANDN: bwlogic_op_b_negate = (RV32B != RV32BNone) ? 1'b1 : 1'b0; + ALU_CMIX: bwlogic_op_b_negate = (RV32B != RV32BNone) ? ~instr_first_cycle_i : 1'b0; default: bwlogic_op_b_negate = 1'b0; endcase end @@ -373,19 +372,19 @@ module ibex_alu #( endcase end + logic [5:0] bitcnt_result; + logic [31:0] minmax_result; + logic [31:0] pack_result; + logic [31:0] sext_result; + logic [31:0] singlebit_result; + logic [31:0] rev_result; logic [31:0] shuffle_result; logic [31:0] butterfly_result; logic [31:0] invbutterfly_result; - - logic [31:0] minmax_result; - logic [5:0] bitcnt_result; - logic [31:0] pack_result; - logic [31:0] sext_result; - logic [31:0] multicycle_result; - logic [31:0] singlebit_result; logic [31:0] clmul_result; + logic [31:0] multicycle_result; - if (RV32B) begin : g_alu_rvb + if (RV32B != RV32BNone) begin : g_alu_rvb ///////////////// // Bitcounting // @@ -404,6 +403,8 @@ module ibex_alu #( logic [31:0] bitcnt_mask_op; logic [31:0] bitcnt_bit_mask; logic [ 5:0] bitcnt_partial [32]; + logic [31:0] bitcnt_partial_lsb_d; + logic [31:0] bitcnt_partial_msb_d; assign bitcnt_ctz = operator_i == ALU_CTZ; @@ -427,6 +428,8 @@ module ibex_alu #( bitcnt_bit_mask = ~bitcnt_bit_mask; end + assign zbe_op = (operator_i == ALU_BEXT) | (operator_i == ALU_BDEP); + always_comb begin case(1'b1) zbe_op: bitcnt_bits = operand_b_i; @@ -517,524 +520,12 @@ module ibex_alu #( end end - /////////////// - // Butterfly // - /////////////// - - // The butterfly / inverse butterfly network is shared between bext/bdep (zbe)instructions - // respectively and grev / gorc instructions (zbp). - // For bdep, the control bits mask of a local left region is generated by - // the inverse of a n-bit left rotate and complement upon wrap (LROTC) operation by the number - // of ones in the deposit bitmask to the right of the segment. n hereby denotes the width - // of the according segment. The bitmask for a pertaining local right region is equal to the - // corresponding local left region. Bext uses an analogue inverse process. - // Consider the following 8-bit example. For details, see Hilewitz et al. "Fast Bit Gather, - // Bit Scatter and Bit Permuation Instructions for Commodity Microprocessors", (2008). - - // 8-bit example: (Hilewitz et al.) - // Consider the instruction bdep operand_a_i deposit_mask - // Let operand_a_i = 8'babcd_efgh - // deposit_mask = 8'b1010_1101 - // - // control bitmask for stage 1: - // - number of ones in the right half of the deposit bitmask: 3 - // - width of the segment: 4 - // - control bitmask = ~LROTC(4'b0, 3)[3:0] = 4'b1000 - // - // control bitmask: c3 c2 c1 c0 c3 c2 c1 c0 - // 1 0 0 0 1 0 0 0 - // <- L -----> <- R -----> - // operand_a_i a b c d e f g h - // :\ | | | /: | | | - // : +|---|--|-+ : | | | - // :/ | | | \: | | | - // stage 1 e b c d a f g h - // - // control bitmask: c3 c2 c3 c2 c1 c0 c1 c0 - // 1 1 1 1 1 0 1 0 - // :\ :\ /: /: :\ | /: | - // : +:-+-:+ : : +|-+ : | - // :/ :/ \: \: :/ | \: | - // stage 2 c d e b g f a h - // L R L R L R L R - // control bitmask: c3 c3 c2 c2 c1 c1 c0 c0 - // 1 1 0 0 1 1 0 0 - // :\/: | | :\/: | | - // : : | | : : | | - // :/\: | | :/\: | | - // stage 3 d c e b f g a h - // & deposit bitmask: 1 0 1 0 1 1 0 1 - // result: d 0 e 0 f g 0 h - - assign zbe_op = (operator_i == ALU_BEXT) | (operator_i == ALU_BDEP); - - logic [31:0] butterfly_mask_l[5]; - logic [31:0] butterfly_mask_r[5]; - logic [31:0] butterfly_mask_not[5]; - logic [31:0] lrotc_stage [5]; // left rotate and complement upon wrap - - // bext / bdep - logic [31:0] butterfly_zbe_mask_l[5]; - logic [31:0] butterfly_zbe_mask_r[5]; - logic [31:0] butterfly_zbe_mask_not[5]; - - // grev / gorc - logic [31:0] butterfly_zbp_mask_l[5]; - logic [31:0] butterfly_zbp_mask_r[5]; - logic [31:0] butterfly_zbp_mask_not[5]; - - logic grev_op; - logic gorc_op; - logic zbp_op; - - // number of bits in local r = 32 / 2**(stage + 1) = 16/2**stage - `define _N(stg) (16 >> stg) - - // bext / bdep control bit generation - for (genvar stg=0; stg<5; stg++) begin : gen_stage - // number of segs: 2** stg - for (genvar seg=0; seg<2**stg; seg++) begin : gen_segment - - assign lrotc_stage[stg][2*`_N(stg)*(seg+1)-1 : 2*`_N(stg)*seg] = - {{`_N(stg){1'b0}},{`_N(stg){1'b1}}} << - bitcnt_partial[`_N(stg)*(2*seg+1)-1][$clog2(`_N(stg)):0]; - - assign butterfly_zbe_mask_l[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)] - = ~lrotc_stage[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)]; - - assign butterfly_zbe_mask_r[stg][`_N(stg)*(2*seg+1)-1 : `_N(stg)*(2*seg)] - = ~lrotc_stage[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)]; - - assign butterfly_zbe_mask_l[stg][`_N(stg)*(2*seg+1)-1 : `_N(stg)*(2*seg)] = '0; - assign butterfly_zbe_mask_r[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)] = '0; - end - end - `undef _N - - for (genvar stg=0; stg<5; stg++) begin : gen_zbe_mask - assign butterfly_zbe_mask_not[stg] = - ~(butterfly_zbe_mask_l[stg] | butterfly_zbe_mask_r[stg]); - end - - // grev / gorc control bit generation - assign butterfly_zbp_mask_l[0] = shift_amt[4] ? 32'hffff_0000 : 32'h0000_0000; - assign butterfly_zbp_mask_r[0] = shift_amt[4] ? 32'h0000_ffff : 32'h0000_0000; - assign butterfly_zbp_mask_not[0] = - !shift_amt[4] || (shift_amt[4] && gorc_op) ? 32'hffff_ffff : 32'h0000_0000; - - assign butterfly_zbp_mask_l[1] = shift_amt[3] ? 32'hff00_ff00 : 32'h0000_0000; - assign butterfly_zbp_mask_r[1] = shift_amt[3] ? 32'h00ff_00ff : 32'h0000_0000; - assign butterfly_zbp_mask_not[1] = - !shift_amt[3] || (shift_amt[3] && gorc_op) ? 32'hffff_ffff : 32'h0000_0000; - - assign butterfly_zbp_mask_l[2] = shift_amt[2] ? 32'hf0f0_f0f0 : 32'h0000_0000; - assign butterfly_zbp_mask_r[2] = shift_amt[2] ? 32'h0f0f_0f0f : 32'h0000_0000; - assign butterfly_zbp_mask_not[2] = - !shift_amt[2] || (shift_amt[2] && gorc_op) ? 32'hffff_ffff : 32'h0000_0000; - - assign butterfly_zbp_mask_l[3] = shift_amt[1] ? 32'hcccc_cccc : 32'h0000_0000; - assign butterfly_zbp_mask_r[3] = shift_amt[1] ? 32'h3333_3333 : 32'h0000_0000; - assign butterfly_zbp_mask_not[3] = - !shift_amt[1] || (shift_amt[1] && gorc_op) ? 32'hffff_ffff : 32'h0000_0000; - - assign butterfly_zbp_mask_l[4] = shift_amt[0] ? 32'haaaa_aaaa : 32'h0000_0000; - assign butterfly_zbp_mask_r[4] = shift_amt[0] ? 32'h5555_5555 : 32'h0000_0000; - assign butterfly_zbp_mask_not[4] = - !shift_amt[0] || (shift_amt[0] && gorc_op) ? 32'hffff_ffff : 32'h0000_0000; - - // grev / gorc instructions - assign grev_op = RV32B ? (operator_i == ALU_GREV) : 1'b0; - assign gorc_op = RV32B ? (operator_i == ALU_GORC) : 1'b0; - assign zbp_op = grev_op | gorc_op; - - // select set of masks: - assign butterfly_mask_l = zbp_op ? butterfly_zbp_mask_l : butterfly_zbe_mask_l; - assign butterfly_mask_r = zbp_op ? butterfly_zbp_mask_r : butterfly_zbe_mask_r; - assign butterfly_mask_not = zbp_op ? butterfly_zbp_mask_not : butterfly_zbe_mask_not; - - always_comb begin - butterfly_result = operand_a_i; - - butterfly_result = butterfly_result & butterfly_mask_not[0] | - ((butterfly_result & butterfly_mask_l[0]) >> 16)| - ((butterfly_result & butterfly_mask_r[0]) << 16); - - butterfly_result = butterfly_result & butterfly_mask_not[1] | - ((butterfly_result & butterfly_mask_l[1]) >> 8)| - ((butterfly_result & butterfly_mask_r[1]) << 8); - - butterfly_result = butterfly_result & butterfly_mask_not[2] | - ((butterfly_result & butterfly_mask_l[2]) >> 4)| - ((butterfly_result & butterfly_mask_r[2]) << 4); - - butterfly_result = butterfly_result & butterfly_mask_not[3] | - ((butterfly_result & butterfly_mask_l[3]) >> 2)| - ((butterfly_result & butterfly_mask_r[3]) << 2); - - butterfly_result = butterfly_result & butterfly_mask_not[4] | - ((butterfly_result & butterfly_mask_l[4]) >> 1)| - ((butterfly_result & butterfly_mask_r[4]) << 1); - - if (!zbp_op) begin - butterfly_result = butterfly_result & operand_b_i; - end - end - - always_comb begin - invbutterfly_result = operand_a_i & operand_b_i; - - invbutterfly_result = invbutterfly_result & butterfly_mask_not[4] | - ((invbutterfly_result & butterfly_mask_l[4]) >> 1)| - ((invbutterfly_result & butterfly_mask_r[4]) << 1); - - invbutterfly_result = invbutterfly_result & butterfly_mask_not[3] | - ((invbutterfly_result & butterfly_mask_l[3]) >> 2)| - ((invbutterfly_result & butterfly_mask_r[3]) << 2); - - invbutterfly_result = invbutterfly_result & butterfly_mask_not[2] | - ((invbutterfly_result & butterfly_mask_l[2]) >> 4)| - ((invbutterfly_result & butterfly_mask_r[2]) << 4); - - invbutterfly_result = invbutterfly_result & butterfly_mask_not[1] | - ((invbutterfly_result & butterfly_mask_l[1]) >> 8)| - ((invbutterfly_result & butterfly_mask_r[1]) << 8); - - invbutterfly_result = invbutterfly_result & butterfly_mask_not[0] | - ((invbutterfly_result & butterfly_mask_l[0]) >> 16)| - ((invbutterfly_result & butterfly_mask_r[0]) << 16); - end - - ///////////////////////// - // Shuffle / Unshuffle // - ///////////////////////// - - localparam logic [31:0] SHUFFLE_MASK_L [4] = - '{32'h4444_4444, 32'h3030_3030, 32'h0f00_0f00, 32'h00ff_0000}; - localparam logic [31:0] SHUFFLE_MASK_R [4] = - '{32'h2222_2222, 32'h0c0c_0c0c, 32'h00f0_00f0, 32'h0000_ff00}; - - localparam logic [31:0] FLIP_MASK_L [4] = - '{32'h1100_0000, 32'h4411_0000, 32'h0044_0000, 32'h2200_1100}; - localparam logic [31:0] FLIP_MASK_R [4] = - '{32'h0000_0088, 32'h0000_8822, 32'h0000_2200, 32'h0088_0044}; - - logic [31:0] SHUFFLE_MASK_NOT [4]; - for(genvar i = 0; i < 4; i++) begin : gen_shuffle_mask_not - assign SHUFFLE_MASK_NOT[i] = ~(SHUFFLE_MASK_L[i] | SHUFFLE_MASK_R[i]); - end - - logic shuffle_flip; - assign shuffle_flip = operator_i == ALU_UNSHFL; - - logic [3:0] shuffle_mode; - - always_comb begin - shuffle_result = operand_a_i; - - if (shuffle_flip) begin - shuffle_mode[3] = shift_amt[0]; - shuffle_mode[2] = shift_amt[1]; - shuffle_mode[1] = shift_amt[2]; - shuffle_mode[0] = shift_amt[3]; - end else begin - shuffle_mode = shift_amt[3:0]; - end - - if (shuffle_flip) begin - shuffle_result = (shuffle_result & 32'h8822_4411) | - ((shuffle_result << 6) & FLIP_MASK_L[0]) | ((shuffle_result >> 6) & FLIP_MASK_R[0]) | - ((shuffle_result << 9) & FLIP_MASK_L[1]) | ((shuffle_result >> 9) & FLIP_MASK_R[1]) | - ((shuffle_result << 15) & FLIP_MASK_L[2]) | ((shuffle_result >> 15) & FLIP_MASK_R[2]) | - ((shuffle_result << 21) & FLIP_MASK_L[3]) | ((shuffle_result >> 21) & FLIP_MASK_R[3]); - end - - if (shuffle_mode[3]) begin - shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[0]) | - (((shuffle_result << 8) & SHUFFLE_MASK_L[0]) | - ((shuffle_result >> 8) & SHUFFLE_MASK_R[0])); - end - if (shuffle_mode[2]) begin - shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[1]) | - (((shuffle_result << 4) & SHUFFLE_MASK_L[1]) | - ((shuffle_result >> 4) & SHUFFLE_MASK_R[1])); - end - if (shuffle_mode[1]) begin - shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[2]) | - (((shuffle_result << 2) & SHUFFLE_MASK_L[2]) | - ((shuffle_result >> 2) & SHUFFLE_MASK_R[2])); - end - if (shuffle_mode[0]) begin - shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[3]) | - (((shuffle_result << 1) & SHUFFLE_MASK_L[3]) | - ((shuffle_result >> 1) & SHUFFLE_MASK_R[3])); - end - - if (shuffle_flip) begin - shuffle_result = (shuffle_result & 32'h8822_4411) | - ((shuffle_result << 6) & FLIP_MASK_L[0]) | ((shuffle_result >> 6) & FLIP_MASK_R[0]) | - ((shuffle_result << 9) & FLIP_MASK_L[1]) | ((shuffle_result >> 9) & FLIP_MASK_R[1]) | - ((shuffle_result << 15) & FLIP_MASK_L[2]) | ((shuffle_result >> 15) & FLIP_MASK_R[2]) | - ((shuffle_result << 21) & FLIP_MASK_L[3]) | ((shuffle_result >> 21) & FLIP_MASK_R[3]); - end - - end - /////////////////////////////////////////////////// - // Carry-less Multiply + Cyclic Redundancy Check // - /////////////////////////////////////////////////// - - // Carry-less multiplication can be understood as multiplication based on - // the addition interpreted as the bit-wise xor operation. - // - // Example: 1101 X 1011 = 1111111: - // - // 1011 X 1101 - // ----------- - // 1101 - // xor 1101 - // --------- - // 10111 - // xor 0000 - // ---------- - // 010111 - // xor 1101 - // ----------- - // 1111111 - // - // Architectural details: - // A 32 x 32-bit array - // [ operand_b[i] ? (operand_a << i) : '0 for i in 0 ... 31 ] - // is generated. The entries of the array are pairwise 'xor-ed' - // together in a 5-stage binary tree. - // - // - // Cyclic Redundancy Check: - // - // CRC-32 (CRC-32/ISO-HDLC) and CRC-32C (CRC-32/ISCSI) are directly implemented. For - // documentation of the crc configuration (crc-polynomials, initialization, reflection, etc.) - // see http://reveng.sourceforge.net/crc-catalogue/all.htm - // A useful guide to crc arithmetic and algorithms is given here: - // http://www.piclist.com/techref/method/math/crcguide.html. - // - // The CRC operation solves the following equation using binary polynomial arithmetic: - // - // rev(rd)(x) = rev(rs1)(x) * x**n mod {1, P}(x) - // - // where P denotes lower 32 bits of the corresponding CRC polynomial, rev(a) the bit reversal - // of a, n = 8,16, or 32 for .b, .h, .w -variants. {a, b} denotes bit concatenation. - // - // Using barret reduction, one can show that - // - // M(x) mod P(x) = R(x) = - // (M(x) * x**n) & {deg(P(x)'{1'b1}}) ^ (M(x) x**-(deg(P(x) - n)) cx mu(x) cx P(x), - // - // Where mu(x) = polydiv(x**64, {1,P}) & 0xffffffff. Here, 'cx' refers to carry-less - // multiplication. Substituting rev(rd)(x) for R(x) and rev(rs1)(x) for M(x) and solving for - // rd(x) with P(x) a crc32 polynomial (deg(P(x)) = 32), we get - // - // rd = rev( (rev(rs1) << n) ^ ((rev(rs1) >> (32-n)) cx mu cx P) - // = (rs1 >> n) ^ rev(rev( (rs1 << (32-n)) cx rev(mu)) cx P) - // ^-- cycle 0--------------------^ - // ^- cycle 1 -------------------------------------------^ - // - // In the last step we used the fact that carry-less multiplication is bit-order agnostic: - // rev(a cx b) = rev(a) cx rev(b). - - logic clmul_rmode; - logic clmul_hmode; - logic [31:0] clmul_op_a; - logic [31:0] clmul_op_b; - logic [31:0] operand_b_rev; - logic [31:0] clmul_and_stage[32]; - logic [31:0] clmul_xor_stage1[16]; - logic [31:0] clmul_xor_stage2[8]; - logic [31:0] clmul_xor_stage3[4]; - logic [31:0] clmul_xor_stage4[2]; - - logic [31:0] clmul_result_raw; - logic [31:0] clmul_result_rev; - - for (genvar i=0; i<32; i++) begin: gen_rev_operand_b - assign operand_b_rev[i] = operand_b_i[31-i]; - end - - assign clmul_rmode = operator_i == ALU_CLMULR; - assign clmul_hmode = operator_i == ALU_CLMULH; - - // CRC - localparam logic [31:0] CRC32_POLYNOMIAL = 32'h04c1_1db7; - localparam logic [31:0] CRC32_MU_REV = 32'hf701_1641; - - localparam logic [31:0] CRC32C_POLYNOMIAL = 32'h1edc_6f41; - localparam logic [31:0] CRC32C_MU_REV = 32'hdea7_13f1; - - logic crc_op; - logic crc_hmode; - logic crc_bmode; - - logic crc_cpoly; - - logic [31:0] crc_operand; - logic [31:0] crc_poly; - logic [31:0] crc_mu_rev; - - assign crc_op = (operator_i == ALU_CRC32C_W) | (operator_i == ALU_CRC32_W) | - (operator_i == ALU_CRC32C_H) | (operator_i == ALU_CRC32_H) | - (operator_i == ALU_CRC32C_B) | (operator_i == ALU_CRC32_B); - - assign crc_cpoly = (operator_i == ALU_CRC32C_W) | - (operator_i == ALU_CRC32C_H) | - (operator_i == ALU_CRC32C_B); - - assign crc_hmode = (operator_i == ALU_CRC32_H) | (operator_i == ALU_CRC32C_H); - assign crc_bmode = (operator_i == ALU_CRC32_B) | (operator_i == ALU_CRC32C_B); - - assign crc_poly = crc_cpoly ? CRC32C_POLYNOMIAL : CRC32_POLYNOMIAL; - assign crc_mu_rev = crc_cpoly ? CRC32C_MU_REV : CRC32_MU_REV; - - always_comb begin - unique case(1'b1) - crc_bmode: crc_operand = {operand_a_i[7:0], 24'h0}; - crc_hmode: crc_operand = {operand_a_i[15:0], 16'h0}; - default: crc_operand = operand_a_i; - endcase - end - - // Select clmul input - always_comb begin - if (crc_op) begin - clmul_op_a = instr_first_cycle_i ? crc_operand : imd_val_q_i; - clmul_op_b = instr_first_cycle_i ? crc_mu_rev : crc_poly; - end else begin - clmul_op_a = clmul_rmode | clmul_hmode ? operand_a_rev : operand_a_i; - clmul_op_b = clmul_rmode | clmul_hmode ? operand_b_rev : operand_b_i; - end - end - - for (genvar i=0; i<32; i++) begin : gen_clmul_and_op - assign clmul_and_stage[i] = clmul_op_b[i] ? clmul_op_a << i : '0; - end - - for (genvar i=0; i<16; i++) begin : gen_clmul_xor_op_l1 - assign clmul_xor_stage1[i] = clmul_and_stage[2*i] ^ clmul_and_stage[2*i+1]; - end - - for (genvar i=0; i<8; i++) begin : gen_clmul_xor_op_l2 - assign clmul_xor_stage2[i] = clmul_xor_stage1[2*i] ^ clmul_xor_stage1[2*i+1]; - end - - for (genvar i=0; i<4; i++) begin : gen_clmul_xor_op_l3 - assign clmul_xor_stage3[i] = clmul_xor_stage2[2*i] ^ clmul_xor_stage2[2*i+1]; - end - - for (genvar i=0; i<2; i++) begin : gen_clmul_xor_op_l4 - assign clmul_xor_stage4[i] = clmul_xor_stage3[2*i] ^ clmul_xor_stage3[2*i+1]; - end - - assign clmul_result_raw = clmul_xor_stage4[0] ^ clmul_xor_stage4[1]; - - for (genvar i=0; i<32; i++) begin : gen_rev_clmul_result - assign clmul_result_rev[i] = clmul_result_raw[31-i]; - end - - // clmulr_result = rev(clmul(rev(a), rev(b))) - // clmulh_result = clmulr_result >> 1 - always_comb begin - case(1'b1) - clmul_rmode: clmul_result = clmul_result_rev; - clmul_hmode: clmul_result = {1'b0, clmul_result_rev[31:1]}; - default: clmul_result = clmul_result_raw; - endcase - end - - ////////////////////////////////////// - // Multicycle Bitmanip Instructions // - ////////////////////////////////////// - // Ternary instructions + Shift Rotations + CRC - // For ternary instructions (zbt), operand_a_i is tied to rs1 in the first cycle and rs3 in the - // second cycle. operand_b_i is always tied to rs2. - - - always_comb begin - unique case (operator_i) - ALU_CMOV: begin - imd_val_d_o = operand_a_i; - multicycle_result = (operand_b_i == 32'h0) ? operand_a_i : imd_val_q_i; - if (instr_first_cycle_i) begin - imd_val_we_o = 1'b1; - end else begin - imd_val_we_o = 1'b0; - end - end - - ALU_CMIX: begin - multicycle_result = imd_val_q_i | bwlogic_and_result; - imd_val_d_o = bwlogic_and_result; - if (instr_first_cycle_i) begin - imd_val_we_o = 1'b1; - end else begin - imd_val_we_o = 1'b0; - end - end - - ALU_FSR, ALU_FSL, - ALU_ROL, ALU_ROR: begin - if (shift_amt[4:0] == 5'h0) begin - multicycle_result = shift_amt[5] ? operand_a_i : imd_val_q_i; - end else begin - multicycle_result = imd_val_q_i | shift_result; - end - imd_val_d_o = shift_result; - if (instr_first_cycle_i) begin - imd_val_we_o = 1'b1; - end else begin - imd_val_we_o = 1'b0; - end - end - - ALU_CRC32_W, ALU_CRC32C_W, - ALU_CRC32_H, ALU_CRC32C_H, - ALU_CRC32_B, ALU_CRC32C_B: begin - imd_val_d_o = clmul_result_rev; - unique case(1'b1) - crc_bmode: multicycle_result = clmul_result_rev ^ (operand_a_i >> 8); - crc_hmode: multicycle_result = clmul_result_rev ^ (operand_a_i >> 16); - default: multicycle_result = clmul_result_rev; - endcase - if (instr_first_cycle_i) begin - imd_val_we_o = 1'b1; - end else begin - imd_val_we_o = 1'b0; - end - end - - default: begin - imd_val_d_o = operand_a_i; - imd_val_we_o = 1'b0; - multicycle_result = operand_a_i; - end - endcase - end - - ///////////////////////////// - // Single-bit Instructions // - ///////////////////////////// - - always_comb begin - unique case (operator_i) - ALU_SBSET: singlebit_result = operand_a_i | shift_result; - ALU_SBCLR: singlebit_result = operand_a_i & ~shift_result; - ALU_SBINV: singlebit_result = operand_a_i ^ shift_result; - default: singlebit_result = {31'h0, shift_result[0]}; // ALU_SBEXT - endcase - end - /////////////// // Min / Max // /////////////// assign minmax_result = cmp_result ? operand_a_i : operand_b_i; - ////////// // Pack // ////////// @@ -1059,21 +550,623 @@ module ibex_alu #( assign sext_result = (operator_i == ALU_SEXTB) ? { {24{operand_a_i[7]}}, operand_a_i[7:0]} : { {16{operand_a_i[15]}}, operand_a_i[15:0]}; + ///////////////////////////// + // Single-bit Instructions // + ///////////////////////////// + + always_comb begin + unique case (operator_i) + ALU_SBSET: singlebit_result = operand_a_i | shift_result; + ALU_SBCLR: singlebit_result = operand_a_i & ~shift_result; + ALU_SBINV: singlebit_result = operand_a_i ^ shift_result; + default: singlebit_result = {31'h0, shift_result[0]}; // ALU_SBEXT + endcase + end + + //////////////////////////////////// + // General Reverse and Or-combine // + //////////////////////////////////// + + // Only a subset of the General reverse and or-combine instructions are implemented in the + // balanced version of the B extension. Currently rev, rev8 and orc.b are supported in the + // base extension. + + logic [4:0] zbp_shift_amt; + logic gorc_op; + + assign gorc_op = (operator_i == ALU_GORC); + assign zbp_shift_amt[2:0] = (RV32B == RV32BFull) ? shift_amt[2:0] : {3{&shift_amt[2:0]}}; + assign zbp_shift_amt[4:3] = (RV32B == RV32BFull) ? shift_amt[4:3] : {2{&shift_amt[4:3]}}; + + always_comb begin + rev_result = operand_a_i; + + if (zbp_shift_amt[0]) begin + rev_result = (gorc_op ? rev_result : 32'h0) | + ((rev_result & 32'h5555_5555) << 1) | + ((rev_result & 32'haaaa_aaaa) >> 1); + end + + if (zbp_shift_amt[1]) begin + rev_result = (gorc_op ? rev_result : 32'h0) | + ((rev_result & 32'h3333_3333) << 2) | + ((rev_result & 32'hcccc_cccc) >> 2); + end + + if (zbp_shift_amt[2]) begin + rev_result = (gorc_op ? rev_result : 32'h0) | + ((rev_result & 32'h0f0f_0f0f) << 4) | + ((rev_result & 32'hf0f0_f0f0) >> 4); + end + + if (zbp_shift_amt[3]) begin + rev_result = (gorc_op & (RV32B == RV32BFull) ? rev_result : 32'h0) | + ((rev_result & 32'h00ff_00ff) << 8) | + ((rev_result & 32'hff00_ff00) >> 8); + end + + if (zbp_shift_amt[4]) begin + rev_result = (gorc_op & (RV32B == RV32BFull) ? rev_result : 32'h0) | + ((rev_result & 32'h0000_ffff) << 16) | + ((rev_result & 32'hffff_0000) >> 16); + end + end + + logic crc_hmode; + logic crc_bmode; + logic [31:0] clmul_result_rev; + + if (RV32B == RV32BFull) begin : gen_alu_rvb_full + + ///////////////////////// + // Shuffle / Unshuffle // + ///////////////////////// + + localparam logic [31:0] SHUFFLE_MASK_L [0:3] = + '{32'h00ff_0000, 32'h0f00_0f00, 32'h3030_3030, 32'h4444_4444}; + localparam logic [31:0] SHUFFLE_MASK_R [0:3] = + '{32'h0000_ff00, 32'h00f0_00f0, 32'h0c0c_0c0c, 32'h2222_2222}; + + localparam logic [31:0] FLIP_MASK_L [0:3] = + '{32'h2200_1100, 32'h0044_0000, 32'h4411_0000, 32'h1100_0000}; + localparam logic [31:0] FLIP_MASK_R [0:3] = + '{32'h0088_0044, 32'h0000_2200, 32'h0000_8822, 32'h0000_0088}; + + logic [31:0] SHUFFLE_MASK_NOT [0:3]; + for(genvar i = 0; i < 4; i++) begin : gen_shuffle_mask_not + assign SHUFFLE_MASK_NOT[i] = ~(SHUFFLE_MASK_L[i] | SHUFFLE_MASK_R[i]); + end + + logic shuffle_flip; + assign shuffle_flip = operator_i == ALU_UNSHFL; + + logic [3:0] shuffle_mode; + + always_comb begin + shuffle_result = operand_a_i; + + if (shuffle_flip) begin + shuffle_mode[3] = shift_amt[0]; + shuffle_mode[2] = shift_amt[1]; + shuffle_mode[1] = shift_amt[2]; + shuffle_mode[0] = shift_amt[3]; + end else begin + shuffle_mode = shift_amt[3:0]; + end + + if (shuffle_flip) begin + shuffle_result = (shuffle_result & 32'h8822_4411) | + ((shuffle_result << 6) & FLIP_MASK_L[0]) | ((shuffle_result >> 6) & FLIP_MASK_R[0]) | + ((shuffle_result << 9) & FLIP_MASK_L[1]) | ((shuffle_result >> 9) & FLIP_MASK_R[1]) | + ((shuffle_result << 15) & FLIP_MASK_L[2]) | ((shuffle_result >> 15) & FLIP_MASK_R[2]) | + ((shuffle_result << 21) & FLIP_MASK_L[3]) | ((shuffle_result >> 21) & FLIP_MASK_R[3]); + end + + if (shuffle_mode[3]) begin + shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[0]) | + (((shuffle_result << 8) & SHUFFLE_MASK_L[0]) | + ((shuffle_result >> 8) & SHUFFLE_MASK_R[0])); + end + if (shuffle_mode[2]) begin + shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[1]) | + (((shuffle_result << 4) & SHUFFLE_MASK_L[1]) | + ((shuffle_result >> 4) & SHUFFLE_MASK_R[1])); + end + if (shuffle_mode[1]) begin + shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[2]) | + (((shuffle_result << 2) & SHUFFLE_MASK_L[2]) | + ((shuffle_result >> 2) & SHUFFLE_MASK_R[2])); + end + if (shuffle_mode[0]) begin + shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[3]) | + (((shuffle_result << 1) & SHUFFLE_MASK_L[3]) | + ((shuffle_result >> 1) & SHUFFLE_MASK_R[3])); + end + + if (shuffle_flip) begin + shuffle_result = (shuffle_result & 32'h8822_4411) | + ((shuffle_result << 6) & FLIP_MASK_L[0]) | ((shuffle_result >> 6) & FLIP_MASK_R[0]) | + ((shuffle_result << 9) & FLIP_MASK_L[1]) | ((shuffle_result >> 9) & FLIP_MASK_R[1]) | + ((shuffle_result << 15) & FLIP_MASK_L[2]) | ((shuffle_result >> 15) & FLIP_MASK_R[2]) | + ((shuffle_result << 21) & FLIP_MASK_L[3]) | ((shuffle_result >> 21) & FLIP_MASK_R[3]); + end + end + + /////////////// + // Butterfly // + /////////////// + + // The butterfly / inverse butterfly network executing bext/bdep (zbe) instructions. + // For bdep, the control bits mask of a local left region is generated by + // the inverse of a n-bit left rotate and complement upon wrap (LROTC) operation by the number + // of ones in the deposit bitmask to the right of the segment. n hereby denotes the width + // of the according segment. The bitmask for a pertaining local right region is equal to the + // corresponding local left region. Bext uses an analogue inverse process. + // Consider the following 8-bit example. For details, see Hilewitz et al. "Fast Bit Gather, + // Bit Scatter and Bit Permuation Instructions for Commodity Microprocessors", (2008). + // + // The bext/bdep instructions are completed in 2 cycles. In the first cycle, the control + // bitmask is prepared by executing the parallel prefix bit count. In the second cycle, + // the bit swapping is executed according to the control masks. + + // 8-bit example: (Hilewitz et al.) + // Consider the instruction bdep operand_a_i deposit_mask + // Let operand_a_i = 8'babcd_efgh + // deposit_mask = 8'b1010_1101 + // + // control bitmask for stage 1: + // - number of ones in the right half of the deposit bitmask: 3 + // - width of the segment: 4 + // - control bitmask = ~LROTC(4'b0, 3)[3:0] = 4'b1000 + // + // control bitmask: c3 c2 c1 c0 c3 c2 c1 c0 + // 1 0 0 0 1 0 0 0 + // <- L -----> <- R -----> + // operand_a_i a b c d e f g h + // :\ | | | /: | | | + // : +|---|--|-+ : | | | + // :/ | | | \: | | | + // stage 1 e b c d a f g h + // + // control bitmask: c3 c2 c3 c2 c1 c0 c1 c0 + // 1 1 1 1 1 0 1 0 + // :\ :\ /: /: :\ | /: | + // : +:-+-:+ : : +|-+ : | + // :/ :/ \: \: :/ | \: | + // stage 2 c d e b g f a h + // L R L R L R L R + // control bitmask: c3 c3 c2 c2 c1 c1 c0 c0 + // 1 1 0 0 1 1 0 0 + // :\/: | | :\/: | | + // : : | | : : | | + // :/\: | | :/\: | | + // stage 3 d c e b f g a h + // & deposit bitmask: 1 0 1 0 1 1 0 1 + // result: d 0 e 0 f g 0 h + + logic [ 5:0] bitcnt_partial_q [32]; + + // first cycle + // Store partial bitcnts + for (genvar i=0; i<32; i++) begin : gen_bitcnt_reg_in_lsb + assign bitcnt_partial_lsb_d[i] = bitcnt_partial[i][0]; + end + + for (genvar i=0; i<16; i++) begin : gen_bitcnt_reg_in_b1 + assign bitcnt_partial_msb_d[i] = bitcnt_partial[2*i+1][1]; + end + + for (genvar i=0; i<8; i++) begin : gen_bitcnt_reg_in_b2 + assign bitcnt_partial_msb_d[16+i] = bitcnt_partial[4*i+3][2]; + end + + for (genvar i=0; i<4; i++) begin : gen_bitcnt_reg_in_b3 + assign bitcnt_partial_msb_d[24+i] = bitcnt_partial[8*i+7][3]; + end + + for (genvar i=0; i<2; i++) begin : gen_bitcnt_reg_in_b4 + assign bitcnt_partial_msb_d[28+i] = bitcnt_partial[16*i+15][4]; + end + + assign bitcnt_partial_msb_d[30] = bitcnt_partial[31][5]; + assign bitcnt_partial_msb_d[31] = 1'b0; // unused + + // Second cycle + // Load partial bitcnts + always_comb begin + bitcnt_partial_q = '{default: '0}; + + for (int unsigned i=0; i<32; i++) begin : gen_bitcnt_reg_out_lsb + bitcnt_partial_q[i][0] = imd_val_q_i[0][i]; + end + + for (int unsigned i=0; i<16; i++) begin : gen_bitcnt_reg_out_b1 + bitcnt_partial_q[2*i+1][1] = imd_val_q_i[1][i]; + end + + for (int unsigned i=0; i<8; i++) begin : gen_bitcnt_reg_out_b2 + bitcnt_partial_q[4*i+3][2] = imd_val_q_i[1][16+i]; + end + + for (int unsigned i=0; i<4; i++) begin : gen_bitcnt_reg_out_b3 + bitcnt_partial_q[8*i+7][3] = imd_val_q_i[1][24+i]; + end + + for (int unsigned i=0; i<2; i++) begin : gen_bitcnt_reg_out_b4 + bitcnt_partial_q[16*i+15][4] = imd_val_q_i[1][28+i]; + end + + bitcnt_partial_q[31][5] = imd_val_q_i[1][30]; + end + + logic [31:0] butterfly_mask_l[5]; + logic [31:0] butterfly_mask_r[5]; + logic [31:0] butterfly_mask_not[5]; + logic [31:0] lrotc_stage [5]; // left rotate and complement upon wrap + + // number of bits in local r = 32 / 2**(stage + 1) = 16/2**stage + `define _N(stg) (16 >> stg) + + // bext / bdep control bit generation + for (genvar stg=0; stg<5; stg++) begin : gen_butterfly_ctrl_stage + // number of segs: 2** stg + for (genvar seg=0; seg<2**stg; seg++) begin : gen_butterfly_ctrl + + assign lrotc_stage[stg][2*`_N(stg)*(seg+1)-1 : 2*`_N(stg)*seg] = + {{`_N(stg){1'b0}},{`_N(stg){1'b1}}} << + bitcnt_partial_q[`_N(stg)*(2*seg+1)-1][$clog2(`_N(stg)):0]; + + assign butterfly_mask_l[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)] + = ~lrotc_stage[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)]; + + assign butterfly_mask_r[stg][`_N(stg)*(2*seg+1)-1 : `_N(stg)*(2*seg)] + = ~lrotc_stage[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)]; + + assign butterfly_mask_l[stg][`_N(stg)*(2*seg+1)-1 : `_N(stg)*(2*seg)] = '0; + assign butterfly_mask_r[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)] = '0; + end + end + `undef _N + + for (genvar stg=0; stg<5; stg++) begin : gen_butterfly_not + assign butterfly_mask_not[stg] = + ~(butterfly_mask_l[stg] | butterfly_mask_r[stg]); + end + + always_comb begin + butterfly_result = operand_a_i; + + butterfly_result = butterfly_result & butterfly_mask_not[0] | + ((butterfly_result & butterfly_mask_l[0]) >> 16)| + ((butterfly_result & butterfly_mask_r[0]) << 16); + + butterfly_result = butterfly_result & butterfly_mask_not[1] | + ((butterfly_result & butterfly_mask_l[1]) >> 8)| + ((butterfly_result & butterfly_mask_r[1]) << 8); + + butterfly_result = butterfly_result & butterfly_mask_not[2] | + ((butterfly_result & butterfly_mask_l[2]) >> 4)| + ((butterfly_result & butterfly_mask_r[2]) << 4); + + butterfly_result = butterfly_result & butterfly_mask_not[3] | + ((butterfly_result & butterfly_mask_l[3]) >> 2)| + ((butterfly_result & butterfly_mask_r[3]) << 2); + + butterfly_result = butterfly_result & butterfly_mask_not[4] | + ((butterfly_result & butterfly_mask_l[4]) >> 1)| + ((butterfly_result & butterfly_mask_r[4]) << 1); + + butterfly_result = butterfly_result & operand_b_i; + end + + always_comb begin + invbutterfly_result = operand_a_i & operand_b_i; + + invbutterfly_result = invbutterfly_result & butterfly_mask_not[4] | + ((invbutterfly_result & butterfly_mask_l[4]) >> 1)| + ((invbutterfly_result & butterfly_mask_r[4]) << 1); + + invbutterfly_result = invbutterfly_result & butterfly_mask_not[3] | + ((invbutterfly_result & butterfly_mask_l[3]) >> 2)| + ((invbutterfly_result & butterfly_mask_r[3]) << 2); + + invbutterfly_result = invbutterfly_result & butterfly_mask_not[2] | + ((invbutterfly_result & butterfly_mask_l[2]) >> 4)| + ((invbutterfly_result & butterfly_mask_r[2]) << 4); + + invbutterfly_result = invbutterfly_result & butterfly_mask_not[1] | + ((invbutterfly_result & butterfly_mask_l[1]) >> 8)| + ((invbutterfly_result & butterfly_mask_r[1]) << 8); + + invbutterfly_result = invbutterfly_result & butterfly_mask_not[0] | + ((invbutterfly_result & butterfly_mask_l[0]) >> 16)| + ((invbutterfly_result & butterfly_mask_r[0]) << 16); + end + + /////////////////////////////////////////////////// + // Carry-less Multiply + Cyclic Redundancy Check // + /////////////////////////////////////////////////// + + // Carry-less multiplication can be understood as multiplication based on + // the addition interpreted as the bit-wise xor operation. + // + // Example: 1101 X 1011 = 1111111: + // + // 1011 X 1101 + // ----------- + // 1101 + // xor 1101 + // --------- + // 10111 + // xor 0000 + // ---------- + // 010111 + // xor 1101 + // ----------- + // 1111111 + // + // Architectural details: + // A 32 x 32-bit array + // [ operand_b[i] ? (operand_a << i) : '0 for i in 0 ... 31 ] + // is generated. The entries of the array are pairwise 'xor-ed' + // together in a 5-stage binary tree. + // + // + // Cyclic Redundancy Check: + // + // CRC-32 (CRC-32/ISO-HDLC) and CRC-32C (CRC-32/ISCSI) are directly implemented. For + // documentation of the crc configuration (crc-polynomials, initialization, reflection, etc.) + // see http://reveng.sourceforge.net/crc-catalogue/all.htm + // A useful guide to crc arithmetic and algorithms is given here: + // http://www.piclist.com/techref/method/math/crcguide.html. + // + // The CRC operation solves the following equation using binary polynomial arithmetic: + // + // rev(rd)(x) = rev(rs1)(x) * x**n mod {1, P}(x) + // + // where P denotes lower 32 bits of the corresponding CRC polynomial, rev(a) the bit reversal + // of a, n = 8,16, or 32 for .b, .h, .w -variants. {a, b} denotes bit concatenation. + // + // Using barret reduction, one can show that + // + // M(x) mod P(x) = R(x) = + // (M(x) * x**n) & {deg(P(x)'{1'b1}}) ^ (M(x) x**-(deg(P(x) - n)) cx mu(x) cx P(x), + // + // Where mu(x) = polydiv(x**64, {1,P}) & 0xffffffff. Here, 'cx' refers to carry-less + // multiplication. Substituting rev(rd)(x) for R(x) and rev(rs1)(x) for M(x) and solving for + // rd(x) with P(x) a crc32 polynomial (deg(P(x)) = 32), we get + // + // rd = rev( (rev(rs1) << n) ^ ((rev(rs1) >> (32-n)) cx mu cx P) + // = (rs1 >> n) ^ rev(rev( (rs1 << (32-n)) cx rev(mu)) cx P) + // ^-- cycle 0--------------------^ + // ^- cycle 1 -------------------------------------------^ + // + // In the last step we used the fact that carry-less multiplication is bit-order agnostic: + // rev(a cx b) = rev(a) cx rev(b). + + logic clmul_rmode; + logic clmul_hmode; + logic [31:0] clmul_op_a; + logic [31:0] clmul_op_b; + logic [31:0] operand_b_rev; + logic [31:0] clmul_and_stage[32]; + logic [31:0] clmul_xor_stage1[16]; + logic [31:0] clmul_xor_stage2[8]; + logic [31:0] clmul_xor_stage3[4]; + logic [31:0] clmul_xor_stage4[2]; + + logic [31:0] clmul_result_raw; + + for (genvar i=0; i<32; i++) begin: gen_rev_operand_b + assign operand_b_rev[i] = operand_b_i[31-i]; + end + + assign clmul_rmode = operator_i == ALU_CLMULR; + assign clmul_hmode = operator_i == ALU_CLMULH; + + // CRC + localparam logic [31:0] CRC32_POLYNOMIAL = 32'h04c1_1db7; + localparam logic [31:0] CRC32_MU_REV = 32'hf701_1641; + + localparam logic [31:0] CRC32C_POLYNOMIAL = 32'h1edc_6f41; + localparam logic [31:0] CRC32C_MU_REV = 32'hdea7_13f1; + + logic crc_op; + + logic crc_cpoly; + + logic [31:0] crc_operand; + logic [31:0] crc_poly; + logic [31:0] crc_mu_rev; + + assign crc_op = (operator_i == ALU_CRC32C_W) | (operator_i == ALU_CRC32_W) | + (operator_i == ALU_CRC32C_H) | (operator_i == ALU_CRC32_H) | + (operator_i == ALU_CRC32C_B) | (operator_i == ALU_CRC32_B); + + assign crc_cpoly = (operator_i == ALU_CRC32C_W) | + (operator_i == ALU_CRC32C_H) | + (operator_i == ALU_CRC32C_B); + + assign crc_hmode = (operator_i == ALU_CRC32_H) | (operator_i == ALU_CRC32C_H); + assign crc_bmode = (operator_i == ALU_CRC32_B) | (operator_i == ALU_CRC32C_B); + + assign crc_poly = crc_cpoly ? CRC32C_POLYNOMIAL : CRC32_POLYNOMIAL; + assign crc_mu_rev = crc_cpoly ? CRC32C_MU_REV : CRC32_MU_REV; + + always_comb begin + unique case(1'b1) + crc_bmode: crc_operand = {operand_a_i[7:0], 24'h0}; + crc_hmode: crc_operand = {operand_a_i[15:0], 16'h0}; + default: crc_operand = operand_a_i; + endcase + end + + // Select clmul input + always_comb begin + if (crc_op) begin + clmul_op_a = instr_first_cycle_i ? crc_operand : imd_val_q_i[0]; + clmul_op_b = instr_first_cycle_i ? crc_mu_rev : crc_poly; + end else begin + clmul_op_a = clmul_rmode | clmul_hmode ? operand_a_rev : operand_a_i; + clmul_op_b = clmul_rmode | clmul_hmode ? operand_b_rev : operand_b_i; + end + end + + for (genvar i=0; i<32; i++) begin : gen_clmul_and_op + assign clmul_and_stage[i] = clmul_op_b[i] ? clmul_op_a << i : '0; + end + + for (genvar i=0; i<16; i++) begin : gen_clmul_xor_op_l1 + assign clmul_xor_stage1[i] = clmul_and_stage[2*i] ^ clmul_and_stage[2*i+1]; + end + + for (genvar i=0; i<8; i++) begin : gen_clmul_xor_op_l2 + assign clmul_xor_stage2[i] = clmul_xor_stage1[2*i] ^ clmul_xor_stage1[2*i+1]; + end + + for (genvar i=0; i<4; i++) begin : gen_clmul_xor_op_l3 + assign clmul_xor_stage3[i] = clmul_xor_stage2[2*i] ^ clmul_xor_stage2[2*i+1]; + end + + for (genvar i=0; i<2; i++) begin : gen_clmul_xor_op_l4 + assign clmul_xor_stage4[i] = clmul_xor_stage3[2*i] ^ clmul_xor_stage3[2*i+1]; + end + + assign clmul_result_raw = clmul_xor_stage4[0] ^ clmul_xor_stage4[1]; + + for (genvar i=0; i<32; i++) begin : gen_rev_clmul_result + assign clmul_result_rev[i] = clmul_result_raw[31-i]; + end + + // clmulr_result = rev(clmul(rev(a), rev(b))) + // clmulh_result = clmulr_result >> 1 + always_comb begin + case(1'b1) + clmul_rmode: clmul_result = clmul_result_rev; + clmul_hmode: clmul_result = {1'b0, clmul_result_rev[31:1]}; + default: clmul_result = clmul_result_raw; + endcase + end + end else begin + assign shuffle_result = '0; + assign butterfly_result = '0; + assign invbutterfly_result = '0; + assign clmul_result = '0; + // support signals + assign bitcnt_partial_lsb_d = '0; + assign bitcnt_partial_msb_d = '0; + assign clmul_result_rev = '0; + assign crc_bmode = '0; + assign crc_hmode = '0; + end + + ////////////////////////////////////// + // Multicycle Bitmanip Instructions // + ////////////////////////////////////// + // Ternary instructions + Shift Rotations + Bit extract/deposit + CRC + // For ternary instructions (zbt), operand_a_i is tied to rs1 in the first cycle and rs3 in the + // second cycle. operand_b_i is always tied to rs2. + + always_comb begin + unique case (operator_i) + ALU_CMOV: begin + multicycle_result = (operand_b_i == 32'h0) ? operand_a_i : imd_val_q_i[0]; + imd_val_d_o = '{operand_a_i, 32'h0}; + if (instr_first_cycle_i) begin + imd_val_we_o = 2'b01; + end else begin + imd_val_we_o = 2'b00; + end + end + + ALU_CMIX: begin + multicycle_result = imd_val_q_i[0] | bwlogic_and_result; + imd_val_d_o = '{bwlogic_and_result, 32'h0}; + if (instr_first_cycle_i) begin + imd_val_we_o = 2'b01; + end else begin + imd_val_we_o = 2'b00; + end + end + + ALU_FSR, ALU_FSL, + ALU_ROL, ALU_ROR: begin + if (shift_amt[4:0] == 5'h0) begin + multicycle_result = shift_amt[5] ? operand_a_i : imd_val_q_i[0]; + end else begin + multicycle_result = imd_val_q_i[0] | shift_result; + end + imd_val_d_o = '{shift_result, 32'h0}; + if (instr_first_cycle_i) begin + imd_val_we_o = 2'b01; + end else begin + imd_val_we_o = 2'b00; + end + end + + ALU_CRC32_W, ALU_CRC32C_W, + ALU_CRC32_H, ALU_CRC32C_H, + ALU_CRC32_B, ALU_CRC32C_B: begin + if (RV32B == RV32BFull) begin + unique case(1'b1) + crc_bmode: multicycle_result = clmul_result_rev ^ (operand_a_i >> 8); + crc_hmode: multicycle_result = clmul_result_rev ^ (operand_a_i >> 16); + default: multicycle_result = clmul_result_rev; + endcase + imd_val_d_o = '{clmul_result_rev, 32'h0}; + if (instr_first_cycle_i) begin + imd_val_we_o = 2'b01; + end else begin + imd_val_we_o = 2'b00; + end + end else begin + imd_val_d_o = '{operand_a_i, 32'h0}; + imd_val_we_o = 2'b00; + multicycle_result = '0; + end + end + + ALU_BEXT, ALU_BDEP: begin + if (RV32B == RV32BFull) begin + multicycle_result = (operator_i == ALU_BDEP) ? butterfly_result : invbutterfly_result; + imd_val_d_o = '{bitcnt_partial_lsb_d, bitcnt_partial_msb_d}; + if (instr_first_cycle_i) begin + imd_val_we_o = 2'b11; + end else begin + imd_val_we_o = 2'b00; + end + end else begin + imd_val_d_o = '{operand_a_i, 32'h0}; + imd_val_we_o = 2'b00; + multicycle_result = '0; + end + end + + default: begin + imd_val_d_o = '{operand_a_i, 32'h0}; + imd_val_we_o = 2'b00; + multicycle_result = '0; + end + endcase + end + + end else begin : g_no_alu_rvb // RV32B result signals - assign minmax_result = '0; assign bitcnt_result = '0; + assign minmax_result = '0; assign pack_result = '0; assign sext_result = '0; - assign multicycle_result = '0; assign singlebit_result = '0; + assign rev_result = '0; assign shuffle_result = '0; assign butterfly_result = '0; assign invbutterfly_result = '0; assign clmul_result = '0; + assign multicycle_result = '0; // RV32B support signals - assign imd_val_d_o = '0; - assign imd_val_we_o = '0; + assign imd_val_d_o = '{default: '0}; + assign imd_val_we_o = '{default: '0}; end //////////////// @@ -1130,18 +1223,16 @@ module ibex_alu #( // Cyclic Redundancy Checks (RV32B) ALU_CRC32_W, ALU_CRC32C_W, ALU_CRC32_H, ALU_CRC32C_H, - ALU_CRC32_B, ALU_CRC32C_B: result_o = multicycle_result; + ALU_CRC32_B, ALU_CRC32C_B, + // Bit Extract / Deposit (RV32B) + ALU_BEXT, ALU_BDEP: result_o = multicycle_result; // Single-Bit Bitmanip Operations (RV32B) ALU_SBSET, ALU_SBCLR, ALU_SBINV, ALU_SBEXT: result_o = singlebit_result; - // Bit Extract / Deposit (RV32B) - ALU_BDEP: result_o = butterfly_result; - ALU_BEXT: result_o = invbutterfly_result; - // General Reverse / Or-combine (RV32B) - ALU_GREV, ALU_GORC: result_o = butterfly_result; + ALU_GREV, ALU_GORC: result_o = rev_result; // Bit Field Place (RV32B) ALU_BFP: result_o = bfp_result; diff --git a/rtl/ibex_core.sv b/rtl/ibex_core.sv index 6fd1b407..1de776bb 100644 --- a/rtl/ibex_core.sv +++ b/rtl/ibex_core.sv @@ -9,27 +9,31 @@ `include "prim_assert.sv" +`ifndef RV32B + `define RV32B ibex_pkg::RV32BNone +`endif + /** * Top level module of the ibex RISC-V core */ module ibex_core #( - parameter bit PMPEnable = 1'b0, - parameter int unsigned PMPGranularity = 0, - parameter int unsigned PMPNumRegions = 4, - parameter int unsigned MHPMCounterNum = 0, - parameter int unsigned MHPMCounterWidth = 40, - parameter bit RV32E = 1'b0, - parameter bit RV32M = 1'b1, - parameter bit RV32B = 1'b0, - parameter bit BranchTargetALU = 1'b0, - parameter bit WritebackStage = 1'b0, - parameter MultiplierImplementation = "fast", - parameter bit ICache = 1'b0, - parameter bit ICacheECC = 1'b0, - parameter bit DbgTriggerEn = 1'b0, - parameter bit SecureIbex = 1'b0, - parameter int unsigned DmHaltAddr = 32'h1A110800, - parameter int unsigned DmExceptionAddr = 32'h1A110808 + parameter bit PMPEnable = 1'b0, + parameter int unsigned PMPGranularity = 0, + parameter int unsigned PMPNumRegions = 4, + parameter int unsigned MHPMCounterNum = 0, + parameter int unsigned MHPMCounterWidth = 40, + parameter bit RV32E = 1'b0, + parameter bit RV32M = 1'b1, + parameter ibex_pkg::rv32b_e RV32B = `RV32B, + parameter bit BranchTargetALU = 1'b0, + parameter bit WritebackStage = 1'b0, + parameter MultiplierImplementation = "fast", + parameter bit ICache = 1'b0, + parameter bit ICacheECC = 1'b0, + parameter bit DbgTriggerEn = 1'b0, + parameter bit SecureIbex = 1'b0, + parameter int unsigned DmHaltAddr = 32'h1A110800, + parameter int unsigned DmExceptionAddr = 32'h1A110808 ) ( // Clock and Reset input logic clk_i, @@ -129,9 +133,9 @@ module ibex_core #( logic [31:0] pc_if; // Program counter in IF stage logic [31:0] pc_id; // Program counter in ID stage logic [31:0] pc_wb; // Program counter in WB stage - logic [33:0] imd_val_d_ex; // Intermediate register for multicycle Ops - logic [33:0] imd_val_q_ex; // Intermediate register for multicycle Ops - logic imd_val_we_ex; + logic [33:0] imd_val_d_ex[2]; // Intermediate register for multicycle Ops + logic [33:0] imd_val_q_ex[2]; // Intermediate register for multicycle Ops + logic [1:0] imd_val_we_ex; logic data_ind_timing; logic dummy_instr_en; diff --git a/rtl/ibex_core_tracing.sv b/rtl/ibex_core_tracing.sv index 1c019d5a..e290c35a 100644 --- a/rtl/ibex_core_tracing.sv +++ b/rtl/ibex_core_tracing.sv @@ -2,28 +2,32 @@ // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 +`ifndef RV32B + `define RV32B ibex_pkg::RV32BNone +`endif /** * Top level module of the ibex RISC-V core with tracing enabled */ + module ibex_core_tracing #( - parameter bit PMPEnable = 1'b0, - parameter int unsigned PMPGranularity = 0, - parameter int unsigned PMPNumRegions = 4, - parameter int unsigned MHPMCounterNum = 0, - parameter int unsigned MHPMCounterWidth = 40, - parameter bit RV32E = 1'b0, - parameter bit RV32M = 1'b1, - parameter bit RV32B = 1'b0, - parameter bit BranchTargetALU = 1'b0, - parameter bit WritebackStage = 1'b0, - parameter MultiplierImplementation = "fast", - parameter bit ICache = 1'b0, - parameter bit ICacheECC = 1'b0, - parameter bit DbgTriggerEn = 1'b0, - parameter bit SecureIbex = 1'b0, - parameter int unsigned DmHaltAddr = 32'h1A110800, - parameter int unsigned DmExceptionAddr = 32'h1A110808 + parameter bit PMPEnable = 1'b0, + parameter int unsigned PMPGranularity = 0, + parameter int unsigned PMPNumRegions = 4, + parameter int unsigned MHPMCounterNum = 0, + parameter int unsigned MHPMCounterWidth = 40, + parameter bit RV32E = 1'b0, + parameter bit RV32M = 1'b1, + parameter ibex_pkg::rv32b_e RV32B = `RV32B, + parameter bit BranchTargetALU = 1'b0, + parameter bit WritebackStage = 1'b0, + parameter MultiplierImplementation = "fast", + parameter bit ICache = 1'b0, + parameter bit ICacheECC = 1'b0, + parameter bit DbgTriggerEn = 1'b0, + parameter bit SecureIbex = 1'b0, + parameter int unsigned DmHaltAddr = 32'h1A110800, + parameter int unsigned DmExceptionAddr = 32'h1A110808 ) ( // Clock and Reset input logic clk_i, diff --git a/rtl/ibex_decoder.sv b/rtl/ibex_decoder.sv index b8952754..3b2807eb 100644 --- a/rtl/ibex_decoder.sv +++ b/rtl/ibex_decoder.sv @@ -14,10 +14,10 @@ `include "prim_assert.sv" module ibex_decoder #( - parameter bit RV32E = 0, - parameter bit RV32M = 1, - parameter bit RV32B = 0, - parameter bit BranchTargetALU = 0 + parameter bit RV32E = 0, + parameter bit RV32M = 1, + parameter bit BranchTargetALU = 0, + parameter ibex_pkg::rv32b_e RV32B = ibex_pkg::RV32BNone ) ( input logic clk_i, input logic rst_ni, @@ -112,7 +112,8 @@ module ibex_decoder #( logic [4:0] instr_rs3; logic [4:0] instr_rd; - logic use_rs3; + logic use_rs3_d; + logic use_rs3_q; csr_op_e csr_op; @@ -139,11 +140,20 @@ module ibex_decoder #( // immediate for CSR manipulation (zero extended) assign zimm_rs1_type_o = { 27'b0, instr_rs1 }; // rs1 + // the use of rs3 is known one cycle ahead. + always_ff @(posedge clk_i or negedge rst_ni) begin + if (!rst_ni) begin + use_rs3_q <= 1'b0; + end else begin + use_rs3_q <= use_rs3_d; + end + end + // source registers assign instr_rs1 = instr[19:15]; assign instr_rs2 = instr[24:20]; assign instr_rs3 = instr[31:27]; - assign rf_raddr_a_o = use_rs3 ? instr_rs3 : instr_rs1; // rs3 / rs1 + assign rf_raddr_a_o = (use_rs3_q & ~instr_first_cycle_i) ? instr_rs3 : instr_rs1; // rs3 / rs1 assign rf_raddr_b_o = instr_rs2; // rs2 // destination register @@ -338,29 +348,29 @@ module ibex_decoder #( 3'b001: begin unique case (instr[31:27]) - 5'b0_0000: illegal_insn = 1'b0; // slli - 5'b0_0100, // sloi - 5'b0_1001, // sbclri - 5'b0_0101, // sbseti - 5'b0_1101: illegal_insn = RV32B ? 1'b0 : 1'b1; // sbinvi + 5'b0_0000: illegal_insn = 1'b0; // slli + 5'b0_0100, // sloi + 5'b0_1001, // sbclri + 5'b0_0101, // sbseti + 5'b0_1101: illegal_insn = (RV32B != RV32BNone) ? 1'b0 : 1'b1; // sbinvi 5'b0_0001: if (instr[26] == 1'b0) begin - illegal_insn = RV32B ? 1'b0 : 1'b1; // shfl + illegal_insn = (RV32B == RV32BFull) ? 1'b0 : 1'b1; // shfl end else begin illegal_insn = 1'b1; end 5'b0_1100: begin unique case(instr[26:20]) - 7'b000_0000, // clz - 7'b000_0001, // ctz - 7'b000_0010, // pcnt - 7'b000_0100, // sext.b - 7'b000_0101, // sext.h - 7'b001_0000, // crc32.b - 7'b001_0001, // crc32.h - 7'b001_0010, // crc32.w - 7'b001_1000, // crc32c.b - 7'b001_1001, // crc32c.h - 7'b001_1010: illegal_insn = RV32B ? 1'b0 : 1'b1; // crc32c.w + 7'b000_0000, // clz + 7'b000_0001, // ctz + 7'b000_0010, // pcnt + 7'b000_0100, // sext.b + 7'b000_0101: illegal_insn = (RV32B != RV32BNone) ? 1'b0 : 1'b1; // sext.h + 7'b001_0000, // crc32.b + 7'b001_0001, // crc32.h + 7'b001_0010, // crc32.w + 7'b001_1000, // crc32c.b + 7'b001_1001, // crc32c.h + 7'b001_1010: illegal_insn = (RV32B == RV32BFull) ? 1'b0 : 1'b1; // crc32c.w default: illegal_insn = 1'b1; endcase @@ -371,22 +381,41 @@ module ibex_decoder #( 3'b101: begin if (instr[26]) begin - illegal_insn = RV32B ? 1'b0 : 1'b1; // fsri + illegal_insn = (RV32B != RV32BNone) ? 1'b0 : 1'b1; // fsri end else begin unique case (instr[31:27]) - 5'b0_0000, // srli - 5'b0_1000: illegal_insn = 1'b0; // srai + 5'b0_0000, // srli + 5'b0_1000: illegal_insn = 1'b0; // srai - 5'b0_0100, // sroi - 5'b0_1100, // rori - 5'b0_1001: illegal_insn = RV32B ? 1'b0 : 1'b1; // sbexti + 5'b0_0100, // sroi + 5'b0_1100, // rori + 5'b0_1001: illegal_insn = (RV32B != RV32BNone) ? 1'b0 : 1'b1; // sbexti - 5'b0_1101, // grevi - 5'b0_0101: illegal_insn = RV32B ? 1'b0 : 1'b1; // gorci - 5'b0_0001: if (instr[26] == 1'b0) begin - illegal_insn = RV32B ? 1'b0 : 1'b1; // unshfl - end else begin - illegal_insn = 1'b1; + 5'b0_1101: begin + if ((RV32B == RV32BFull)) begin + illegal_insn = 1'b0; // grevi + end else begin + unique case (instr[24:20]) + 5'b11111, // rev + 5'b11000: illegal_insn = (RV32B == RV32BBalanced) ? 1'b0 : 1'b1; // rev8 + + default: illegal_insn = 1'b1; + endcase + end + end + 5'b0_0101: begin + if ((RV32B == RV32BFull)) begin + illegal_insn = 1'b0; // gorci + end else if (instr[24:20] == 5'b00111) begin + illegal_insn = (RV32B == RV32BBalanced) ? 1'b0 : 1'b1; // orc.b + end + end + 5'b0_0001: begin + if (instr[26] == 1'b0) begin + illegal_insn = (RV32B == RV32BFull) ? 1'b0 : 1'b1; // unshfl + end else begin + illegal_insn = 1'b1; + end end default: illegal_insn = 1'b1; @@ -403,7 +432,7 @@ module ibex_decoder #( rf_ren_b_o = 1'b1; rf_we = 1'b1; if ({instr[26], instr[13:12]} == {1'b1, 2'b01}) begin - illegal_insn = RV32B ? 1'b0 : 1'b1; // cmix / cmov / fsl / fsr + illegal_insn = (RV32B != RV32BNone) ? 1'b0 : 1'b1; // cmix / cmov / fsl / fsr end else begin unique case ({instr[31:25], instr[14:12]}) // RV32I ALU operations @@ -438,6 +467,8 @@ module ibex_decoder #( {7'b001_0100, 3'b001}, // sbset {7'b011_0100, 3'b001}, // sbinv {7'b010_0100, 3'b101}, // sbext + // RV32B zbf + {7'b010_0100, 3'b111}: illegal_insn = (RV32B != RV32BNone) ? 1'b0 : 1'b1; // bfp // RV32B zbe {7'b010_0100, 3'b110}, // bdep {7'b000_0100, 3'b110}, // bext @@ -446,12 +477,10 @@ module ibex_decoder #( {7'b001_0100, 3'b101}, // gorc {7'b000_0100, 3'b001}, // shfl {7'b000_0100, 3'b101}, // unshfl - // RV32B zbf - {7'b010_0100, 3'b111}, // bfp // RV32B zbc {7'b000_0101, 3'b001}, // clmul {7'b000_0101, 3'b010}, // clmulr - {7'b000_0101, 3'b011}: illegal_insn = RV32B ? 1'b0 : 1'b1; // clmulh + {7'b000_0101, 3'b011}: illegal_insn = (RV32B == RV32BFull) ? 1'b0 : 1'b1; // clmulh // RV32M instructions {7'b000_0001, 3'b000}: begin // mul @@ -627,7 +656,7 @@ module ibex_decoder #( opcode_alu = opcode_e'(instr_alu[6:0]); - use_rs3 = 1'b0; + use_rs3_d = 1'b0; alu_multicycle_o = 1'b0; mult_sel_o = 1'b0; div_sel_o = 1'b0; @@ -774,7 +803,7 @@ module ibex_decoder #( 3'b111: alu_operator_o = ALU_AND; // And with Immediate 3'b001: begin - if (RV32B) begin + if (RV32B != RV32BNone) begin unique case (instr_alu[31:27]) 5'b0_0000: alu_operator_o = ALU_SLL; // Shift Left Logical by Immediate 5'b0_0100: alu_operator_o = ALU_SLO; // Shift Left Ones by Immediate @@ -785,34 +814,46 @@ module ibex_decoder #( 5'b0_0001: if (instr_alu[26] == 0) alu_operator_o = ALU_SHFL; 5'b0_1100: begin unique case (instr_alu[26:20]) - 7'b000_0000: alu_operator_o = ALU_CLZ; // clz - 7'b000_0001: alu_operator_o = ALU_CTZ; // ctz - 7'b000_0010: alu_operator_o = ALU_PCNT; // pcnt - 7'b000_0100: alu_operator_o = ALU_SEXTB; // sext.b - 7'b000_0101: alu_operator_o = ALU_SEXTH; // sext.h + 7'b000_0000: alu_operator_o = ALU_CLZ; // clz + 7'b000_0001: alu_operator_o = ALU_CTZ; // ctz + 7'b000_0010: alu_operator_o = ALU_PCNT; // pcnt + 7'b000_0100: alu_operator_o = ALU_SEXTB; // sext.b + 7'b000_0101: alu_operator_o = ALU_SEXTH; // sext.h 7'b001_0000: begin - alu_operator_o = ALU_CRC32_B; // crc32.b - alu_multicycle_o = 1'b1; + if (RV32B == RV32BFull) begin + alu_operator_o = ALU_CRC32_B; // crc32.b + alu_multicycle_o = 1'b1; + end end 7'b001_0001: begin - alu_operator_o = ALU_CRC32_H; // crc32.h - alu_multicycle_o = 1'b1; + if (RV32B == RV32BFull) begin + alu_operator_o = ALU_CRC32_H; // crc32.h + alu_multicycle_o = 1'b1; + end end 7'b001_0010: begin - alu_operator_o = ALU_CRC32_W; // crc32.w - alu_multicycle_o = 1'b1; + if (RV32B == RV32BFull) begin + alu_operator_o = ALU_CRC32_W; // crc32.w + alu_multicycle_o = 1'b1; + end end 7'b001_1000: begin - alu_operator_o = ALU_CRC32C_B; // crc32c.b - alu_multicycle_o = 1'b1; + if (RV32B == RV32BFull) begin + alu_operator_o = ALU_CRC32C_B; // crc32c.b + alu_multicycle_o = 1'b1; + end end 7'b001_1001: begin - alu_operator_o = ALU_CRC32C_H; // crc32c.h - alu_multicycle_o = 1'b1; + if (RV32B == RV32BFull) begin + alu_operator_o = ALU_CRC32C_H; // crc32c.h + alu_multicycle_o = 1'b1; + end end 7'b001_1010: begin - alu_operator_o = ALU_CRC32C_W; // crc32c.w - alu_multicycle_o = 1'b1; + if (RV32B == RV32BFull) begin + alu_operator_o = ALU_CRC32C_W; // crc32c.w + alu_multicycle_o = 1'b1; + end end default: ; endcase @@ -821,19 +862,19 @@ module ibex_decoder #( default: ; endcase end else begin - alu_operator_o = ALU_SLL; // Shift Left Logical by Immediate + alu_operator_o = ALU_SLL; // Shift Left Logical by Immediate end end 3'b101: begin - if (RV32B) begin + if (RV32B != RV32BNone) begin if (instr_alu[26] == 1'b1) begin alu_operator_o = ALU_FSR; alu_multicycle_o = 1'b1; if (instr_first_cycle_i) begin - use_rs3 = 1'b0; + use_rs3_d = 1'b1; end else begin - use_rs3 = 1'b1; + use_rs3_d = 1'b0; end end else begin unique case (instr_alu[31:27]) @@ -842,22 +883,26 @@ module ibex_decoder #( 5'b0_0100: alu_operator_o = ALU_SRO; // Shift Right Ones by Immediate 5'b0_1001: alu_operator_o = ALU_SBEXT; // Extract bit specified by immediate. 5'b0_1100: begin - alu_operator_o = ALU_ROR; // Rotate Right by Immediate + alu_operator_o = ALU_ROR; // Rotate Right by Immediate alu_multicycle_o = 1'b1; end - 5'b0_1101: alu_operator_o = ALU_GREV; // General Reverse with Imm Control Val - 5'b0_0101: alu_operator_o = ALU_GORC; // General Or-combine with Imm Control Val + 5'b0_1101: alu_operator_o = ALU_GREV; // General Reverse with Imm Control Val + 5'b0_0101: alu_operator_o = ALU_GORC; // General Or-combine with Imm Control Val // Unshuffle with Immediate Control Value - 5'b0_0001: if (instr_alu[26] == 1'b0) alu_operator_o = ALU_UNSHFL; + 5'b0_0001: begin + if (RV32B == RV32BFull) begin + if (instr_alu[26] == 1'b0) alu_operator_o = ALU_UNSHFL; + end + end default: ; endcase end end else begin if (instr_alu[31:27] == 5'b0_0000) begin - alu_operator_o = ALU_SRL; // Shift Right Logical by Immediate + alu_operator_o = ALU_SRL; // Shift Right Logical by Immediate end else if (instr_alu[31:27] == 5'b0_1000) begin - alu_operator_o = ALU_SRA; // Shift Right Arithmetically by Immediate + alu_operator_o = ALU_SRA; // Shift Right Arithmetically by Immediate end end end @@ -871,42 +916,42 @@ module ibex_decoder #( alu_op_b_mux_sel_o = OP_B_REG_B; if (instr_alu[26]) begin - if (RV32B) begin + if (RV32B != RV32BNone) begin unique case ({instr_alu[26:25], instr_alu[14:12]}) {2'b11, 3'b001}: begin alu_operator_o = ALU_CMIX; // cmix alu_multicycle_o = 1'b1; if (instr_first_cycle_i) begin - use_rs3 = 1'b0; + use_rs3_d = 1'b1; end else begin - use_rs3 = 1'b1; + use_rs3_d = 1'b0; end end {2'b11, 3'b101}: begin alu_operator_o = ALU_CMOV; // cmov alu_multicycle_o = 1'b1; if (instr_first_cycle_i) begin - use_rs3 = 1'b0; + use_rs3_d = 1'b1; end else begin - use_rs3 = 1'b1; + use_rs3_d = 1'b0; end end {2'b10, 3'b001}: begin alu_operator_o = ALU_FSL; // fsl alu_multicycle_o = 1'b1; if (instr_first_cycle_i) begin - use_rs3 = 1'b0; + use_rs3_d = 1'b1; end else begin - use_rs3 = 1'b1; + use_rs3_d = 1'b0; end end {2'b10, 3'b101}: begin alu_operator_o = ALU_FSR; // fsr alu_multicycle_o = 1'b1; if (instr_first_cycle_i) begin - use_rs3 = 1'b0; + use_rs3_d = 1'b1; end else begin - use_rs3 = 1'b1; + use_rs3_d = 1'b0; end end default: ; @@ -927,56 +972,67 @@ module ibex_decoder #( {7'b010_0000, 3'b101}: alu_operator_o = ALU_SRA; // Shift Right Arithmetic // RV32B ALU Operations - {7'b001_0000, 3'b001}: if (RV32B) alu_operator_o = ALU_SLO; // slo - {7'b001_0000, 3'b101}: if (RV32B) alu_operator_o = ALU_SRO; // sro + {7'b001_0000, 3'b001}: if (RV32B != RV32BNone) alu_operator_o = ALU_SLO; // slo + {7'b001_0000, 3'b101}: if (RV32B != RV32BNone) alu_operator_o = ALU_SRO; // sro {7'b011_0000, 3'b001}: begin - if (RV32B) begin + if (RV32B != RV32BNone) begin alu_operator_o = ALU_ROL; // rol alu_multicycle_o = 1'b1; end end {7'b011_0000, 3'b101}: begin - if (RV32B) begin + if (RV32B != RV32BNone) begin alu_operator_o = ALU_ROR; // ror alu_multicycle_o = 1'b1; end end - {7'b000_0101, 3'b100}: if (RV32B) alu_operator_o = ALU_MIN; // min - {7'b000_0101, 3'b101}: if (RV32B) alu_operator_o = ALU_MAX; // max - {7'b000_0101, 3'b110}: if (RV32B) alu_operator_o = ALU_MINU; // minu - {7'b000_0101, 3'b111}: if (RV32B) alu_operator_o = ALU_MAXU; // maxu + {7'b000_0101, 3'b100}: if (RV32B != RV32BNone) alu_operator_o = ALU_MIN; // min + {7'b000_0101, 3'b101}: if (RV32B != RV32BNone) alu_operator_o = ALU_MAX; // max + {7'b000_0101, 3'b110}: if (RV32B != RV32BNone) alu_operator_o = ALU_MINU; // minu + {7'b000_0101, 3'b111}: if (RV32B != RV32BNone) alu_operator_o = ALU_MAXU; // maxu - {7'b000_0100, 3'b100}: if (RV32B) alu_operator_o = ALU_PACK; // pack - {7'b010_0100, 3'b100}: if (RV32B) alu_operator_o = ALU_PACKU; // packu - {7'b000_0100, 3'b111}: if (RV32B) alu_operator_o = ALU_PACKH; // packh + {7'b000_0100, 3'b100}: if (RV32B != RV32BNone) alu_operator_o = ALU_PACK; // pack + {7'b010_0100, 3'b100}: if (RV32B != RV32BNone) alu_operator_o = ALU_PACKU; // packu + {7'b000_0100, 3'b111}: if (RV32B != RV32BNone) alu_operator_o = ALU_PACKH; // packh - {7'b010_0000, 3'b100}: if (RV32B) alu_operator_o = ALU_XNOR; // xnor - {7'b010_0000, 3'b110}: if (RV32B) alu_operator_o = ALU_ORN; // orn - {7'b010_0000, 3'b111}: if (RV32B) alu_operator_o = ALU_ANDN; // andn - - // RV32B zbp - {7'b011_0100, 3'b101}: if (RV32B) alu_operator_o = ALU_GREV; // grev - {7'b001_0100, 3'b101}: if (RV32B) alu_operator_o = ALU_GORC; // grev - {7'b000_0100, 3'b001}: if (RV32B) alu_operator_o = ALU_SHFL; // shfl - {7'b000_0100, 3'b101}: if (RV32B) alu_operator_o = ALU_UNSHFL; // unshfl + {7'b010_0000, 3'b100}: if (RV32B != RV32BNone) alu_operator_o = ALU_XNOR; // xnor + {7'b010_0000, 3'b110}: if (RV32B != RV32BNone) alu_operator_o = ALU_ORN; // orn + {7'b010_0000, 3'b111}: if (RV32B != RV32BNone) alu_operator_o = ALU_ANDN; // andn // RV32B zbs - {7'b010_0100, 3'b001}: if (RV32B) alu_operator_o = ALU_SBCLR; // sbclr - {7'b001_0100, 3'b001}: if (RV32B) alu_operator_o = ALU_SBSET; // sbset - {7'b011_0100, 3'b001}: if (RV32B) alu_operator_o = ALU_SBINV; // sbinv - {7'b010_0100, 3'b101}: if (RV32B) alu_operator_o = ALU_SBEXT; // sbext + {7'b010_0100, 3'b001}: if (RV32B != RV32BNone) alu_operator_o = ALU_SBCLR; // sbclr + {7'b001_0100, 3'b001}: if (RV32B != RV32BNone) alu_operator_o = ALU_SBSET; // sbset + {7'b011_0100, 3'b001}: if (RV32B != RV32BNone) alu_operator_o = ALU_SBINV; // sbinv + {7'b010_0100, 3'b101}: if (RV32B != RV32BNone) alu_operator_o = ALU_SBEXT; // sbext + + // RV32B zbf + {7'b010_0100, 3'b111}: if (RV32B != RV32BNone) alu_operator_o = ALU_BFP; // bfp + + // RV32B zbp + {7'b011_0100, 3'b101}: if (RV32B != RV32BNone) alu_operator_o = ALU_GREV; // grev + {7'b001_0100, 3'b101}: if (RV32B != RV32BNone) alu_operator_o = ALU_GORC; // grev + {7'b000_0100, 3'b001}: if (RV32B == RV32BFull) alu_operator_o = ALU_SHFL; // shfl + {7'b000_0100, 3'b101}: if (RV32B == RV32BFull) alu_operator_o = ALU_UNSHFL; // unshfl // RV32B zbc - {7'b000_0101, 3'b001}: if (RV32B) alu_operator_o = ALU_CLMUL; // clmul - {7'b000_0101, 3'b010}: if (RV32B) alu_operator_o = ALU_CLMULR; // clmulr - {7'b000_0101, 3'b011}: if (RV32B) alu_operator_o = ALU_CLMULH; // clmulh + {7'b000_0101, 3'b001}: if (RV32B == RV32BFull) alu_operator_o = ALU_CLMUL; // clmul + {7'b000_0101, 3'b010}: if (RV32B == RV32BFull) alu_operator_o = ALU_CLMULR; // clmulr + {7'b000_0101, 3'b011}: if (RV32B == RV32BFull) alu_operator_o = ALU_CLMULH; // clmulh // RV32B zbe - {7'b010_0100, 3'b110}: if (RV32B) alu_operator_o = ALU_BDEP; // bdep - {7'b000_0100, 3'b110}: if (RV32B) alu_operator_o = ALU_BEXT; // bext - // RV32B zbf - {7'b010_0100, 3'b111}: if (RV32B) alu_operator_o = ALU_BFP; // bfp + {7'b010_0100, 3'b110}: begin + if (RV32B == RV32BFull) begin + alu_operator_o = ALU_BDEP; // bdep + alu_multicycle_o = 1'b1; + end + end + {7'b000_0100, 3'b110}: begin + if (RV32B == RV32BFull) begin + alu_operator_o = ALU_BEXT; // bext + alu_multicycle_o = 1'b1; + end + end // RV32M instructions, all use the same ALU operation {7'b000_0001, 3'b000}: begin // mul diff --git a/rtl/ibex_ex_block.sv b/rtl/ibex_ex_block.sv index 73ffc888..eccc68e9 100644 --- a/rtl/ibex_ex_block.sv +++ b/rtl/ibex_ex_block.sv @@ -9,10 +9,10 @@ * Execution block: Hosts ALU and MUL/DIV unit */ module ibex_ex_block #( - parameter bit RV32M = 1, - parameter bit RV32B = 0, - parameter bit BranchTargetALU = 0, - parameter MultiplierImplementation = "fast" + parameter bit RV32M = 1, + parameter ibex_pkg::rv32b_e RV32B = ibex_pkg::RV32BNone, + parameter bit BranchTargetALU = 0, + parameter MultiplierImplementation = "fast" ) ( input logic clk_i, input logic rst_ni, @@ -41,9 +41,9 @@ module ibex_ex_block #( input logic data_ind_timing_i, // intermediate val reg - output logic imd_val_we_o, - output logic [33:0] imd_val_d_o, - input logic [33:0] imd_val_q_i, + output logic [1:0] imd_val_we_o, + output logic [33:0] imd_val_d_o[2], + input logic [33:0] imd_val_q_i[2], // Outputs output logic [31:0] alu_adder_result_ex_o, // to LSU @@ -63,10 +63,11 @@ module ibex_ex_block #( logic alu_cmp_result, alu_is_equal_result; logic multdiv_valid; logic multdiv_sel; - logic [31:0] alu_imd_val_d; - logic alu_imd_val_we; - logic [33:0] multdiv_imd_val_d; - logic multdiv_imd_val_we; + logic [31:0] alu_imd_val_q[2]; + logic [31:0] alu_imd_val_d[2]; + logic [ 1:0] alu_imd_val_we; + logic [33:0] multdiv_imd_val_d[2]; + logic [ 1:0] multdiv_imd_val_we; /* The multdiv_i output is never selected if RV32M=0 @@ -80,8 +81,11 @@ module ibex_ex_block #( end // Intermediate Value Register Mux - assign imd_val_d_o = multdiv_sel ? multdiv_imd_val_d : {2'b0, alu_imd_val_d}; - assign imd_val_we_o = multdiv_sel ? multdiv_imd_val_we : alu_imd_val_we; + assign imd_val_d_o[0] = multdiv_sel ? multdiv_imd_val_d[0] : {2'b0, alu_imd_val_d[0]}; + assign imd_val_d_o[1] = multdiv_sel ? multdiv_imd_val_d[1] : {2'b0, alu_imd_val_d[1]}; + assign imd_val_we_o = multdiv_sel ? multdiv_imd_val_we : alu_imd_val_we; + + assign alu_imd_val_q = '{imd_val_q_i[0][31:0], imd_val_q_i[1][31:0]}; assign result_ex_o = multdiv_sel ? multdiv_result : alu_result; @@ -117,7 +121,7 @@ module ibex_ex_block #( .operand_a_i ( alu_operand_a_i ), .operand_b_i ( alu_operand_b_i ), .instr_first_cycle_i ( alu_instr_first_cycle_i ), - .imd_val_q_i ( imd_val_q_i[31:0] ), + .imd_val_q_i ( alu_imd_val_q ), .imd_val_we_o ( alu_imd_val_we ), .imd_val_d_o ( alu_imd_val_d ), .multdiv_operand_a_i ( multdiv_alu_operand_a ), @@ -218,6 +222,6 @@ module ibex_ex_block #( // Multiplier/divider may require multiple cycles. The ALU output is valid in the same cycle // unless the intermediate result register is being written (which indicates this isn't the // final cycle of ALU operation). - assign ex_valid_o = multdiv_sel ? multdiv_valid : !alu_imd_val_we; + assign ex_valid_o = multdiv_sel ? multdiv_valid : ~(|alu_imd_val_we); endmodule diff --git a/rtl/ibex_id_stage.sv b/rtl/ibex_id_stage.sv index ee63142a..2552b86b 100644 --- a/rtl/ibex_id_stage.sv +++ b/rtl/ibex_id_stage.sv @@ -17,13 +17,13 @@ `include "prim_assert.sv" module ibex_id_stage #( - parameter bit RV32E = 0, - parameter bit RV32M = 1, - parameter bit RV32B = 0, - parameter bit DataIndTiming = 1'b0, - parameter bit BranchTargetALU = 0, - parameter bit SpecBranch = 0, - parameter bit WritebackStage = 0 + parameter bit RV32E = 0, + parameter bit RV32M = 1, + parameter ibex_pkg::rv32b_e RV32B = ibex_pkg::RV32BNone, + parameter bit DataIndTiming = 1'b0, + parameter bit BranchTargetALU = 0, + parameter bit SpecBranch = 0, + parameter bit WritebackStage = 0 ) ( input logic clk_i, input logic rst_ni, @@ -68,9 +68,9 @@ module ibex_id_stage #( output logic [31:0] alu_operand_b_ex_o, // Multicycle Operation Stage Register - input logic imd_val_we_ex_i, - input logic [33:0] imd_val_d_ex_i, - output logic [33:0] imd_val_q_ex_o, + input logic [1:0] imd_val_we_ex_i, + input logic [33:0] imd_val_d_ex_i[2], + output logic [33:0] imd_val_q_ex_o[2], // Branch target ALU output logic [31:0] bt_a_operand_o, @@ -247,7 +247,7 @@ module ibex_id_stage #( logic alu_multicycle_dec; logic stall_alu; - logic [33:0] imd_val_q; + logic [33:0] imd_val_q[2]; op_a_sel_e bt_a_mux_sel; imm_b_sel_e bt_b_mux_sel; @@ -379,11 +379,13 @@ module ibex_id_stage #( // Multicycle Operation Stage Register // ///////////////////////////////////////// - always_ff @(posedge clk_i or negedge rst_ni) begin : intermediate_val_reg - if (!rst_ni) begin - imd_val_q <= '0; - end else if (imd_val_we_ex_i) begin - imd_val_q <= imd_val_d_ex_i; + for (genvar i=0; i<2; i++) begin : gen_intermediate_val_reg + always_ff @(posedge clk_i or negedge rst_ni) begin : intermediate_val_reg + if (!rst_ni) begin + imd_val_q[i] <= '0; + end else if (imd_val_we_ex_i[i]) begin + imd_val_q[i] <= imd_val_d_ex_i[i]; + end end end diff --git a/rtl/ibex_multdiv_fast.sv b/rtl/ibex_multdiv_fast.sv index 53fd6913..617bb516 100644 --- a/rtl/ibex_multdiv_fast.sv +++ b/rtl/ibex_multdiv_fast.sv @@ -35,9 +35,9 @@ module ibex_multdiv_fast #( output logic [32:0] alu_operand_a_o, output logic [32:0] alu_operand_b_o, - input logic [33:0] imd_val_q_i, - output logic [33:0] imd_val_d_o, - output logic imd_val_we_o, + input logic [33:0] imd_val_q_i[2], + output logic [33:0] imd_val_d_o[2], + output logic [1:0] imd_val_we_o, input logic multdiv_ready_id_i, @@ -99,13 +99,11 @@ module ibex_multdiv_fast #( if (!rst_ni) begin div_counter_q <= '0; md_state_q <= MD_IDLE; - op_denominator_q <= '0; op_numerator_q <= '0; op_quotient_q <= '0; div_by_zero_q <= '0; end else if (div_en_internal) begin div_counter_q <= div_counter_d; - op_denominator_q <= op_denominator_d; op_numerator_q <= op_numerator_d; op_quotient_q <= op_quotient_d; md_state_q <= md_state_d; @@ -113,18 +111,24 @@ module ibex_multdiv_fast #( end end - `ASSERT_KNOWN(DivEnKnown, div_en_internal); `ASSERT_KNOWN(MultEnKnown, mult_en_internal); `ASSERT_KNOWN(MultDivEnKnown, multdiv_en); assign multdiv_en = mult_en_internal | div_en_internal; - assign imd_val_d_o = div_sel_i ? op_remainder_d : mac_res_d; - assign imd_val_we_o = multdiv_en; + // Intermediate value register shared with ALU + assign imd_val_d_o[0] = div_sel_i ? op_remainder_d : mac_res_d; + assign imd_val_we_o[0] = multdiv_en; + + assign imd_val_d_o[1] = {2'b0, op_denominator_d}; + assign imd_val_we_o[1] = div_en_internal; + assign op_denominator_q = imd_val_q_i[1][31:0]; + logic [1:0] unused_imd_val; + assign unused_imd_val = imd_val_q_i[1][33:32]; assign signed_mult = (signed_mode_i != 2'b00); - assign multdiv_result_o = div_sel_i ? imd_val_q_i[31:0] : mac_res_d[31:0]; + assign multdiv_result_o = div_sel_i ? imd_val_q_i[0][31:0] : mac_res_d[31:0]; // The single cycle multiplier uses three 17 bit multipliers to compute MUL instructions in a // single cycle and MULH instructions in two cycles. @@ -170,8 +174,8 @@ module ibex_multdiv_fast #( assign mult2_op_b = op_b_i[`OP_H]; // used in MULH - assign accum[17:0] = imd_val_q_i[33:16]; - assign accum[33:18] = {16{signed_mult & imd_val_q_i[33]}}; + assign accum[17:0] = imd_val_q_i[0][33:16]; + assign accum[33:18] = {16{signed_mult & imd_val_q_i[0][33]}}; always_comb begin // Default values == MULL @@ -268,7 +272,7 @@ module ibex_multdiv_fast #( mult_op_b = op_b_i[`OP_L]; sign_a = 1'b0; sign_b = 1'b0; - accum = imd_val_q_i; + accum = imd_val_q_i[0]; mac_res_d = mac_res; mult_state_d = mult_state_q; mult_valid = 1'b0; @@ -293,10 +297,10 @@ module ibex_multdiv_fast #( mult_op_b = op_b_i[`OP_H]; sign_a = 1'b0; sign_b = signed_mode_i[1] & op_b_i[31]; - // result of AL*BL (in imd_val_q_i) always unsigned with no carry, so carries_q always 00 - accum = {18'b0, imd_val_q_i[31:16]}; + // result of AL*BL (in imd_val_q_i[0]) always unsigned with no carry, so carries_q always 00 + accum = {18'b0, imd_val_q_i[0][31:16]}; if (operator_i == MD_OP_MULL) begin - mac_res_d = {2'b0, mac_res[`OP_L], imd_val_q_i[`OP_L]}; + mac_res_d = {2'b0, mac_res[`OP_L], imd_val_q_i[0][`OP_L]}; end else begin // MD_OP_MULH mac_res_d = mac_res; @@ -311,15 +315,15 @@ module ibex_multdiv_fast #( sign_a = signed_mode_i[0] & op_a_i[31]; sign_b = 1'b0; if (operator_i == MD_OP_MULL) begin - accum = {18'b0, imd_val_q_i[31:16]}; - mac_res_d = {2'b0, mac_res[15:0], imd_val_q_i[15:0]}; + accum = {18'b0, imd_val_q_i[0][31:16]}; + mac_res_d = {2'b0, mac_res[15:0], imd_val_q_i[0][15:0]}; mult_valid = 1'b1; // Note no state transition will occur if mult_hold is set mult_state_d = ALBL; mult_hold = ~multdiv_ready_id_i; end else begin - accum = imd_val_q_i; + accum = imd_val_q_i[0]; mac_res_d = mac_res; mult_state_d = AHBH; end @@ -332,8 +336,8 @@ module ibex_multdiv_fast #( mult_op_b = op_b_i[`OP_H]; sign_a = signed_mode_i[0] & op_a_i[31]; sign_b = signed_mode_i[1] & op_b_i[31]; - accum[17: 0] = imd_val_q_i[33:16]; - accum[33:18] = {16{signed_mult & imd_val_q_i[33]}}; + accum[17: 0] = imd_val_q_i[0][33:16]; + accum[33:18] = {16{signed_mult & imd_val_q_i[0][33]}}; // result of AH*BL is not signed only if signed_mode_i == 2'b00 mac_res_d = mac_res; mult_valid = 1'b1; @@ -366,7 +370,7 @@ module ibex_multdiv_fast #( // Divider assign res_adder_h = alu_adder_ext_i[33:1]; - assign next_remainder = is_greater_equal ? res_adder_h[31:0] : imd_val_q_i[31:0]; + assign next_remainder = is_greater_equal ? res_adder_h[31:0] : imd_val_q_i[0][31:0]; assign next_quotient = is_greater_equal ? {1'b0, op_quotient_q} | {1'b0, one_shift} : {1'b0, op_quotient_q}; @@ -376,10 +380,10 @@ module ibex_multdiv_fast #( // Remainder - Divisor. If Remainder - Divisor >= 0, is_greater_equal is equal to 1, // the next Remainder is Remainder - Divisor contained in res_adder_h and the always_comb begin - if ((imd_val_q_i[31] ^ op_denominator_q[31]) == 1'b0) begin + if ((imd_val_q_i[0][31] ^ op_denominator_q[31]) == 1'b0) begin is_greater_equal = (res_adder_h[31] == 1'b0); end else begin - is_greater_equal = imd_val_q_i[31]; + is_greater_equal = imd_val_q_i[0][31]; end end @@ -391,7 +395,7 @@ module ibex_multdiv_fast #( always_comb begin div_counter_d = div_counter_q - 5'h1; - op_remainder_d = imd_val_q_i; + op_remainder_d = imd_val_q_i[0]; op_quotient_d = op_quotient_q; md_state_d = md_state_q; op_numerator_d = op_numerator_q; @@ -457,13 +461,13 @@ module ibex_multdiv_fast #( op_quotient_d = next_quotient[31:0]; md_state_d = (div_counter_q == 5'd1) ? MD_LAST : MD_COMP; // Division - alu_operand_a_o = {imd_val_q_i[31:0], 1'b1}; // it contains the remainder + alu_operand_a_o = {imd_val_q_i[0][31:0], 1'b1}; // it contains the remainder alu_operand_b_o = {~op_denominator_q[31:0], 1'b1}; // -denominator two's compliment end MD_LAST: begin if (operator_i == MD_OP_DIV) begin - // this time we save the quotient in op_remainder_d (i.e. imd_val_q_i) since + // this time we save the quotient in op_remainder_d (i.e. imd_val_q_i[0]) since // we do not need anymore the remainder op_remainder_d = {1'b0, next_quotient}; end else begin @@ -471,7 +475,7 @@ module ibex_multdiv_fast #( op_remainder_d = {2'b0, next_remainder[31:0]}; end // Division - alu_operand_a_o = {imd_val_q_i[31:0], 1'b1}; // it contains the remainder + alu_operand_a_o = {imd_val_q_i[0][31:0], 1'b1}; // it contains the remainder alu_operand_b_o = {~op_denominator_q[31:0], 1'b1}; // -denominator two's compliment md_state_d = MD_CHANGE_SIGN; @@ -480,13 +484,13 @@ module ibex_multdiv_fast #( MD_CHANGE_SIGN: begin md_state_d = MD_FINISH; if (operator_i == MD_OP_DIV) begin - op_remainder_d = (div_change_sign) ? {2'h0, alu_adder_i} : imd_val_q_i; + op_remainder_d = (div_change_sign) ? {2'h0, alu_adder_i} : imd_val_q_i[0]; end else begin - op_remainder_d = (rem_change_sign) ? {2'h0, alu_adder_i} : imd_val_q_i; + op_remainder_d = (rem_change_sign) ? {2'h0, alu_adder_i} : imd_val_q_i[0]; end // ABS(Quotient) = 0 - Quotient (or Remainder) alu_operand_a_o = {32'h0 , 1'b1}; - alu_operand_b_o = {~imd_val_q_i[31:0], 1'b1}; + alu_operand_b_o = {~imd_val_q_i[0][31:0], 1'b1}; end MD_FINISH: begin diff --git a/rtl/ibex_multdiv_slow.sv b/rtl/ibex_multdiv_slow.sv index b3038cb4..bcd04b0f 100644 --- a/rtl/ibex_multdiv_slow.sv +++ b/rtl/ibex_multdiv_slow.sv @@ -31,9 +31,9 @@ module ibex_multdiv_slow output logic [32:0] alu_operand_a_o, output logic [32:0] alu_operand_b_o, - input logic [33:0] imd_val_q_i, - output logic [33:0] imd_val_d_o, - output logic imd_val_we_o, + input logic [33:0] imd_val_q_i[2], + output logic [33:0] imd_val_d_o[2], + output logic [1:0] imd_val_we_o, input logic multdiv_ready_id_i, @@ -50,7 +50,8 @@ module ibex_multdiv_slow md_fsm_e md_state_q, md_state_d; logic [32:0] accum_window_q, accum_window_d; - logic unused_imd_val; + logic unused_imd_val0; + logic [ 1:0] unused_imd_val1; logic [32:0] res_adder_l; logic [32:0] res_adder_h; @@ -81,11 +82,16 @@ module ibex_multdiv_slow // ALU Operand MUX // ///////////////////// - // Use shared intermediate value register in id_stage for accum_window - assign imd_val_d_o = {1'b0,accum_window_d}; - assign imd_val_we_o = ~multdiv_hold; - assign accum_window_q = imd_val_q_i[32:0]; - assign unused_imd_val = imd_val_q_i[33]; + // Intermediate value register shared with ALU + assign imd_val_d_o[0] = {1'b0,accum_window_d}; + assign imd_val_we_o[0] = ~multdiv_hold; + assign accum_window_q = imd_val_q_i[0][32:0]; + assign unused_imd_val0 = imd_val_q_i[0][33]; + + assign imd_val_d_o[1] = {2'b00, op_numerator_d}; + assign imd_val_we_o[1] = multdiv_en; + assign op_numerator_q = imd_val_q_i[1][31:0]; + assign unused_imd_val1 = imd_val_q_i[1][33:32]; always_comb begin alu_operand_a_o = accum_window_q; @@ -328,14 +334,12 @@ module ibex_multdiv_slow multdiv_count_q <= 5'h0; op_b_shift_q <= 33'h0; op_a_shift_q <= 33'h0; - op_numerator_q <= 32'h0; md_state_q <= MD_IDLE; div_by_zero_q <= 1'b0; end else if (multdiv_en) begin multdiv_count_q <= multdiv_count_d; op_b_shift_q <= op_b_shift_d; op_a_shift_q <= op_a_shift_d; - op_numerator_q <= op_numerator_d; md_state_q <= md_state_d; div_by_zero_q <= div_by_zero_d; end diff --git a/rtl/ibex_pkg.sv b/rtl/ibex_pkg.sv index 3ecd4015..bb086ecb 100644 --- a/rtl/ibex_pkg.sv +++ b/rtl/ibex_pkg.sv @@ -8,6 +8,15 @@ */ package ibex_pkg; +///////////////////////// +// RV32B Paramter Enum // +///////////////////////// + +typedef enum integer { + RV32BNone, + RV32BBalanced, + RV32BFull +} rv32b_e; ///////////// // Opcodes //