diff --git a/README.md b/README.md
index e840e7bf..50fe8ddc 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ The options include different choices for the architecture of the multiplier uni
 The table below indicates performance, area and verification status for a few selected configurations.
 These are configurations on which lowRISC is focusing for performance evaluation and design verification (see [supported configs](ibex_configs.yaml)).
 
-| Config | "small" | "maxperf" | "maxperf-pmp-bm" |
+| Config | "small" | "maxperf" | "maxperf-pmp-bmfull" |
 | ------ | ------- | --------- | ---------------- |
 | Features | RV32IMC, 3 cycle mult | RV32IMC, 1 cycle mult, Branch target ALU, Writeback stage | RV32IMCB, 1 cycle mult, Branch target ALU, Writeback stage, 16 PMP regions |
 | Performance (Coremark/MHz) | 2.44 | 3.09 | 3.09 |
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index db7212ad..076912b5 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -159,4 +159,4 @@ jobs:
       ibex_configs:
         - small
         - experimental-maxperf-pmp
-        - experimental-maxperf-pmp-bm
+        - experimental-maxperf-pmp-bmfull
diff --git a/doc/instruction_decode_execute.rst b/doc/instruction_decode_execute.rst
index 00e090f1..1ec7f444 100644
--- a/doc/instruction_decode_execute.rst
+++ b/doc/instruction_decode_execute.rst
@@ -64,10 +64,46 @@ Other blocks use the ALU for the following tasks:
 * It computes memory addresses for loads and stores with a Reg + Imm calculation
 * The LSU uses it to increment addresses when performing two accesses to handle an unaligned access
 
-Support for the RISC-V Bitmanipulation Extension (Document Version 0.92, November 8, 2019) is enabled via the parameter ``RV32B``.
-This feature is *EXPERIMENTAL* and the details of its impact are not yet documented here.
-Currently the Zbb, Zbs, Zbp, Zbe, Zbf, Zbc, Zbr and Zbt sub-extensions are implemented.
-The rotate instructions `ror` and `rol` (Zbb), ternary instructions `cmov`, `cmix`, `fsl` and `fsr` as well as cyclic redundancy checks `crc32[c]` (Zbr) are completed in 2 cycles. All remaining instructions complete in one cycle.
+Bit Manipulation Extension
+  Support for the `RISC-V Bit Manipulation Extension (Document Version 0.92, November 8, 2019) <https://github.com/riscv/riscv-bitmanip/blob/master/bitmanip-0.92.pdf>`_ is enabled via the enumerated parameter ``RV32B`` defined in :file:`rtl/ibex_pkg.sv`.
+  This feature is *Experimental*.
+
+  There are two versions of the bit manipulation extension available:
+  The balanced implementation comprises a set of sub-extensions aiming for good benefits at a reasonable area overhead.
+  The full implementation comprises all 32 bit instructions defined in the extension.
+  The following table lists the implemented instructions in each version.
+  Multi-cycle instructions are completed in 2 cycles.
+  All remaining instructions complete in a single cycle.
+
+  +---------------------------+---------------+--------------------------+
+  | Z-Extension               | Version       | Multi-Cycle Instructions |
+  +===========================+===============+==========================+
+  | Zbb (Base)                | Balanced/Full | rol, ror[i]              |
+  +---------------------------+---------------+--------------------------+
+  | Zbs (Single-bit)          | Balanced/Full | None                     |
+  +---------------------------+---------------+--------------------------+
+  | Zbp (Permutation)         | Full          | None                     |
+  +---------------------------+---------------+--------------------------+
+  | Zbp (Bit extract/deposit) | Full          | All                      |
+  +---------------------------+---------------+--------------------------+
+  | Zbf (Bit-field place)     | Balanced/Full | All                      |
+  +---------------------------+---------------+--------------------------+
+  | Zbc (Carry-less multiply) | Full          | None                     |
+  +---------------------------+---------------+--------------------------+
+  | Zbr (Crc)                 | Full          | All                      |
+  +---------------------------+---------------+--------------------------+
+  | Zbt (Ternary)             | Balanced/Full | All                      |
+  +---------------------------+---------------+--------------------------+
+  | Zb_tmp (Temporary)*       | Balanced/Full | None                     |
+  +---------------------------+---------------+--------------------------+
+
+  * The sign-extend instructions `sext.b/sext.h` are defined but not yet classified in version 0.92 of the extension proposal.
+    Temporarily, they are assigned a separate Z-extension.
+
+  The implementation of the B-extension comes with an area overhead of 1.8 to 3.0 kGE for the balanced version and 6.0 to 8.7 kGE for the full version.
+  That corresponds to an approximate percentage increase in area of 9 to 14 % and 25 to 30 % for the balanced and full versions respectively.
+  The ranges correspond to synthesis results generated using relaxed and maximum frequency targets respectively.
+  The designs have been synthesized using Synopsys Design Compiler targeting TSMC 65 nm technology.
 
 
 .. _mult-div:
diff --git a/doc/integration.rst b/doc/integration.rst
index d410bbf8..f7233d3f 100644
--- a/doc/integration.rst
+++ b/doc/integration.rst
@@ -12,21 +12,21 @@ Instantiation Template
 .. code-block:: verilog
 
   ibex_core #(
-      .PMPEnable                ( 0            ),
-      .PMPGranularity           ( 0            ),
-      .PMPNumRegions            ( 4            ),
-      .MHPMCounterNum           ( 0            ),
-      .MHPMCounterWidth         ( 40           ),
-      .RV32E                    ( 0            ),
-      .RV32M                    ( 1            ),
-      .RV32B                    ( 0            ),
-      .MultiplierImplementation ( "fast"       ),
-      .ICache                   ( 0            ),
-      .ICacheECC                ( 0            ),
-      .SecureIbex               ( 0            ),
-      .DbgTriggerEn             ( 0            ),
-      .DmHaltAddr               ( 32'h1A110800 ),
-      .DmExceptionAddr          ( 32'h1A110808 )
+      .PMPEnable                ( 0                   ),
+      .PMPGranularity           ( 0                   ),
+      .PMPNumRegions            ( 4                   ),
+      .MHPMCounterNum           ( 0                   ),
+      .MHPMCounterWidth         ( 40                  ),
+      .RV32E                    ( 0                   ),
+      .RV32M                    ( 1                   ),
+      .RV32B                    ( ibex_pkg::RV32BNone ),
+      .MultiplierImplementation ( "fast"              ),
+      .ICache                   ( 0                   ),
+      .ICacheECC                ( 0                   ),
+      .SecureIbex               ( 0                   ),
+      .DbgTriggerEn             ( 0                   ),
+      .DmHaltAddr               ( 32'h1A110800        ),
+      .DmExceptionAddr          ( 32'h1A110808        )
   ) u_core (
       // Clock and reset
       .clk_i          (),
@@ -74,55 +74,55 @@ Instantiation Template
 Parameters
 ----------
 
-+------------------------------+-------------+------------+-----------------------------------------------------------------+
-| Name                         | Type/Range  | Default    | Description                                                     |
-+==============================+=============+============+=================================================================+
-| ``PMPEnable``                | bit         | 0          | Enable PMP support                                              |
-+------------------------------+-------------+------------+-----------------------------------------------------------------+
-| ``PMPGranularity``           | int (0..31) | 0          | Minimum granularity of PMP address matching                     |
-+------------------------------+-------------+------------+-----------------------------------------------------------------+
-| ``PMPNumRegions``            | int (1..16) | 4          | Number implemented PMP regions (ignored if PMPEnable == 0)      |
-+------------------------------+-------------+------------+-----------------------------------------------------------------+
-| ``MHPMCounterNum``           | int (0..10) | 0          | Number of performance monitor event counters                    |
-+------------------------------+-------------+------------+-----------------------------------------------------------------+
-| ``MHPMCounterWidth``         | int (64..1) | 40         | Bit width of performance monitor event counters                 |
-+------------------------------+-------------+------------+-----------------------------------------------------------------+
-| ``RV32E``                    | bit         | 0          | RV32E mode enable (16 integer registers only)                   |
-+------------------------------+-------------+------------+-----------------------------------------------------------------+
-| ``RV32M``                    | bit         | 1          | M(ultiply) extension enable                                     |
-+------------------------------+-------------+------------+-----------------------------------------------------------------+
-| ``RV32B``                    | bit         | 0          | *EXPERIMENTAL* - B(itmanipulation) extension enable:            |
-|                              |             |            | Currently supported Z-extensions: Zbb (base), Zbs (single-bit)  |
-|                              |             |            | Zbp (bit permutation), Zbe (bit extract/deposit),               |
-|                              |             |            | Zbf (bit-field place) Zbc (carry-less multiplication)           |
-|                              |             |            | Zbr (cyclic redundancy check) and Zbt (ternary)                 |
-+------------------------------+-------------+------------+-----------------------------------------------------------------+
-| ``BranchTargetALU``          | bit         | 0          | *EXPERIMENTAL* - Enables branch target ALU removing a stall     |
-|                              |             |            | cycle from taken branches                                       |
-+------------------------------+-------------+------------+-----------------------------------------------------------------+
-| ``WritebackStage``           | bit         | 0          | *EXPERIMENTAL* - Enables third pipeline stage (writeback)       |
-|                              |             |            | improving performance of loads and stores                       |
-+------------------------------+-------------+------------+-----------------------------------------------------------------+
-| ``MultiplierImplementation`` | string      | "fast"     | Multiplicator type:                                             |
-|                              |             |            | "slow": multi-cycle slow,                                       |
-|                              |             |            | "fast": multi-cycle fast,                                       |
-|                              |             |            | "single-cycle": single-cycle                                    |
-+------------------------------+-------------+------------+-----------------------------------------------------------------+
-| ``ICache``                   | bit         | 0          | *EXPERIMENTAL* Enable instruction cache instead of prefetch     |
-|                              |             |            | buffer                                                          |
-+------------------------------+-------------+------------+-----------------------------------------------------------------+
-| ``ICacheECC``                | bit         | 0          | *EXPERIMENTAL* Enable SECDED ECC protection in ICache (if       |
-|                              |             |            | ICache == 1)                                                    |
-+------------------------------+-------------+------------+-----------------------------------------------------------------+
-| ``SecureIbex``               | bit         | 0          | *EXPERIMENTAL* Enable various additional features targeting     |
-|                              |             |            | secure code execution.                                          |
-+------------------------------+-------------+------------+-----------------------------------------------------------------+
-| ``DbgTriggerEn``             | bit         | 0          | Enable debug trigger support (one trigger only)                 |
-+------------------------------+-------------+------------+-----------------------------------------------------------------+
-| ``DmHaltAddr``               | int         | 0x1A110800 | Address to jump to when entering Debug Mode                     |
-+------------------------------+-------------+------------+-----------------------------------------------------------------+
-| ``DmExceptionAddr``          | int         | 0x1A110808 | Address to jump to when an exception occurs while in Debug Mode |
-+------------------------------+-------------+------------+-----------------------------------------------------------------+
++------------------------------+-------------------+------------+-----------------------------------------------------------------+
+| Name                         | Type/Range        | Default    | Description                                                     |
++==============================+===================+============+=================================================================+
+| ``PMPEnable``                | bit               | 0          | Enable PMP support                                              |
++------------------------------+-------------------+------------+-----------------------------------------------------------------+
+| ``PMPGranularity``           | int (0..31)       | 0          | Minimum granularity of PMP address matching                     |
++------------------------------+-------------------+------------+-----------------------------------------------------------------+
+| ``PMPNumRegions``            | int (1..16)       | 4          | Number implemented PMP regions (ignored if PMPEnable == 0)      |
++------------------------------+-------------------+------------+-----------------------------------------------------------------+
+| ``MHPMCounterNum``           | int (0..10)       | 0          | Number of performance monitor event counters                    |
++------------------------------+-------------------+------------+-----------------------------------------------------------------+
+| ``MHPMCounterWidth``         | int (64..1)       | 40         | Bit width of performance monitor event counters                 |
++------------------------------+-------------------+------------+-----------------------------------------------------------------+
+| ``RV32E``                    | bit               | 0          | RV32E mode enable (16 integer registers only)                   |
++------------------------------+-------------------+------------+-----------------------------------------------------------------+
+| ``RV32M``                    | bit               | 1          | M(ultiply) extension enable                                     |
++------------------------------+-------------------+------------+-----------------------------------------------------------------+
+| ``RV32B``                    | ibex_pkg::rv32b_e | RV32BNone  | *EXPERIMENTAL* - B(itmanipulation) extension select:            |
+|                              |                   |            | "RV32BNone": No B-extension                                     |
+|                              |                   |            | "RV32BBalanced": Sub-extensions Zbb, Zbs, Zbf and               |
+|                              |                   |            | Zbt                                                             |
+|                              |                   |            | "RV32Full": All sub-extensions                                  |
++------------------------------+-------------------+------------+-----------------------------------------------------------------+
+| ``BranchTargetALU``          | bit               | 0          | *EXPERIMENTAL* - Enables branch target ALU removing a stall     |
+|                              |                   |            | cycle from taken branches                                       |
++------------------------------+------------------ +------------+-----------------------------------------------------------------+
+| ``WritebackStage``           | bit               | 0          | *EXPERIMENTAL* - Enables third pipeline stage (writeback)       |
+|                              |                   |            | improving performance of loads and stores                       |
++------------------------------+-------------------+------------+-----------------------------------------------------------------+
+| ``MultiplierImplementation`` | string            | "fast"     | Multiplicator type:                                             |
+|                              |                   |            | "slow": multi-cycle slow,                                       |
+|                              |                   |            | "fast": multi-cycle fast,                                       |
+|                              |                   |            | "single-cycle": single-cycle                                    |
++------------------------------+-------------------+------------+-----------------------------------------------------------------+
+| ``ICache``                   | bit               | 0          | *EXPERIMENTAL* Enable instruction cache instead of prefetch     |
+|                              |                   |            | buffer                                                          |
++------------------------------+-------------------+------------+-----------------------------------------------------------------+
+| ``ICacheECC``                | bit               | 0          | *EXPERIMENTAL* Enable SECDED ECC protection in ICache (if       |
+|                              |                   |            | ICache == 1)                                                    |
++------------------------------+-------------------+------------+-----------------------------------------------------------------+
+| ``SecureIbex``               | bit               | 0          | *EXPERIMENTAL* Enable various additional features targeting     |
+|                              |                   |            | secure code execution.                                          |
++------------------------------+-------------------+------------+-----------------------------------------------------------------+
+| ``DbgTriggerEn``             | bit               | 0          | Enable debug trigger support (one trigger only)                 |
++------------------------------+-------------------+------------+-----------------------------------------------------------------+
+| ``DmHaltAddr``               | int               | 0x1A110800 | Address to jump to when entering Debug Mode                     |
++------------------------------+-------------------+------------+-----------------------------------------------------------------+
+| ``DmExceptionAddr``          | int               | 0x1A110808 | Address to jump to when an exception occurs while in Debug Mode |
++------------------------------+-------------------+------------+-----------------------------------------------------------------+
 
 Any parameter marked *EXPERIMENTAL* when enabled is not verified to the same standard as the rest of the Ibex core.
 
diff --git a/doc/introduction.rst b/doc/introduction.rst
index e0d8c0c5..0aaa5fdc 100644
--- a/doc/introduction.rst
+++ b/doc/introduction.rst
@@ -46,6 +46,10 @@ In addition, the following instruction set extensions are available.
      - 2.0
      - optional
 
+   * - **B**: *EXPERIMENTAL* Standard Extension for Bit Manipulation Instructions
+     - 0.92
+     - optional
+
    * - **Zicsr**: Control and Status Register Instructions
      - 2.0
      - always enabled
diff --git a/dv/riscv_compliance/ibex_riscv_compliance.core b/dv/riscv_compliance/ibex_riscv_compliance.core
index 99661cb7..33b5d40a 100644
--- a/dv/riscv_compliance/ibex_riscv_compliance.core
+++ b/dv/riscv_compliance/ibex_riscv_compliance.core
@@ -37,10 +37,10 @@ parameters:
     description: "Enable the E ISA extension (reduced register set) [0/1]"
 
   RV32B:
-    datatype: int
-    paramtype: vlogparam
-    default: 0
-    description: "Enable the B ISA extension (bit manipulation EXPERIMENTAL) [0/1]"
+    datatype: str
+    default: ibex_pkg::RV32BNone
+    paramtype: vlogdefine
+    description: "Bitmanip implementation parameter enum. See ibex_pkg.sv (EXPERIMENTAL)"
 
   SRAM_INIT_FILE:
     datatype: str
diff --git a/dv/uvm/core_ibex/Makefile b/dv/uvm/core_ibex/Makefile
index 3291a8c0..105b04e3 100644
--- a/dv/uvm/core_ibex/Makefile
+++ b/dv/uvm/core_ibex/Makefile
@@ -65,7 +65,7 @@ PMP_REGIONS         := 16
 # PMP Granularity
 PMP_GRANULARITY     := 0
 
-IBEX_CONFIG         := experimental-maxperf-pmp-bm
+IBEX_CONFIG         := experimental-maxperf-pmp-bmfull
 
 # TODO(udinator) - might need options for SAIL/Whisper/Spike
 ifeq (${ISS},ovpsim)
diff --git a/dv/uvm/core_ibex/riscv_dv_extension/testlist.yaml b/dv/uvm/core_ibex/riscv_dv_extension/testlist.yaml
index 9605190b..a9cb249e 100644
--- a/dv/uvm/core_ibex/riscv_dv_extension/testlist.yaml
+++ b/dv/uvm/core_ibex/riscv_dv_extension/testlist.yaml
@@ -643,12 +643,22 @@
     +pmp_allow_addr_overlap=1
   rtl_test: core_ibex_base_test
 
-- test: riscv_bitmanip_test
+- test: riscv_bitmanip_full_test
   desc: >
-    Random instruction test with supported B extension instructions
+    Random instruction test with supported B extension instructions in full configuration
   iterations: 10
   gen_test: riscv_rand_instr_test
   gen_opts: >
     +enable_b_extension=1
-    +enable_bitmanip_groups=zbb,zbt,zbs,zbp,zbf,zbe,zbc,zbr
+    +enable_bitmanip_groups=zbb,zb_tmp,zbt,zbs,zbp,zbf,zbe,zbc,zbr
+  rtl_test: core_ibex_base_test
+
+- test: riscv_bitmanip_balanced_test
+  desc: >
+    Random instruction test with supported B extension instructions in balanced configuration
+  iterations: 10
+  gen_test: riscv_rand_instr_test
+  gen_opts: >
+    +enable_b_extension=1
+    +enable_bitmanip_groups=zbb,zb_tmp,zbt,zbs,zbf
   rtl_test: core_ibex_base_test
diff --git a/dv/uvm/core_ibex/tb/core_ibex_tb_top.sv b/dv/uvm/core_ibex/tb/core_ibex_tb_top.sv
index 87c7a885..57c5915a 100644
--- a/dv/uvm/core_ibex/tb/core_ibex_tb_top.sv
+++ b/dv/uvm/core_ibex/tb/core_ibex_tb_top.sv
@@ -32,12 +32,16 @@ module core_ibex_tb_top;
     `define IBEX_MULTIPLIER_IMPLEMENTATION fast
   `endif
 
+  `ifndef IBEX_CFG_RV32B
+    `define IBEX_CFG_RV32B ibex_pkg::RV32BNone
+  `endif
+
   parameter bit          PMPEnable       = 1'b0;
   parameter int unsigned PMPGranularity  = 0;
   parameter int unsigned PMPNumRegions   = 4;
   parameter bit RV32E                    = 1'b0;
   parameter bit RV32M                    = 1'b1;
-  parameter bit RV32B                    = 1'b0;
+  parameter ibex_pkg::rv32b_e RV32B      = `IBEX_CFG_RV32B;
   parameter bit BranchTargetALU          = 1'b0;
   parameter bit WritebackStage           = 1'b0;
 
diff --git a/examples/simple_system/ibex_simple_system.core b/examples/simple_system/ibex_simple_system.core
index 46db5399..f5aa9fe2 100644
--- a/examples/simple_system/ibex_simple_system.core
+++ b/examples/simple_system/ibex_simple_system.core
@@ -36,10 +36,10 @@ parameters:
     description: "Enable the E ISA extension (reduced register set) [0/1]"
 
   RV32B:
-    datatype: int
-    paramtype: vlogparam
-    default: 0
-    description: "Enable the B ISA extension (bit manipulation EXPERIMENTAL) [0/1]"
+    datatype: str
+    default: ibex_pkg::RV32BNone
+    paramtype: vlogdefine
+    description: "Bitmanip implementation parameter enum. See ibex_pkg.sv (EXPERIMENTAL)"
 
   SRAM_INIT_FILE:
     datatype: str
diff --git a/examples/simple_system/rtl/ibex_simple_system.sv b/examples/simple_system/rtl/ibex_simple_system.sv
index a37bd993..23926e1a 100644
--- a/examples/simple_system/rtl/ibex_simple_system.sv
+++ b/examples/simple_system/rtl/ibex_simple_system.sv
@@ -2,6 +2,10 @@
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 
+`ifndef RV32B
+  `define RV32B ibex_pkg::RV32BNone
+`endif
+
 /**
  * Ibex simple system
  *
@@ -19,15 +23,15 @@ module ibex_simple_system (
   input IO_RST_N
 );
 
-  parameter bit          PMPEnable       = 1'b0;
-  parameter int unsigned PMPGranularity  = 0;
-  parameter int unsigned PMPNumRegions   = 4;
-  parameter bit RV32E                    = 1'b0;
-  parameter bit RV32M                    = 1'b1;
-  parameter bit RV32B                    = 1'b0;
-  parameter bit BranchTargetALU          = 1'b0;
-  parameter bit WritebackStage           = 1'b0;
-  parameter     MultiplierImplementation = "fast";
+  parameter bit               PMPEnable                = 1'b0;
+  parameter int unsigned      PMPGranularity           = 0;
+  parameter int unsigned      PMPNumRegions            = 4;
+  parameter bit               RV32E                    = 1'b0;
+  parameter bit               RV32M                    = 1'b1;
+  parameter ibex_pkg::rv32b_e RV32B                    = `RV32B;
+  parameter bit               BranchTargetALU          = 1'b0;
+  parameter bit               WritebackStage           = 1'b0;
+  parameter                   MultiplierImplementation = "fast";
 
   logic clk_sys = 1'b0, rst_sys_n;
 
diff --git a/ibex_configs.yaml b/ibex_configs.yaml
index 6929140b..ed0913c5 100644
--- a/ibex_configs.yaml
+++ b/ibex_configs.yaml
@@ -10,7 +10,7 @@
 small:
   RV32E                    : 0
   RV32M                    : 1
-  RV32B                    : 0
+  RV32B                    : "ibex_pkg::RV32BNone"
   BranchTargetALU          : 0
   WritebackStage           : 0
   MultiplierImplementation : "fast"
@@ -28,7 +28,7 @@ small:
 experimental-maxperf:
   RV32E                    : 0
   RV32M                    : 1
-  RV32B                    : 0
+  RV32B                    : "ibex_pkg::RV32BNone"
   BranchTargetALU          : 1
   WritebackStage           : 1
   MultiplierImplementation : "single-cycle"
@@ -40,7 +40,7 @@ experimental-maxperf:
 experimental-maxperf-pmp:
   RV32E                    : 0
   RV32M                    : 1
-  RV32B                    : 0
+  RV32B                    : "ibex_pkg::RV32BNone"
   BranchTargetALU          : 1
   WritebackStage           : 1
   MultiplierImplementation : "single-cycle"
@@ -48,14 +48,27 @@ experimental-maxperf-pmp:
   PMPGranularity           : 0
   PMPNumRegions            : 16
 
-# experimental-maxperf-pmp config above with bitmanip extension
-experimental-maxperf-pmp-bm:
+# experimental-maxperf-pmp config above with balanced bitmanip extension
+experimental-maxperf-pmp-bmbalanced:
   RV32E                    : 0
   RV32M                    : 1
-  RV32B                    : 1
+  RV32B                    : "ibex_pkg::RV32BBalanced"
   BranchTargetALU          : 1
   WritebackStage           : 1
   MultiplierImplementation : "single-cycle"
   PMPEnable                : 1
   PMPGranularity           : 0
   PMPNumRegions            : 16
+
+# experimental-maxperf-pmp config above with full bitmanip extension
+experimental-maxperf-pmp-bmfull:
+  RV32E                    : 0
+  RV32M                    : 1
+  RV32B                    : "ibex_pkg::RV32BFull"
+  BranchTargetALU          : 1
+  WritebackStage           : 1
+  MultiplierImplementation : "single-cycle"
+  PMPEnable                : 1
+  PMPGranularity           : 0
+  PMPNumRegions            : 16
+
diff --git a/ibex_core.core b/ibex_core.core
index 0c45f590..6c640658 100644
--- a/ibex_core.core
+++ b/ibex_core.core
@@ -72,9 +72,10 @@ parameters:
     paramtype: vlogparam
 
   RV32B:
-    datatype: int
-    default: 0
-    paramtype: vlogparam
+    datatype: str
+    default: ibex_pkg::RV32BNone
+    paramtype: vlogdefine
+    description: "Bitmanip implementation parameter enum. See ibex_pkg.sv (EXPERIMENTAL)"
 
   MultiplierImplementation:
     datatype: str
diff --git a/ibex_core_tracing.core b/ibex_core_tracing.core
index 619436b1..765bbe67 100644
--- a/ibex_core_tracing.core
+++ b/ibex_core_tracing.core
@@ -43,9 +43,10 @@ parameters:
     paramtype: vlogparam
 
   RV32B:
-    datatype: int
-    default: 0
-    paramtype: vlogparam
+    datatype: str
+    default: ibex_pkg::RV32BNone
+    paramtype: vlogdefine
+    description: "Bitmanip implementation parameter enum. See ibex_pkg.sv (EXPERIMENTAL)"
 
   MultiplierImplementation:
     datatype: str
diff --git a/lint/verilator_waiver.vlt b/lint/verilator_waiver.vlt
index ee041ae2..8049d659 100644
--- a/lint/verilator_waiver.vlt
+++ b/lint/verilator_waiver.vlt
@@ -37,12 +37,18 @@ lint_off -rule UNUSED -file "*/rtl/ibex_alu.sv" -match "*'shift_amt_compl'[5]*"
 // cleaner to write all bits even if not all are used
 lint_off -rule UNUSED -file "*/rtl/ibex_alu.sv" -match "*'shift_result_ext'[32]*"
 
-// Signal is not used for RV32B == 0: imd_val_q_i
+// Signal is not used for RV32B == RV32BNone: imd_val_q_i
 //
 // No ALU multicycle instructions exist to use the intermediate value register,
 // if bitmanipulation extension is not enabled.
 lint_off -rule UNUSED -file "*/rtl/ibex_alu.sv" -match "*'imd_val_q_i'"
 
+// Signal is not used for RV32B == RV32BNone: butterfly_result, invbutterfly_result
+//
+// Need to be declared; referenced in unused if-generate block
+lint_off -rule UNUSED -file "*/rtl/ibex_alu.sv" -match "*'butterfly_result'"
+lint_off -rule UNUSED -file "*/rtl/ibex_alu.sv" -match "*'invbutterfly_result'"
+
 // Bits of signal are not used: fetch_addr_n[0]
 // cleaner to write all bits even if not all are used
 lint_off -rule UNUSED -file "*/rtl/ibex_if_stage.sv" -match "*'fetch_addr_n'[0]*"
diff --git a/rtl/ibex_alu.sv b/rtl/ibex_alu.sv
index 0f593285..faa154c5 100644
--- a/rtl/ibex_alu.sv
+++ b/rtl/ibex_alu.sv
@@ -7,7 +7,7 @@
  * Arithmetic logic unit
  */
 module ibex_alu #(
-  parameter bit RV32B = 1'b0
+  parameter ibex_pkg::rv32b_e RV32B = ibex_pkg::RV32BNone
 ) (
     input  ibex_pkg::alu_op_e operator_i,
     input  logic [31:0]       operand_a_i,
@@ -20,9 +20,9 @@ module ibex_alu #(
 
     input  logic              multdiv_sel_i,
 
-    input  logic [31:0]       imd_val_q_i,
-    output logic [31:0]       imd_val_d_o,
-    output logic              imd_val_we_o,
+    input  logic [31:0]       imd_val_q_i[2],
+    output logic [31:0]       imd_val_d_o[2],
+    output logic [1:0]        imd_val_we_o,
 
     output logic [31:0]       adder_result_o,
     output logic [33:0]       adder_result_ext_o,
@@ -241,16 +241,16 @@ module ibex_alu #(
   logic [31:0] bfp_result;
 
   // bfp: shares the shifter structure to compute bfp_mask << bfp_off
-  assign bfp_op = RV32B ? (operator_i == ALU_BFP) : 1'b0;
+  assign bfp_op = (RV32B != RV32BNone) ? (operator_i == ALU_BFP) : 1'b0;
   assign bfp_len = {~(|operand_b_i[27:24]), operand_b_i[27:24]}; // len = 0 encodes for len = 16
   assign bfp_off = operand_b_i[20:16];
-  assign bfp_mask = RV32B ? ~(32'hffff_ffff << bfp_len) : '0;
+  assign bfp_mask = (RV32B != RV32BNone) ? ~(32'hffff_ffff << bfp_len) : '0;
   for (genvar i=0; i<32; i++) begin : gen_rev_bfp_mask
     assign bfp_mask_rev[i] = bfp_mask[31-i];
   end
 
-  assign bfp_result =
-      RV32B ? (~shift_result & operand_a_i) | ((operand_b_i & bfp_mask) << bfp_off) : '0;
+  assign bfp_result =(RV32B != RV32BNone) ?
+      (~shift_result & operand_a_i) | ((operand_b_i & bfp_mask) << bfp_off) : '0;
 
   // bit shift_amt[5]: word swap bit: only considered for FSL/FSR.
   // if set, reverse operations in first and second cycle.
@@ -267,9 +267,8 @@ module ibex_alu #(
     end
   end
 
-
   // single-bit mode: shift
-  assign shift_sbmode = RV32B ?
+  assign shift_sbmode = (RV32B != RV32BNone) ?
       (operator_i == ALU_SBSET) | (operator_i == ALU_SBCLR) | (operator_i == ALU_SBINV) : 1'b0;
 
   // left shift if this is:
@@ -284,13 +283,13 @@ module ibex_alu #(
     unique case (operator_i)
       ALU_SLL: shift_left = 1'b1;
       ALU_SLO,
-      ALU_BFP: shift_left = RV32B ? 1'b1 : 1'b0;
-      ALU_ROL: shift_left = RV32B ? instr_first_cycle_i : 0;
-      ALU_ROR: shift_left = RV32B ? ~instr_first_cycle_i : 0;
-      ALU_FSL: shift_left =
-          RV32B ? (shift_amt[5] ? ~instr_first_cycle_i : instr_first_cycle_i) : 1'b0;
-      ALU_FSR: shift_left =
-          RV32B ? (shift_amt[5] ? instr_first_cycle_i : ~instr_first_cycle_i) : 1'b0;
+      ALU_BFP: shift_left = (RV32B != RV32BNone) ? 1'b1 : 1'b0;
+      ALU_ROL: shift_left = (RV32B != RV32BNone) ? instr_first_cycle_i : 0;
+      ALU_ROR: shift_left = (RV32B != RV32BNone) ? ~instr_first_cycle_i : 0;
+      ALU_FSL: shift_left = (RV32B != RV32BNone) ?
+        (shift_amt[5] ? ~instr_first_cycle_i : instr_first_cycle_i) : 1'b0;
+      ALU_FSR: shift_left = (RV32B != RV32BNone) ?
+          (shift_amt[5] ? instr_first_cycle_i : ~instr_first_cycle_i) : 1'b0;
       default: shift_left = 1'b0;
     endcase
     if (shift_sbmode) begin
@@ -298,26 +297,26 @@ module ibex_alu #(
     end
   end
 
-  assign shift_arith      = (operator_i == ALU_SRA);
-  assign shift_ones       = RV32B ? (operator_i == ALU_SLO) | (operator_i == ALU_SRO) : 1'b0;
-  assign shift_funnel     = RV32B ? (operator_i == ALU_FSL) | (operator_i == ALU_FSR) : 1'b0;
+  assign shift_arith  = (operator_i == ALU_SRA);
+  assign shift_ones   =
+      (RV32B != RV32BNone) ? (operator_i == ALU_SLO) | (operator_i == ALU_SRO) : 1'b0;
+  assign shift_funnel =
+      (RV32B != RV32BNone) ? (operator_i == ALU_FSL) | (operator_i == ALU_FSR) : 1'b0;
 
   // shifter structure.
   always_comb begin
-
     // select shifter input
     // for bfp, sbmode and shift_left the corresponding bit-reversed input is chosen.
-    if (shift_sbmode) begin
-      shift_result = 32'h8000_0000; // rev(32'h1)
+    if (RV32B == RV32BNone) begin
+      shift_result = shift_left ? operand_a_rev : operand_a_i;
     end else begin
       unique case (1'b1)
         bfp_op:       shift_result = bfp_mask_rev;
-        shift_left:   shift_result = operand_a_rev;
-        default:      shift_result = operand_a_i;
+        shift_sbmode: shift_result = 32'h8000_0000;
+        default:      shift_result = shift_left ? operand_a_rev : operand_a_i;
       endcase
     end
 
-
     shift_result_ext =
         $signed({shift_ones | (shift_arith & shift_result[31]), shift_result}) >>> shift_amt[4:0];
 
@@ -350,8 +349,8 @@ module ibex_alu #(
       // Logic-with-negate OPs (RV32B Ops)
       ALU_XNOR,
       ALU_ORN,
-      ALU_ANDN: bwlogic_op_b_negate = RV32B ? 1'b1 : 1'b0;
-      ALU_CMIX: bwlogic_op_b_negate = RV32B ? ~instr_first_cycle_i : 1'b0;
+      ALU_ANDN: bwlogic_op_b_negate = (RV32B != RV32BNone) ? 1'b1 : 1'b0;
+      ALU_CMIX: bwlogic_op_b_negate = (RV32B != RV32BNone) ? ~instr_first_cycle_i : 1'b0;
       default:  bwlogic_op_b_negate = 1'b0;
     endcase
   end
@@ -373,19 +372,19 @@ module ibex_alu #(
     endcase
   end
 
+  logic [5:0]  bitcnt_result;
+  logic [31:0] minmax_result;
+  logic [31:0] pack_result;
+  logic [31:0] sext_result;
+  logic [31:0] singlebit_result;
+  logic [31:0] rev_result;
   logic [31:0] shuffle_result;
   logic [31:0] butterfly_result;
   logic [31:0] invbutterfly_result;
-
-  logic [31:0] minmax_result;
-  logic [5:0]  bitcnt_result;
-  logic [31:0] pack_result;
-  logic [31:0] sext_result;
-  logic [31:0] multicycle_result;
-  logic [31:0] singlebit_result;
   logic [31:0] clmul_result;
+  logic [31:0] multicycle_result;
 
-  if (RV32B) begin : g_alu_rvb
+  if (RV32B != RV32BNone) begin : g_alu_rvb
 
     /////////////////
     // Bitcounting //
@@ -404,6 +403,8 @@ module ibex_alu #(
     logic [31:0] bitcnt_mask_op;
     logic [31:0] bitcnt_bit_mask;
     logic [ 5:0] bitcnt_partial [32];
+    logic [31:0] bitcnt_partial_lsb_d;
+    logic [31:0] bitcnt_partial_msb_d;
 
 
     assign bitcnt_ctz    = operator_i == ALU_CTZ;
@@ -427,6 +428,8 @@ module ibex_alu #(
       bitcnt_bit_mask = ~bitcnt_bit_mask;
     end
 
+    assign zbe_op = (operator_i == ALU_BEXT) | (operator_i == ALU_BDEP);
+
     always_comb begin
       case(1'b1)
         zbe_op:      bitcnt_bits = operand_b_i;
@@ -517,524 +520,12 @@ module ibex_alu #(
       end
     end
 
-    ///////////////
-    // Butterfly //
-    ///////////////
-
-    // The butterfly / inverse butterfly network is shared between bext/bdep (zbe)instructions
-    // respectively and grev / gorc instructions (zbp).
-    // For bdep, the control bits mask of a local left region is generated by
-    // the inverse of a n-bit left rotate and complement upon wrap (LROTC) operation by the number
-    // of ones in the deposit bitmask to the right of the segment. n hereby denotes the width
-    // of the according segment. The bitmask for a pertaining local right region is equal to the
-    // corresponding local left region. Bext uses an analogue inverse process.
-    // Consider the following 8-bit example.  For details, see Hilewitz et al. "Fast Bit Gather,
-    // Bit Scatter and Bit Permuation Instructions for Commodity Microprocessors", (2008).
-
-    // 8-bit example:  (Hilewitz et al.)
-    // Consider the instruction bdep operand_a_i deposit_mask
-    // Let operand_a_i = 8'babcd_efgh
-    //    deposit_mask = 8'b1010_1101
-    //
-    // control bitmask for stage 1:
-    //  - number of ones in the right half of the deposit bitmask: 3
-    //  - width of the segment: 4
-    //  - control bitmask = ~LROTC(4'b0, 3)[3:0] = 4'b1000
-    //
-    // control bitmask:   c3 c2  c1 c0  c3 c2  c1 c0
-    //                    1  0   0  0   1  0   0  0
-    //                    <- L ----->   <- R ----->
-    // operand_a_i        a  b   c  d   e  f   g  h
-    //                    :\ |   |  |  /:  |   |  |
-    //                    : +|---|--|-+ :  |   |  |
-    //                    :/ |   |  |  \:  |   |  |
-    // stage 1            e  b   c  d   a  f   g  h
-    //                    <L->   <R->   <L->   <R->
-    // control bitmask:   c3 c2  c3 c2  c1 c0  c1 c0
-    //                    1  1   1  1   1  0   1  0
-    //                    :\ :\ /: /:   :\ |  /:  |
-    //                    : +:-+-:+ :   : +|-+ :  |
-    //                    :/ :/ \: \:   :/ |  \:  |
-    // stage 2            c  d   e  b   g  f   a  h
-    //                    L  R   L  R   L  R   L  R
-    // control bitmask:   c3 c3  c2 c2  c1 c1  c0 c0
-    //                    1  1   0  0   1  1   0  0
-    //                    :\/:   |  |   :\/:   |  |
-    //                    :  :   |  |   :  :   |  |
-    //                    :/\:   |  |   :/\:   |  |
-    // stage 3            d  c   e  b   f  g   a  h
-    // & deposit bitmask: 1  0   1  0   1  1   0  1
-    // result:            d  0   e  0   f  g   0  h
-
-    assign zbe_op = (operator_i == ALU_BEXT) | (operator_i == ALU_BDEP);
-
-    logic [31:0] butterfly_mask_l[5];
-    logic [31:0] butterfly_mask_r[5];
-    logic [31:0] butterfly_mask_not[5];
-    logic [31:0] lrotc_stage [5]; // left rotate and complement upon wrap
-
-    // bext / bdep
-    logic [31:0] butterfly_zbe_mask_l[5];
-    logic [31:0] butterfly_zbe_mask_r[5];
-    logic [31:0] butterfly_zbe_mask_not[5];
-
-    // grev / gorc
-    logic [31:0] butterfly_zbp_mask_l[5];
-    logic [31:0] butterfly_zbp_mask_r[5];
-    logic [31:0] butterfly_zbp_mask_not[5];
-
-    logic grev_op;
-    logic gorc_op;
-    logic zbp_op;
-
-    // number of bits in local r = 32 / 2**(stage + 1) = 16/2**stage
-    `define _N(stg) (16 >> stg)
-
-    // bext / bdep control bit generation
-    for (genvar stg=0; stg<5; stg++) begin : gen_stage
-      // number of segs: 2** stg
-      for (genvar seg=0; seg<2**stg; seg++) begin : gen_segment
-
-        assign lrotc_stage[stg][2*`_N(stg)*(seg+1)-1 : 2*`_N(stg)*seg] =
-            {{`_N(stg){1'b0}},{`_N(stg){1'b1}}} <<
-                bitcnt_partial[`_N(stg)*(2*seg+1)-1][$clog2(`_N(stg)):0];
-
-        assign butterfly_zbe_mask_l[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)]
-                     = ~lrotc_stage[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)];
-
-        assign butterfly_zbe_mask_r[stg][`_N(stg)*(2*seg+1)-1 : `_N(stg)*(2*seg)]
-                     = ~lrotc_stage[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)];
-
-        assign butterfly_zbe_mask_l[stg][`_N(stg)*(2*seg+1)-1 : `_N(stg)*(2*seg)]   = '0;
-        assign butterfly_zbe_mask_r[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)] = '0;
-      end
-    end
-    `undef _N
-
-    for (genvar stg=0; stg<5; stg++) begin : gen_zbe_mask
-      assign butterfly_zbe_mask_not[stg] =
-          ~(butterfly_zbe_mask_l[stg] | butterfly_zbe_mask_r[stg]);
-    end
-
-    // grev / gorc control bit generation
-    assign butterfly_zbp_mask_l[0] = shift_amt[4] ? 32'hffff_0000 : 32'h0000_0000;
-    assign butterfly_zbp_mask_r[0] = shift_amt[4] ? 32'h0000_ffff : 32'h0000_0000;
-    assign butterfly_zbp_mask_not[0] =
-       !shift_amt[4] || (shift_amt[4] && gorc_op) ? 32'hffff_ffff : 32'h0000_0000;
-
-    assign butterfly_zbp_mask_l[1] = shift_amt[3] ? 32'hff00_ff00 : 32'h0000_0000;
-    assign butterfly_zbp_mask_r[1] = shift_amt[3] ? 32'h00ff_00ff : 32'h0000_0000;
-    assign butterfly_zbp_mask_not[1] =
-       !shift_amt[3] || (shift_amt[3] && gorc_op) ? 32'hffff_ffff : 32'h0000_0000;
-
-    assign butterfly_zbp_mask_l[2] = shift_amt[2] ? 32'hf0f0_f0f0 : 32'h0000_0000;
-    assign butterfly_zbp_mask_r[2] = shift_amt[2] ? 32'h0f0f_0f0f : 32'h0000_0000;
-    assign butterfly_zbp_mask_not[2] =
-       !shift_amt[2] || (shift_amt[2] && gorc_op) ? 32'hffff_ffff : 32'h0000_0000;
-
-    assign butterfly_zbp_mask_l[3] = shift_amt[1] ? 32'hcccc_cccc : 32'h0000_0000;
-    assign butterfly_zbp_mask_r[3] = shift_amt[1] ? 32'h3333_3333 : 32'h0000_0000;
-    assign butterfly_zbp_mask_not[3] =
-       !shift_amt[1] || (shift_amt[1] && gorc_op) ? 32'hffff_ffff : 32'h0000_0000;
-
-    assign butterfly_zbp_mask_l[4] = shift_amt[0] ? 32'haaaa_aaaa : 32'h0000_0000;
-    assign butterfly_zbp_mask_r[4] = shift_amt[0] ? 32'h5555_5555 : 32'h0000_0000;
-    assign butterfly_zbp_mask_not[4] =
-       !shift_amt[0] || (shift_amt[0] && gorc_op) ? 32'hffff_ffff : 32'h0000_0000;
-
-    // grev / gorc instructions
-    assign grev_op = RV32B ? (operator_i == ALU_GREV) : 1'b0;
-    assign gorc_op = RV32B ? (operator_i == ALU_GORC) : 1'b0;
-    assign zbp_op = grev_op | gorc_op;
-
-    // select set of masks:
-    assign butterfly_mask_l   = zbp_op ? butterfly_zbp_mask_l   : butterfly_zbe_mask_l;
-    assign butterfly_mask_r   = zbp_op ? butterfly_zbp_mask_r   : butterfly_zbe_mask_r;
-    assign butterfly_mask_not = zbp_op ? butterfly_zbp_mask_not : butterfly_zbe_mask_not;
-
-    always_comb begin
-      butterfly_result = operand_a_i;
-
-      butterfly_result = butterfly_result & butterfly_mask_not[0] |
-          ((butterfly_result & butterfly_mask_l[0]) >> 16)|
-          ((butterfly_result & butterfly_mask_r[0]) << 16);
-
-      butterfly_result = butterfly_result & butterfly_mask_not[1] |
-          ((butterfly_result & butterfly_mask_l[1]) >> 8)|
-          ((butterfly_result & butterfly_mask_r[1]) << 8);
-
-      butterfly_result = butterfly_result & butterfly_mask_not[2] |
-          ((butterfly_result & butterfly_mask_l[2]) >> 4)|
-          ((butterfly_result & butterfly_mask_r[2]) << 4);
-
-      butterfly_result = butterfly_result & butterfly_mask_not[3] |
-          ((butterfly_result & butterfly_mask_l[3]) >> 2)|
-          ((butterfly_result & butterfly_mask_r[3]) << 2);
-
-      butterfly_result = butterfly_result & butterfly_mask_not[4] |
-          ((butterfly_result & butterfly_mask_l[4]) >> 1)|
-          ((butterfly_result & butterfly_mask_r[4]) << 1);
-
-      if (!zbp_op) begin
-        butterfly_result = butterfly_result & operand_b_i;
-      end
-    end
-
-    always_comb begin
-      invbutterfly_result = operand_a_i & operand_b_i;
-
-      invbutterfly_result = invbutterfly_result & butterfly_mask_not[4] |
-          ((invbutterfly_result & butterfly_mask_l[4]) >> 1)|
-          ((invbutterfly_result & butterfly_mask_r[4]) << 1);
-
-      invbutterfly_result = invbutterfly_result & butterfly_mask_not[3] |
-          ((invbutterfly_result & butterfly_mask_l[3]) >> 2)|
-          ((invbutterfly_result & butterfly_mask_r[3]) << 2);
-
-      invbutterfly_result = invbutterfly_result & butterfly_mask_not[2] |
-          ((invbutterfly_result & butterfly_mask_l[2]) >> 4)|
-          ((invbutterfly_result & butterfly_mask_r[2]) << 4);
-
-      invbutterfly_result = invbutterfly_result & butterfly_mask_not[1] |
-          ((invbutterfly_result & butterfly_mask_l[1]) >> 8)|
-          ((invbutterfly_result & butterfly_mask_r[1]) << 8);
-
-      invbutterfly_result = invbutterfly_result & butterfly_mask_not[0] |
-          ((invbutterfly_result & butterfly_mask_l[0]) >> 16)|
-          ((invbutterfly_result & butterfly_mask_r[0]) << 16);
-    end
-
-    /////////////////////////
-    // Shuffle / Unshuffle //
-    /////////////////////////
-
-    localparam logic [31:0] SHUFFLE_MASK_L [4] =
-        '{32'h4444_4444, 32'h3030_3030, 32'h0f00_0f00, 32'h00ff_0000};
-    localparam logic [31:0] SHUFFLE_MASK_R [4] =
-        '{32'h2222_2222, 32'h0c0c_0c0c, 32'h00f0_00f0, 32'h0000_ff00};
-
-    localparam logic [31:0] FLIP_MASK_L [4] =
-        '{32'h1100_0000, 32'h4411_0000, 32'h0044_0000, 32'h2200_1100};
-    localparam logic [31:0] FLIP_MASK_R [4] =
-        '{32'h0000_0088, 32'h0000_8822, 32'h0000_2200, 32'h0088_0044};
-
-    logic [31:0] SHUFFLE_MASK_NOT [4];
-    for(genvar i = 0; i < 4; i++) begin : gen_shuffle_mask_not
-      assign SHUFFLE_MASK_NOT[i] = ~(SHUFFLE_MASK_L[i] | SHUFFLE_MASK_R[i]);
-    end
-
-    logic shuffle_flip;
-    assign shuffle_flip = operator_i == ALU_UNSHFL;
-
-    logic [3:0] shuffle_mode;
-
-    always_comb begin
-      shuffle_result = operand_a_i;
-
-      if (shuffle_flip) begin
-        shuffle_mode[3] = shift_amt[0];
-        shuffle_mode[2] = shift_amt[1];
-        shuffle_mode[1] = shift_amt[2];
-        shuffle_mode[0] = shift_amt[3];
-      end else begin
-        shuffle_mode = shift_amt[3:0];
-      end
-
-      if (shuffle_flip) begin
-        shuffle_result = (shuffle_result & 32'h8822_4411) |
-            ((shuffle_result << 6)  & FLIP_MASK_L[0]) | ((shuffle_result >> 6)  & FLIP_MASK_R[0]) |
-            ((shuffle_result << 9)  & FLIP_MASK_L[1]) | ((shuffle_result >> 9)  & FLIP_MASK_R[1]) |
-            ((shuffle_result << 15) & FLIP_MASK_L[2]) | ((shuffle_result >> 15) & FLIP_MASK_R[2]) |
-            ((shuffle_result << 21) & FLIP_MASK_L[3]) | ((shuffle_result >> 21) & FLIP_MASK_R[3]);
-      end
-
-      if (shuffle_mode[3]) begin
-        shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[0]) |
-            (((shuffle_result << 8) & SHUFFLE_MASK_L[0]) |
-            ((shuffle_result >> 8) & SHUFFLE_MASK_R[0]));
-      end
-      if (shuffle_mode[2]) begin
-        shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[1]) |
-            (((shuffle_result << 4) & SHUFFLE_MASK_L[1]) |
-            ((shuffle_result >> 4) & SHUFFLE_MASK_R[1]));
-      end
-      if (shuffle_mode[1]) begin
-        shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[2]) |
-            (((shuffle_result << 2) & SHUFFLE_MASK_L[2]) |
-            ((shuffle_result >> 2) & SHUFFLE_MASK_R[2]));
-      end
-      if (shuffle_mode[0]) begin
-        shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[3]) |
-            (((shuffle_result << 1) & SHUFFLE_MASK_L[3]) |
-            ((shuffle_result >> 1) & SHUFFLE_MASK_R[3]));
-      end
-
-      if (shuffle_flip) begin
-        shuffle_result = (shuffle_result & 32'h8822_4411) |
-            ((shuffle_result << 6)  & FLIP_MASK_L[0]) | ((shuffle_result >> 6) & FLIP_MASK_R[0])  |
-            ((shuffle_result << 9)  & FLIP_MASK_L[1]) | ((shuffle_result >> 9) & FLIP_MASK_R[1])  |
-            ((shuffle_result << 15) & FLIP_MASK_L[2]) | ((shuffle_result >> 15) & FLIP_MASK_R[2]) |
-            ((shuffle_result << 21) & FLIP_MASK_L[3]) | ((shuffle_result >> 21) & FLIP_MASK_R[3]);
-      end
-
-    end
-    ///////////////////////////////////////////////////
-    // Carry-less Multiply + Cyclic Redundancy Check //
-    ///////////////////////////////////////////////////
-
-    // Carry-less multiplication can be understood as multiplication based on
-    // the addition interpreted as the bit-wise xor operation.
-    //
-    // Example: 1101 X 1011 = 1111111:
-    //
-    //       1011 X 1101
-    //       -----------
-    //              1101
-    //         xor 1101
-    //         ---------
-    //             10111
-    //        xor 0000
-    //        ----------
-    //            010111
-    //       xor 1101
-    //       -----------
-    //           1111111
-    //
-    // Architectural details:
-    //         A 32 x 32-bit array
-    //         [ operand_b[i] ? (operand_a << i) : '0 for i in 0 ... 31 ]
-    //         is generated. The entries of the array are pairwise 'xor-ed'
-    //         together in a 5-stage binary tree.
-    //
-    //
-    // Cyclic Redundancy Check:
-    //
-    // CRC-32 (CRC-32/ISO-HDLC) and CRC-32C (CRC-32/ISCSI) are directly implemented. For
-    // documentation of the crc configuration (crc-polynomials, initialization, reflection, etc.)
-    // see http://reveng.sourceforge.net/crc-catalogue/all.htm
-    // A useful guide to crc arithmetic and algorithms is given here:
-    // http://www.piclist.com/techref/method/math/crcguide.html.
-    //
-    // The CRC operation solves the following equation using binary polynomial arithmetic:
-    //
-    // rev(rd)(x) = rev(rs1)(x) * x**n mod {1, P}(x)
-    //
-    // where P denotes lower 32 bits of the corresponding CRC polynomial, rev(a) the bit reversal
-    // of a, n = 8,16, or 32 for .b, .h, .w -variants. {a, b} denotes bit concatenation.
-    //
-    // Using barret reduction, one can show that
-    //
-    // M(x) mod P(x) = R(x) =
-    //          (M(x) * x**n) & {deg(P(x)'{1'b1}}) ^ (M(x) x**-(deg(P(x) - n)) cx mu(x) cx P(x),
-    //
-    // Where mu(x) = polydiv(x**64, {1,P}) & 0xffffffff. Here, 'cx' refers to carry-less
-    // multiplication. Substituting rev(rd)(x) for R(x) and rev(rs1)(x) for M(x) and solving for
-    // rd(x) with P(x) a crc32 polynomial (deg(P(x)) = 32), we get
-    //
-    // rd = rev( (rev(rs1) << n)  ^ ((rev(rs1) >> (32-n)) cx mu cx P)
-    //    = (rs1 >> n) ^ rev(rev( (rs1 << (32-n)) cx rev(mu)) cx P)
-    //                       ^-- cycle 0--------------------^
-    //      ^- cycle 1 -------------------------------------------^
-    //
-    // In the last step we used the fact that carry-less multiplication is bit-order agnostic:
-    // rev(a cx b) = rev(a) cx rev(b).
-
-    logic clmul_rmode;
-    logic clmul_hmode;
-    logic [31:0] clmul_op_a;
-    logic [31:0] clmul_op_b;
-    logic [31:0] operand_b_rev;
-    logic [31:0] clmul_and_stage[32];
-    logic [31:0] clmul_xor_stage1[16];
-    logic [31:0] clmul_xor_stage2[8];
-    logic [31:0] clmul_xor_stage3[4];
-    logic [31:0] clmul_xor_stage4[2];
-
-    logic [31:0] clmul_result_raw;
-    logic [31:0] clmul_result_rev;
-
-    for (genvar i=0; i<32; i++) begin: gen_rev_operand_b
-      assign operand_b_rev[i] = operand_b_i[31-i];
-    end
-
-    assign clmul_rmode = operator_i == ALU_CLMULR;
-    assign clmul_hmode = operator_i == ALU_CLMULH;
-
-    // CRC
-    localparam logic [31:0] CRC32_POLYNOMIAL = 32'h04c1_1db7;
-    localparam logic [31:0] CRC32_MU_REV = 32'hf701_1641;
-
-    localparam logic [31:0] CRC32C_POLYNOMIAL = 32'h1edc_6f41;
-    localparam logic [31:0] CRC32C_MU_REV = 32'hdea7_13f1;
-
-    logic crc_op;
-    logic crc_hmode;
-    logic crc_bmode;
-
-    logic crc_cpoly;
-
-    logic [31:0] crc_operand;
-    logic [31:0] crc_poly;
-    logic [31:0] crc_mu_rev;
-
-    assign crc_op = (operator_i == ALU_CRC32C_W) | (operator_i == ALU_CRC32_W) |
-                    (operator_i == ALU_CRC32C_H) | (operator_i == ALU_CRC32_H) |
-                    (operator_i == ALU_CRC32C_B) | (operator_i == ALU_CRC32_B);
-
-    assign crc_cpoly = (operator_i == ALU_CRC32C_W) |
-                       (operator_i == ALU_CRC32C_H) |
-                       (operator_i == ALU_CRC32C_B);
-
-    assign crc_hmode = (operator_i == ALU_CRC32_H) | (operator_i == ALU_CRC32C_H);
-    assign crc_bmode = (operator_i == ALU_CRC32_B) | (operator_i == ALU_CRC32C_B);
-
-    assign crc_poly   = crc_cpoly ? CRC32C_POLYNOMIAL : CRC32_POLYNOMIAL;
-    assign crc_mu_rev = crc_cpoly ? CRC32C_MU_REV : CRC32_MU_REV;
-
-    always_comb begin
-      unique case(1'b1)
-        crc_bmode: crc_operand = {operand_a_i[7:0], 24'h0};
-        crc_hmode: crc_operand = {operand_a_i[15:0], 16'h0};
-        default:   crc_operand = operand_a_i;
-      endcase
-    end
-
-    // Select clmul input
-    always_comb begin
-      if (crc_op) begin
-        clmul_op_a = instr_first_cycle_i ? crc_operand : imd_val_q_i;
-        clmul_op_b = instr_first_cycle_i ? crc_mu_rev : crc_poly;
-      end else begin
-        clmul_op_a = clmul_rmode | clmul_hmode ? operand_a_rev : operand_a_i;
-        clmul_op_b = clmul_rmode | clmul_hmode ? operand_b_rev : operand_b_i;
-      end
-    end
-
-    for (genvar i=0; i<32; i++) begin : gen_clmul_and_op
-      assign clmul_and_stage[i] = clmul_op_b[i] ? clmul_op_a << i : '0;
-    end
-
-    for (genvar i=0; i<16; i++) begin : gen_clmul_xor_op_l1
-      assign clmul_xor_stage1[i] = clmul_and_stage[2*i] ^ clmul_and_stage[2*i+1];
-    end
-
-    for (genvar i=0; i<8; i++) begin : gen_clmul_xor_op_l2
-      assign clmul_xor_stage2[i] = clmul_xor_stage1[2*i] ^ clmul_xor_stage1[2*i+1];
-    end
-
-    for (genvar i=0; i<4; i++) begin : gen_clmul_xor_op_l3
-      assign clmul_xor_stage3[i] = clmul_xor_stage2[2*i] ^ clmul_xor_stage2[2*i+1];
-    end
-
-    for (genvar i=0; i<2; i++) begin : gen_clmul_xor_op_l4
-      assign clmul_xor_stage4[i] = clmul_xor_stage3[2*i] ^ clmul_xor_stage3[2*i+1];
-    end
-
-    assign clmul_result_raw = clmul_xor_stage4[0] ^ clmul_xor_stage4[1];
-
-    for (genvar i=0; i<32; i++) begin : gen_rev_clmul_result
-      assign clmul_result_rev[i] = clmul_result_raw[31-i];
-    end
-
-    // clmulr_result = rev(clmul(rev(a), rev(b)))
-    // clmulh_result = clmulr_result >> 1
-    always_comb begin
-      case(1'b1)
-        clmul_rmode: clmul_result = clmul_result_rev;
-        clmul_hmode: clmul_result = {1'b0, clmul_result_rev[31:1]};
-        default:     clmul_result = clmul_result_raw;
-      endcase
-    end
-
-    //////////////////////////////////////
-    // Multicycle Bitmanip Instructions //
-    //////////////////////////////////////
-    // Ternary instructions + Shift Rotations + CRC
-    // For ternary instructions (zbt), operand_a_i is tied to rs1 in the first cycle and rs3 in the
-    // second cycle. operand_b_i is always tied to rs2.
-
-
-    always_comb begin
-      unique case (operator_i)
-        ALU_CMOV: begin
-            imd_val_d_o = operand_a_i;
-            multicycle_result = (operand_b_i == 32'h0) ? operand_a_i : imd_val_q_i;
-          if (instr_first_cycle_i) begin
-            imd_val_we_o = 1'b1;
-          end else begin
-            imd_val_we_o = 1'b0;
-          end
-        end
-
-        ALU_CMIX: begin
-          multicycle_result = imd_val_q_i | bwlogic_and_result;
-          imd_val_d_o = bwlogic_and_result;
-          if (instr_first_cycle_i) begin
-            imd_val_we_o = 1'b1;
-          end else begin
-            imd_val_we_o = 1'b0;
-          end
-        end
-
-        ALU_FSR, ALU_FSL,
-        ALU_ROL, ALU_ROR: begin
-          if (shift_amt[4:0] == 5'h0) begin
-            multicycle_result = shift_amt[5] ? operand_a_i : imd_val_q_i;
-          end else begin
-            multicycle_result = imd_val_q_i | shift_result;
-          end
-          imd_val_d_o = shift_result;
-          if (instr_first_cycle_i) begin
-            imd_val_we_o = 1'b1;
-          end else begin
-            imd_val_we_o = 1'b0;
-          end
-        end
-
-        ALU_CRC32_W, ALU_CRC32C_W,
-        ALU_CRC32_H, ALU_CRC32C_H,
-        ALU_CRC32_B, ALU_CRC32C_B: begin
-          imd_val_d_o = clmul_result_rev;
-          unique case(1'b1)
-            crc_bmode: multicycle_result = clmul_result_rev ^ (operand_a_i >> 8);
-            crc_hmode: multicycle_result = clmul_result_rev ^ (operand_a_i >> 16);
-            default:   multicycle_result = clmul_result_rev;
-          endcase
-          if (instr_first_cycle_i) begin
-            imd_val_we_o = 1'b1;
-          end else begin
-            imd_val_we_o = 1'b0;
-          end
-        end
-
-        default: begin
-          imd_val_d_o = operand_a_i;
-          imd_val_we_o = 1'b0;
-          multicycle_result = operand_a_i;
-        end
-      endcase
-    end
-
-    /////////////////////////////
-    // Single-bit Instructions //
-    /////////////////////////////
-
-    always_comb begin
-      unique case (operator_i)
-        ALU_SBSET: singlebit_result = operand_a_i | shift_result;
-        ALU_SBCLR: singlebit_result = operand_a_i & ~shift_result;
-        ALU_SBINV: singlebit_result = operand_a_i ^ shift_result;
-        default:   singlebit_result = {31'h0, shift_result[0]}; // ALU_SBEXT
-      endcase
-    end
-
     ///////////////
     // Min / Max //
     ///////////////
 
     assign minmax_result = cmp_result ? operand_a_i : operand_b_i;
 
-
     //////////
     // Pack //
     //////////
@@ -1059,21 +550,623 @@ module ibex_alu #(
     assign sext_result = (operator_i == ALU_SEXTB) ?
         { {24{operand_a_i[7]}}, operand_a_i[7:0]} : { {16{operand_a_i[15]}}, operand_a_i[15:0]};
 
+    /////////////////////////////
+    // Single-bit Instructions //
+    /////////////////////////////
+
+    always_comb begin
+      unique case (operator_i)
+        ALU_SBSET: singlebit_result = operand_a_i | shift_result;
+        ALU_SBCLR: singlebit_result = operand_a_i & ~shift_result;
+        ALU_SBINV: singlebit_result = operand_a_i ^ shift_result;
+        default:   singlebit_result = {31'h0, shift_result[0]}; // ALU_SBEXT
+      endcase
+    end
+
+    ////////////////////////////////////
+    // General Reverse and Or-combine //
+    ////////////////////////////////////
+
+    // Only a subset of the General reverse and or-combine instructions are implemented in the
+    // balanced version of the B extension. Currently rev, rev8 and orc.b are supported in the
+    // base extension.
+
+    logic [4:0] zbp_shift_amt;
+    logic gorc_op;
+
+    assign gorc_op = (operator_i == ALU_GORC);
+    assign zbp_shift_amt[2:0] = (RV32B == RV32BFull) ? shift_amt[2:0] : {3{&shift_amt[2:0]}};
+    assign zbp_shift_amt[4:3] = (RV32B == RV32BFull) ? shift_amt[4:3] : {2{&shift_amt[4:3]}};
+
+    always_comb begin
+      rev_result = operand_a_i;
+
+      if (zbp_shift_amt[0]) begin
+        rev_result = (gorc_op ? rev_result : 32'h0)       |
+                     ((rev_result & 32'h5555_5555) <<  1) |
+                     ((rev_result & 32'haaaa_aaaa) >>  1);
+      end
+
+      if (zbp_shift_amt[1]) begin
+        rev_result = (gorc_op ? rev_result : 32'h0)       |
+                     ((rev_result & 32'h3333_3333) <<  2) |
+                     ((rev_result & 32'hcccc_cccc) >>  2);
+      end
+
+      if (zbp_shift_amt[2]) begin
+        rev_result = (gorc_op ? rev_result : 32'h0)       |
+                     ((rev_result & 32'h0f0f_0f0f) <<  4) |
+                     ((rev_result & 32'hf0f0_f0f0) >>  4);
+      end
+
+      if (zbp_shift_amt[3]) begin
+        rev_result = (gorc_op & (RV32B == RV32BFull) ? rev_result : 32'h0) |
+                     ((rev_result & 32'h00ff_00ff) <<  8) |
+                     ((rev_result & 32'hff00_ff00) >>  8);
+      end
+
+      if (zbp_shift_amt[4]) begin
+        rev_result = (gorc_op & (RV32B == RV32BFull) ? rev_result : 32'h0) |
+                     ((rev_result & 32'h0000_ffff) << 16) |
+                     ((rev_result & 32'hffff_0000) >> 16);
+      end
+    end
+
+    logic crc_hmode;
+    logic crc_bmode;
+    logic [31:0] clmul_result_rev;
+
+    if (RV32B == RV32BFull) begin : gen_alu_rvb_full
+
+      /////////////////////////
+      // Shuffle / Unshuffle //
+      /////////////////////////
+
+      localparam logic [31:0] SHUFFLE_MASK_L [0:3] =
+          '{32'h00ff_0000, 32'h0f00_0f00, 32'h3030_3030, 32'h4444_4444};
+      localparam logic [31:0] SHUFFLE_MASK_R [0:3] =
+          '{32'h0000_ff00, 32'h00f0_00f0, 32'h0c0c_0c0c, 32'h2222_2222};
+
+      localparam logic [31:0] FLIP_MASK_L [0:3] =
+          '{32'h2200_1100, 32'h0044_0000, 32'h4411_0000, 32'h1100_0000};
+      localparam logic [31:0] FLIP_MASK_R [0:3] =
+          '{32'h0088_0044, 32'h0000_2200, 32'h0000_8822, 32'h0000_0088};
+
+      logic [31:0] SHUFFLE_MASK_NOT [0:3];
+      for(genvar i = 0; i < 4; i++) begin : gen_shuffle_mask_not
+        assign SHUFFLE_MASK_NOT[i] = ~(SHUFFLE_MASK_L[i] | SHUFFLE_MASK_R[i]);
+      end
+
+      logic shuffle_flip;
+      assign shuffle_flip = operator_i == ALU_UNSHFL;
+
+      logic [3:0] shuffle_mode;
+
+      always_comb begin
+        shuffle_result = operand_a_i;
+
+        if (shuffle_flip) begin
+          shuffle_mode[3] = shift_amt[0];
+          shuffle_mode[2] = shift_amt[1];
+          shuffle_mode[1] = shift_amt[2];
+          shuffle_mode[0] = shift_amt[3];
+        end else begin
+          shuffle_mode = shift_amt[3:0];
+        end
+
+        if (shuffle_flip) begin
+          shuffle_result = (shuffle_result & 32'h8822_4411) |
+              ((shuffle_result << 6)  & FLIP_MASK_L[0]) | ((shuffle_result >> 6)  & FLIP_MASK_R[0]) |
+              ((shuffle_result << 9)  & FLIP_MASK_L[1]) | ((shuffle_result >> 9)  & FLIP_MASK_R[1]) |
+              ((shuffle_result << 15) & FLIP_MASK_L[2]) | ((shuffle_result >> 15) & FLIP_MASK_R[2]) |
+              ((shuffle_result << 21) & FLIP_MASK_L[3]) | ((shuffle_result >> 21) & FLIP_MASK_R[3]);
+        end
+
+        if (shuffle_mode[3]) begin
+          shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[0]) |
+              (((shuffle_result << 8) & SHUFFLE_MASK_L[0]) |
+              ((shuffle_result >> 8) & SHUFFLE_MASK_R[0]));
+        end
+        if (shuffle_mode[2]) begin
+          shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[1]) |
+              (((shuffle_result << 4) & SHUFFLE_MASK_L[1]) |
+              ((shuffle_result >> 4) & SHUFFLE_MASK_R[1]));
+        end
+        if (shuffle_mode[1]) begin
+          shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[2]) |
+              (((shuffle_result << 2) & SHUFFLE_MASK_L[2]) |
+              ((shuffle_result >> 2) & SHUFFLE_MASK_R[2]));
+        end
+        if (shuffle_mode[0]) begin
+          shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[3]) |
+              (((shuffle_result << 1) & SHUFFLE_MASK_L[3]) |
+              ((shuffle_result >> 1) & SHUFFLE_MASK_R[3]));
+        end
+
+        if (shuffle_flip) begin
+          shuffle_result = (shuffle_result & 32'h8822_4411) |
+              ((shuffle_result << 6)  & FLIP_MASK_L[0]) | ((shuffle_result >> 6) & FLIP_MASK_R[0])  |
+              ((shuffle_result << 9)  & FLIP_MASK_L[1]) | ((shuffle_result >> 9) & FLIP_MASK_R[1])  |
+              ((shuffle_result << 15) & FLIP_MASK_L[2]) | ((shuffle_result >> 15) & FLIP_MASK_R[2]) |
+              ((shuffle_result << 21) & FLIP_MASK_L[3]) | ((shuffle_result >> 21) & FLIP_MASK_R[3]);
+        end
+      end
+
+      ///////////////
+      // Butterfly //
+      ///////////////
+
+      // The butterfly / inverse butterfly network executing bext/bdep (zbe) instructions.
+      // For bdep, the control bits mask of a local left region is generated by
+      // the inverse of a n-bit left rotate and complement upon wrap (LROTC) operation by the number
+      // of ones in the deposit bitmask to the right of the segment. n hereby denotes the width
+      // of the according segment. The bitmask for a pertaining local right region is equal to the
+      // corresponding local left region. Bext uses an analogue inverse process.
+      // Consider the following 8-bit example.  For details, see Hilewitz et al. "Fast Bit Gather,
+      // Bit Scatter and Bit Permuation Instructions for Commodity Microprocessors", (2008).
+      //
+      // The bext/bdep instructions are completed in 2 cycles. In the first cycle, the control
+      // bitmask is prepared by executing the parallel prefix bit count. In the second cycle,
+      // the bit swapping is executed according to the control masks.
+
+      // 8-bit example:  (Hilewitz et al.)
+      // Consider the instruction bdep operand_a_i deposit_mask
+      // Let operand_a_i = 8'babcd_efgh
+      //    deposit_mask = 8'b1010_1101
+      //
+      // control bitmask for stage 1:
+      //  - number of ones in the right half of the deposit bitmask: 3
+      //  - width of the segment: 4
+      //  - control bitmask = ~LROTC(4'b0, 3)[3:0] = 4'b1000
+      //
+      // control bitmask:   c3 c2  c1 c0  c3 c2  c1 c0
+      //                    1  0   0  0   1  0   0  0
+      //                    <- L ----->   <- R ----->
+      // operand_a_i        a  b   c  d   e  f   g  h
+      //                    :\ |   |  |  /:  |   |  |
+      //                    : +|---|--|-+ :  |   |  |
+      //                    :/ |   |  |  \:  |   |  |
+      // stage 1            e  b   c  d   a  f   g  h
+      //                    <L->   <R->   <L->   <R->
+      // control bitmask:   c3 c2  c3 c2  c1 c0  c1 c0
+      //                    1  1   1  1   1  0   1  0
+      //                    :\ :\ /: /:   :\ |  /:  |
+      //                    : +:-+-:+ :   : +|-+ :  |
+      //                    :/ :/ \: \:   :/ |  \:  |
+      // stage 2            c  d   e  b   g  f   a  h
+      //                    L  R   L  R   L  R   L  R
+      // control bitmask:   c3 c3  c2 c2  c1 c1  c0 c0
+      //                    1  1   0  0   1  1   0  0
+      //                    :\/:   |  |   :\/:   |  |
+      //                    :  :   |  |   :  :   |  |
+      //                    :/\:   |  |   :/\:   |  |
+      // stage 3            d  c   e  b   f  g   a  h
+      // & deposit bitmask: 1  0   1  0   1  1   0  1
+      // result:            d  0   e  0   f  g   0  h
+
+      logic [ 5:0] bitcnt_partial_q [32];
+
+      // first cycle
+      // Store partial bitcnts
+      for (genvar i=0; i<32; i++) begin : gen_bitcnt_reg_in_lsb
+        assign bitcnt_partial_lsb_d[i] = bitcnt_partial[i][0];
+      end
+
+      for (genvar i=0; i<16; i++) begin : gen_bitcnt_reg_in_b1
+        assign bitcnt_partial_msb_d[i] = bitcnt_partial[2*i+1][1];
+      end
+
+      for (genvar i=0; i<8; i++) begin : gen_bitcnt_reg_in_b2
+        assign bitcnt_partial_msb_d[16+i] = bitcnt_partial[4*i+3][2];
+      end
+
+      for (genvar i=0; i<4; i++) begin : gen_bitcnt_reg_in_b3
+        assign bitcnt_partial_msb_d[24+i] = bitcnt_partial[8*i+7][3];
+      end
+
+      for (genvar i=0; i<2; i++) begin : gen_bitcnt_reg_in_b4
+        assign bitcnt_partial_msb_d[28+i] = bitcnt_partial[16*i+15][4];
+      end
+
+      assign bitcnt_partial_msb_d[30] = bitcnt_partial[31][5];
+      assign bitcnt_partial_msb_d[31] = 1'b0; // unused
+
+      // Second cycle
+      // Load partial bitcnts
+      always_comb begin
+        bitcnt_partial_q = '{default: '0};
+
+        for (int unsigned i=0; i<32; i++) begin : gen_bitcnt_reg_out_lsb
+          bitcnt_partial_q[i][0] = imd_val_q_i[0][i];
+        end
+
+        for (int unsigned i=0; i<16; i++) begin : gen_bitcnt_reg_out_b1
+          bitcnt_partial_q[2*i+1][1] = imd_val_q_i[1][i];
+        end
+
+        for (int unsigned i=0; i<8; i++) begin : gen_bitcnt_reg_out_b2
+          bitcnt_partial_q[4*i+3][2] = imd_val_q_i[1][16+i];
+        end
+
+        for (int unsigned i=0; i<4; i++) begin : gen_bitcnt_reg_out_b3
+          bitcnt_partial_q[8*i+7][3] = imd_val_q_i[1][24+i];
+        end
+
+        for (int unsigned i=0; i<2; i++) begin : gen_bitcnt_reg_out_b4
+          bitcnt_partial_q[16*i+15][4] = imd_val_q_i[1][28+i];
+        end
+
+        bitcnt_partial_q[31][5] = imd_val_q_i[1][30];
+      end
+
+      logic [31:0] butterfly_mask_l[5];
+      logic [31:0] butterfly_mask_r[5];
+      logic [31:0] butterfly_mask_not[5];
+      logic [31:0] lrotc_stage [5]; // left rotate and complement upon wrap
+
+      // number of bits in local r = 32 / 2**(stage + 1) = 16/2**stage
+      `define _N(stg) (16 >> stg)
+
+      // bext / bdep control bit generation
+      for (genvar stg=0; stg<5; stg++) begin : gen_butterfly_ctrl_stage
+        // number of segs: 2** stg
+        for (genvar seg=0; seg<2**stg; seg++) begin : gen_butterfly_ctrl
+
+          assign lrotc_stage[stg][2*`_N(stg)*(seg+1)-1 : 2*`_N(stg)*seg] =
+              {{`_N(stg){1'b0}},{`_N(stg){1'b1}}} <<
+                bitcnt_partial_q[`_N(stg)*(2*seg+1)-1][$clog2(`_N(stg)):0];
+
+          assign butterfly_mask_l[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)]
+                   = ~lrotc_stage[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)];
+
+          assign butterfly_mask_r[stg][`_N(stg)*(2*seg+1)-1 : `_N(stg)*(2*seg)]
+                   = ~lrotc_stage[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)];
+
+          assign butterfly_mask_l[stg][`_N(stg)*(2*seg+1)-1 : `_N(stg)*(2*seg)]   = '0;
+          assign butterfly_mask_r[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)] = '0;
+        end
+      end
+      `undef _N
+
+      for (genvar stg=0; stg<5; stg++) begin : gen_butterfly_not
+        assign butterfly_mask_not[stg] =
+            ~(butterfly_mask_l[stg] | butterfly_mask_r[stg]);
+      end
+
+      always_comb begin
+        butterfly_result = operand_a_i;
+
+        butterfly_result = butterfly_result & butterfly_mask_not[0] |
+            ((butterfly_result & butterfly_mask_l[0]) >> 16)|
+            ((butterfly_result & butterfly_mask_r[0]) << 16);
+
+        butterfly_result = butterfly_result & butterfly_mask_not[1] |
+            ((butterfly_result & butterfly_mask_l[1]) >> 8)|
+            ((butterfly_result & butterfly_mask_r[1]) << 8);
+
+        butterfly_result = butterfly_result & butterfly_mask_not[2] |
+            ((butterfly_result & butterfly_mask_l[2]) >> 4)|
+            ((butterfly_result & butterfly_mask_r[2]) << 4);
+
+        butterfly_result = butterfly_result & butterfly_mask_not[3] |
+            ((butterfly_result & butterfly_mask_l[3]) >> 2)|
+            ((butterfly_result & butterfly_mask_r[3]) << 2);
+
+        butterfly_result = butterfly_result & butterfly_mask_not[4] |
+            ((butterfly_result & butterfly_mask_l[4]) >> 1)|
+            ((butterfly_result & butterfly_mask_r[4]) << 1);
+
+        butterfly_result = butterfly_result & operand_b_i;
+      end
+
+      always_comb begin
+        invbutterfly_result = operand_a_i & operand_b_i;
+
+        invbutterfly_result = invbutterfly_result & butterfly_mask_not[4] |
+            ((invbutterfly_result & butterfly_mask_l[4]) >> 1)|
+            ((invbutterfly_result & butterfly_mask_r[4]) << 1);
+
+        invbutterfly_result = invbutterfly_result & butterfly_mask_not[3] |
+            ((invbutterfly_result & butterfly_mask_l[3]) >> 2)|
+            ((invbutterfly_result & butterfly_mask_r[3]) << 2);
+
+        invbutterfly_result = invbutterfly_result & butterfly_mask_not[2] |
+            ((invbutterfly_result & butterfly_mask_l[2]) >> 4)|
+            ((invbutterfly_result & butterfly_mask_r[2]) << 4);
+
+        invbutterfly_result = invbutterfly_result & butterfly_mask_not[1] |
+            ((invbutterfly_result & butterfly_mask_l[1]) >> 8)|
+            ((invbutterfly_result & butterfly_mask_r[1]) << 8);
+
+        invbutterfly_result = invbutterfly_result & butterfly_mask_not[0] |
+            ((invbutterfly_result & butterfly_mask_l[0]) >> 16)|
+            ((invbutterfly_result & butterfly_mask_r[0]) << 16);
+      end
+
+      ///////////////////////////////////////////////////
+      // Carry-less Multiply + Cyclic Redundancy Check //
+      ///////////////////////////////////////////////////
+
+      // Carry-less multiplication can be understood as multiplication based on
+      // the addition interpreted as the bit-wise xor operation.
+      //
+      // Example: 1101 X 1011 = 1111111:
+      //
+      //       1011 X 1101
+      //       -----------
+      //              1101
+      //         xor 1101
+      //         ---------
+      //             10111
+      //        xor 0000
+      //        ----------
+      //            010111
+      //       xor 1101
+      //       -----------
+      //           1111111
+      //
+      // Architectural details:
+      //         A 32 x 32-bit array
+      //         [ operand_b[i] ? (operand_a << i) : '0 for i in 0 ... 31 ]
+      //         is generated. The entries of the array are pairwise 'xor-ed'
+      //         together in a 5-stage binary tree.
+      //
+      //
+      // Cyclic Redundancy Check:
+      //
+      // CRC-32 (CRC-32/ISO-HDLC) and CRC-32C (CRC-32/ISCSI) are directly implemented. For
+      // documentation of the crc configuration (crc-polynomials, initialization, reflection, etc.)
+      // see http://reveng.sourceforge.net/crc-catalogue/all.htm
+      // A useful guide to crc arithmetic and algorithms is given here:
+      // http://www.piclist.com/techref/method/math/crcguide.html.
+      //
+      // The CRC operation solves the following equation using binary polynomial arithmetic:
+      //
+      // rev(rd)(x) = rev(rs1)(x) * x**n mod {1, P}(x)
+      //
+      // where P denotes lower 32 bits of the corresponding CRC polynomial, rev(a) the bit reversal
+      // of a, n = 8,16, or 32 for .b, .h, .w -variants. {a, b} denotes bit concatenation.
+      //
+      // Using barret reduction, one can show that
+      //
+      // M(x) mod P(x) = R(x) =
+      //          (M(x) * x**n) & {deg(P(x)'{1'b1}}) ^ (M(x) x**-(deg(P(x) - n)) cx mu(x) cx P(x),
+      //
+      // Where mu(x) = polydiv(x**64, {1,P}) & 0xffffffff. Here, 'cx' refers to carry-less
+      // multiplication. Substituting rev(rd)(x) for R(x) and rev(rs1)(x) for M(x) and solving for
+      // rd(x) with P(x) a crc32 polynomial (deg(P(x)) = 32), we get
+      //
+      // rd = rev( (rev(rs1) << n)  ^ ((rev(rs1) >> (32-n)) cx mu cx P)
+      //    = (rs1 >> n) ^ rev(rev( (rs1 << (32-n)) cx rev(mu)) cx P)
+      //                       ^-- cycle 0--------------------^
+      //      ^- cycle 1 -------------------------------------------^
+      //
+      // In the last step we used the fact that carry-less multiplication is bit-order agnostic:
+      // rev(a cx b) = rev(a) cx rev(b).
+
+      logic clmul_rmode;
+      logic clmul_hmode;
+      logic [31:0] clmul_op_a;
+      logic [31:0] clmul_op_b;
+      logic [31:0] operand_b_rev;
+      logic [31:0] clmul_and_stage[32];
+      logic [31:0] clmul_xor_stage1[16];
+      logic [31:0] clmul_xor_stage2[8];
+      logic [31:0] clmul_xor_stage3[4];
+      logic [31:0] clmul_xor_stage4[2];
+
+      logic [31:0] clmul_result_raw;
+
+      for (genvar i=0; i<32; i++) begin: gen_rev_operand_b
+        assign operand_b_rev[i] = operand_b_i[31-i];
+      end
+
+      assign clmul_rmode = operator_i == ALU_CLMULR;
+      assign clmul_hmode = operator_i == ALU_CLMULH;
+
+      // CRC
+      localparam logic [31:0] CRC32_POLYNOMIAL = 32'h04c1_1db7;
+      localparam logic [31:0] CRC32_MU_REV = 32'hf701_1641;
+
+      localparam logic [31:0] CRC32C_POLYNOMIAL = 32'h1edc_6f41;
+      localparam logic [31:0] CRC32C_MU_REV = 32'hdea7_13f1;
+
+      logic crc_op;
+
+      logic crc_cpoly;
+
+      logic [31:0] crc_operand;
+      logic [31:0] crc_poly;
+      logic [31:0] crc_mu_rev;
+
+      assign crc_op = (operator_i == ALU_CRC32C_W) | (operator_i == ALU_CRC32_W) |
+                      (operator_i == ALU_CRC32C_H) | (operator_i == ALU_CRC32_H) |
+                      (operator_i == ALU_CRC32C_B) | (operator_i == ALU_CRC32_B);
+
+      assign crc_cpoly = (operator_i == ALU_CRC32C_W) |
+                         (operator_i == ALU_CRC32C_H) |
+                         (operator_i == ALU_CRC32C_B);
+
+      assign crc_hmode = (operator_i == ALU_CRC32_H) | (operator_i == ALU_CRC32C_H);
+      assign crc_bmode = (operator_i == ALU_CRC32_B) | (operator_i == ALU_CRC32C_B);
+
+      assign crc_poly   = crc_cpoly ? CRC32C_POLYNOMIAL : CRC32_POLYNOMIAL;
+      assign crc_mu_rev = crc_cpoly ? CRC32C_MU_REV : CRC32_MU_REV;
+
+      always_comb begin
+        unique case(1'b1)
+          crc_bmode: crc_operand = {operand_a_i[7:0], 24'h0};
+          crc_hmode: crc_operand = {operand_a_i[15:0], 16'h0};
+          default:   crc_operand = operand_a_i;
+        endcase
+      end
+
+      // Select clmul input
+      always_comb begin
+        if (crc_op) begin
+          clmul_op_a = instr_first_cycle_i ? crc_operand : imd_val_q_i[0];
+          clmul_op_b = instr_first_cycle_i ? crc_mu_rev : crc_poly;
+        end else begin
+          clmul_op_a = clmul_rmode | clmul_hmode ? operand_a_rev : operand_a_i;
+          clmul_op_b = clmul_rmode | clmul_hmode ? operand_b_rev : operand_b_i;
+        end
+      end
+
+      for (genvar i=0; i<32; i++) begin : gen_clmul_and_op
+        assign clmul_and_stage[i] = clmul_op_b[i] ? clmul_op_a << i : '0;
+      end
+
+      for (genvar i=0; i<16; i++) begin : gen_clmul_xor_op_l1
+        assign clmul_xor_stage1[i] = clmul_and_stage[2*i] ^ clmul_and_stage[2*i+1];
+      end
+
+      for (genvar i=0; i<8; i++) begin : gen_clmul_xor_op_l2
+        assign clmul_xor_stage2[i] = clmul_xor_stage1[2*i] ^ clmul_xor_stage1[2*i+1];
+      end
+
+      for (genvar i=0; i<4; i++) begin : gen_clmul_xor_op_l3
+        assign clmul_xor_stage3[i] = clmul_xor_stage2[2*i] ^ clmul_xor_stage2[2*i+1];
+      end
+
+      for (genvar i=0; i<2; i++) begin : gen_clmul_xor_op_l4
+        assign clmul_xor_stage4[i] = clmul_xor_stage3[2*i] ^ clmul_xor_stage3[2*i+1];
+      end
+
+      assign clmul_result_raw = clmul_xor_stage4[0] ^ clmul_xor_stage4[1];
+
+      for (genvar i=0; i<32; i++) begin : gen_rev_clmul_result
+        assign clmul_result_rev[i] = clmul_result_raw[31-i];
+      end
+
+      // clmulr_result = rev(clmul(rev(a), rev(b)))
+      // clmulh_result = clmulr_result >> 1
+      always_comb begin
+        case(1'b1)
+          clmul_rmode: clmul_result = clmul_result_rev;
+          clmul_hmode: clmul_result = {1'b0, clmul_result_rev[31:1]};
+          default:     clmul_result = clmul_result_raw;
+        endcase
+      end
+    end else begin
+      assign shuffle_result       = '0;
+      assign butterfly_result     = '0;
+      assign invbutterfly_result  = '0;
+      assign clmul_result         = '0;
+      // support signals
+      assign bitcnt_partial_lsb_d = '0;
+      assign bitcnt_partial_msb_d = '0;
+      assign clmul_result_rev     = '0;
+      assign crc_bmode            = '0;
+      assign crc_hmode            = '0;
+    end
+
+    //////////////////////////////////////
+    // Multicycle Bitmanip Instructions //
+    //////////////////////////////////////
+    // Ternary instructions + Shift Rotations + Bit extract/deposit + CRC
+    // For ternary instructions (zbt), operand_a_i is tied to rs1 in the first cycle and rs3 in the
+    // second cycle. operand_b_i is always tied to rs2.
+
+    always_comb begin
+      unique case (operator_i)
+        ALU_CMOV: begin
+            multicycle_result = (operand_b_i == 32'h0) ? operand_a_i : imd_val_q_i[0];
+            imd_val_d_o = '{operand_a_i, 32'h0};
+          if (instr_first_cycle_i) begin
+            imd_val_we_o = 2'b01;
+          end else begin
+            imd_val_we_o = 2'b00;
+          end
+        end
+
+        ALU_CMIX: begin
+          multicycle_result = imd_val_q_i[0] | bwlogic_and_result;
+          imd_val_d_o = '{bwlogic_and_result, 32'h0};
+          if (instr_first_cycle_i) begin
+            imd_val_we_o = 2'b01;
+          end else begin
+            imd_val_we_o = 2'b00;
+          end
+        end
+
+        ALU_FSR, ALU_FSL,
+        ALU_ROL, ALU_ROR: begin
+          if (shift_amt[4:0] == 5'h0) begin
+            multicycle_result = shift_amt[5] ? operand_a_i : imd_val_q_i[0];
+          end else begin
+            multicycle_result = imd_val_q_i[0] | shift_result;
+          end
+          imd_val_d_o = '{shift_result, 32'h0};
+          if (instr_first_cycle_i) begin
+            imd_val_we_o = 2'b01;
+          end else begin
+            imd_val_we_o = 2'b00;
+          end
+        end
+
+        ALU_CRC32_W, ALU_CRC32C_W,
+        ALU_CRC32_H, ALU_CRC32C_H,
+        ALU_CRC32_B, ALU_CRC32C_B: begin
+          if (RV32B == RV32BFull) begin
+            unique case(1'b1)
+              crc_bmode: multicycle_result = clmul_result_rev ^ (operand_a_i >> 8);
+              crc_hmode: multicycle_result = clmul_result_rev ^ (operand_a_i >> 16);
+              default:   multicycle_result = clmul_result_rev;
+            endcase
+            imd_val_d_o = '{clmul_result_rev, 32'h0};
+            if (instr_first_cycle_i) begin
+              imd_val_we_o = 2'b01;
+            end else begin
+              imd_val_we_o = 2'b00;
+            end
+          end else begin
+            imd_val_d_o = '{operand_a_i, 32'h0};
+            imd_val_we_o = 2'b00;
+            multicycle_result = '0;
+          end
+        end
+
+        ALU_BEXT, ALU_BDEP: begin
+          if (RV32B == RV32BFull) begin
+            multicycle_result = (operator_i == ALU_BDEP) ? butterfly_result : invbutterfly_result;
+            imd_val_d_o = '{bitcnt_partial_lsb_d, bitcnt_partial_msb_d};
+            if (instr_first_cycle_i) begin
+              imd_val_we_o = 2'b11;
+            end else begin
+              imd_val_we_o = 2'b00;
+            end
+          end else begin
+            imd_val_d_o = '{operand_a_i, 32'h0};
+            imd_val_we_o = 2'b00;
+            multicycle_result = '0;
+          end
+        end
+
+        default: begin
+          imd_val_d_o = '{operand_a_i, 32'h0};
+          imd_val_we_o = 2'b00;
+          multicycle_result = '0;
+        end
+      endcase
+    end
+
+
   end else begin : g_no_alu_rvb
     // RV32B result signals
-    assign minmax_result       = '0;
     assign bitcnt_result       = '0;
+    assign minmax_result       = '0;
     assign pack_result         = '0;
     assign sext_result         = '0;
-    assign multicycle_result   = '0;
     assign singlebit_result    = '0;
+    assign rev_result          = '0;
     assign shuffle_result      = '0;
     assign butterfly_result    = '0;
     assign invbutterfly_result = '0;
     assign clmul_result        = '0;
+    assign multicycle_result   = '0;
     // RV32B support signals
-    assign imd_val_d_o         = '0;
-    assign imd_val_we_o        = '0;
+    assign imd_val_d_o         = '{default: '0};
+    assign imd_val_we_o        = '{default: '0};
   end
 
   ////////////////
@@ -1130,18 +1223,16 @@ module ibex_alu #(
       // Cyclic Redundancy Checks (RV32B)
       ALU_CRC32_W, ALU_CRC32C_W,
       ALU_CRC32_H, ALU_CRC32C_H,
-      ALU_CRC32_B, ALU_CRC32C_B: result_o = multicycle_result;
+      ALU_CRC32_B, ALU_CRC32C_B,
+      // Bit Extract / Deposit (RV32B)
+      ALU_BEXT, ALU_BDEP: result_o = multicycle_result;
 
       // Single-Bit Bitmanip Operations (RV32B)
       ALU_SBSET, ALU_SBCLR,
       ALU_SBINV, ALU_SBEXT: result_o = singlebit_result;
 
-      // Bit Extract / Deposit (RV32B)
-      ALU_BDEP:  result_o = butterfly_result;
-      ALU_BEXT:  result_o = invbutterfly_result;
-
       // General Reverse / Or-combine (RV32B)
-      ALU_GREV, ALU_GORC: result_o = butterfly_result;
+      ALU_GREV, ALU_GORC: result_o = rev_result;
 
       // Bit Field Place (RV32B)
       ALU_BFP: result_o = bfp_result;
diff --git a/rtl/ibex_core.sv b/rtl/ibex_core.sv
index 6fd1b407..1de776bb 100644
--- a/rtl/ibex_core.sv
+++ b/rtl/ibex_core.sv
@@ -9,27 +9,31 @@
 
 `include "prim_assert.sv"
 
+`ifndef RV32B
+  `define RV32B ibex_pkg::RV32BNone
+`endif
+
 /**
  * Top level module of the ibex RISC-V core
  */
 module ibex_core #(
-    parameter bit          PMPEnable                = 1'b0,
-    parameter int unsigned PMPGranularity           = 0,
-    parameter int unsigned PMPNumRegions            = 4,
-    parameter int unsigned MHPMCounterNum           = 0,
-    parameter int unsigned MHPMCounterWidth         = 40,
-    parameter bit          RV32E                    = 1'b0,
-    parameter bit          RV32M                    = 1'b1,
-    parameter bit          RV32B                    = 1'b0,
-    parameter bit          BranchTargetALU          = 1'b0,
-    parameter bit          WritebackStage           = 1'b0,
-    parameter              MultiplierImplementation = "fast",
-    parameter bit          ICache                   = 1'b0,
-    parameter bit          ICacheECC                = 1'b0,
-    parameter bit          DbgTriggerEn             = 1'b0,
-    parameter bit          SecureIbex               = 1'b0,
-    parameter int unsigned DmHaltAddr               = 32'h1A110800,
-    parameter int unsigned DmExceptionAddr          = 32'h1A110808
+    parameter bit               PMPEnable                = 1'b0,
+    parameter int unsigned      PMPGranularity           = 0,
+    parameter int unsigned      PMPNumRegions            = 4,
+    parameter int unsigned      MHPMCounterNum           = 0,
+    parameter int unsigned      MHPMCounterWidth         = 40,
+    parameter bit               RV32E                    = 1'b0,
+    parameter bit               RV32M                    = 1'b1,
+    parameter ibex_pkg::rv32b_e RV32B                    = `RV32B,
+    parameter bit               BranchTargetALU          = 1'b0,
+    parameter bit               WritebackStage           = 1'b0,
+    parameter                   MultiplierImplementation = "fast",
+    parameter bit               ICache                   = 1'b0,
+    parameter bit               ICacheECC                = 1'b0,
+    parameter bit               DbgTriggerEn             = 1'b0,
+    parameter bit               SecureIbex               = 1'b0,
+    parameter int unsigned      DmHaltAddr               = 32'h1A110800,
+    parameter int unsigned      DmExceptionAddr          = 32'h1A110808
 ) (
     // Clock and Reset
     input  logic        clk_i,
@@ -129,9 +133,9 @@ module ibex_core #(
   logic [31:0] pc_if;                          // Program counter in IF stage
   logic [31:0] pc_id;                          // Program counter in ID stage
   logic [31:0] pc_wb;                          // Program counter in WB stage
-  logic [33:0] imd_val_d_ex;                   // Intermediate register for multicycle Ops
-  logic [33:0] imd_val_q_ex;                   // Intermediate register for multicycle Ops
-  logic        imd_val_we_ex;
+  logic [33:0] imd_val_d_ex[2];                // Intermediate register for multicycle Ops
+  logic [33:0] imd_val_q_ex[2];                // Intermediate register for multicycle Ops
+  logic [1:0]  imd_val_we_ex;
 
   logic        data_ind_timing;
   logic        dummy_instr_en;
diff --git a/rtl/ibex_core_tracing.sv b/rtl/ibex_core_tracing.sv
index 1c019d5a..e290c35a 100644
--- a/rtl/ibex_core_tracing.sv
+++ b/rtl/ibex_core_tracing.sv
@@ -2,28 +2,32 @@
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 
+`ifndef RV32B
+  `define RV32B ibex_pkg::RV32BNone
+`endif
 
 /**
  * Top level module of the ibex RISC-V core with tracing enabled
  */
+
 module ibex_core_tracing #(
-    parameter bit          PMPEnable                = 1'b0,
-    parameter int unsigned PMPGranularity           = 0,
-    parameter int unsigned PMPNumRegions            = 4,
-    parameter int unsigned MHPMCounterNum           = 0,
-    parameter int unsigned MHPMCounterWidth         = 40,
-    parameter bit          RV32E                    = 1'b0,
-    parameter bit          RV32M                    = 1'b1,
-    parameter bit          RV32B                    = 1'b0,
-    parameter bit          BranchTargetALU          = 1'b0,
-    parameter bit          WritebackStage           = 1'b0,
-    parameter              MultiplierImplementation = "fast",
-    parameter bit          ICache                   = 1'b0,
-    parameter bit          ICacheECC                = 1'b0,
-    parameter bit          DbgTriggerEn             = 1'b0,
-    parameter bit          SecureIbex               = 1'b0,
-    parameter int unsigned DmHaltAddr               = 32'h1A110800,
-    parameter int unsigned DmExceptionAddr          = 32'h1A110808
+    parameter bit               PMPEnable                = 1'b0,
+    parameter int unsigned      PMPGranularity           = 0,
+    parameter int unsigned      PMPNumRegions            = 4,
+    parameter int unsigned      MHPMCounterNum           = 0,
+    parameter int unsigned      MHPMCounterWidth         = 40,
+    parameter bit               RV32E                    = 1'b0,
+    parameter bit               RV32M                    = 1'b1,
+    parameter ibex_pkg::rv32b_e RV32B                    = `RV32B,
+    parameter bit               BranchTargetALU          = 1'b0,
+    parameter bit               WritebackStage           = 1'b0,
+    parameter                   MultiplierImplementation = "fast",
+    parameter bit               ICache                   = 1'b0,
+    parameter bit               ICacheECC                = 1'b0,
+    parameter bit               DbgTriggerEn             = 1'b0,
+    parameter bit               SecureIbex               = 1'b0,
+    parameter int unsigned      DmHaltAddr               = 32'h1A110800,
+    parameter int unsigned      DmExceptionAddr          = 32'h1A110808
 ) (
     // Clock and Reset
     input  logic        clk_i,
diff --git a/rtl/ibex_decoder.sv b/rtl/ibex_decoder.sv
index b8952754..3b2807eb 100644
--- a/rtl/ibex_decoder.sv
+++ b/rtl/ibex_decoder.sv
@@ -14,10 +14,10 @@
 `include "prim_assert.sv"
 
 module ibex_decoder #(
-    parameter bit RV32E           = 0,
-    parameter bit RV32M           = 1,
-    parameter bit RV32B           = 0,
-    parameter bit BranchTargetALU = 0
+    parameter bit RV32E                = 0,
+    parameter bit RV32M                = 1,
+    parameter bit BranchTargetALU      = 0,
+    parameter ibex_pkg::rv32b_e RV32B  = ibex_pkg::RV32BNone
 ) (
     input  logic                 clk_i,
     input  logic                 rst_ni,
@@ -112,7 +112,8 @@ module ibex_decoder #(
   logic [4:0] instr_rs3;
   logic [4:0] instr_rd;
 
-  logic        use_rs3;
+  logic        use_rs3_d;
+  logic        use_rs3_q;
 
   csr_op_e     csr_op;
 
@@ -139,11 +140,20 @@ module ibex_decoder #(
   // immediate for CSR manipulation (zero extended)
   assign zimm_rs1_type_o = { 27'b0, instr_rs1 }; // rs1
 
+  // the use of rs3 is known one cycle ahead.
+  always_ff  @(posedge clk_i or negedge rst_ni) begin
+    if (!rst_ni) begin
+      use_rs3_q <= 1'b0;
+    end else begin
+      use_rs3_q <= use_rs3_d;
+    end
+  end
+
   // source registers
   assign instr_rs1 = instr[19:15];
   assign instr_rs2 = instr[24:20];
   assign instr_rs3 = instr[31:27];
-  assign rf_raddr_a_o = use_rs3 ? instr_rs3 : instr_rs1; // rs3 / rs1
+  assign rf_raddr_a_o = (use_rs3_q & ~instr_first_cycle_i) ? instr_rs3 : instr_rs1; // rs3 / rs1
   assign rf_raddr_b_o = instr_rs2; // rs2
 
   // destination register
@@ -338,29 +348,29 @@ module ibex_decoder #(
 
           3'b001: begin
             unique case (instr[31:27])
-              5'b0_0000: illegal_insn = 1'b0;                      // slli
-              5'b0_0100,                                           // sloi
-              5'b0_1001,                                           // sbclri
-              5'b0_0101,                                           // sbseti
-              5'b0_1101: illegal_insn = RV32B ? 1'b0 : 1'b1;       // sbinvi
+              5'b0_0000: illegal_insn = 1'b0;                                         // slli
+              5'b0_0100,                                                              // sloi
+              5'b0_1001,                                                              // sbclri
+              5'b0_0101,                                                              // sbseti
+              5'b0_1101: illegal_insn = (RV32B != RV32BNone) ? 1'b0 : 1'b1;           // sbinvi
               5'b0_0001: if (instr[26] == 1'b0) begin
-                illegal_insn = RV32B ? 1'b0 : 1'b1;                // shfl
+                illegal_insn = (RV32B == RV32BFull) ? 1'b0 : 1'b1;                    // shfl
               end else begin
                 illegal_insn = 1'b1;
               end
               5'b0_1100: begin
                 unique case(instr[26:20])
-                  7'b000_0000,                                     // clz
-                  7'b000_0001,                                     // ctz
-                  7'b000_0010,                                     // pcnt
-                  7'b000_0100,                                     // sext.b
-                  7'b000_0101,                                     // sext.h
-                  7'b001_0000,                                     // crc32.b
-                  7'b001_0001,                                     // crc32.h
-                  7'b001_0010,                                     // crc32.w
-                  7'b001_1000,                                     // crc32c.b
-                  7'b001_1001,                                     // crc32c.h
-                  7'b001_1010: illegal_insn = RV32B ? 1'b0 : 1'b1; // crc32c.w
+                  7'b000_0000,                                                         // clz
+                  7'b000_0001,                                                         // ctz
+                  7'b000_0010,                                                         // pcnt
+                  7'b000_0100,                                                         // sext.b
+                  7'b000_0101: illegal_insn = (RV32B != RV32BNone) ? 1'b0 : 1'b1;      // sext.h
+                  7'b001_0000,                                                         // crc32.b
+                  7'b001_0001,                                                         // crc32.h
+                  7'b001_0010,                                                         // crc32.w
+                  7'b001_1000,                                                         // crc32c.b
+                  7'b001_1001,                                                         // crc32c.h
+                  7'b001_1010: illegal_insn = (RV32B == RV32BFull) ? 1'b0 : 1'b1;      // crc32c.w
 
                   default: illegal_insn = 1'b1;
                 endcase
@@ -371,22 +381,41 @@ module ibex_decoder #(
 
           3'b101: begin
             if (instr[26]) begin
-              illegal_insn = RV32B ? 1'b0 : 1'b1;                  // fsri
+              illegal_insn = (RV32B != RV32BNone) ? 1'b0 : 1'b1;                       // fsri
             end else begin
               unique case (instr[31:27])
-                5'b0_0000,                                         // srli
-                5'b0_1000: illegal_insn = 1'b0;                    // srai
+                5'b0_0000,                                                             // srli
+                5'b0_1000: illegal_insn = 1'b0;                                        // srai
 
-                5'b0_0100,                                         // sroi
-                5'b0_1100,                                         // rori
-                5'b0_1001: illegal_insn = RV32B ? 1'b0 : 1'b1;     // sbexti
+                5'b0_0100,                                                             // sroi
+                5'b0_1100,                                                             // rori
+                5'b0_1001: illegal_insn = (RV32B != RV32BNone) ? 1'b0 : 1'b1;          // sbexti
 
-                5'b0_1101,                                         // grevi
-                5'b0_0101: illegal_insn = RV32B ? 1'b0 : 1'b1;     // gorci
-                5'b0_0001: if (instr[26] == 1'b0) begin
-                  illegal_insn = RV32B ? 1'b0 : 1'b1;              // unshfl
-                end else begin
-                  illegal_insn = 1'b1;
+                5'b0_1101: begin
+                  if ((RV32B == RV32BFull)) begin
+                    illegal_insn = 1'b0;                                               // grevi
+                  end else begin
+                    unique case (instr[24:20])
+                      5'b11111,                                                        // rev
+                      5'b11000: illegal_insn = (RV32B == RV32BBalanced) ? 1'b0 : 1'b1; // rev8
+
+                      default: illegal_insn = 1'b1;
+                    endcase
+                  end
+                end
+                5'b0_0101: begin
+                  if ((RV32B == RV32BFull)) begin
+                    illegal_insn = 1'b0;                                              // gorci
+                  end else if (instr[24:20] == 5'b00111) begin
+                    illegal_insn = (RV32B == RV32BBalanced) ? 1'b0 : 1'b1;            // orc.b
+                  end
+                end
+                5'b0_0001: begin
+                  if (instr[26] == 1'b0) begin
+                    illegal_insn = (RV32B == RV32BFull) ? 1'b0 : 1'b1;                // unshfl
+                  end else begin
+                    illegal_insn = 1'b1;
+                  end
                 end
 
                 default: illegal_insn = 1'b1;
@@ -403,7 +432,7 @@ module ibex_decoder #(
         rf_ren_b_o      = 1'b1;
         rf_we           = 1'b1;
         if ({instr[26], instr[13:12]} == {1'b1, 2'b01}) begin
-          illegal_insn = RV32B ? 1'b0 : 1'b1; // cmix / cmov / fsl / fsr
+          illegal_insn = (RV32B != RV32BNone) ? 1'b0 : 1'b1; // cmix / cmov / fsl / fsr
         end else begin
           unique case ({instr[31:25], instr[14:12]})
             // RV32I ALU operations
@@ -438,6 +467,8 @@ module ibex_decoder #(
             {7'b001_0100, 3'b001}, // sbset
             {7'b011_0100, 3'b001}, // sbinv
             {7'b010_0100, 3'b101}, // sbext
+            // RV32B zbf
+            {7'b010_0100, 3'b111}: illegal_insn = (RV32B != RV32BNone) ? 1'b0 : 1'b1; // bfp
             // RV32B zbe
             {7'b010_0100, 3'b110}, // bdep
             {7'b000_0100, 3'b110}, // bext
@@ -446,12 +477,10 @@ module ibex_decoder #(
             {7'b001_0100, 3'b101}, // gorc
             {7'b000_0100, 3'b001}, // shfl
             {7'b000_0100, 3'b101}, // unshfl
-            // RV32B zbf
-            {7'b010_0100, 3'b111}, // bfp
             // RV32B zbc
             {7'b000_0101, 3'b001}, // clmul
             {7'b000_0101, 3'b010}, // clmulr
-            {7'b000_0101, 3'b011}: illegal_insn = RV32B ? 1'b0 : 1'b1; // clmulh
+            {7'b000_0101, 3'b011}: illegal_insn = (RV32B == RV32BFull) ? 1'b0 : 1'b1; // clmulh
 
             // RV32M instructions
             {7'b000_0001, 3'b000}: begin // mul
@@ -627,7 +656,7 @@ module ibex_decoder #(
 
     opcode_alu         = opcode_e'(instr_alu[6:0]);
 
-    use_rs3            = 1'b0;
+    use_rs3_d          = 1'b0;
     alu_multicycle_o   = 1'b0;
     mult_sel_o         = 1'b0;
     div_sel_o          = 1'b0;
@@ -774,7 +803,7 @@ module ibex_decoder #(
           3'b111: alu_operator_o = ALU_AND;  // And with Immediate
 
           3'b001: begin
-            if (RV32B) begin
+            if (RV32B != RV32BNone) begin
               unique case (instr_alu[31:27])
                 5'b0_0000: alu_operator_o = ALU_SLL;    // Shift Left Logical by Immediate
                 5'b0_0100: alu_operator_o = ALU_SLO;    // Shift Left Ones by Immediate
@@ -785,34 +814,46 @@ module ibex_decoder #(
                 5'b0_0001: if (instr_alu[26] == 0) alu_operator_o = ALU_SHFL;
                 5'b0_1100: begin
                   unique case (instr_alu[26:20])
-                    7'b000_0000: alu_operator_o = ALU_CLZ;      // clz
-                    7'b000_0001: alu_operator_o = ALU_CTZ;      // ctz
-                    7'b000_0010: alu_operator_o = ALU_PCNT;     // pcnt
-                    7'b000_0100: alu_operator_o = ALU_SEXTB;    // sext.b
-                    7'b000_0101: alu_operator_o = ALU_SEXTH;    // sext.h
+                    7'b000_0000: alu_operator_o = ALU_CLZ;   // clz
+                    7'b000_0001: alu_operator_o = ALU_CTZ;   // ctz
+                    7'b000_0010: alu_operator_o = ALU_PCNT;  // pcnt
+                    7'b000_0100: alu_operator_o = ALU_SEXTB; // sext.b
+                    7'b000_0101: alu_operator_o = ALU_SEXTH; // sext.h
                     7'b001_0000: begin
-                      alu_operator_o = ALU_CRC32_B;  // crc32.b
-                      alu_multicycle_o = 1'b1;
+                      if (RV32B == RV32BFull) begin
+                        alu_operator_o = ALU_CRC32_B;  // crc32.b
+                        alu_multicycle_o = 1'b1;
+                      end
                     end
                     7'b001_0001: begin
-                      alu_operator_o = ALU_CRC32_H;  // crc32.h
-                      alu_multicycle_o = 1'b1;
+                      if (RV32B == RV32BFull) begin
+                        alu_operator_o = ALU_CRC32_H;  // crc32.h
+                        alu_multicycle_o = 1'b1;
+                      end
                     end
                     7'b001_0010: begin
-                      alu_operator_o = ALU_CRC32_W;  // crc32.w
-                      alu_multicycle_o = 1'b1;
+                      if (RV32B == RV32BFull) begin
+                        alu_operator_o = ALU_CRC32_W;  // crc32.w
+                        alu_multicycle_o = 1'b1;
+                      end
                     end
                     7'b001_1000: begin
-                      alu_operator_o = ALU_CRC32C_B; // crc32c.b
-                      alu_multicycle_o = 1'b1;
+                      if (RV32B == RV32BFull) begin
+                        alu_operator_o = ALU_CRC32C_B; // crc32c.b
+                        alu_multicycle_o = 1'b1;
+                      end
                     end
                     7'b001_1001: begin
-                      alu_operator_o = ALU_CRC32C_H; // crc32c.h
-                      alu_multicycle_o = 1'b1;
+                      if (RV32B == RV32BFull) begin
+                        alu_operator_o = ALU_CRC32C_H; // crc32c.h
+                        alu_multicycle_o = 1'b1;
+                      end
                     end
                     7'b001_1010: begin
-                      alu_operator_o = ALU_CRC32C_W; // crc32c.w
-                      alu_multicycle_o = 1'b1;
+                      if (RV32B == RV32BFull) begin
+                        alu_operator_o = ALU_CRC32C_W; // crc32c.w
+                        alu_multicycle_o = 1'b1;
+                      end
                     end
                     default: ;
                   endcase
@@ -821,19 +862,19 @@ module ibex_decoder #(
                 default: ;
               endcase
             end else begin
-              alu_operator_o = ALU_SLL;                 // Shift Left Logical by Immediate
+              alu_operator_o = ALU_SLL; // Shift Left Logical by Immediate
             end
           end
 
           3'b101: begin
-            if (RV32B) begin
+            if (RV32B != RV32BNone) begin
               if (instr_alu[26] == 1'b1) begin
                 alu_operator_o = ALU_FSR;
                 alu_multicycle_o = 1'b1;
                 if (instr_first_cycle_i) begin
-                  use_rs3 = 1'b0;
+                  use_rs3_d = 1'b1;
                 end else begin
-                  use_rs3 = 1'b1;
+                  use_rs3_d = 1'b0;
                 end
               end else begin
                 unique case (instr_alu[31:27])
@@ -842,22 +883,26 @@ module ibex_decoder #(
                   5'b0_0100: alu_operator_o = ALU_SRO;   // Shift Right Ones by Immediate
                   5'b0_1001: alu_operator_o = ALU_SBEXT; // Extract bit specified by immediate.
                   5'b0_1100: begin
-                    alu_operator_o = ALU_ROR;           // Rotate Right by Immediate
+                    alu_operator_o = ALU_ROR;            // Rotate Right by Immediate
                     alu_multicycle_o = 1'b1;
                   end
-                  5'b0_1101: alu_operator_o = ALU_GREV; // General Reverse with Imm Control Val
-                  5'b0_0101: alu_operator_o = ALU_GORC; // General Or-combine with Imm Control Val
+                  5'b0_1101: alu_operator_o = ALU_GREV;  // General Reverse with Imm Control Val
+                  5'b0_0101: alu_operator_o = ALU_GORC;  // General Or-combine with Imm Control Val
                   // Unshuffle with Immediate Control Value
-                  5'b0_0001: if (instr_alu[26] == 1'b0) alu_operator_o = ALU_UNSHFL;
+                  5'b0_0001: begin
+                    if (RV32B == RV32BFull) begin
+                      if (instr_alu[26] == 1'b0) alu_operator_o = ALU_UNSHFL;
+                    end
+                  end
                   default: ;
                 endcase
               end
 
             end else begin
               if (instr_alu[31:27] == 5'b0_0000) begin
-                alu_operator_o = ALU_SRL;              // Shift Right Logical by Immediate
+                alu_operator_o = ALU_SRL;               // Shift Right Logical by Immediate
               end else if (instr_alu[31:27] == 5'b0_1000) begin
-                alu_operator_o = ALU_SRA;              // Shift Right Arithmetically by Immediate
+                alu_operator_o = ALU_SRA;               // Shift Right Arithmetically by Immediate
               end
             end
           end
@@ -871,42 +916,42 @@ module ibex_decoder #(
         alu_op_b_mux_sel_o = OP_B_REG_B;
 
         if (instr_alu[26]) begin
-          if (RV32B) begin
+          if (RV32B != RV32BNone) begin
             unique case ({instr_alu[26:25], instr_alu[14:12]})
               {2'b11, 3'b001}: begin
                 alu_operator_o   = ALU_CMIX; // cmix
                 alu_multicycle_o = 1'b1;
                 if (instr_first_cycle_i) begin
-                  use_rs3 = 1'b0;
+                  use_rs3_d = 1'b1;
                 end else begin
-                  use_rs3 = 1'b1;
+                  use_rs3_d = 1'b0;
                 end
               end
               {2'b11, 3'b101}: begin
                 alu_operator_o   = ALU_CMOV; // cmov
                 alu_multicycle_o = 1'b1;
                 if (instr_first_cycle_i) begin
-                  use_rs3 = 1'b0;
+                  use_rs3_d = 1'b1;
                 end else begin
-                  use_rs3 = 1'b1;
+                  use_rs3_d = 1'b0;
                 end
               end
               {2'b10, 3'b001}: begin
                 alu_operator_o   = ALU_FSL;  // fsl
                 alu_multicycle_o = 1'b1;
                 if (instr_first_cycle_i) begin
-                  use_rs3 = 1'b0;
+                  use_rs3_d = 1'b1;
                 end else begin
-                  use_rs3 = 1'b1;
+                  use_rs3_d = 1'b0;
                 end
               end
               {2'b10, 3'b101}: begin
                 alu_operator_o   = ALU_FSR;  // fsr
                 alu_multicycle_o = 1'b1;
                 if (instr_first_cycle_i) begin
-                  use_rs3 = 1'b0;
+                  use_rs3_d = 1'b1;
                 end else begin
-                  use_rs3 = 1'b1;
+                  use_rs3_d = 1'b0;
                 end
               end
               default: ;
@@ -927,56 +972,67 @@ module ibex_decoder #(
             {7'b010_0000, 3'b101}: alu_operator_o = ALU_SRA;   // Shift Right Arithmetic
 
             // RV32B ALU Operations
-            {7'b001_0000, 3'b001}: if (RV32B) alu_operator_o = ALU_SLO;   // slo
-            {7'b001_0000, 3'b101}: if (RV32B) alu_operator_o = ALU_SRO;   // sro
+            {7'b001_0000, 3'b001}: if (RV32B != RV32BNone) alu_operator_o = ALU_SLO;   // slo
+            {7'b001_0000, 3'b101}: if (RV32B != RV32BNone) alu_operator_o = ALU_SRO;   // sro
             {7'b011_0000, 3'b001}: begin
-              if (RV32B) begin
+              if (RV32B != RV32BNone) begin
                 alu_operator_o = ALU_ROL;   // rol
                 alu_multicycle_o = 1'b1;
               end
             end
             {7'b011_0000, 3'b101}: begin
-              if (RV32B) begin
+              if (RV32B != RV32BNone) begin
                 alu_operator_o = ALU_ROR;   // ror
                 alu_multicycle_o = 1'b1;
               end
             end
 
-            {7'b000_0101, 3'b100}: if (RV32B) alu_operator_o = ALU_MIN;    // min
-            {7'b000_0101, 3'b101}: if (RV32B) alu_operator_o = ALU_MAX;    // max
-            {7'b000_0101, 3'b110}: if (RV32B) alu_operator_o = ALU_MINU;   // minu
-            {7'b000_0101, 3'b111}: if (RV32B) alu_operator_o = ALU_MAXU;   // maxu
+            {7'b000_0101, 3'b100}: if (RV32B != RV32BNone) alu_operator_o = ALU_MIN;    // min
+            {7'b000_0101, 3'b101}: if (RV32B != RV32BNone) alu_operator_o = ALU_MAX;    // max
+            {7'b000_0101, 3'b110}: if (RV32B != RV32BNone) alu_operator_o = ALU_MINU;   // minu
+            {7'b000_0101, 3'b111}: if (RV32B != RV32BNone) alu_operator_o = ALU_MAXU;   // maxu
 
-            {7'b000_0100, 3'b100}: if (RV32B) alu_operator_o = ALU_PACK;   // pack
-            {7'b010_0100, 3'b100}: if (RV32B) alu_operator_o = ALU_PACKU;  // packu
-            {7'b000_0100, 3'b111}: if (RV32B) alu_operator_o = ALU_PACKH;  // packh
+            {7'b000_0100, 3'b100}: if (RV32B != RV32BNone) alu_operator_o = ALU_PACK;   // pack
+            {7'b010_0100, 3'b100}: if (RV32B != RV32BNone) alu_operator_o = ALU_PACKU;  // packu
+            {7'b000_0100, 3'b111}: if (RV32B != RV32BNone) alu_operator_o = ALU_PACKH;  // packh
 
-            {7'b010_0000, 3'b100}: if (RV32B) alu_operator_o = ALU_XNOR;   // xnor
-            {7'b010_0000, 3'b110}: if (RV32B) alu_operator_o = ALU_ORN;    // orn
-            {7'b010_0000, 3'b111}: if (RV32B) alu_operator_o = ALU_ANDN;   // andn
-
-            // RV32B zbp
-            {7'b011_0100, 3'b101}: if (RV32B) alu_operator_o = ALU_GREV;   // grev
-            {7'b001_0100, 3'b101}: if (RV32B) alu_operator_o = ALU_GORC;   // grev
-            {7'b000_0100, 3'b001}: if (RV32B) alu_operator_o = ALU_SHFL;   // shfl
-            {7'b000_0100, 3'b101}: if (RV32B) alu_operator_o = ALU_UNSHFL; // unshfl
+            {7'b010_0000, 3'b100}: if (RV32B != RV32BNone) alu_operator_o = ALU_XNOR;   // xnor
+            {7'b010_0000, 3'b110}: if (RV32B != RV32BNone) alu_operator_o = ALU_ORN;    // orn
+            {7'b010_0000, 3'b111}: if (RV32B != RV32BNone) alu_operator_o = ALU_ANDN;   // andn
 
             // RV32B zbs
-            {7'b010_0100, 3'b001}: if (RV32B) alu_operator_o = ALU_SBCLR;  // sbclr
-            {7'b001_0100, 3'b001}: if (RV32B) alu_operator_o = ALU_SBSET;  // sbset
-            {7'b011_0100, 3'b001}: if (RV32B) alu_operator_o = ALU_SBINV;  // sbinv
-            {7'b010_0100, 3'b101}: if (RV32B) alu_operator_o = ALU_SBEXT;  // sbext
+            {7'b010_0100, 3'b001}: if (RV32B != RV32BNone) alu_operator_o = ALU_SBCLR;  // sbclr
+            {7'b001_0100, 3'b001}: if (RV32B != RV32BNone) alu_operator_o = ALU_SBSET;  // sbset
+            {7'b011_0100, 3'b001}: if (RV32B != RV32BNone) alu_operator_o = ALU_SBINV;  // sbinv
+            {7'b010_0100, 3'b101}: if (RV32B != RV32BNone) alu_operator_o = ALU_SBEXT;  // sbext
+
+            // RV32B zbf
+            {7'b010_0100, 3'b111}: if (RV32B != RV32BNone) alu_operator_o = ALU_BFP;    // bfp
+
+            // RV32B zbp
+            {7'b011_0100, 3'b101}: if (RV32B != RV32BNone) alu_operator_o = ALU_GREV;   // grev
+            {7'b001_0100, 3'b101}: if (RV32B != RV32BNone) alu_operator_o = ALU_GORC;   // grev
+            {7'b000_0100, 3'b001}: if (RV32B == RV32BFull) alu_operator_o = ALU_SHFL;   // shfl
+            {7'b000_0100, 3'b101}: if (RV32B == RV32BFull) alu_operator_o = ALU_UNSHFL; // unshfl
 
             // RV32B zbc
-            {7'b000_0101, 3'b001}: if (RV32B) alu_operator_o = ALU_CLMUL;  // clmul
-            {7'b000_0101, 3'b010}: if (RV32B) alu_operator_o = ALU_CLMULR; // clmulr
-            {7'b000_0101, 3'b011}: if (RV32B) alu_operator_o = ALU_CLMULH; // clmulh
+            {7'b000_0101, 3'b001}: if (RV32B == RV32BFull) alu_operator_o = ALU_CLMUL;  // clmul
+            {7'b000_0101, 3'b010}: if (RV32B == RV32BFull) alu_operator_o = ALU_CLMULR; // clmulr
+            {7'b000_0101, 3'b011}: if (RV32B == RV32BFull) alu_operator_o = ALU_CLMULH; // clmulh
 
             // RV32B zbe
-            {7'b010_0100, 3'b110}: if (RV32B) alu_operator_o = ALU_BDEP;   // bdep
-            {7'b000_0100, 3'b110}: if (RV32B) alu_operator_o = ALU_BEXT;   // bext
-            // RV32B zbf
-            {7'b010_0100, 3'b111}: if (RV32B) alu_operator_o = ALU_BFP;    // bfp
+            {7'b010_0100, 3'b110}: begin
+              if (RV32B == RV32BFull) begin
+                alu_operator_o = ALU_BDEP;   // bdep
+                alu_multicycle_o = 1'b1;
+              end
+            end
+            {7'b000_0100, 3'b110}: begin
+              if (RV32B == RV32BFull) begin
+                alu_operator_o = ALU_BEXT;   // bext
+                alu_multicycle_o = 1'b1;
+              end
+            end
 
             // RV32M instructions, all use the same ALU operation
             {7'b000_0001, 3'b000}: begin // mul
diff --git a/rtl/ibex_ex_block.sv b/rtl/ibex_ex_block.sv
index 73ffc888..eccc68e9 100644
--- a/rtl/ibex_ex_block.sv
+++ b/rtl/ibex_ex_block.sv
@@ -9,10 +9,10 @@
  * Execution block: Hosts ALU and MUL/DIV unit
  */
 module ibex_ex_block #(
-    parameter bit RV32M                    = 1,
-    parameter bit RV32B                    = 0,
-    parameter bit BranchTargetALU          = 0,
-    parameter     MultiplierImplementation = "fast"
+    parameter bit               RV32M                    = 1,
+    parameter ibex_pkg::rv32b_e RV32B                    = ibex_pkg::RV32BNone,
+    parameter bit               BranchTargetALU          = 0,
+    parameter                   MultiplierImplementation = "fast"
 ) (
     input  logic                  clk_i,
     input  logic                  rst_ni,
@@ -41,9 +41,9 @@ module ibex_ex_block #(
     input  logic                  data_ind_timing_i,
 
     // intermediate val reg
-    output logic                  imd_val_we_o,
-    output logic [33:0]           imd_val_d_o,
-    input  logic [33:0]           imd_val_q_i,
+    output logic [1:0]            imd_val_we_o,
+    output logic [33:0]           imd_val_d_o[2],
+    input  logic [33:0]           imd_val_q_i[2],
 
     // Outputs
     output logic [31:0]           alu_adder_result_ex_o, // to LSU
@@ -63,10 +63,11 @@ module ibex_ex_block #(
   logic        alu_cmp_result, alu_is_equal_result;
   logic        multdiv_valid;
   logic        multdiv_sel;
-  logic [31:0] alu_imd_val_d;
-  logic        alu_imd_val_we;
-  logic [33:0] multdiv_imd_val_d;
-  logic        multdiv_imd_val_we;
+  logic [31:0] alu_imd_val_q[2];
+  logic [31:0] alu_imd_val_d[2];
+  logic [ 1:0] alu_imd_val_we;
+  logic [33:0] multdiv_imd_val_d[2];
+  logic [ 1:0] multdiv_imd_val_we;
 
   /*
     The multdiv_i output is never selected if RV32M=0
@@ -80,8 +81,11 @@ module ibex_ex_block #(
   end
 
   // Intermediate Value Register Mux
-  assign imd_val_d_o  = multdiv_sel ? multdiv_imd_val_d : {2'b0, alu_imd_val_d};
-  assign imd_val_we_o = multdiv_sel ? multdiv_imd_val_we : alu_imd_val_we;
+  assign imd_val_d_o[0] = multdiv_sel ? multdiv_imd_val_d[0] : {2'b0, alu_imd_val_d[0]};
+  assign imd_val_d_o[1] = multdiv_sel ? multdiv_imd_val_d[1] : {2'b0, alu_imd_val_d[1]};
+  assign imd_val_we_o   = multdiv_sel ? multdiv_imd_val_we : alu_imd_val_we;
+
+  assign alu_imd_val_q = '{imd_val_q_i[0][31:0], imd_val_q_i[1][31:0]};
 
   assign result_ex_o  = multdiv_sel ? multdiv_result : alu_result;
 
@@ -117,7 +121,7 @@ module ibex_ex_block #(
       .operand_a_i         ( alu_operand_a_i         ),
       .operand_b_i         ( alu_operand_b_i         ),
       .instr_first_cycle_i ( alu_instr_first_cycle_i ),
-      .imd_val_q_i         ( imd_val_q_i[31:0]       ),
+      .imd_val_q_i         ( alu_imd_val_q           ),
       .imd_val_we_o        ( alu_imd_val_we          ),
       .imd_val_d_o         ( alu_imd_val_d           ),
       .multdiv_operand_a_i ( multdiv_alu_operand_a   ),
@@ -218,6 +222,6 @@ module ibex_ex_block #(
   // Multiplier/divider may require multiple cycles. The ALU output is valid in the same cycle
   // unless the intermediate result register is being written (which indicates this isn't the
   // final cycle of ALU operation).
-  assign ex_valid_o = multdiv_sel ? multdiv_valid : !alu_imd_val_we;
+  assign ex_valid_o = multdiv_sel ? multdiv_valid : ~(|alu_imd_val_we);
 
 endmodule
diff --git a/rtl/ibex_id_stage.sv b/rtl/ibex_id_stage.sv
index ee63142a..2552b86b 100644
--- a/rtl/ibex_id_stage.sv
+++ b/rtl/ibex_id_stage.sv
@@ -17,13 +17,13 @@
 `include "prim_assert.sv"
 
 module ibex_id_stage #(
-    parameter bit RV32E           = 0,
-    parameter bit RV32M           = 1,
-    parameter bit RV32B           = 0,
-    parameter bit DataIndTiming   = 1'b0,
-    parameter bit BranchTargetALU = 0,
-    parameter bit SpecBranch      = 0,
-    parameter bit WritebackStage  = 0
+    parameter bit               RV32E           = 0,
+    parameter bit               RV32M           = 1,
+    parameter ibex_pkg::rv32b_e RV32B           = ibex_pkg::RV32BNone,
+    parameter bit               DataIndTiming   = 1'b0,
+    parameter bit               BranchTargetALU = 0,
+    parameter bit               SpecBranch      = 0,
+    parameter bit               WritebackStage  = 0
 ) (
     input  logic                      clk_i,
     input  logic                      rst_ni,
@@ -68,9 +68,9 @@ module ibex_id_stage #(
     output logic [31:0]               alu_operand_b_ex_o,
 
     // Multicycle Operation Stage Register
-    input  logic                      imd_val_we_ex_i,
-    input  logic [33:0]               imd_val_d_ex_i,
-    output logic [33:0]               imd_val_q_ex_o,
+    input  logic [1:0]                imd_val_we_ex_i,
+    input  logic [33:0]               imd_val_d_ex_i[2],
+    output logic [33:0]               imd_val_q_ex_o[2],
 
     // Branch target ALU
     output logic [31:0]               bt_a_operand_o,
@@ -247,7 +247,7 @@ module ibex_id_stage #(
   logic        alu_multicycle_dec;
   logic        stall_alu;
 
-  logic [33:0] imd_val_q;
+  logic [33:0] imd_val_q[2];
 
   op_a_sel_e   bt_a_mux_sel;
   imm_b_sel_e  bt_b_mux_sel;
@@ -379,11 +379,13 @@ module ibex_id_stage #(
   // Multicycle Operation Stage Register //
   /////////////////////////////////////////
 
-  always_ff @(posedge clk_i or negedge rst_ni) begin : intermediate_val_reg
-    if (!rst_ni) begin
-      imd_val_q <= '0;
-    end else if (imd_val_we_ex_i) begin
-      imd_val_q <= imd_val_d_ex_i;
+  for (genvar i=0; i<2; i++) begin : gen_intermediate_val_reg
+    always_ff @(posedge clk_i or negedge rst_ni) begin : intermediate_val_reg
+      if (!rst_ni) begin
+        imd_val_q[i] <= '0;
+      end else if (imd_val_we_ex_i[i]) begin
+        imd_val_q[i] <= imd_val_d_ex_i[i];
+      end
     end
   end
 
diff --git a/rtl/ibex_multdiv_fast.sv b/rtl/ibex_multdiv_fast.sv
index 53fd6913..617bb516 100644
--- a/rtl/ibex_multdiv_fast.sv
+++ b/rtl/ibex_multdiv_fast.sv
@@ -35,9 +35,9 @@ module ibex_multdiv_fast #(
     output logic [32:0]      alu_operand_a_o,
     output logic [32:0]      alu_operand_b_o,
 
-    input  logic [33:0]      imd_val_q_i,
-    output logic [33:0]      imd_val_d_o,
-    output logic             imd_val_we_o,
+    input  logic [33:0]      imd_val_q_i[2],
+    output logic [33:0]      imd_val_d_o[2],
+    output logic [1:0]       imd_val_we_o,
 
     input  logic             multdiv_ready_id_i,
 
@@ -99,13 +99,11 @@ module ibex_multdiv_fast #(
     if (!rst_ni) begin
       div_counter_q    <= '0;
       md_state_q       <= MD_IDLE;
-      op_denominator_q <= '0;
       op_numerator_q   <= '0;
       op_quotient_q    <= '0;
       div_by_zero_q    <= '0;
     end else if (div_en_internal) begin
       div_counter_q    <= div_counter_d;
-      op_denominator_q <= op_denominator_d;
       op_numerator_q   <= op_numerator_d;
       op_quotient_q    <= op_quotient_d;
       md_state_q       <= md_state_d;
@@ -113,18 +111,24 @@ module ibex_multdiv_fast #(
     end
   end
 
-
   `ASSERT_KNOWN(DivEnKnown, div_en_internal);
   `ASSERT_KNOWN(MultEnKnown, mult_en_internal);
   `ASSERT_KNOWN(MultDivEnKnown, multdiv_en);
 
   assign multdiv_en = mult_en_internal | div_en_internal;
 
-  assign imd_val_d_o = div_sel_i ? op_remainder_d : mac_res_d;
-  assign imd_val_we_o = multdiv_en;
+  // Intermediate value register shared with ALU
+  assign imd_val_d_o[0] = div_sel_i ? op_remainder_d : mac_res_d;
+  assign imd_val_we_o[0] = multdiv_en;
+
+  assign imd_val_d_o[1] = {2'b0, op_denominator_d};
+  assign imd_val_we_o[1] = div_en_internal;
+  assign op_denominator_q = imd_val_q_i[1][31:0];
+  logic [1:0] unused_imd_val;
+  assign unused_imd_val = imd_val_q_i[1][33:32];
 
   assign signed_mult      = (signed_mode_i != 2'b00);
-  assign multdiv_result_o = div_sel_i ? imd_val_q_i[31:0] : mac_res_d[31:0];
+  assign multdiv_result_o = div_sel_i ? imd_val_q_i[0][31:0] : mac_res_d[31:0];
 
   // The single cycle multiplier uses three 17 bit multipliers to compute MUL instructions in a
   // single cycle and MULH instructions in two cycles.
@@ -170,8 +174,8 @@ module ibex_multdiv_fast #(
     assign mult2_op_b = op_b_i[`OP_H];
 
     // used in MULH
-    assign accum[17:0] = imd_val_q_i[33:16];
-    assign accum[33:18] = {16{signed_mult & imd_val_q_i[33]}};
+    assign accum[17:0] = imd_val_q_i[0][33:16];
+    assign accum[33:18] = {16{signed_mult & imd_val_q_i[0][33]}};
 
     always_comb begin
       // Default values == MULL
@@ -268,7 +272,7 @@ module ibex_multdiv_fast #(
       mult_op_b    = op_b_i[`OP_L];
       sign_a       = 1'b0;
       sign_b       = 1'b0;
-      accum        = imd_val_q_i;
+      accum        = imd_val_q_i[0];
       mac_res_d    = mac_res;
       mult_state_d = mult_state_q;
       mult_valid   = 1'b0;
@@ -293,10 +297,10 @@ module ibex_multdiv_fast #(
           mult_op_b = op_b_i[`OP_H];
           sign_a    = 1'b0;
           sign_b    = signed_mode_i[1] & op_b_i[31];
-          // result of AL*BL (in imd_val_q_i) always unsigned with no carry, so carries_q always 00
-          accum     = {18'b0, imd_val_q_i[31:16]};
+          // result of AL*BL (in imd_val_q_i[0]) always unsigned with no carry, so carries_q always 00
+          accum     = {18'b0, imd_val_q_i[0][31:16]};
           if (operator_i == MD_OP_MULL) begin
-            mac_res_d = {2'b0, mac_res[`OP_L], imd_val_q_i[`OP_L]};
+            mac_res_d = {2'b0, mac_res[`OP_L], imd_val_q_i[0][`OP_L]};
           end else begin
             // MD_OP_MULH
             mac_res_d = mac_res;
@@ -311,15 +315,15 @@ module ibex_multdiv_fast #(
           sign_a    = signed_mode_i[0] & op_a_i[31];
           sign_b    = 1'b0;
           if (operator_i == MD_OP_MULL) begin
-            accum        = {18'b0, imd_val_q_i[31:16]};
-            mac_res_d    = {2'b0, mac_res[15:0], imd_val_q_i[15:0]};
+            accum        = {18'b0, imd_val_q_i[0][31:16]};
+            mac_res_d    = {2'b0, mac_res[15:0], imd_val_q_i[0][15:0]};
             mult_valid   = 1'b1;
 
             // Note no state transition will occur if mult_hold is set
             mult_state_d = ALBL;
             mult_hold    = ~multdiv_ready_id_i;
           end else begin
-            accum        = imd_val_q_i;
+            accum        = imd_val_q_i[0];
             mac_res_d    = mac_res;
             mult_state_d = AHBH;
           end
@@ -332,8 +336,8 @@ module ibex_multdiv_fast #(
           mult_op_b = op_b_i[`OP_H];
           sign_a    = signed_mode_i[0] & op_a_i[31];
           sign_b    = signed_mode_i[1] & op_b_i[31];
-          accum[17: 0]  = imd_val_q_i[33:16];
-          accum[33:18]  = {16{signed_mult & imd_val_q_i[33]}};
+          accum[17: 0]  = imd_val_q_i[0][33:16];
+          accum[33:18]  = {16{signed_mult & imd_val_q_i[0][33]}};
           // result of AH*BL is not signed only if signed_mode_i == 2'b00
           mac_res_d    = mac_res;
           mult_valid   = 1'b1;
@@ -366,7 +370,7 @@ module ibex_multdiv_fast #(
   // Divider
   assign res_adder_h    = alu_adder_ext_i[33:1];
 
-  assign next_remainder = is_greater_equal ? res_adder_h[31:0] : imd_val_q_i[31:0];
+  assign next_remainder = is_greater_equal ? res_adder_h[31:0] : imd_val_q_i[0][31:0];
   assign next_quotient  = is_greater_equal ? {1'b0, op_quotient_q} | {1'b0, one_shift} :
                                              {1'b0, op_quotient_q};
 
@@ -376,10 +380,10 @@ module ibex_multdiv_fast #(
   // Remainder - Divisor. If Remainder - Divisor >= 0, is_greater_equal is equal to 1,
   // the next Remainder is Remainder - Divisor contained in res_adder_h and the
   always_comb begin
-    if ((imd_val_q_i[31] ^ op_denominator_q[31]) == 1'b0) begin
+    if ((imd_val_q_i[0][31] ^ op_denominator_q[31]) == 1'b0) begin
       is_greater_equal = (res_adder_h[31] == 1'b0);
     end else begin
-      is_greater_equal = imd_val_q_i[31];
+      is_greater_equal = imd_val_q_i[0][31];
     end
   end
 
@@ -391,7 +395,7 @@ module ibex_multdiv_fast #(
 
   always_comb begin
     div_counter_d    = div_counter_q - 5'h1;
-    op_remainder_d   = imd_val_q_i;
+    op_remainder_d   = imd_val_q_i[0];
     op_quotient_d    = op_quotient_q;
     md_state_d       = md_state_q;
     op_numerator_d   = op_numerator_q;
@@ -457,13 +461,13 @@ module ibex_multdiv_fast #(
         op_quotient_d   = next_quotient[31:0];
         md_state_d      = (div_counter_q == 5'd1) ? MD_LAST : MD_COMP;
         // Division
-        alu_operand_a_o = {imd_val_q_i[31:0], 1'b1}; // it contains the remainder
+        alu_operand_a_o = {imd_val_q_i[0][31:0], 1'b1}; // it contains the remainder
         alu_operand_b_o = {~op_denominator_q[31:0], 1'b1};  // -denominator two's compliment
       end
 
       MD_LAST: begin
         if (operator_i == MD_OP_DIV) begin
-          // this time we save the quotient in op_remainder_d (i.e. imd_val_q_i) since
+          // this time we save the quotient in op_remainder_d (i.e. imd_val_q_i[0]) since
           // we do not need anymore the remainder
           op_remainder_d = {1'b0, next_quotient};
         end else begin
@@ -471,7 +475,7 @@ module ibex_multdiv_fast #(
           op_remainder_d = {2'b0, next_remainder[31:0]};
         end
         // Division
-        alu_operand_a_o  = {imd_val_q_i[31:0], 1'b1}; // it contains the remainder
+        alu_operand_a_o  = {imd_val_q_i[0][31:0], 1'b1}; // it contains the remainder
         alu_operand_b_o  = {~op_denominator_q[31:0], 1'b1};  // -denominator two's compliment
 
         md_state_d = MD_CHANGE_SIGN;
@@ -480,13 +484,13 @@ module ibex_multdiv_fast #(
       MD_CHANGE_SIGN: begin
         md_state_d  = MD_FINISH;
         if (operator_i == MD_OP_DIV) begin
-          op_remainder_d = (div_change_sign) ? {2'h0, alu_adder_i} : imd_val_q_i;
+          op_remainder_d = (div_change_sign) ? {2'h0, alu_adder_i} : imd_val_q_i[0];
         end else begin
-          op_remainder_d = (rem_change_sign) ? {2'h0, alu_adder_i} : imd_val_q_i;
+          op_remainder_d = (rem_change_sign) ? {2'h0, alu_adder_i} : imd_val_q_i[0];
         end
         // ABS(Quotient) = 0 - Quotient (or Remainder)
         alu_operand_a_o  = {32'h0  , 1'b1};
-        alu_operand_b_o  = {~imd_val_q_i[31:0], 1'b1};
+        alu_operand_b_o  = {~imd_val_q_i[0][31:0], 1'b1};
       end
 
       MD_FINISH: begin
diff --git a/rtl/ibex_multdiv_slow.sv b/rtl/ibex_multdiv_slow.sv
index b3038cb4..bcd04b0f 100644
--- a/rtl/ibex_multdiv_slow.sv
+++ b/rtl/ibex_multdiv_slow.sv
@@ -31,9 +31,9 @@ module ibex_multdiv_slow
     output logic [32:0]      alu_operand_a_o,
     output logic [32:0]      alu_operand_b_o,
 
-    input  logic [33:0]      imd_val_q_i,
-    output logic [33:0]      imd_val_d_o,
-    output logic             imd_val_we_o,
+    input  logic [33:0]      imd_val_q_i[2],
+    output logic [33:0]      imd_val_d_o[2],
+    output logic  [1:0]      imd_val_we_o,
 
     input  logic             multdiv_ready_id_i,
 
@@ -50,7 +50,8 @@ module ibex_multdiv_slow
   md_fsm_e md_state_q, md_state_d;
 
   logic [32:0] accum_window_q, accum_window_d;
-  logic        unused_imd_val;
+  logic        unused_imd_val0;
+  logic [ 1:0] unused_imd_val1;
 
   logic [32:0] res_adder_l;
   logic [32:0] res_adder_h;
@@ -81,11 +82,16 @@ module ibex_multdiv_slow
   // ALU Operand MUX //
   /////////////////////
 
-  // Use shared intermediate value register in id_stage for accum_window
-  assign imd_val_d_o    = {1'b0,accum_window_d};
-  assign imd_val_we_o   = ~multdiv_hold;
-  assign accum_window_q = imd_val_q_i[32:0];
-  assign unused_imd_val = imd_val_q_i[33];
+  // Intermediate value register shared with ALU
+  assign imd_val_d_o[0]  = {1'b0,accum_window_d};
+  assign imd_val_we_o[0] = ~multdiv_hold;
+  assign accum_window_q  = imd_val_q_i[0][32:0];
+  assign unused_imd_val0 = imd_val_q_i[0][33];
+
+  assign imd_val_d_o[1]  = {2'b00, op_numerator_d};
+  assign imd_val_we_o[1] = multdiv_en;
+  assign op_numerator_q  = imd_val_q_i[1][31:0];
+  assign unused_imd_val1 = imd_val_q_i[1][33:32];
 
   always_comb begin
     alu_operand_a_o = accum_window_q;
@@ -328,14 +334,12 @@ module ibex_multdiv_slow
       multdiv_count_q  <= 5'h0;
       op_b_shift_q     <= 33'h0;
       op_a_shift_q     <= 33'h0;
-      op_numerator_q   <= 32'h0;
       md_state_q       <= MD_IDLE;
       div_by_zero_q    <= 1'b0;
     end else if (multdiv_en) begin
       multdiv_count_q  <= multdiv_count_d;
       op_b_shift_q     <= op_b_shift_d;
       op_a_shift_q     <= op_a_shift_d;
-      op_numerator_q   <= op_numerator_d;
       md_state_q       <= md_state_d;
       div_by_zero_q    <= div_by_zero_d;
     end
diff --git a/rtl/ibex_pkg.sv b/rtl/ibex_pkg.sv
index 3ecd4015..bb086ecb 100644
--- a/rtl/ibex_pkg.sv
+++ b/rtl/ibex_pkg.sv
@@ -8,6 +8,15 @@
  */
 package ibex_pkg;
 
+/////////////////////////
+// RV32B Paramter Enum //
+/////////////////////////
+
+typedef enum integer {
+  RV32BNone,
+  RV32BBalanced,
+  RV32BFull
+} rv32b_e;
 
 /////////////
 // Opcodes //