// Copyright lowRISC contributors.
// Copyright 2018 ETH Zurich and University of Bologna, see also CREDITS.md.
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0

/**
 * Arithmetic logic unit
 */
module ibex_alu #(
  parameter ibex_pkg::rv32b_e RV32B = ibex_pkg::RV32BNone
) (
  input  ibex_pkg::alu_op_e operator_i,
  input  logic [31:0]       operand_a_i,
  input  logic [31:0]       operand_b_i,

  input  logic              instr_first_cycle_i,

  input  logic [32:0]       multdiv_operand_a_i,
  input  logic [32:0]       multdiv_operand_b_i,

  input  logic              multdiv_sel_i,

  input  logic [31:0]       imd_val_q_i[2],
  output logic [31:0]       imd_val_d_o[2],
  output logic [1:0]        imd_val_we_o,

  output logic [31:0]       adder_result_o,
  output logic [33:0]       adder_result_ext_o,

  output logic [31:0]       result_o,
  output logic              comparison_result_o,
  output logic              is_equal_result_o,

  input  logic              x_mem_lsu_req_i
);
  import ibex_pkg::*;

  logic [31:0] operand_a_rev;
  logic [32:0] operand_b_neg;

  // bit reverse operand_a for left shifts and bit counting
  for (genvar k = 0; k < 32; k++) begin : gen_rev_operand_a
    assign operand_a_rev[k] = operand_a_i[31-k];
  end

  ///////////
  // Adder //
  ///////////

  logic        adder_op_a_shift1;
  logic        adder_op_a_shift2;
  logic        adder_op_a_shift3;
  logic        adder_op_b_negate;
  logic [32:0] adder_in_a, adder_in_b;
  logic [31:0] adder_result;

  always_comb begin
    adder_op_a_shift1 = 1'b0;
    adder_op_a_shift2 = 1'b0;
    adder_op_a_shift3 = 1'b0;
    adder_op_b_negate = 1'b0;
    unique case (operator_i)
      // Adder OPs
      ALU_SUB,

      // Comparator OPs
      ALU_EQ,   ALU_NE,
      ALU_GE,   ALU_GEU,
      ALU_LT,   ALU_LTU,
      ALU_SLT,  ALU_SLTU,

      // MinMax OPs (RV32B Ops)
      ALU_MIN,  ALU_MINU,
      ALU_MAX,  ALU_MAXU: adder_op_b_negate = 1'b1;

      // Address Calculation OPs (RV32B Ops)
      ALU_SH1ADD: if (RV32B != RV32BNone) adder_op_a_shift1 = 1'b1;
      ALU_SH2ADD: if (RV32B != RV32BNone) adder_op_a_shift2 = 1'b1;
      ALU_SH3ADD: if (RV32B != RV32BNone) adder_op_a_shift3 = 1'b1;

      default:;
    endcase
  end

  // prepare operand a
  always_comb begin
    if (x_mem_lsu_req_i) begin
      adder_in_a = {operand_a_i,1'b1};
    end else begin
      unique case(1'b1)
        multdiv_sel_i:     adder_in_a = multdiv_operand_a_i;
        adder_op_a_shift1: adder_in_a = {operand_a_i[30:0],2'b01};
        adder_op_a_shift2: adder_in_a = {operand_a_i[29:0],3'b001};
        adder_op_a_shift3: adder_in_a = {operand_a_i[28:0],4'b0001};
        default:           adder_in_a = {operand_a_i,1'b1};
      endcase
    end
  end

  // prepare operand b
  assign operand_b_neg = {operand_b_i,1'b0} ^ {33{1'b1}};
  always_comb begin
    if (x_mem_lsu_req_i) begin
      adder_in_b = {operand_b_i, 1'b0};
    end else begin
      unique case (1'b1)
        multdiv_sel_i:     adder_in_b = multdiv_operand_b_i;
        adder_op_b_negate: adder_in_b = operand_b_neg;
        default:           adder_in_b = {operand_b_i, 1'b0};
      endcase
    end
  end

  // actual adder
  assign adder_result_ext_o = $unsigned(adder_in_a) + $unsigned(adder_in_b);

  assign adder_result       = adder_result_ext_o[32:1];

  assign adder_result_o     = adder_result;

  ////////////////
  // Comparison //
  ////////////////

  logic is_equal;
  logic is_greater_equal;  // handles both signed and unsigned forms
  logic cmp_signed;

  always_comb begin
    unique case (operator_i)
      ALU_GE,
      ALU_LT,
      ALU_SLT,
      // RV32B only
      ALU_MIN,
      ALU_MAX: cmp_signed = 1'b1;

      default: cmp_signed = 1'b0;
    endcase
  end

  assign is_equal = (adder_result == 32'b0);
  assign is_equal_result_o = is_equal;

  // Is greater equal
  always_comb begin
    if ((operand_a_i[31] ^ operand_b_i[31]) == 1'b0) begin
      is_greater_equal = (adder_result[31] == 1'b0);
    end else begin
      is_greater_equal = operand_a_i[31] ^ (cmp_signed);
    end
  end

  // GTE unsigned:
  // (a[31] == 1 && b[31] == 1) => adder_result[31] == 0
  // (a[31] == 0 && b[31] == 0) => adder_result[31] == 0
  // (a[31] == 1 && b[31] == 0) => 1
  // (a[31] == 0 && b[31] == 1) => 0

  // GTE signed:
  // (a[31] == 1 && b[31] == 1) => adder_result[31] == 0
  // (a[31] == 0 && b[31] == 0) => adder_result[31] == 0
  // (a[31] == 1 && b[31] == 0) => 0
  // (a[31] == 0 && b[31] == 1) => 1

  // generate comparison result
  logic cmp_result;

  always_comb begin
    unique case (operator_i)
      ALU_EQ:             cmp_result =  is_equal;
      ALU_NE:             cmp_result = ~is_equal;
      ALU_GE,   ALU_GEU,
      ALU_MAX,  ALU_MAXU: cmp_result = is_greater_equal; // RV32B only
      ALU_LT,   ALU_LTU,
      ALU_MIN,  ALU_MINU, //RV32B only
      ALU_SLT,  ALU_SLTU: cmp_result = ~is_greater_equal;

      default: cmp_result = is_equal;
    endcase
  end

  assign comparison_result_o = cmp_result;

  ///////////
  // Shift //
  ///////////

  // The shifter structure consists of a 33-bit shifter: 32-bit operand + 1 bit extension for
  // arithmetic shifts and one-shift support.
  // Rotations and funnel shifts are implemented as multi-cycle instructions.
  // The shifter is also used for single-bit instructions and bit-field place as detailed below.
  //
  // Standard Shifts
  // ===============
  // For standard shift instructions, the direction of the shift is to the right by default. For
  // left shifts, the signal shift_left signal is set. If so, the operand is initially reversed,
  // shifted to the right by the specified amount and shifted back again. For arithmetic- and
  // one-shifts the 33rd bit of the shifter operand can is set accordingly.
  //
  // Multicycle Shifts
  // =================
  //
  // Rotation
  // --------
  // For rotations, the operand signals operand_a_i and operand_b_i are kept constant to rs1 and
  // rs2 respectively.
  //
  // Rotation pseudocode:
  //   shift_amt = rs2 & 31;
  //   multicycle_result = (rs1 >> shift_amt) | (rs1 << (32 - shift_amt));
  //                       ^-- cycle 0 -----^ ^-- cycle 1 --------------^
  //
  // Funnel Shifts
  // -------------
  // For funnel shifs, operand_a_i is tied to rs1 in the first cycle and rs3 in the
  // second cycle. operand_b_i is always tied to rs2. The order of applying the shift amount or
  // its complement is determined by bit [5] of shift_amt.
  //
  // Funnel shift Pseudocode: (fsl)
  //  shift_amt = rs2 & 63;
  //  shift_amt_compl = 32 - shift_amt[4:0]
  //  if (shift_amt >=33):
  //     multicycle_result = (rs1 >> shift_amt_compl[4:0]) | (rs3 << shift_amt[4:0]);
  //                         ^-- cycle 0 ----------------^ ^-- cycle 1 ------------^
  //  else if (shift_amt <= 31 && shift_amt > 0):
  //     multicycle_result = (rs1 << shift_amt[4:0]) | (rs3 >> shift_amt_compl[4:0]);
  //                         ^-- cycle 0 ----------^ ^-- cycle 1 -------------------^
  //  For shift_amt == 0, 32, both shift_amt[4:0] and shift_amt_compl[4:0] == '0.
  //  these cases need to be handled separately outside the shifting structure:
  //  else if (shift_amt == 32):
  //     multicycle_result = rs3
  //  else if (shift_amt == 0):
  //     multicycle_result = rs1.
  //
  // Single-Bit Instructions
  // =======================
  // Single bit instructions operate on bit operand_b_i[4:0] of operand_a_i.

  // The operations bset, bclr and binv are implemented by generation of a bit-mask using the
  // shifter structure. This is done by left-shifting the operand 32'h1 by the required amount.
  // The signal shift_sbmode multiplexes the shifter input and sets the signal shift_left.
  // Further processing is taken care of by a separate structure.
  //
  // For bext, the bit defined by operand_b_i[4:0] is to be returned. This is done by simply
  // shifting operand_a_i to the right by the required amount and returning bit [0] of the result.
  //
  // Bit-Field Place
  // ===============
  // The shifter structure is shared to compute bfp_mask << bfp_off.

  logic       shift_left;
  logic       shift_ones;
  logic       shift_arith;
  logic       shift_funnel;
  logic       shift_sbmode;
  logic [5:0] shift_amt;
  logic [5:0] shift_amt_compl; // complementary shift amount (32 - shift_amt)

  logic        [31:0] shift_operand;
  logic signed [32:0] shift_result_ext_signed;
  logic        [32:0] shift_result_ext;
  logic               unused_shift_result_ext;
  logic        [31:0] shift_result;
  logic        [31:0] shift_result_rev;

  // zbf
  logic bfp_op;
  logic [4:0]  bfp_len;
  logic [4:0]  bfp_off;
  logic [31:0] bfp_mask;
  logic [31:0] bfp_mask_rev;
  logic [31:0] bfp_result;

  // bfp: shares the shifter structure to compute bfp_mask << bfp_off
  assign bfp_op = (RV32B != RV32BNone) ? (operator_i == ALU_BFP) : 1'b0;
  assign bfp_len = {~(|operand_b_i[27:24]), operand_b_i[27:24]}; // len = 0 encodes for len = 16
  assign bfp_off = operand_b_i[20:16];
  assign bfp_mask = (RV32B != RV32BNone) ? ~(32'hffff_ffff << bfp_len) : '0;
  for (genvar i = 0; i < 32; i++) begin : gen_rev_bfp_mask
    assign bfp_mask_rev[i] = bfp_mask[31-i];
  end

  assign bfp_result =(RV32B != RV32BNone) ?
      (~shift_result & operand_a_i) | ((operand_b_i & bfp_mask) << bfp_off) : '0;

  // bit shift_amt[5]: word swap bit: only considered for FSL/FSR.
  // if set, reverse operations in first and second cycle.
  assign shift_amt[5] = operand_b_i[5] & shift_funnel;
  assign shift_amt_compl = 32 - operand_b_i[4:0];

  always_comb begin
    if (bfp_op) begin
      shift_amt[4:0] = bfp_off;  // length field of bfp control word
    end else begin
      shift_amt[4:0] = instr_first_cycle_i ?
          (operand_b_i[5] && shift_funnel ? shift_amt_compl[4:0] : operand_b_i[4:0]) :
          (operand_b_i[5] && shift_funnel ? operand_b_i[4:0] : shift_amt_compl[4:0]);
    end
  end

  // single-bit mode: shift
  assign shift_sbmode = (RV32B != RV32BNone) ?
      (operator_i == ALU_BSET) | (operator_i == ALU_BCLR) | (operator_i == ALU_BINV) : 1'b0;

  // left shift if this is:
  // * a standard left shift (slo, sll)
  // * a rol in the first cycle
  // * a ror in the second cycle
  // * fsl: without word-swap bit: first cycle, else: second cycle
  // * fsr: without word-swap bit: second cycle, else: first cycle
  // * a single-bit instruction: bclr, bset, binv (excluding bext)
  // * bfp: bfp_mask << bfp_off
  always_comb begin
    unique case (operator_i)
      ALU_SLL: shift_left = 1'b1;
      ALU_SLO: shift_left = (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) ? 1'b1 : 1'b0;
      ALU_BFP: shift_left = (RV32B != RV32BNone) ? 1'b1 : 1'b0;
      ALU_ROL: shift_left = (RV32B != RV32BNone) ? instr_first_cycle_i : 0;
      ALU_ROR: shift_left = (RV32B != RV32BNone) ? ~instr_first_cycle_i : 0;
      ALU_FSL: shift_left = (RV32B != RV32BNone) ?
        (shift_amt[5] ? ~instr_first_cycle_i : instr_first_cycle_i) : 1'b0;
      ALU_FSR: shift_left = (RV32B != RV32BNone) ?
          (shift_amt[5] ? instr_first_cycle_i : ~instr_first_cycle_i) : 1'b0;
      default: shift_left = 1'b0;
    endcase
    if (shift_sbmode) begin
      shift_left = 1'b1;
    end
  end

  assign shift_arith  = (operator_i == ALU_SRA);
  assign shift_ones   = (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) ?
      (operator_i == ALU_SLO) | (operator_i == ALU_SRO) : 1'b0;
  assign shift_funnel = (RV32B != RV32BNone) ?
      (operator_i == ALU_FSL) | (operator_i == ALU_FSR) : 1'b0;

  // shifter structure.
  always_comb begin
    // select shifter input
    // for bfp, sbmode and shift_left the corresponding bit-reversed input is chosen.
    if (RV32B == RV32BNone) begin
      shift_operand = shift_left ? operand_a_rev : operand_a_i;
    end else begin
      unique case (1'b1)
        bfp_op:       shift_operand = bfp_mask_rev;
        shift_sbmode: shift_operand = 32'h8000_0000;
        default:      shift_operand = shift_left ? operand_a_rev : operand_a_i;
      endcase
    end

    shift_result_ext_signed =
        $signed({shift_ones | (shift_arith & shift_operand[31]), shift_operand}) >>> shift_amt[4:0];
    shift_result_ext = $unsigned(shift_result_ext_signed);

    shift_result            = shift_result_ext[31:0];
    unused_shift_result_ext = shift_result_ext[32];

    for (int unsigned i = 0; i < 32; i++) begin
      shift_result_rev[i] = shift_result[31-i];
    end

    shift_result = shift_left ? shift_result_rev : shift_result;

  end

  ///////////////////
  // Bitwise Logic //
  ///////////////////

  logic bwlogic_or;
  logic bwlogic_and;
  logic [31:0] bwlogic_operand_b;
  logic [31:0] bwlogic_or_result;
  logic [31:0] bwlogic_and_result;
  logic [31:0] bwlogic_xor_result;
  logic [31:0] bwlogic_result;

  logic bwlogic_op_b_negate;

  always_comb begin
    unique case (operator_i)
      // Logic-with-negate OPs (RV32B Ops)
      ALU_XNOR,
      ALU_ORN,
      ALU_ANDN: bwlogic_op_b_negate = (RV32B != RV32BNone) ? 1'b1 : 1'b0;
      ALU_CMIX: bwlogic_op_b_negate = (RV32B != RV32BNone) ? ~instr_first_cycle_i : 1'b0;
      default:  bwlogic_op_b_negate = 1'b0;
    endcase
  end

  assign bwlogic_operand_b = bwlogic_op_b_negate ? operand_b_neg[32:1] : operand_b_i;

  assign bwlogic_or_result  = operand_a_i | bwlogic_operand_b;
  assign bwlogic_and_result = operand_a_i & bwlogic_operand_b;
  assign bwlogic_xor_result = operand_a_i ^ bwlogic_operand_b;

  assign bwlogic_or  = (operator_i == ALU_OR)  | (operator_i == ALU_ORN);
  assign bwlogic_and = (operator_i == ALU_AND) | (operator_i == ALU_ANDN);

  always_comb begin
    unique case (1'b1)
      bwlogic_or:  bwlogic_result = bwlogic_or_result;
      bwlogic_and: bwlogic_result = bwlogic_and_result;
      default:     bwlogic_result = bwlogic_xor_result;
    endcase
  end

  logic [5:0]  bitcnt_result;
  logic [31:0] minmax_result;
  logic [31:0] pack_result;
  logic [31:0] sext_result;
  logic [31:0] singlebit_result;
  logic [31:0] rev_result;
  logic [31:0] shuffle_result;
  logic [31:0] xperm_result;
  logic [31:0] butterfly_result;
  logic [31:0] invbutterfly_result;
  logic [31:0] clmul_result;
  logic [31:0] multicycle_result;

  if (RV32B != RV32BNone) begin : g_alu_rvb

    /////////////////
    // Bitcounting //
    /////////////////

    // The bit-counter structure computes the number of set bits in its operand. Partial results
    // (from left to right) are needed to compute the control masks for computation of
    // bcompress/bdecompress by the butterfly network, if implemented.
    // For cpop, clz and ctz, only the end result is used.

    logic        zbe_op;
    logic        bitcnt_ctz;
    logic        bitcnt_clz;
    logic        bitcnt_cz;
    logic [31:0] bitcnt_bits;
    logic [31:0] bitcnt_mask_op;
    logic [31:0] bitcnt_bit_mask;
    logic [ 5:0] bitcnt_partial [32];
    logic [31:0] bitcnt_partial_lsb_d;
    logic [31:0] bitcnt_partial_msb_d;


    assign bitcnt_ctz    = operator_i == ALU_CTZ;
    assign bitcnt_clz    = operator_i == ALU_CLZ;
    assign bitcnt_cz     = bitcnt_ctz | bitcnt_clz;
    assign bitcnt_result = bitcnt_partial[31];

    // Bit-mask generation for clz and ctz:
    // The bit mask is generated by spreading the lowest-order set bit in the operand to all
    // higher order bits. The resulting mask is inverted to cover the lowest order zeros. In order
    // to create the bit mask for leading zeros, the input operand needs to be reversed.
    assign bitcnt_mask_op = bitcnt_clz ? operand_a_rev : operand_a_i;

    always_comb begin
      bitcnt_bit_mask = bitcnt_mask_op;
      bitcnt_bit_mask |= bitcnt_bit_mask << 1;
      bitcnt_bit_mask |= bitcnt_bit_mask << 2;
      bitcnt_bit_mask |= bitcnt_bit_mask << 4;
      bitcnt_bit_mask |= bitcnt_bit_mask << 8;
      bitcnt_bit_mask |= bitcnt_bit_mask << 16;
      bitcnt_bit_mask = ~bitcnt_bit_mask;
    end

    assign zbe_op = (operator_i == ALU_BCOMPRESS) | (operator_i == ALU_BDECOMPRESS);

    always_comb begin
      case (1'b1)
        zbe_op:      bitcnt_bits = operand_b_i;
        bitcnt_cz:   bitcnt_bits = bitcnt_bit_mask & ~bitcnt_mask_op; // clz / ctz
        default:     bitcnt_bits = operand_a_i; // cpop
      endcase
    end

    // The parallel prefix counter is of the structure of a Brent-Kung Adder. In the first
    // log2(width) stages, the sum of the n preceding bit lines is computed for the bit lines at
    // positions 2**n-1 (power-of-two positions) where n denotes the current stage.
    // In stage n=log2(width), the count for position width-1 (the MSB) is finished.
    // For the intermediate values, an inverse adder tree then computes the bit counts for the bit
    // lines at positions
    // m = 2**(n-1) + i*2**(n-2), where i = [1 ... width / 2**(n-1)-1] and n = [log2(width) ... 2].
    // Thus, at every subsequent stage the result of two previously unconnected sub-trees is
    // summed, starting at the node summing bits [width/2-1 : 0] and [3*width/4-1: width/2]
    // and moving to iteratively sum up all the sub-trees.
    // The inverse adder tree thus features log2(width) - 1 stages the first of these stages is a
    // single addition at position 3*width/4 - 1. It does not interfere with the last
    // stage of the primary adder tree. These stages can thus be folded together, resulting in a
    // total of 2*log2(width)-2 stages.
    // For more details refer to R. Brent, H. T. Kung, "A Regular Layout for Parallel Adders",
    // (1982).
    // For a bitline at position p, only bits
    // bitcnt_partial[max(i, such that p % log2(i) == 0)-1 : 0] are needed for generation of the
    // butterfly network control signals. The adders in the intermediate value adder tree thus need
    // not be full 5-bit adders. We leave the optimization to the synthesis tools.
    //
    // Consider the following 8-bit example for illustraton.
    //
    // let bitcnt_bits = 8'babcdefgh.
    //
    //                   a  b  c  d  e  f  g  h
    //                   | /:  | /:  | /:  | /:
    //                   |/ :  |/ :  |/ :  |/ :
    // stage 1:          +  :  +  :  +  :  +  :
    //                   |  : /:  :  |  : /:  :
    //                   |,--+ :  :  |,--+ :  :
    // stage 2:          +  :  :  :  +  :  :  :
    //                   |  :  |  : /:  :  :  :
    //                   |,-----,--+ :  :  :  : ^-primary adder tree
    // stage 3:          +  :  +  :  :  :  :  : -------------------------
    //                   :  | /| /| /| /| /|  : ,-intermediate adder tree
    //                   :  |/ |/ |/ |/ |/ :  :
    // stage 4           :  +  +  +  +  +  :  :
    //                   :  :  :  :  :  :  :  :
    // bitcnt_partial[i] 7  6  5  4  3  2  1  0

    always_comb begin
      bitcnt_partial = '{default: '0};
      // stage 1
      for (int unsigned i = 1; i < 32; i += 2) begin
        bitcnt_partial[i] = {5'h0, bitcnt_bits[i]} + {5'h0, bitcnt_bits[i-1]};
      end
      // stage 2
      for (int unsigned i = 3; i < 32; i += 4) begin
        bitcnt_partial[i] = bitcnt_partial[i-2] + bitcnt_partial[i];
      end
      // stage 3
      for (int unsigned i = 7; i < 32; i += 8) begin
        bitcnt_partial[i] = bitcnt_partial[i-4] + bitcnt_partial[i];
      end
      // stage 4
      for (int unsigned i = 15; i < 32; i += 16) begin
        bitcnt_partial[i] = bitcnt_partial[i-8] + bitcnt_partial[i];
      end
      // stage 5
      bitcnt_partial[31] = bitcnt_partial[15] + bitcnt_partial[31];
      // ^- primary adder tree
      // -------------------------------
      // ,-intermediate value adder tree
      bitcnt_partial[23] = bitcnt_partial[15] + bitcnt_partial[23];

      // stage 6
      for (int unsigned i = 11; i < 32; i += 8) begin
        bitcnt_partial[i] = bitcnt_partial[i-4] + bitcnt_partial[i];
      end

      // stage 7
      for (int unsigned i = 5; i < 32; i += 4) begin
        bitcnt_partial[i] = bitcnt_partial[i-2] + bitcnt_partial[i];
      end
      // stage 8
      bitcnt_partial[0] = {5'h0, bitcnt_bits[0]};
      for (int unsigned i = 2; i < 32; i += 2) begin
        bitcnt_partial[i] = bitcnt_partial[i-1] + {5'h0, bitcnt_bits[i]};
      end
    end

    ///////////////
    // Min / Max //
    ///////////////

    assign minmax_result = cmp_result ? operand_a_i : operand_b_i;

    //////////
    // Pack //
    //////////

    logic packu;
    logic packh;
    assign packu = operator_i == ALU_PACKU;
    assign packh = operator_i == ALU_PACKH;

    always_comb begin
      unique case (1'b1)
        packu:   pack_result = {operand_b_i[31:16], operand_a_i[31:16]};
        packh:   pack_result = {16'h0, operand_b_i[7:0], operand_a_i[7:0]};
        default: pack_result = {operand_b_i[15:0], operand_a_i[15:0]};
      endcase
    end

    //////////
    // Sext //
    //////////

    assign sext_result = (operator_i == ALU_SEXTB) ?
        { {24{operand_a_i[7]}}, operand_a_i[7:0]} : { {16{operand_a_i[15]}}, operand_a_i[15:0]};

    /////////////////////////////
    // Single-bit Instructions //
    /////////////////////////////

    always_comb begin
      unique case (operator_i)
        ALU_BSET: singlebit_result = operand_a_i | shift_result;
        ALU_BCLR: singlebit_result = operand_a_i & ~shift_result;
        ALU_BINV: singlebit_result = operand_a_i ^ shift_result;
        default:  singlebit_result = {31'h0, shift_result[0]}; // ALU_BEXT
      endcase
    end

    ////////////////////////////////////
    // General Reverse and Or-combine //
    ////////////////////////////////////

    // Only a subset of the general reverse and or-combine instructions are implemented in the
    // balanced version of the B extension. Currently rev8 (shift_amt = 5'b11000) and orc.b
    // (shift_amt = 5'b00111) are supported in the base extension.

    logic [4:0] zbp_shift_amt;
    logic gorc_op;

    assign gorc_op = (operator_i == ALU_GORC);
    assign zbp_shift_amt[2:0] =
        (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) ? shift_amt[2:0] : {3{shift_amt[0]}};
    assign zbp_shift_amt[4:3] =
        (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) ? shift_amt[4:3] : {2{shift_amt[3]}};

    always_comb begin
      rev_result = operand_a_i;

      if (zbp_shift_amt[0]) begin
        rev_result = (gorc_op ? rev_result : 32'h0)       |
                     ((rev_result & 32'h5555_5555) <<  1) |
                     ((rev_result & 32'haaaa_aaaa) >>  1);
      end

      if (zbp_shift_amt[1]) begin
        rev_result = (gorc_op ? rev_result : 32'h0)       |
                     ((rev_result & 32'h3333_3333) <<  2) |
                     ((rev_result & 32'hcccc_cccc) >>  2);
      end

      if (zbp_shift_amt[2]) begin
        rev_result = (gorc_op ? rev_result : 32'h0)       |
                     ((rev_result & 32'h0f0f_0f0f) <<  4) |
                     ((rev_result & 32'hf0f0_f0f0) >>  4);
      end

      if (zbp_shift_amt[3]) begin
        rev_result = ((RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) &&
                      gorc_op ? rev_result : 32'h0) |
                     ((rev_result & 32'h00ff_00ff) <<  8) |
                     ((rev_result & 32'hff00_ff00) >>  8);
      end

      if (zbp_shift_amt[4]) begin
        rev_result = ((RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) &&
                      gorc_op ? rev_result : 32'h0) |
                     ((rev_result & 32'h0000_ffff) << 16) |
                     ((rev_result & 32'hffff_0000) >> 16);
      end
    end

    logic crc_hmode;
    logic crc_bmode;
    logic [31:0] clmul_result_rev;

    if (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) begin : gen_alu_rvb_otearlgrey_full

      /////////////////////////
      // Shuffle / Unshuffle //
      /////////////////////////

      localparam logic [31:0] SHUFFLE_MASK_L [4] =
          '{32'h00ff_0000, 32'h0f00_0f00, 32'h3030_3030, 32'h4444_4444};
      localparam logic [31:0] SHUFFLE_MASK_R [4] =
          '{32'h0000_ff00, 32'h00f0_00f0, 32'h0c0c_0c0c, 32'h2222_2222};

      localparam logic [31:0] FLIP_MASK_L [4] =
          '{32'h2200_1100, 32'h0044_0000, 32'h4411_0000, 32'h1100_0000};
      localparam logic [31:0] FLIP_MASK_R [4] =
          '{32'h0088_0044, 32'h0000_2200, 32'h0000_8822, 32'h0000_0088};

      logic [31:0] SHUFFLE_MASK_NOT [4];
      for(genvar i = 0; i < 4; i++) begin : gen_shuffle_mask_not
        assign SHUFFLE_MASK_NOT[i] = ~(SHUFFLE_MASK_L[i] | SHUFFLE_MASK_R[i]);
      end

      logic shuffle_flip;
      assign shuffle_flip = operator_i == ALU_UNSHFL;

      logic [3:0] shuffle_mode;

      always_comb begin
        shuffle_result = operand_a_i;

        if (shuffle_flip) begin
          shuffle_mode[3] = shift_amt[0];
          shuffle_mode[2] = shift_amt[1];
          shuffle_mode[1] = shift_amt[2];
          shuffle_mode[0] = shift_amt[3];
        end else begin
          shuffle_mode = shift_amt[3:0];
        end

        if (shuffle_flip) begin
          shuffle_result = (shuffle_result & 32'h8822_4411) |
              ((shuffle_result << 6)  & FLIP_MASK_L[0]) |
              ((shuffle_result >> 6)  & FLIP_MASK_R[0]) |
              ((shuffle_result << 9)  & FLIP_MASK_L[1]) |
              ((shuffle_result >> 9)  & FLIP_MASK_R[1]) |
              ((shuffle_result << 15) & FLIP_MASK_L[2]) |
              ((shuffle_result >> 15) & FLIP_MASK_R[2]) |
              ((shuffle_result << 21) & FLIP_MASK_L[3]) |
              ((shuffle_result >> 21) & FLIP_MASK_R[3]);
        end

        if (shuffle_mode[3]) begin
          shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[0]) |
              (((shuffle_result << 8) & SHUFFLE_MASK_L[0]) |
              ((shuffle_result >> 8) & SHUFFLE_MASK_R[0]));
        end
        if (shuffle_mode[2]) begin
          shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[1]) |
              (((shuffle_result << 4) & SHUFFLE_MASK_L[1]) |
              ((shuffle_result >> 4) & SHUFFLE_MASK_R[1]));
        end
        if (shuffle_mode[1]) begin
          shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[2]) |
              (((shuffle_result << 2) & SHUFFLE_MASK_L[2]) |
              ((shuffle_result >> 2) & SHUFFLE_MASK_R[2]));
        end
        if (shuffle_mode[0]) begin
          shuffle_result = (shuffle_result & SHUFFLE_MASK_NOT[3]) |
              (((shuffle_result << 1) & SHUFFLE_MASK_L[3]) |
              ((shuffle_result >> 1) & SHUFFLE_MASK_R[3]));
        end

        if (shuffle_flip) begin
          shuffle_result = (shuffle_result & 32'h8822_4411) |
              ((shuffle_result << 6)  & FLIP_MASK_L[0]) |
              ((shuffle_result >> 6)  & FLIP_MASK_R[0]) |
              ((shuffle_result << 9)  & FLIP_MASK_L[1]) |
              ((shuffle_result >> 9)  & FLIP_MASK_R[1]) |
              ((shuffle_result << 15) & FLIP_MASK_L[2]) |
              ((shuffle_result >> 15) & FLIP_MASK_R[2]) |
              ((shuffle_result << 21) & FLIP_MASK_L[3]) |
              ((shuffle_result >> 21) & FLIP_MASK_R[3]);
        end
      end

      //////////////
      // Crossbar //
      //////////////
      // The crossbar permutation instructions xperm.[nbh] (Zbp) can be implemented using 8
      // parallel 4-bit-wide, 8-input crossbars. Basically, we permute the 8 nibbles of operand_a_i
      // based on operand_b_i.

      // Generate selector indices and valid signals.
      // - sel_n[x] indicates which nibble of operand_a_i is selected for output nibble x.
      // - vld_n[x] indicates if the selection is valid.
      logic  [7:0][2:0] sel_n; // nibbles
      logic  [7:0]      vld_n; // nibbles
      logic  [3:0][1:0] sel_b; // bytes
      logic  [3:0]      vld_b; // bytes
      logic  [1:0][0:0] sel_h; // half words
      logic  [1:0]      vld_h; // half words

      // Per nibble, 3 bits are needed for the selection. Other bits must be zero.
      // sel_n bit mask: 32'b0111_0111_0111_0111_0111_0111_0111_0111
      // vld_n bit mask: 32'b1000_1000_1000_1000_1000_1000_1000_1000
      for (genvar i = 0; i < 8; i++) begin : gen_sel_vld_n
        assign sel_n[i] =   operand_b_i[i*4     +: 3];
        assign vld_n[i] = ~|operand_b_i[i*4 + 3 +: 1];
      end

      // Per byte, 2 bits are needed for the selection. Other bits must be zero.
      // sel_b bit mask: 32'b0000_0011_0000_0011_0000_0011_0000_0011
      // vld_b bit mask: 32'b1111_1100_1111_1100_1111_1100_1111_1100
      for (genvar i = 0; i < 4; i++) begin : gen_sel_vld_b
        assign sel_b[i] =   operand_b_i[i*8     +: 2];
        assign vld_b[i] = ~|operand_b_i[i*8 + 2 +: 6];
      end

      // Per half word, 1 bit is needed for the selection only. All other bits must be zero.
      // sel_h bit mask: 32'b0000_0000_0000_0001_0000_0000_0000_0001
      // vld_h bit mask: 32'b1111_1111_1111_1110_1111_1111_1111_1110
      for (genvar i = 0; i < 2; i++) begin : gen_sel_vld_h
        assign sel_h[i] =   operand_b_i[i*16     +: 1];
        assign vld_h[i] = ~|operand_b_i[i*16 + 1 +: 15];
      end

      // Convert selector indices and valid signals to control the nibble-based
      // crossbar logic.
      logic [7:0][2:0] sel;
      logic [7:0]      vld;
      always_comb begin
        unique case (operator_i)
          ALU_XPERM_N: begin
            // No conversion needed.
            sel = sel_n;
            vld = vld_n;
          end

          ALU_XPERM_B: begin
            // Convert byte to nibble indicies.
            for (int b = 0; b < 4; b++) begin
              sel[b*2 +  0] =   {sel_b[b], 1'b0};
              sel[b*2 +  1] =   {sel_b[b], 1'b1};
              vld[b*2 +: 2] = {2{vld_b[b]}};
            end
          end

          ALU_XPERM_H: begin
            // Convert half-word to nibble indices.
            for (int h = 0; h < 2; h++) begin
              sel[h*4 +  0] =   {sel_h[h], 2'b00};
              sel[h*4 +  1] =   {sel_h[h], 2'b01};
              sel[h*4 +  2] =   {sel_h[h], 2'b10};
              sel[h*4 +  3] =   {sel_h[h], 2'b11};
              vld[h*4 +: 4] = {4{vld_h[h]}};
            end
          end

          default: begin
            // Tie valid to zero to disable the crossbar unless we need it.
            sel = sel_n;
            vld = '0;
          end
        endcase
      end

      // The actual nibble-based crossbar logic.
      logic [7:0][3:0] val_n;
      logic [7:0][3:0] xperm_n;
      assign val_n = operand_a_i;
      for (genvar i = 0; i < 8; i++) begin : gen_xperm_n
        assign xperm_n[i] = vld[i] ? val_n[sel[i]] : '0;
      end
      assign xperm_result = xperm_n;

      ///////////////////////////////////////////////////
      // Carry-less Multiply + Cyclic Redundancy Check //
      ///////////////////////////////////////////////////

      // Carry-less multiplication can be understood as multiplication based on
      // the addition interpreted as the bit-wise xor operation.
      //
      // Example: 1101 X 1011 = 1111111:
      //
      //       1011 X 1101
      //       -----------
      //              1101
      //         xor 1101
      //         ---------
      //             10111
      //        xor 0000
      //        ----------
      //            010111
      //       xor 1101
      //       -----------
      //           1111111
      //
      // Architectural details:
      //         A 32 x 32-bit array
      //         [ operand_b[i] ? (operand_a << i) : '0 for i in 0 ... 31 ]
      //         is generated. The entries of the array are pairwise 'xor-ed'
      //         together in a 5-stage binary tree.
      //
      //
      // Cyclic Redundancy Check:
      //
      // CRC-32 (CRC-32/ISO-HDLC) and CRC-32C (CRC-32/ISCSI) are directly implemented. For
      // documentation of the crc configuration (crc-polynomials, initialization, reflection, etc.)
      // see http://reveng.sourceforge.net/crc-catalogue/all.htm
      // A useful guide to crc arithmetic and algorithms is given here:
      // http://www.piclist.com/techref/method/math/crcguide.html.
      //
      // The CRC operation solves the following equation using binary polynomial arithmetic:
      //
      // rev(rd)(x) = rev(rs1)(x) * x**n mod {1, P}(x)
      //
      // where P denotes lower 32 bits of the corresponding CRC polynomial, rev(a) the bit reversal
      // of a, n = 8,16, or 32 for .b, .h, .w -variants. {a, b} denotes bit concatenation.
      //
      // Using barret reduction, one can show that
      //
      // M(x) mod P(x) = R(x) =
      //          (M(x) * x**n) & {deg(P(x)'{1'b1}}) ^ (M(x) x**-(deg(P(x) - n)) cx mu(x) cx P(x),
      //
      // Where mu(x) = polydiv(x**64, {1,P}) & 0xffffffff. Here, 'cx' refers to carry-less
      // multiplication. Substituting rev(rd)(x) for R(x) and rev(rs1)(x) for M(x) and solving for
      // rd(x) with P(x) a crc32 polynomial (deg(P(x)) = 32), we get
      //
      // rd = rev( (rev(rs1) << n)  ^ ((rev(rs1) >> (32-n)) cx mu cx P)
      //    = (rs1 >> n) ^ rev(rev( (rs1 << (32-n)) cx rev(mu)) cx P)
      //                       ^-- cycle 0--------------------^
      //      ^- cycle 1 -------------------------------------------^
      //
      // In the last step we used the fact that carry-less multiplication is bit-order agnostic:
      // rev(a cx b) = rev(a) cx rev(b).

      logic clmul_rmode;
      logic clmul_hmode;
      logic [31:0] clmul_op_a;
      logic [31:0] clmul_op_b;
      logic [31:0] operand_b_rev;
      logic [31:0] clmul_and_stage[32];
      logic [31:0] clmul_xor_stage1[16];
      logic [31:0] clmul_xor_stage2[8];
      logic [31:0] clmul_xor_stage3[4];
      logic [31:0] clmul_xor_stage4[2];

      logic [31:0] clmul_result_raw;

      for (genvar i = 0; i < 32; i++) begin : gen_rev_operand_b
        assign operand_b_rev[i] = operand_b_i[31-i];
      end

      assign clmul_rmode = operator_i == ALU_CLMULR;
      assign clmul_hmode = operator_i == ALU_CLMULH;

      // CRC
      localparam logic [31:0] CRC32_POLYNOMIAL = 32'h04c1_1db7;
      localparam logic [31:0] CRC32_MU_REV = 32'hf701_1641;

      localparam logic [31:0] CRC32C_POLYNOMIAL = 32'h1edc_6f41;
      localparam logic [31:0] CRC32C_MU_REV = 32'hdea7_13f1;

      logic crc_op;

      logic crc_cpoly;

      logic [31:0] crc_operand;
      logic [31:0] crc_poly;
      logic [31:0] crc_mu_rev;

      assign crc_op = (operator_i == ALU_CRC32C_W) | (operator_i == ALU_CRC32_W) |
                      (operator_i == ALU_CRC32C_H) | (operator_i == ALU_CRC32_H) |
                      (operator_i == ALU_CRC32C_B) | (operator_i == ALU_CRC32_B);

      assign crc_cpoly = (operator_i == ALU_CRC32C_W) |
                         (operator_i == ALU_CRC32C_H) |
                         (operator_i == ALU_CRC32C_B);

      assign crc_hmode = (operator_i == ALU_CRC32_H) | (operator_i == ALU_CRC32C_H);
      assign crc_bmode = (operator_i == ALU_CRC32_B) | (operator_i == ALU_CRC32C_B);

      assign crc_poly   = crc_cpoly ? CRC32C_POLYNOMIAL : CRC32_POLYNOMIAL;
      assign crc_mu_rev = crc_cpoly ? CRC32C_MU_REV : CRC32_MU_REV;

      always_comb begin
        unique case (1'b1)
          crc_bmode: crc_operand = {operand_a_i[7:0], 24'h0};
          crc_hmode: crc_operand = {operand_a_i[15:0], 16'h0};
          default:   crc_operand = operand_a_i;
        endcase
      end

      // Select clmul input
      always_comb begin
        if (crc_op) begin
          clmul_op_a = instr_first_cycle_i ? crc_operand : imd_val_q_i[0];
          clmul_op_b = instr_first_cycle_i ? crc_mu_rev : crc_poly;
        end else begin
          clmul_op_a = clmul_rmode | clmul_hmode ? operand_a_rev : operand_a_i;
          clmul_op_b = clmul_rmode | clmul_hmode ? operand_b_rev : operand_b_i;
        end
      end

      for (genvar i = 0; i < 32; i++) begin : gen_clmul_and_op
        assign clmul_and_stage[i] = clmul_op_b[i] ? clmul_op_a << i : '0;
      end

      for (genvar i = 0; i < 16; i++) begin : gen_clmul_xor_op_l1
        assign clmul_xor_stage1[i] = clmul_and_stage[2*i] ^ clmul_and_stage[2*i+1];
      end

      for (genvar i = 0; i < 8; i++) begin : gen_clmul_xor_op_l2
        assign clmul_xor_stage2[i] = clmul_xor_stage1[2*i] ^ clmul_xor_stage1[2*i+1];
      end

      for (genvar i = 0; i < 4; i++) begin : gen_clmul_xor_op_l3
        assign clmul_xor_stage3[i] = clmul_xor_stage2[2*i] ^ clmul_xor_stage2[2*i+1];
      end

      for (genvar i = 0; i < 2; i++) begin : gen_clmul_xor_op_l4
        assign clmul_xor_stage4[i] = clmul_xor_stage3[2*i] ^ clmul_xor_stage3[2*i+1];
      end

      assign clmul_result_raw = clmul_xor_stage4[0] ^ clmul_xor_stage4[1];

      for (genvar i = 0; i < 32; i++) begin : gen_rev_clmul_result
        assign clmul_result_rev[i] = clmul_result_raw[31-i];
      end

      // clmulr_result = rev(clmul(rev(a), rev(b)))
      // clmulh_result = clmulr_result >> 1
      always_comb begin
        case (1'b1)
          clmul_rmode: clmul_result = clmul_result_rev;
          clmul_hmode: clmul_result = {1'b0, clmul_result_rev[31:1]};
          default:     clmul_result = clmul_result_raw;
        endcase
      end
    end else begin : gen_alu_rvb_not_otearlgrey_full
      assign shuffle_result       = '0;
      assign xperm_result         = '0;
      assign clmul_result         = '0;
      // support signals
      assign clmul_result_rev     = '0;
      assign crc_bmode            = '0;
      assign crc_hmode            = '0;
    end

    if (RV32B == RV32BFull) begin : gen_alu_rvb_full

      ///////////////
      // Butterfly //
      ///////////////

      // The butterfly / inverse butterfly network executing bcompress/bdecompress (zbe)
      // instructions. For bdecompress, the control bits mask of a local left region is generated
      // by the inverse of a n-bit left rotate and complement upon wrap (LROTC) operation by the
      // number of ones in the deposit bitmask to the right of the segment. n hereby denotes the
      // width of the according segment. The bitmask for a pertaining local right region is equal
      // to the corresponding local left region. Bcompress uses an analogue inverse process.
      // Consider the following 8-bit example.  For details, see Hilewitz et al. "Fast Bit Gather,
      // Bit Scatter and Bit Permuation Instructions for Commodity Microprocessors", (2008).
      //
      // The bcompress/bdecompress instructions are completed in 2 cycles. In the first cycle, the
      // control bitmask is prepared by executing the parallel prefix bit count. In the second
      // cycle, the bit swapping is executed according to the control masks.

      // 8-bit example:  (Hilewitz et al.)
      // Consider the instruction bdecompress operand_a_i deposit_mask
      // Let operand_a_i = 8'babcd_efgh
      //    deposit_mask = 8'b1010_1101
      //
      // control bitmask for stage 1:
      //  - number of ones in the right half of the deposit bitmask: 3
      //  - width of the segment: 4
      //  - control bitmask = ~LROTC(4'b0, 3)[3:0] = 4'b1000
      //
      // control bitmask:   c3 c2  c1 c0  c3 c2  c1 c0
      //                    1  0   0  0   1  0   0  0
      //                    <- L ----->   <- R ----->
      // operand_a_i        a  b   c  d   e  f   g  h
      //                    :\ |   |  |  /:  |   |  |
      //                    : +|---|--|-+ :  |   |  |
      //                    :/ |   |  |  \:  |   |  |
      // stage 1            e  b   c  d   a  f   g  h
      //                    <L->   <R->   <L->   <R->
      // control bitmask:   c3 c2  c3 c2  c1 c0  c1 c0
      //                    1  1   1  1   1  0   1  0
      //                    :\ :\ /: /:   :\ |  /:  |
      //                    : +:-+-:+ :   : +|-+ :  |
      //                    :/ :/ \: \:   :/ |  \:  |
      // stage 2            c  d   e  b   g  f   a  h
      //                    L  R   L  R   L  R   L  R
      // control bitmask:   c3 c3  c2 c2  c1 c1  c0 c0
      //                    1  1   0  0   1  1   0  0
      //                    :\/:   |  |   :\/:   |  |
      //                    :  :   |  |   :  :   |  |
      //                    :/\:   |  |   :/\:   |  |
      // stage 3            d  c   e  b   f  g   a  h
      // & deposit bitmask: 1  0   1  0   1  1   0  1
      // result:            d  0   e  0   f  g   0  h

      logic [ 5:0] bitcnt_partial_q [32];

      // first cycle
      // Store partial bitcnts
      for (genvar i = 0; i < 32; i++) begin : gen_bitcnt_reg_in_lsb
        assign bitcnt_partial_lsb_d[i] = bitcnt_partial[i][0];
      end

      for (genvar i = 0; i < 16; i++) begin : gen_bitcnt_reg_in_b1
        assign bitcnt_partial_msb_d[i] = bitcnt_partial[2*i+1][1];
      end

      for (genvar i = 0; i < 8; i++) begin : gen_bitcnt_reg_in_b2
        assign bitcnt_partial_msb_d[16+i] = bitcnt_partial[4*i+3][2];
      end

      for (genvar i = 0; i < 4; i++) begin : gen_bitcnt_reg_in_b3
        assign bitcnt_partial_msb_d[24+i] = bitcnt_partial[8*i+7][3];
      end

      for (genvar i = 0; i < 2; i++) begin : gen_bitcnt_reg_in_b4
        assign bitcnt_partial_msb_d[28+i] = bitcnt_partial[16*i+15][4];
      end

      assign bitcnt_partial_msb_d[30] = bitcnt_partial[31][5];
      assign bitcnt_partial_msb_d[31] = 1'b0; // unused

      // Second cycle
      // Load partial bitcnts
      always_comb begin
        bitcnt_partial_q = '{default: '0};

        for (int unsigned i = 0; i < 32; i++) begin : gen_bitcnt_reg_out_lsb
          bitcnt_partial_q[i][0] = imd_val_q_i[0][i];
        end

        for (int unsigned i = 0; i < 16; i++) begin : gen_bitcnt_reg_out_b1
          bitcnt_partial_q[2*i+1][1] = imd_val_q_i[1][i];
        end

        for (int unsigned i = 0; i < 8; i++) begin : gen_bitcnt_reg_out_b2
          bitcnt_partial_q[4*i+3][2] = imd_val_q_i[1][16+i];
        end

        for (int unsigned i = 0; i < 4; i++) begin : gen_bitcnt_reg_out_b3
          bitcnt_partial_q[8*i+7][3] = imd_val_q_i[1][24+i];
        end

        for (int unsigned i = 0; i < 2; i++) begin : gen_bitcnt_reg_out_b4
          bitcnt_partial_q[16*i+15][4] = imd_val_q_i[1][28+i];
        end

        bitcnt_partial_q[31][5] = imd_val_q_i[1][30];
      end

      logic [31:0] butterfly_mask_l[5];
      logic [31:0] butterfly_mask_r[5];
      logic [31:0] butterfly_mask_not[5];
      logic [31:0] lrotc_stage [5]; // left rotate and complement upon wrap

      // number of bits in local r = 32 / 2**(stage + 1) = 16/2**stage
      `define _N(stg) (16 >> stg)

      // bcompress / bdecompress control bit generation
      for (genvar stg = 0; stg < 5; stg++) begin : gen_butterfly_ctrl_stage
        // number of segs: 2** stg
        for (genvar seg=0; seg<2**stg; seg++) begin : gen_butterfly_ctrl

          assign lrotc_stage[stg][2*`_N(stg)*(seg+1)-1 : 2*`_N(stg)*seg] =
              {{`_N(stg){1'b0}},{`_N(stg){1'b1}}} <<
                bitcnt_partial_q[`_N(stg)*(2*seg+1)-1][$clog2(`_N(stg)):0];

          assign butterfly_mask_l[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)]
                   = ~lrotc_stage[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)];

          assign butterfly_mask_r[stg][`_N(stg)*(2*seg+1)-1 : `_N(stg)*(2*seg)]
                   = ~lrotc_stage[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)];

          assign butterfly_mask_l[stg][`_N(stg)*(2*seg+1)-1 : `_N(stg)*(2*seg)]   = '0;
          assign butterfly_mask_r[stg][`_N(stg)*(2*seg+2)-1 : `_N(stg)*(2*seg+1)] = '0;
        end
      end
      `undef _N

      for (genvar stg = 0; stg < 5; stg++) begin : gen_butterfly_not
        assign butterfly_mask_not[stg] =
            ~(butterfly_mask_l[stg] | butterfly_mask_r[stg]);
      end

      always_comb begin
        butterfly_result = operand_a_i;

        butterfly_result = butterfly_result & butterfly_mask_not[0] |
            ((butterfly_result & butterfly_mask_l[0]) >> 16)|
            ((butterfly_result & butterfly_mask_r[0]) << 16);

        butterfly_result = butterfly_result & butterfly_mask_not[1] |
            ((butterfly_result & butterfly_mask_l[1]) >> 8)|
            ((butterfly_result & butterfly_mask_r[1]) << 8);

        butterfly_result = butterfly_result & butterfly_mask_not[2] |
            ((butterfly_result & butterfly_mask_l[2]) >> 4)|
            ((butterfly_result & butterfly_mask_r[2]) << 4);

        butterfly_result = butterfly_result & butterfly_mask_not[3] |
            ((butterfly_result & butterfly_mask_l[3]) >> 2)|
            ((butterfly_result & butterfly_mask_r[3]) << 2);

        butterfly_result = butterfly_result & butterfly_mask_not[4] |
            ((butterfly_result & butterfly_mask_l[4]) >> 1)|
            ((butterfly_result & butterfly_mask_r[4]) << 1);

        butterfly_result = butterfly_result & operand_b_i;
      end

      always_comb begin
        invbutterfly_result = operand_a_i & operand_b_i;

        invbutterfly_result = invbutterfly_result & butterfly_mask_not[4] |
            ((invbutterfly_result & butterfly_mask_l[4]) >> 1)|
            ((invbutterfly_result & butterfly_mask_r[4]) << 1);

        invbutterfly_result = invbutterfly_result & butterfly_mask_not[3] |
            ((invbutterfly_result & butterfly_mask_l[3]) >> 2)|
            ((invbutterfly_result & butterfly_mask_r[3]) << 2);

        invbutterfly_result = invbutterfly_result & butterfly_mask_not[2] |
            ((invbutterfly_result & butterfly_mask_l[2]) >> 4)|
            ((invbutterfly_result & butterfly_mask_r[2]) << 4);

        invbutterfly_result = invbutterfly_result & butterfly_mask_not[1] |
            ((invbutterfly_result & butterfly_mask_l[1]) >> 8)|
            ((invbutterfly_result & butterfly_mask_r[1]) << 8);

        invbutterfly_result = invbutterfly_result & butterfly_mask_not[0] |
            ((invbutterfly_result & butterfly_mask_l[0]) >> 16)|
            ((invbutterfly_result & butterfly_mask_r[0]) << 16);
      end
    end else begin : gen_alu_rvb_not_full
      logic [31:0] unused_imd_val_q_1;
      assign unused_imd_val_q_1   = imd_val_q_i[1];
      assign butterfly_result     = '0;
      assign invbutterfly_result  = '0;
      // support signals
      assign bitcnt_partial_lsb_d = '0;
      assign bitcnt_partial_msb_d = '0;
    end

    //////////////////////////////////////
    // Multicycle Bitmanip Instructions //
    //////////////////////////////////////
    // Ternary instructions + Shift Rotations + Bit Compress/Decompress + CRC
    // For ternary instructions (zbt), operand_a_i is tied to rs1 in the first cycle and rs3 in the
    // second cycle. operand_b_i is always tied to rs2.

    always_comb begin
      unique case (operator_i)
        ALU_CMOV: begin
          multicycle_result = (operand_b_i == 32'h0) ? operand_a_i : imd_val_q_i[0];
          imd_val_d_o = '{operand_a_i, 32'h0};
          if (instr_first_cycle_i) begin
            imd_val_we_o = 2'b01;
          end else begin
            imd_val_we_o = 2'b00;
          end
        end

        ALU_CMIX: begin
          multicycle_result = imd_val_q_i[0] | bwlogic_and_result;
          imd_val_d_o = '{bwlogic_and_result, 32'h0};
          if (instr_first_cycle_i) begin
            imd_val_we_o = 2'b01;
          end else begin
            imd_val_we_o = 2'b00;
          end
        end

        ALU_FSR, ALU_FSL,
        ALU_ROL, ALU_ROR: begin
          if (shift_amt[4:0] == 5'h0) begin
            multicycle_result = shift_amt[5] ? operand_a_i : imd_val_q_i[0];
          end else begin
            multicycle_result = imd_val_q_i[0] | shift_result;
          end
          imd_val_d_o = '{shift_result, 32'h0};
          if (instr_first_cycle_i) begin
            imd_val_we_o = 2'b01;
          end else begin
            imd_val_we_o = 2'b00;
          end
        end

        ALU_CRC32_W, ALU_CRC32C_W,
        ALU_CRC32_H, ALU_CRC32C_H,
        ALU_CRC32_B, ALU_CRC32C_B: begin
          if (RV32B == RV32BOTEarlGrey || RV32B == RV32BFull) begin
            unique case (1'b1)
              crc_bmode: multicycle_result = clmul_result_rev ^ (operand_a_i >> 8);
              crc_hmode: multicycle_result = clmul_result_rev ^ (operand_a_i >> 16);
              default:   multicycle_result = clmul_result_rev;
            endcase
            imd_val_d_o = '{clmul_result_rev, 32'h0};
            if (instr_first_cycle_i) begin
              imd_val_we_o = 2'b01;
            end else begin
              imd_val_we_o = 2'b00;
            end
          end else begin
            imd_val_d_o = '{operand_a_i, 32'h0};
            imd_val_we_o = 2'b00;
            multicycle_result = '0;
          end
        end

        ALU_BCOMPRESS, ALU_BDECOMPRESS: begin
          if (RV32B == RV32BFull) begin
            multicycle_result = (operator_i == ALU_BDECOMPRESS) ? butterfly_result :
                                                                  invbutterfly_result;
            imd_val_d_o = '{bitcnt_partial_lsb_d, bitcnt_partial_msb_d};
            if (instr_first_cycle_i) begin
              imd_val_we_o = 2'b11;
            end else begin
              imd_val_we_o = 2'b00;
            end
          end else begin
            imd_val_d_o = '{operand_a_i, 32'h0};
            imd_val_we_o = 2'b00;
            multicycle_result = '0;
          end
        end

        default: begin
          imd_val_d_o = '{operand_a_i, 32'h0};
          imd_val_we_o = 2'b00;
          multicycle_result = '0;
        end
      endcase
    end


  end else begin : g_no_alu_rvb
    logic [31:0] unused_imd_val_q[2];
    assign unused_imd_val_q           = imd_val_q_i;
    logic [31:0] unused_butterfly_result;
    assign unused_butterfly_result    = butterfly_result;
    logic [31:0] unused_invbutterfly_result;
    assign unused_invbutterfly_result = invbutterfly_result;
    // RV32B result signals
    assign bitcnt_result       = '0;
    assign minmax_result       = '0;
    assign pack_result         = '0;
    assign sext_result         = '0;
    assign singlebit_result    = '0;
    assign rev_result          = '0;
    assign shuffle_result      = '0;
    assign xperm_result        = '0;
    assign butterfly_result    = '0;
    assign invbutterfly_result = '0;
    assign clmul_result        = '0;
    assign multicycle_result   = '0;
    // RV32B support signals
    assign imd_val_d_o         = '{default: '0};
    assign imd_val_we_o        = '{default: '0};
  end

  ////////////////
  // Result mux //
  ////////////////

  always_comb begin
    result_o   = '0;

    unique case (operator_i)
      // Bitwise Logic Operations (negate: RV32B)
      ALU_XOR,  ALU_XNOR,
      ALU_OR,   ALU_ORN,
      ALU_AND,  ALU_ANDN: result_o = bwlogic_result;

      // Adder Operations
      ALU_ADD,  ALU_SUB,
      // RV32B
      ALU_SH1ADD, ALU_SH2ADD,
      ALU_SH3ADD: result_o = adder_result;

      // Shift Operations
      ALU_SLL,  ALU_SRL,
      ALU_SRA,
      // RV32B
      ALU_SLO,  ALU_SRO: result_o = shift_result;

      // Shuffle Operations (RV32B)
      ALU_SHFL, ALU_UNSHFL: result_o = shuffle_result;

      // Crossbar Permutation Operations (RV32B)
      ALU_XPERM_N, ALU_XPERM_B, ALU_XPERM_H: result_o = xperm_result;

      // Comparison Operations
      ALU_EQ,   ALU_NE,
      ALU_GE,   ALU_GEU,
      ALU_LT,   ALU_LTU,
      ALU_SLT,  ALU_SLTU: result_o = {31'h0,cmp_result};

      // MinMax Operations (RV32B)
      ALU_MIN,  ALU_MAX,
      ALU_MINU, ALU_MAXU: result_o = minmax_result;

      // Bitcount Operations (RV32B)
      ALU_CLZ, ALU_CTZ,
      ALU_CPOP: result_o = {26'h0, bitcnt_result};

      // Pack Operations (RV32B)
      ALU_PACK, ALU_PACKH,
      ALU_PACKU: result_o = pack_result;

      // Sign-Extend (RV32B)
      ALU_SEXTB, ALU_SEXTH: result_o = sext_result;

      // Ternary Bitmanip Operations (RV32B)
      ALU_CMIX, ALU_CMOV,
      ALU_FSL,  ALU_FSR,
      // Rotate Shift (RV32B)
      ALU_ROL, ALU_ROR,
      // Cyclic Redundancy Checks (RV32B)
      ALU_CRC32_W, ALU_CRC32C_W,
      ALU_CRC32_H, ALU_CRC32C_H,
      ALU_CRC32_B, ALU_CRC32C_B,
      // Bit Compress / Decompress (RV32B)
      ALU_BCOMPRESS, ALU_BDECOMPRESS: result_o = multicycle_result;

      // Single-Bit Bitmanip Operations (RV32B)
      ALU_BSET, ALU_BCLR,
      ALU_BINV, ALU_BEXT: result_o = singlebit_result;

      // General Reverse / Or-combine (RV32B)
      ALU_GREV, ALU_GORC: result_o = rev_result;

      // Bit Field Place (RV32B)
      ALU_BFP: result_o = bfp_result;

      // Carry-less Multiply Operations (RV32B)
      ALU_CLMUL, ALU_CLMULR,
      ALU_CLMULH: result_o = clmul_result;

      default: ;
    endcase
  end

  logic unused_shift_amt_compl;
  assign unused_shift_amt_compl = shift_amt_compl[5];

endmodule