Merge branch 'main' of https://github.com/openhwgroup/cvw into main

2025-04-20 03:47:20 -04:00 · 2024-06-19 09:25:39 -07:00 · 2024-06-19 09:25:39 -07:00 · ab1af0fabf
commit ab1af0fabf
parent b77fcd70e6 1f569ed6f8
171 changed files with 501 additions and 590 deletions
--- a/bin/lint-wally
+++ b/bin/lint-wally
@ -26,7 +26,10 @@ fi

 for config in ${configs[@]}; do
 #    echo "$config linting..."
-    if !($verilator --lint-only --quiet --top-module wallywrapper "-I$basepath/config/shared" "-I$basepath/config/$config" "-I$basepath/config/deriv/$config" $basepath/src/cvw.sv $basepath/testbench/wallywrapper.sv $basepath/src/*/*.sv $basepath/src/*/*/*.sv --relative-includes ); then
+    if !($verilator --lint-only --quiet --top-module wallywrapper \
+         "-I$basepath/config/shared" "-I$basepath/config/$config" "-I$basepath/config/deriv/$config" \
+         $basepath/src/cvw.sv $basepath/testbench/wallywrapper.sv $basepath/src/*/*.sv $basepath/src/*/*/*.sv \
+         -Wall -Wno-UNUSEDSIGNAL -Wno-UNUSEDPARAM -Wno-VARHIDDEN -Wno-GENUNNAMED -Wno-PINCONNECTEMPTY); then
        if [ "$1" == "-nightly" ]; then
            echo -e "${RED}$config failed lint${NC}"
            fails=$((fails+1))
@ -48,4 +51,5 @@ echo -e "${GREEN}All ${#configs[@]} lints run with no errors or warnings"
 # -I points to the include directory where files such as `include config.vh  are found

 # For more exhaustive (and sometimes spurious) warnings, add --Wall to the Verilator command
+# verilator --lint-only -Wall --quiet --top-module wallywrapper -Iconfig/shared -Iconfig/rv64gc src/cvw.sv testbench/wallywrapper.sv src/*/*.sv src/*/*/*.sv -Wno-UNUSEDPARAM -Wno-VARHIDDEN -Wno-GENUNNAMED -Wno-PINCONNECTEMPTY
 # Unfortunately, this produces a bunch of UNUSED and UNDRIVEN signal warnings in blocks that are configured to not exist.
--- a/bin/regression-wally
+++ b/bin/regression-wally
@ -423,6 +423,7 @@ def main():
    """Run the tests and count the failures"""
    global configs, coverage
    os.chdir(regressionDir)
+    os.system('rm -rf questa/wkdir')
    for d in ["questa/logs", "questa/wkdir", "verilator/logs", "verilator/wkdir", "vcs/logs", "vcs/wkdir"]:
        try:
            os.mkdir(d)
--- a/bin/wally-tool-chain-install.sh
+++ b/bin/wally-tool-chain-install.sh
@ -112,7 +112,6 @@ make install
 # Verilator needs to be built from scratch to get the latest version
 # apt-get install verilator installs version 4.028 as of 6/8/23
 sudo apt-get install -y perl g++ ccache help2man libgoogle-perftools-dev numactl perl-doc zlib1g 
-sudo apt-get install -y perl g++ ccache help2man libgoogle-perftools-dev numactl perl-doc zlib1g 
 cd $RISCV
 git clone https://github.com/verilator/verilator   # Only first time
 # unsetenv VERILATOR_ROOT  # For csh; ignore error if on bash
--- a/config/shared/config-shared.vh
+++ b/config/shared/config-shared.vh
@ -110,20 +110,10 @@ localparam CVTLEN = (ZFA_SUPPORTED & D_SUPPORTED) ? `max(BASECVTLEN, 32'd84) : B
 localparam LLEN = `max($unsigned(FLEN), $unsigned(XLEN));
 localparam LOGCVTLEN = $unsigned($clog2(CVTLEN+1));

-// size of FMA output
+// size of FMA output in U(NF+4).(3NF+2) format
 localparam FMALEN = 3*NF + 6;

 // NORMSHIFTSIZE is the bits out of the normalization shifter
-// RV32F: max(32+23+1, 2(23)+4, 3(23)+6) = 3*23+6 = 75
-// RV64F: max(64+23+1, 64 + 23 + 2, 3*23+6) = 89
-// RV64D: max(84+52+1, 64+52+2, 3*52+6) = 162
-// *** DH 5/10/24 testbench_fp f_ieee_div_2_1_rv64gc cvtint was failing for fcvt.lu.s
-//     with CVTLEN+NF+1.  Changing to CVTLEN+NF+1+2 fixes failures
-//     This same failure occurred for any test with IDIV_ON_FPU = 0, FLEN=32, XLEN=64
-//     because NORMSHIFTSZ becomes limited by convert rather than divider
-//     The two extra bits are necessary because shiftcorrection dropped them for fcvt.
-//     May be possible to remove these two bits by modifying shiftcorrection
-//localparam NORMSHIFTSZ = `max(`max((CVTLEN+NF+1+2), (DIVb + 1 + NF + 1)), (FMALEN + 2));
 localparam NORMSHIFTSZ = `max(`max((CVTLEN+NF+1), (DIVb + 1 + NF + 1)), (FMALEN + 2));

 localparam LOGNORMSHIFTSZ = ($clog2(NORMSHIFTSZ));                  // log_2(NORMSHIFTSZ)
--- a/sim/vcs/run_vcs
+++ b/sim/vcs/run_vcs
@ -86,7 +86,7 @@ INCLUDE_DIRS=$(find ${SRC} -type d | xargs -I {} echo -n "{} ")
 INCLUDE_PATH="+incdir+${CFG}/${CONFIG_VARIANT} +incdir+${CFG}/deriv/${CONFIG_VARIANT} +incdir+${CFG}/shared +incdir+../../tests +define+ +incdir+${TB} ${SRC}/cvw.sv +incdir+${SRC}"

 # Prepare RTL files avoiding certain paths
-RTL_FILES="$INCLUDE_DIRS $(find ${SRC} -name "*.sv" ! -path "${SRC}/generic/clockgater.sv" ! -path "${SRC}/generic/mem/rom1p1r_128x64.sv" ! -path "${SRC}/generic/mem/ram2p1r1wbe_128x64.sv" ! -path "${SRC}/generic/mem/rom1p1r_128x32.sv" ! -path "${SRC}/generic/mem/ram2p1r1wbe_512x64.sv")  ${TB}/testbench.sv $(find ${TB}/common -name "*.sv" ! -path "${TB}/common/wallyTracer.sv")"
+RTL_FILES="$INCLUDE_DIRS $(find ${SRC} -name "*.sv" ! -path "${SRC}/generic/clockgater.sv" ! -path "${SRC}/generic/mem/rom1p1r_128x64.sv" ! -path "${SRC}/generic/mem/ram2p1r1wbe_128x64.sv" ! -path "${SRC}/generic/mem/rom1p1r_128x32.sv" ! -path "${SRC}/generic/mem/ram2p1r1wbe_2048x64.sv")  ${TB}/testbench.sv $(find ${TB}/common -name "*.sv" ! -path "${TB}/common/wallyTracer.sv")"

 # Simulation and Coverage Commands
 OUTPUT="sim_out"
--- a/sim/verilator/wrapper.c
+++ b/sim/verilator/wrapper.c
@ -3,5 +3,9 @@
 #include "Vtestbench__Dpi.h"

 const char *getenvval(const char *pszName) {
+    const char *pszValue = getenv(pszName);
+    if (pszValue == NULL) {
+        return "";
+    }
    return ((const char *) getenv(pszName));
 }
--- a/src/cache/cache.sv
+++ b/src/cache/cache.sv
@ -7,7 +7,7 @@
 //
 // Purpose: Implements the I$ and D$. Interfaces with requests from IEU and HPTW and ahbcacheinterface
 //
-// Documentation: RISC-V System on Chip Design Chapter 7 (Figures 7.9, 7.10, and 7.19)
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
@ -87,14 +87,13 @@ module cache import cvw::*; #(parameter cvw_t P,
  logic                          LineDirty, HitLineDirty;
  logic [TAGLEN-1:0]             TagWay [NUMWAYS-1:0];
  logic [TAGLEN-1:0]             Tag;
-  logic [SETLEN-1:0]             FlushAdr, NextFlushAdr, FlushAdrP1;
+  logic [SETLEN-1:0]             FlushAdr;
  logic                          FlushAdrCntEn, FlushCntRst;
  logic                          FlushAdrFlag, FlushWayFlag;
  logic [NUMWAYS-1:0]            FlushWay, NextFlushWay;
  logic                          FlushWayCntEn;
  logic                          SelWriteback;
  logic                          LRUWriteEn;
-  logic                          ResetOrFlushCntRst;
  logic [LINELEN-1:0]            ReadDataLine, ReadDataLineCache;
  logic                          SelFetchBuffer;
  logic                          CacheEn;
@ -128,7 +127,7 @@ module cache import cvw::*; #(parameter cvw_t P,
  if(NUMWAYS > 1) begin:vict
    cacheLRU #(NUMWAYS, SETLEN, OFFSETLEN, NUMSETS) cacheLRU(
      .clk, .reset, .FlushStage, .CacheEn, .HitWay, .ValidWay, .VictimWay, .CacheSetTag, .LRUWriteEn,
-      .SetValid, .ClearValid, .PAdr(PAdr[SETTOP-1:OFFSETLEN]), .InvalidateCache);
+      .SetValid, .PAdr(PAdr[SETTOP-1:OFFSETLEN]), .InvalidateCache);
  end else 
    assign VictimWay = 1'b1; // one hot.

@ -201,6 +200,9 @@ module cache import cvw::*; #(parameter cvw_t P,
  /////////////////////////////////////////////////////////////////////////////////////////////

  if (!READ_ONLY_CACHE) begin:flushlogic // D$ can be flushed
+    logic                          ResetOrFlushCntRst;
+    logic [SETLEN-1:0]             NextFlushAdr, FlushAdrP1;
+
    // Flush address (line number)
    assign ResetOrFlushCntRst = reset | FlushCntRst;
    flopenr #(SETLEN) FlushAdrReg(clk, ResetOrFlushCntRst, FlushAdrCntEn, FlushAdrP1, NextFlushAdr);
--- a/src/cache/cacheLRU.sv
+++ b/src/cache/cacheLRU.sv
@ -7,7 +7,7 @@
 //
 // Purpose: Implements Pseudo LRU. Tested for Powers of 2.
 //
-// Documentation: RISC-V System on Chip Design Chapter 7 (Figures 7.8 and 7.15 to 7.18)
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
@ -40,7 +40,6 @@ module cacheLRU
  input  logic [SETLEN-1:0]   PAdr,            // Physical address 
  input  logic                LRUWriteEn,      // Update the LRU state
  input  logic                SetValid,        // Set the dirty bit in the selected way and set
-  input  logic                ClearValid,      // Clear the dirty bit in the selected way and set
  input  logic                InvalidateCache, // Clear all valid bits
  output logic [NUMWAYS-1:0]  VictimWay        // LRU selects a victim to evict
 );
@ -48,12 +47,12 @@ module cacheLRU
  localparam                           LOGNUMWAYS = $clog2(NUMWAYS);

  logic [NUMWAYS-2:0]                  LRUMemory [NUMSETS-1:0];
-  logic [NUMWAYS-2:0]                  CurrLRU;
-  logic [NUMWAYS-2:0]                  NextLRU;
+  logic [NUMWAYS-2:0]                  CurrLRU, NextLRU, ReadLRU, BypassedLRU;
  logic [LOGNUMWAYS-1:0]               HitWayEncoded, Way;
  logic [NUMWAYS-2:0]                  WayExpanded;
  logic                                AllValid;
-  
+  logic                                ForwardLRU;
+ 
  genvar                               row;

  /* verilator lint_off UNOPTFLAT */
@ -131,29 +130,22 @@ module cacheLRU
    assign Intermediate[node] = CurrLRU[node] ? int1[LOGNUMWAYS-1:0] : int0[LOGNUMWAYS-1:0];
  end

-  
  priorityonehot #(NUMWAYS) FirstZeroEncoder(~ValidWay, FirstZero);
  binencoder #(NUMWAYS) FirstZeroWayEncoder(FirstZero, FirstZeroWay);
  mux2 #(LOGNUMWAYS) VictimMux(FirstZeroWay, Intermediate[NUMWAYS-2], AllValid, VictimWayEnc);
  decoder #(LOGNUMWAYS) decoder (VictimWayEnc, VictimWay);

-  // LRU storage must be reset for modelsim to run. However the reset value does not actually matter in practice.
-  // This is a two port memory.
-  // Every cycle must read from CacheSetTag and each load/store must write the new LRU.
-
-  // note: Verilator lint doesn't like <= for array initialization (https://verilator.org/warn/BLKLOOPINIT?v=5.021)
-  // Move to = to keep Verilator happy and simulator running fast
-  always_ff @(posedge clk) begin
+  // LRU memory must be reset for Questa to run. The reset value does not matter but it is best to be deterministc.
+  always_ff @(posedge clk)
    if (reset | (InvalidateCache & ~FlushStage)) 
-      for (int set = 0; set < NUMSETS; set++) LRUMemory[set] = '0; // exclusion-tag: initialize
-    else if(CacheEn) begin
-      // Because we are using blocking assignments, change to LRUMemory must occur after LRUMemory is used so we get the proper value
-      if(LRUWriteEn & (PAdr == CacheSetTag)) CurrLRU = NextLRU;
-      else                                   CurrLRU = LRUMemory[CacheSetTag];
-      if(LRUWriteEn)                         LRUMemory[PAdr] = NextLRU;
-    end
-  end
+      for (int set = 0; set < NUMSETS; set++) LRUMemory[set] <= '0; // exclusion-tag: initialize
+    else if (CacheEn & LRUWriteEn) LRUMemory[PAdr] <= NextLRU;

+  // LRU read path with write forwarding
+  assign ReadLRU = LRUMemory[CacheSetTag];
+  assign ForwardLRU = LRUWriteEn & (PAdr == CacheSetTag);
+  mux2 #(NUMWAYS-1) ReadLRUmux(ReadLRU, NextLRU, ForwardLRU, BypassedLRU);
+  flop #(NUMWAYS-1) CurrLRUReg(clk, BypassedLRU, CurrLRU);
 endmodule


--- a/src/cache/cachefsm.sv
+++ b/src/cache/cachefsm.sv
@ -7,7 +7,7 @@
 //
 // Purpose: Controller for the cache fsm
 //
-// Documentation: RISC-V System on Chip Design Chapter 7 (Figure 7.14 and Table 7.1)
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/cache/cacheway.sv
+++ b/src/cache/cacheway.sv
@ -7,7 +7,7 @@
 //
 // Purpose: Storage and read/write access to data cache data, tag valid, dirty, and replacement.
 // 
-// Documentation: RISC-V System on Chip Design Chapter 7 (Figure 7.11)
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
@ -76,7 +76,6 @@ module cacheway import cvw::*; #(parameter cvw_t P,
  logic                               ClearValidWay;
  logic                               SetDirtyWay;
  logic                               ClearDirtyWay;
-  logic                               SelNonHit;
  logic                               SelectedWay;
  logic                               InvalidateCacheDelay;
  
--- a/src/cache/subcachelineread.sv
+++ b/src/cache/subcachelineread.sv
@ -7,7 +7,7 @@
 //
 // Purpose: Muxes the cache line down to the word size.  Also include possible save/restore registers/muxes.
 //
-// Documentation: RISC-V System on Chip Design Chapter 7
+// Documentation: RISC-V System on Chip Design

 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/ebu/ahbcacheinterface.sv
+++ b/src/ebu/ahbcacheinterface.sv
@ -7,7 +7,7 @@
 //
 // Purpose: Translates cache bus requests and uncached ieu memory requests into AHB transactions.
 //
-// Documentation: RISC-V System on Chip Design Chapter 9 (Figure 9.8)
+// Documentation: RISC-V System on Chip Design
 // 
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
@ -79,8 +79,7 @@ module ahbcacheinterface import cvw::*; #(
  logic [P.PA_BITS-1:0]         LocalHADDR;                             // Address after selecting between cached and uncached operation
  logic [AHBWLOGBWPL-1:0]     BeatCountDelayed;                       // Beat within the cache line in the second (Data) cache stage
  logic                       CaptureEn;                              // Enable updating the Fetch buffer with valid data from HRDATA
-  logic [P.AHBW/8-1:0]          BusByteMaskM;                           // Byte enables within a word. For cache request all 1s
-  logic [P.AHBW-1:0]            PreHWDATA;                              // AHB Address phase write data
+ logic [P.AHBW-1:0]            PreHWDATA;                              // AHB Address phase write data
  logic [P.PA_BITS-1:0]         PAdrZero;

  genvar                      index;
@ -114,11 +113,14 @@ module ahbcacheinterface import cvw::*; #(
    .s(~(CacheableOrFlushCacheM)), .y(PreHWDATA));
  flopen #(P.AHBW) wdreg(HCLK, HREADY, PreHWDATA, HWDATA); // delay HWDATA by 1 cycle per spec

-  // *** bummer need a second byte mask for bus as it is AHBW rather than LLEN.
-  // probably can merge by muxing PAdrM's LLEN/8-1 index bit based on HTRANS being != 0.
-  swbytemask #(P.AHBW) busswbytemask(.Size(HSIZE), .Adr(HADDR[$clog2(P.AHBW/8)-1:0]), .ByteMask(BusByteMaskM), .ByteMaskExtended());
-  
-  flopen #(P.AHBW/8) HWSTRBReg(HCLK, HREADY, BusByteMaskM[P.AHBW/8-1:0], HWSTRB);
+  if (READ_ONLY_CACHE) begin
+    assign HWSTRB = '0;
+  end else begin // compute byte mask for AHB transaction based on size and address.  AHBW may be different than LLEN
+    logic [P.AHBW/8-1:0]          BusByteMaskM;                           // Byte enables within a word. For cache request all 1s
+     
+    swbytemask #(P.AHBW) busswbytemask(.Size(HSIZE), .Adr(HADDR[$clog2(P.AHBW/8)-1:0]), .ByteMask(BusByteMaskM), .ByteMaskExtended());
+    flopen #(P.AHBW/8) HWSTRBReg(HCLK, HREADY, BusByteMaskM[P.AHBW/8-1:0], HWSTRB);
+  end
  
  buscachefsm #(BeatCountThreshold, AHBWLOGBWPL, READ_ONLY_CACHE, P.BURST_EN) AHBBuscachefsm(
    .HCLK, .HRESETn, .Flush, .BusRW, .BusAtomic, .Stall, .BusCommitted, .BusStall, .CaptureEn, .SelBusBeat,
--- a/src/ebu/ahbinterface.sv
+++ b/src/ebu/ahbinterface.sv
@ -7,7 +7,7 @@
 //
 // Purpose: Translates LSU simple memory requests into AHB transactions (NON_SEQ).
 // 
-// Documentation: RISC-V System on Chip Design Chapter 6 (Figure 6.21)
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/ebu/buscachefsm.sv
+++ b/src/ebu/buscachefsm.sv
@ -7,7 +7,7 @@
 //
 // Purpose: Controller for cache to AHB bus interface
 // 
-// Documentation: RISC-V System on Chip Design Chapter 9 (Figure 9.9)
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/ebu/busfsm.sv
+++ b/src/ebu/busfsm.sv
@ -7,7 +7,7 @@
 //
 // Purpose: Simple NON_SEQ (no burst) AHB controller.
 //
-// Documentation: RISC-V System on Chip Design Chapter 6 (Figure 6.23)
+// Documentation: RISC-V System on Chip Design
 // 
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/ebu/controllerinput.sv
+++ b/src/ebu/controllerinput.sv
@ -11,7 +11,7 @@
 //          Connects core to peripherals and I/O pins on SOC
 //          Bus width presently matches XLEN
 // 
-// Documentation: RISC-V System on Chip Design Chapter 6 (Figure 6.25)
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/ebu/ebu.sv
+++ b/src/ebu/ebu.sv
@ -11,7 +11,7 @@
 //          Connects core to peripherals and I/O pins on SOC
 //          Bus width presently matches XLEN
 // 
-// Documentation: RISC-V System on Chip Design Chapter 6 (Figures 6.25 and 6.26)
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
@ -110,7 +110,7 @@ module ebu import cvw::*;  #(parameter cvw_t P) (
    .HWRITEOut(LSUHWRITEOut), .HSIZEOut(LSUHSIZEOut), .HBURSTOut(LSUHBURSTOut),
    .HTRANSOut(LSUHTRANSOut), .HADDROut(LSUHADDROut), .HREADYIn(HREADY));

-  // output mux //*** switch to structural implementation
+  // output mux 
  assign HADDR = LSUSelect ? LSUHADDROut : IFUSelect ? IFUHADDROut : '0;
  assign HSIZE = LSUSelect ? LSUHSIZEOut : IFUSelect ? IFUHSIZEOut: '0; 
  assign HBURST = LSUSelect ? LSUHBURSTOut : IFUSelect ? IFUHBURSTOut : '0; // If doing memory accesses, use LSUburst, else use Instruction burst.
--- a/src/ebu/ebufsmarb.sv
+++ b/src/ebu/ebufsmarb.sv
@ -8,7 +8,7 @@
 // Purpose: Arbitrates requests from instruction and data streams
 //          LSU has priority.
 // 
-// Documentation: RISC-V System on Chip Design Chapter 6 (Figures 6.25 and 6.26)
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/fclassify.sv
+++ b/src/fpu/fclassify.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Floating-point classify unit
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/fcmp.sv
+++ b/src/fpu/fcmp.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Floating-point comparison unit
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/fctrl.sv
+++ b/src/fpu/fctrl.sv
@ -6,7 +6,7 @@
 //
 // Purpose: floating-point control unit
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/fcvt.sv
+++ b/src/fpu/fcvt.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Floating point conversions of configurable size
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // Int component of the Wally configurable RISC-V project.
 // 
@ -190,7 +190,7 @@ module fcvt import cvw::*;  #(parameter cvw_t P) (
  // shifter
  ///////////////////////////////////////////////////////////////////////////

-  // kill the shift if it's negative
+  // kill the shift if it is negative
  // select the amount to shift by
  //      fp -> int: 
  //          - shift left by CalcExp - essentially shifting until the unbiased exponent = 0
@ -201,10 +201,10 @@ module fcvt import cvw::*;  #(parameter cvw_t P) (
  //          - shift left by LeadingZeros - to shift till the result is normalized
  //              - only shift fp -> fp if the intital value is subnormal
  //                  - this is a problem because the input to the lzc was the fraction rather than the mantissa
-  //                  - rather have a few and-gates than an extra bit in the priority encoder??? *** is this true?
+  //                  - rather have a few and-gates than an extra bit in the priority encoder???
  always_comb
      if(ToInt)                       ShiftAmt = Ce[P.LOGCVTLEN-1:0]&{P.LOGCVTLEN{~Ce[P.NE]}};
-      else if (ResSubnormUf)  ShiftAmt = (P.LOGCVTLEN)'(P.NF-1)+Ce[P.LOGCVTLEN-1:0];
+      else if (ResSubnormUf)          ShiftAmt = (P.LOGCVTLEN)'(P.NF-1)+Ce[P.LOGCVTLEN-1:0];
      else                            ShiftAmt = LeadingZeros;
      
  ///////////////////////////////////////////////////////////////////////////
--- a/src/fpu/fdivsqrt/fdivsqrt.sv
+++ b/src/fpu/fdivsqrt/fdivsqrt.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Combined Divide and Square Root Floating Point and Integer Unit
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
@ -65,11 +65,9 @@ module fdivsqrt import cvw::*;  #(parameter cvw_t P) (
  logic                        WZeroE;                       // Early termination flag
  logic [P.DURLEN-1:0]         CyclesE;                      // FSM cycles
  logic                        SpecialCaseM;                 // Divide by zero, square root of negative, etc.
-  logic                        DivStartE;                    // Enable signal for flops during stall
                                                            
  // Integer div/rem signals                                
  logic                        BZeroM;                       // Denominator is zero
-  logic                        IntDivM;                      // Integer operation
  logic [P.DIVBLEN-1:0]        IntNormShiftM;                // Integer normalizatoin shift amount
  logic                        ALTBM, AsM, BsM, W64M;        // Special handling for postprocessor
  logic [P.XLEN-1:0]           AM;                           // Original Numerator for postprocessor
@ -80,8 +78,7 @@ module fdivsqrt import cvw::*;  #(parameter cvw_t P) (
    .FmtE, .Bias(BiasE), .Nf(NfE), .SqrtE, .XZeroE, .Funct3E, .UeM, .X, .D, .CyclesE,
    // Int-specific 
    .ForwardedSrcAE, .ForwardedSrcBE, .IntDivE, .W64E, .ISpecialCaseE,
-    .BZeroM, .IntNormShiftM, .AM, 
-    .IntDivM, .W64M, .ALTBM, .AsM, .BsM);
+    .BZeroM, .IntNormShiftM, .AM, .W64M, .ALTBM, .AsM, .BsM);

  fdivsqrtfsm #(P) fdivsqrtfsm(                                  // FSM
    .clk, .reset, .XInfE, .YInfE, .XZeroE, .YZeroE, .XNaNE, .YNaNE, 
--- a/src/fpu/fdivsqrt/fdivsqrtcycles.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtcycles.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Determine number of cycles for divsqrt
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
@ -28,9 +28,7 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////

 module fdivsqrtcycles import cvw::*;  #(parameter cvw_t P) (
-  input  logic [P.FMTBITS-1:0] FmtE,
  input  logic [P.LOGFLEN-1:0] Nf,          // Number of fractional bits in selected format
-  input  logic                 SqrtE,
  input  logic                 IntDivE,
  input  logic [P.DIVBLEN-1:0] IntResultBitsE,    
  output logic [P.DURLEN-1:0]  CyclesE
--- a/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtexpcalc.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Exponent caclulation for divide and square root
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/fdivsqrt/fdivsqrtfgen2.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtfgen2.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Radix 2 F Addend Generator
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/fdivsqrt/fdivsqrtfgen4.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtfgen4.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Radix 4 F Addend Generator
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/fdivsqrt/fdivsqrtfsm.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtfsm.sv
@ -6,7 +6,7 @@
 //
 // Purpose: divsqrt state machine for multi-cycle operations
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/fdivsqrt/fdivsqrtiter.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtiter.sv
@ -6,7 +6,7 @@
 //
 // Purpose: k stages of divsqrt logic, plus registers
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
@ -44,7 +44,7 @@ module fdivsqrtiter import cvw::*;  #(parameter cvw_t P) (
  logic [P.DIVb+3:0]      WCNext[P.DIVCOPIES-1:0]; // Q4.DIVb
  logic [P.DIVb+3:0]      WS[P.DIVCOPIES:0];       // Q4.DIVb
  logic [P.DIVb+3:0]      WC[P.DIVCOPIES:0];       // Q4.DIVb
-  logic [P.DIVb:0]        U[P.DIVCOPIES:0];        // U1.DIVb // *** probably Q not U.  See Table 16.26 notes
+  logic [P.DIVb:0]        U[P.DIVCOPIES:0];        // U1.DIVb
  logic [P.DIVb:0]        UM[P.DIVCOPIES:0];       // U1.DIVb
  logic [P.DIVb:0]        UNext[P.DIVCOPIES-1:0];  // U1.DIVb
  logic [P.DIVb:0]        UMNext[P.DIVCOPIES-1:0]; // U1.DIVb
--- a/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpostproc.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Divide/Square root postprocessing
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
@ -131,5 +131,6 @@ module fdivsqrtpostproc import cvw::*;  #(parameter cvw_t P) (
        W64M, FIntDivResultM);
    end else 
      assign FIntDivResultM = IntDivResultM[P.XLEN-1:0];
-  end
+  end else
+    assign FIntDivResultM = '0;
 endmodule
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Divide/Square root preprocessing: integer absolute value and W64, normalization shift
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
@ -47,7 +47,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
  output logic                 ISpecialCaseE,
  output logic [P.DURLEN-1:0]  CyclesE,
  output logic [P.DIVBLEN-1:0] IntNormShiftM,
-  output logic                 ALTBM, IntDivM, W64M,
+  output logic                 ALTBM, W64M,
  output logic                 AsM, BsM, BZeroM,
  output logic [P.XLEN-1:0]    AM
 );
@ -58,7 +58,6 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
  logic [P.DIVb:0]             IFX, IFD;                            // Correctly-sized inputs for iterator, selected from int or fp input
  logic [P.DIVBLEN-1:0]        mE, ell;                             // Leading zeros of inputs
  logic [P.DIVBLEN-1:0]        IntResultBitsE;                      // bits in integer result
-  logic                        NumerZeroE;                          // Numerator is zero (X or A)
  logic                        AZeroE, BZeroE;                      // A or B is Zero for integer division
  logic                        SignedDivE;                          // signed division
  logic                        AsE, BsE;                            // Signs of integer inputs
@ -96,11 +95,9 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
    // Select integer or floating point inputs
    mux2 #(P.DIVb+1) ifxmux({Xm, {(P.DIVb-P.NF){1'b0}}}, {PosA, {(P.DIVb-P.XLEN+1){1'b0}}}, IntDivE, IFX);
    mux2 #(P.DIVb+1) ifdmux({Ym, {(P.DIVb-P.NF){1'b0}}}, {PosB, {(P.DIVb-P.XLEN+1){1'b0}}}, IntDivE, IFD);
-    mux2 #(1)    numzmux(XZeroE, AZeroE, IntDivE, NumerZeroE);
  end else begin // Int not supported
    assign IFX = {Xm, {(P.DIVb-P.NF){1'b0}}};
    assign IFD = {Ym, {(P.DIVb-P.NF){1'b0}}};
-    assign NumerZeroE = XZeroE;
  end

  //////////////////////////////////////////////////////
@ -147,7 +144,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
      assign DivXShifted = DivX;
    end
  end else begin
-    assign ISpecialCaseE = 1'b0;
+    assign {ISpecialCaseE, IntResultBitsE} = '0;
  end

  //////////////////////////////////////////////////////
@ -174,7 +171,6 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
  // 4          2(x)-4 = 4(x/2 - 1))  2(x/2)-4 = 4(x/4 - 1)
  // Summary: PreSqrtX = r(x/2or4 - 1)

-  logic [P.DIVb:0] PreSqrtX;
  assign EvenExp = Xe[0] ^ ell[0]; // effective unbiased exponent after normalization is even
  mux2 #(P.DIVb+4) sqrtxmux({4'b0,Xnorm[P.DIVb:1]}, {5'b00, Xnorm[P.DIVb:2]}, EvenExp, SqrtX); // X/2 if exponent odd, X/4 if exponent even

@ -215,21 +211,20 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
  flopen #(P.NE+2) expreg(clk, IFDivStartE, UeE, UeM);

  // Number of FSM cycles (to FSM)
-  fdivsqrtcycles #(P) cyclecalc(.FmtE, .Nf, .SqrtE, .IntDivE, .IntResultBitsE, .CyclesE);
+  fdivsqrtcycles #(P) cyclecalc(.Nf, .IntDivE, .IntResultBitsE, .CyclesE);

  if (P.IDIV_ON_FPU) begin:intpipelineregs
    logic [P.DIVBLEN-1:0] IntDivNormShiftE, IntRemNormShiftE, IntNormShiftE;
    logic               RemOpE;

    /* verilator lint_off WIDTH */
-    assign IntDivNormShiftE = P.DIVb - (CyclesE * P.RK - P.LOGR); // b - rn, used for integer normalization right shift.  rn = Cycles * r * k - r ***explain
+    assign IntDivNormShiftE = P.DIVb - (CyclesE * P.RK - P.LOGR); // b - rn, used for integer normalization right shift.  n = (Cycles * k - 1)
    assign IntRemNormShiftE = mE + (P.DIVb-(P.XLEN-1));           // m + b - (N-1) for remainder normalization shift
    /* verilator lint_on WIDTH */
    assign RemOpE = Funct3E[1];
    mux2 #(P.DIVBLEN) normshiftmux(IntDivNormShiftE, IntRemNormShiftE, RemOpE, IntNormShiftE);

    // pipeline registers
-    flopen #(1)          mdureg(clk, IFDivStartE, IntDivE,  IntDivM);
    flopen #(1)         altbreg(clk, IFDivStartE, ALTBE,    ALTBM);
    flopen #(1)        bzeroreg(clk, IFDivStartE, BZeroE,   BZeroM);
    flopen #(1)        asignreg(clk, IFDivStartE, AsE,      AsM);
@ -238,7 +233,8 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
    flopen #(P.XLEN)    srcareg(clk, IFDivStartE, AE,       AM);
    if (P.XLEN==64) 
      flopen #(1)        w64reg(clk, IFDivStartE, W64E,     W64M);
-  end
+  end else
+    assign {ALTBM, W64M, AsM, BsM, BZeroM, AM, IntNormShiftM} = 0;

 endmodule

--- a/src/fpu/fdivsqrt/fdivsqrtstage2.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtstage2.sv
@ -6,7 +6,7 @@
 //
 // Purpose: radix-2 divsqrt recurrence stage
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/fdivsqrt/fdivsqrtstage4.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtstage4.sv
@ -6,7 +6,7 @@
 //
 // Purpose: radix-4 divsqrt recurrence stage
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
@ -52,7 +52,7 @@ module fdivsqrtstage4 import cvw::*;  #(parameter cvw_t P) (

  // Digit Selection logic
  assign j0     = ~C[P.DIVb+1];             // first step of R digit selection: C = 00...0
-  assign j1     = C[P.DIVb] & ~C[P.DIVb-1]; // second step of R digit selection: C = 1100...0; *** could simplify to ~C[P.DIVb-1] because j=0 case takes priority
+  assign j1     = ~C[P.DIVb-1]; // second step of R digit selection: C = 1100...0; simplified from  C[P.DIVb] & ~C[P.DIVb-1] because j=0 case takes priority
  assign Smsbs  = U[P.DIVb:P.DIVb-4];       // U1.4 most significant bits of square root
  assign Dmsbs  = D[P.DIVb-1:P.DIVb-3];     // U0.3 most significant fractional bits of divisor after leading 1
  assign WCmsbs = WC[P.DIVb+3:P.DIVb-4];    // Q4.4 most significant bits of residual
--- a/src/fpu/fdivsqrt/fdivsqrtuotfc2.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtuotfc2.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Radix 2 unified on-the-fly converter
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/fdivsqrt/fdivsqrtuotfc4.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtuotfc4.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Radix 4 unified on-the-fly converter
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/fdivsqrt/fdivsqrtuslc2.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtuslc2.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Radix 2 Unified Quotient/Square Root Digit Selection
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/fdivsqrt/fdivsqrtuslc4.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtuslc4.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Table-based Radix 4 Unified Quotient/Square Root Digit Selection
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Comparator-based Radix 4 Unified Quotient/Square Root Digit Selection 
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
@ -47,7 +47,7 @@ module fdivsqrtuslc4cmp (
  // Wmsbs = |        |

  logic [6:0] mk2, mk1, mk0, mkm1;
-  logic [6:0] mkj2, mkj1, mkj0, mkjm1;
+  logic [6:0] mkj2, mkj1;
  logic [6:0] mks2[7:0], mks1[7:0], mks0[7:0], mksm1[7:0];
  logic sqrtspecial;

@ -95,7 +95,7 @@ module fdivsqrtuslc4cmp (
  // Choose A for current operation 
 always_comb
    if (SqrtE) begin 
-      if (Smsbs[4]) A = 3'b111; // for S = 1.0000  *** can we optimize away this case?
+      if (Smsbs[4]) A = 3'b111; // for S = 1.0000
      else A = Smsbs[2:0];
    end else A = Dmsbs;
    
@ -108,7 +108,7 @@ module fdivsqrtuslc4cmp (

 /* Nannarelli12 design to exploit symmetry is slower because of negation and mux for special case of A = 000
  assign mk0 = -mk1;
-  assign mkm1 = (A == 3'b000) ? -13 : -mk2; // asymmetry in table *** can we hide from critical path
+  assign mkm1 = (A == 3'b000) ? -13 : -mk2; // asymmetry in table
  */
 
  // Compare residual W to selection constants to choose digit
@ -117,5 +117,5 @@ module fdivsqrtuslc4cmp (
    else if ($signed(Wmsbs) >= $signed(mk1))  udigit = 4'b0100; // choose 1
    else if ($signed(Wmsbs) >= $signed(mk0))  udigit = 4'b0000; // choose 0
    else if ($signed(Wmsbs) >= $signed(mkm1)) udigit = 4'b0010; // choose -1
-    else                                      udigit = 4'b0001; // choose -2  
+    else                                      udigit = 4'b0001; // choose -2
 endmodule
--- a/src/fpu/fhazard.sv
+++ b/src/fpu/fhazard.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Determine forwarding, stalls and flushes for the FPU
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/fli.sv
+++ b/src/fpu/fli.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Floating-point float immediate
 // 
-// Documentation: RISC-V System on Chip Design Chapter 16
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/fma/fma.sv
+++ b/src/fpu/fma/fma.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Floating point multiply-accumulate of configurable size
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13 (Figure 13.7, 9)
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/fma/fmaadd.sv
+++ b/src/fpu/fma/fmaadd.sv
@ -6,7 +6,7 @@
 //
 // Purpose: FMA significand adder
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13 (Figure 13.11)
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/fma/fmaalign.sv
+++ b/src/fpu/fma/fmaalign.sv
@ -6,7 +6,7 @@
 //
 // Purpose: FMA alginment shift
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13 (Table 13.10)
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/fma/fmaexpadd.sv
+++ b/src/fpu/fma/fmaexpadd.sv
@ -6,7 +6,7 @@
 //
 // Purpose: FMA exponent addition
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13 (Table 13.9)
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/fma/fmalza.sv
+++ b/src/fpu/fma/fmalza.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Leading Zero Anticipator
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13 (Figure 13.14)
+// Documentation: RISC-V System on Chip Design
 //    See also [Schmookler & Nowka, Leading zero anticipation and detection, IEEE Sym. Computer Arithmetic, 2001]
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
--- a/src/fpu/fma/fmamult.sv
+++ b/src/fpu/fma/fmamult.sv
@ -6,7 +6,7 @@
 //
 // Purpose: FMA Significand Multiplier
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13 (Table 13.7)
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/fma/fmasign.sv
+++ b/src/fpu/fma/fmasign.sv
@ -6,7 +6,7 @@
 //
 // Purpose: FMA Sign Logic
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13 (Table 13.8)
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/fmtparams.sv
+++ b/src/fpu/fmtparams.sv
@ -7,7 +7,7 @@
 //
 // Purpose: Look up bias of exponent and number of fractional bits for the selected format
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/fpu.sv
+++ b/src/fpu/fpu.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Floating Point Unit Top-Level Interface
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
@ -218,7 +218,6 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
                              {{P.FLEN-P.H_LEN{1'b1}}, 2'b0, {P.H_NE-1{1'b1}}, (P.H_NF)'(0)}, 
                              {2'b0, {P.NE-1{1'b1}}, (P.NF)'(0)}, FmtE, BoxedOneE); // NaN boxing zeroes
  assign FmaAddSubE = OpCtrlE[2]&OpCtrlE[1]&(PostProcSelE==2'b10);
-  // ***simplified from appearently redundant assign FmaAddSubE = OpCtrlE[2]&OpCtrlE[1]&(FResSelE==2'b01)&(PostProcSelE==2'b10);
  mux2  #(P.FLEN)  fyaddmux (PreYE, BoxedOneE, FmaAddSubE, YE); // Force Y to be 1 for add/subtract
  
  // Select NAN-boxed value of Z = 0.0 in proper format for FMA for multiply X*Y+Z
@ -280,7 +279,7 @@ module fpu import cvw::*;  #(parameter cvw_t P) (
    logic [P.FLEN-1:0]           FliResE;                            // Zfa Floating-point load immediate value

    // fround
-    fround #(P) fround(.X(XE), .Xs(XsE), .Xe(XeE), .Xm(XmE), 
+    fround #(P) fround(.Xs(XsE), .Xe(XeE), .Xm(XmE), 
                       .XNaN(XNaNE), .XSNaN(XSNaNE), .Fmt(FmtE), .Frm(FrmE), .Nf(NfE), 
                       .ZfaFRoundNX(ZfaFRoundNXE),
                       .FRound(FRoundE), .FRoundNV(FRoundNVE), .FRoundNX(FRoundNXE));
--- a/src/fpu/fregfile.sv
+++ b/src/fpu/fregfile.sv
@ -6,7 +6,7 @@
 //
 // Purpose: 3R1W 4-port register file for FPU
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/fround.sv
+++ b/src/fpu/fround.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Floating-point round to integer for Zfa
 // 
-// Documentation: RISC-V System on Chip Design Chapter 16
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
@ -28,7 +28,6 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////

 module fround import cvw::*;  #(parameter cvw_t P) (
-  input  logic [P.FLEN-1:0]       X,            // input before unpacking
  input  logic                    Xs,           // input's sign
  input  logic [P.NE-1:0]         Xe,           // input's exponent
  input  logic [P.NF:0]           Xm,           // input's fraction with leading integer bit (U1.NF)
@ -45,7 +44,7 @@ module fround import cvw::*;  #(parameter cvw_t P) (

  logic [P.NE-1:0] E, Xep1;
  logic [P.NF:0] IMask, Tmasknonneg, Tmaskneg, Tmask, HotE, HotEP1, Trunc, Rnd;
-  logic [P.FLEN-1:0] W, PackedW;
+  logic [P.FLEN-1:0] W;
  logic Elt0, Eeqm1, Lnonneg, Lp, Rnonneg, Rp, Tp, RoundUp, Two, EgeNf;

  // Unbiased exponent
--- a/src/fpu/fsgninj.sv
+++ b/src/fpu/fsgninj.sv
@ -6,7 +6,7 @@
 //
 // Purpose: FPU Sign Injection instructions
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/packoutput.sv
+++ b/src/fpu/packoutput.sv
@ -7,7 +7,7 @@
 //
 // Purpose: Pack the output of the FPU
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
@ -100,4 +100,4 @@ module packoutput import cvw::*;  #(parameter cvw_t P) (
      endcase
    end
  end
-endmodule
+endmodule
--- a/src/fpu/postproc/cvtshiftcalc.sv
+++ b/src/fpu/postproc/cvtshiftcalc.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Conversion shift calculation
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/postproc/divshiftcalc.sv
+++ b/src/fpu/postproc/divshiftcalc.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Division shift calculation
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/postproc/flags.sv
+++ b/src/fpu/postproc/flags.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Post-Processing flag calculation
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/postproc/fmashiftcalc.sv
+++ b/src/fpu/postproc/fmashiftcalc.sv
@ -6,7 +6,7 @@
 //
 // Purpose: FMA shift calculation
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/postproc/negateintres.sv
+++ b/src/fpu/postproc/negateintres.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Negate integer result
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/postproc/normshift.sv
+++ b/src/fpu/postproc/normshift.sv
@ -6,7 +6,7 @@
 //
 // Purpose: normalization shifter
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/postproc/postprocess.sv
+++ b/src/fpu/postproc/postprocess.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Post-Processing: normalization, rounding, sign, flags, special cases
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
@ -157,11 +157,11 @@ module postprocess import cvw::*;  #(parameter cvw_t P) (
      end
      2'b00: begin // cvt
        ShiftAmt = {{P.LOGNORMSHIFTSZ-$clog2(P.CVTLEN+1){1'b0}}, CvtShiftAmt};
-        ShiftIn  =  {CvtShiftIn, {P.NORMSHIFTSZ-P.CVTLEN-P.NF-1{1'b0}}};
+        ShiftIn  =  {CvtShiftIn, {P.NORMSHIFTSZ-(P.CVTLEN+P.NF+1){1'b0}}};
      end
      2'b01: begin //divsqrt
        ShiftAmt = DivShiftAmt;
-        ShiftIn  = {{P.NF{1'b0}}, DivUm, {P.NORMSHIFTSZ-P.DIVb-1-P.NF{1'b0}}};
+        ShiftIn  = {{P.NF{1'b0}}, DivUm, {P.NORMSHIFTSZ-(P.DIVb+1+P.NF){1'b0}}};
      end
      default: begin 
        ShiftAmt = {P.LOGNORMSHIFTSZ{1'bx}}; 
--- a/src/fpu/postproc/resultsign.sv
+++ b/src/fpu/postproc/resultsign.sv
@ -6,7 +6,7 @@
 //
 // Purpose: calculating the result's sign
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/postproc/round.sv
+++ b/src/fpu/postproc/round.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Rounder
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/postproc/roundsign.sv
+++ b/src/fpu/postproc/roundsign.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Sign calculation for rounding
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/postproc/shiftcorrection.sv
+++ b/src/fpu/postproc/shiftcorrection.sv
@ -6,7 +6,7 @@
 //
 // Purpose: shift correction
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
@ -45,13 +45,12 @@ module shiftcorrection import cvw::*;  #(parameter cvw_t P) (
  output logic [P.NE+1:0]          Ue                      // corrected exponent for divider
 );

-  logic [P.NORMSHIFTSZ-1:0]        CorrShifted;         // the shifted sum after LZA correction
  logic                            ResSubnorm;             // is the result Subnormal
  logic                            LZAPlus1;               // add one or two to the sum's exponent due to LZA correction
  logic                            LeftShiftQm;            // should the divsqrt result be shifted one to the left
  logic                            RightShift;             // shift right by 1

-  // *** 4/16/24 this code is a mess and needs cleaning and explaining
+  // dh 4/16/24 this code is a mess and needs cleaning and explaining
  // define bit widths
  // seems to shift by 0, 1, or 2.  right and left shift is confusing
  
@ -61,20 +60,20 @@ module shiftcorrection import cvw::*;  #(parameter cvw_t P) (
  //  - a one has to propagate all the way through a sum. so we can leave the bottom statement alone
  assign LZAPlus1 = Shifted[P.NORMSHIFTSZ-1];

-
  // correct the shifting of the divsqrt caused by producing a result in (0.5, 2) range
  // condition: if the msb is 1 or the exponent was one, but the shifted quotent was < 1 (Subnorm)
-  assign LeftShiftQm = (LZAPlus1|(DivUe==1&~LZAPlus1));
- 
-  assign RightShift = FmaOp ? LZAPlus1 : LeftShiftQm;
+  assign LeftShiftQm = (LZAPlus1|(DivUe==1&~LZAPlus1)); 

-  // one bit right shift for FMA or division
-  mux2 #(P.NORMSHIFTSZ) corrmux({Shifted[P.NORMSHIFTSZ-3:0], 2'b00}, {Shifted[P.NORMSHIFTSZ-2:1], 2'b00}, RightShift, CorrShifted);
-  
+  // Determine the shif for either FMA or divsqrt
+  assign RightShift = FmaOp ? LZAPlus1 : LeftShiftQm;
+ 
+  // possible one bit right shift for FMA or division
  // if the result of the divider was calculated to be subnormal, then the result was correctly normalized, so select the top shifted bits
  always_comb
-    if (FmaOp | (DivOp & ~DivResSubnorm))  Mf = CorrShifted;
-    else                                   Mf = Shifted[P.NORMSHIFTSZ-1:0];
+    if (FmaOp | (DivOp & ~DivResSubnorm))  // one bit shift for FMA or divsqrt
+      if (RightShift)                      Mf = {Shifted[P.NORMSHIFTSZ-2:1], 2'b00};
+      else                                 Mf = {Shifted[P.NORMSHIFTSZ-3:0], 2'b00};
+    else                                   Mf =  Shifted[P.NORMSHIFTSZ-1:0];  // convert and subnormal division result
    
  // Determine sum's exponent
  //  main exponent issues: 
--- a/src/fpu/postproc/specialcase.sv
+++ b/src/fpu/postproc/specialcase.sv
@ -6,7 +6,7 @@
 //
 // Purpose: special case selection
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/fpu/unpack.sv
+++ b/src/fpu/unpack.sv
@ -6,7 +6,7 @@
 //
 // Purpose: unpack X, Y, Z floating-point inputs
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
@ -46,23 +46,21 @@ module unpack import cvw::*;  #(parameter cvw_t P) (
  output logic [P.LOGFLEN-1:0]    Nf                    // Number of fractional bits
 );

-  logic XExpNonZero, YExpNonZero, ZExpNonZero;          // is the exponent of XYZ non-zero
-  logic XFracZero, YFracZero, ZFracZero;                // is the fraction zero
  logic YExpMax, ZExpMax;                               // is the exponent all 1s

  unpackinput #(P) unpackinputX (.A(X), .Fmt, .Sgn(Xs), .Exp(Xe), .Man(Xm), .En(XEn), .FPUActive,
-                          .NaN(XNaN), .SNaN(XSNaN), .ExpNonZero(XExpNonZero),
-                          .Zero(XZero), .Inf(XInf), .ExpMax(XExpMax), .FracZero(XFracZero), 
+                          .NaN(XNaN), .SNaN(XSNaN),
+                          .Zero(XZero), .Inf(XInf), .ExpMax(XExpMax),
                          .Subnorm(XSubnorm), .PostBox(XPostBox));

  unpackinput #(P) unpackinputY (.A(Y), .Fmt, .Sgn(Ys), .Exp(Ye), .Man(Ym), .En(YEn), .FPUActive,
-                          .NaN(YNaN), .SNaN(YSNaN), .ExpNonZero(YExpNonZero),
-                          .Zero(YZero), .Inf(YInf), .ExpMax(YExpMax), .FracZero(YFracZero), 
+                          .NaN(YNaN), .SNaN(YSNaN),
+                          .Zero(YZero), .Inf(YInf), .ExpMax(YExpMax),
                          .Subnorm(), .PostBox());

  unpackinput #(P) unpackinputZ (.A(Z), .Fmt, .Sgn(Zs), .Exp(Ze), .Man(Zm), .En(ZEn), .FPUActive,
-                          .NaN(ZNaN), .SNaN(ZSNaN), .ExpNonZero(ZExpNonZero),
-                          .Zero(ZZero), .Inf(ZInf), .ExpMax(ZExpMax), .FracZero(ZFracZero), 
+                          .NaN(ZNaN), .SNaN(ZSNaN),
+                          .Zero(ZZero), .Inf(ZInf), .ExpMax(ZExpMax),
                          .Subnorm(), .PostBox());
 
  // look up bias and fractional bits for the given format
--- a/src/fpu/unpackinput.sv
+++ b/src/fpu/unpackinput.sv
@ -6,7 +6,7 @@
 //
 // Purpose: unpack input: extract sign, exponent, significand, characteristics
 // 
-// Documentation: RISC-V System on Chip Design Chapter 13
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
@ -39,8 +39,6 @@ module unpackinput import cvw::*;  #(parameter cvw_t P) (
  output logic                     SNaN,       // is the number a signaling NaN
  output logic                     Zero,       // is the number zero
  output logic                     Inf,        // is the number infinity
-  output logic                     ExpNonZero, // is the exponent not zero
-  output logic                     FracZero,   // is the fraction zero
  output logic                     ExpMax,     // does In have the maximum exponent (NaN or Inf)
  output logic                     Subnorm,    // is the number subnormal
  output logic [P.FLEN-1:0]        PostBox     // Number reboxed correctly as a NaN
@ -48,6 +46,8 @@ module unpackinput import cvw::*;  #(parameter cvw_t P) (

  logic [P.NF-1:0] Frac;        // Fraction of XYZ
  logic            BadNaNBox;   // incorrectly NaN Boxed
+  logic            FracZero;    // is the fraction zero
+  logic            ExpNonZero;  // is the exponent non-zero
  logic [P.FLEN-1:0] In;

  // Gate input when FPU is not active to save power and simulation
--- a/src/generic/aplusbeq0.sv
+++ b/src/generic/aplusbeq0.sv
@ -34,7 +34,7 @@ module aplusbeq0 #(parameter WIDTH = 8) (
  logic [WIDTH-1:0] orshift;

  // The sum is zero if the bitwise XOR is equal to the bitwise OR shifted left by 1, for all columns
-  // *** explain, cite book
+  // See J. A. Prabhu and G. Zyner, "167 MHz radix-8 divide and square root using overlapped radix-2 stages," IEEE Symp. Computer Arithmetic, 1995, pp. 155-162.

  assign x = a ^ b;
  assign orshift = {a[WIDTH-2:0] | b[WIDTH-2:0], 1'b0};
--- a/src/generic/decoder.sv
+++ b/src/generic/decoder.sv
@ -29,8 +29,5 @@ module decoder #(parameter BINARY_BITS = 3) (
  output logic [(2**BINARY_BITS)-1:0] onehot
 );

-  // *** Double check whether this synthesizes as expected
-  //     -- Ben @ May 4: only warning is that "signed to unsigned assignment occurs"; that said, I haven't checked the netlists
  assign onehot = 1 << binary;
-
 endmodule
--- a/src/generic/mem/ram1p1rwbe.sv
+++ b/src/generic/mem/ram1p1rwbe.sv
@ -44,11 +44,9 @@ module ram1p1rwbe import cvw::*; #(parameter USE_SRAM=0, DEPTH=64, WIDTH=44, PRE
  output logic [WIDTH-1:0]        dout
 );

-  bit [WIDTH-1:0]               RAM[DEPTH-1:0];
-
-  // ***************************************************************************
+  ///////////////////////////////////////////////////////////////////////////////
  // TRUE SRAM macro
-  // ***************************************************************************
+  ///////////////////////////////////////////////////////////////////////////////
  if ((USE_SRAM == 1) & (WIDTH == 128) & (DEPTH == 64)) begin // Cache data subarray
    genvar index;
    // 64 x 128-bit SRAM
@ -79,11 +77,11 @@ module ram1p1rwbe import cvw::*; #(parameter USE_SRAM=0, DEPTH=64, WIDTH=44, PRE
      .A(addr), .D(din), 
      .BWEB(~BitWriteMask), .Q(dout));     
    
-    // ***************************************************************************
+    ///////////////////////////////////////////////////////////////////////////////
    // READ first SRAM model
-    // ***************************************************************************
+    ///////////////////////////////////////////////////////////////////////////////
  end else begin: ram
-    integer i;
+    bit [WIDTH-1:0] RAM[DEPTH-1:0];

    if (PRELOAD_ENABLED) begin
      initial begin
@ -103,11 +101,13 @@ module ram1p1rwbe import cvw::*; #(parameter USE_SRAM=0, DEPTH=64, WIDTH=44, PRE
    // Write divided into part for bytes and part for extra msbs
    // Questa sim version 2022.3_2 does not allow multiple drivers for RAM when using always_ff.
    // Therefore these always blocks use the older always @(posedge clk) 
-    if(WIDTH >= 8) 
+    if(WIDTH >= 8) begin
+      integer i;
      always @(posedge clk) 
        if (ce & we) 
          for(i = 0; i < WIDTH/8; i++) 
            if(bwe[i]) RAM[addr][i*8 +: 8] <= din[i*8 +: 8];
+    end
  
    if (WIDTH%8 != 0) // handle msbs if width not a multiple of 8
      always @(posedge clk) 
--- a/src/generic/mem/ram1p1rwe.sv
+++ b/src/generic/mem/ram1p1rwe.sv
@ -41,11 +41,9 @@ module ram1p1rwe import cvw::* ; #(parameter USE_SRAM=0, DEPTH=64, WIDTH=44) (
  output logic [WIDTH-1:0]        dout
 );

-  bit [WIDTH-1:0]               RAM[DEPTH-1:0];
-
-  // ***************************************************************************
+  //////////////////////////////////////////////////////////////////////////////
  // TRUE SRAM macro
-  // ***************************************************************************
+  //////////////////////////////////////////////////////////////////////////////
  if ((USE_SRAM == 1) & (WIDTH == 128) & (DEPTH == 64)) begin // Cache data subarray
    // 64 x 128-bit SRAM
    ram1p1rwbe_64x128 sram1A (.CLK(clk), .CEB(~ce), .WEB(~we),
@ -64,13 +62,14 @@ module ram1p1rwe import cvw::* ; #(parameter USE_SRAM=0, DEPTH=64, WIDTH=44) (
      .A(addr), .D(din), 
      .BWEB('0), .Q(dout));     
    
-    // ***************************************************************************
+    //////////////////////////////////////////////////////////////////////////////
    // READ first SRAM model
-    // ***************************************************************************
+    //////////////////////////////////////////////////////////////////////////////
  end else begin: ram
-    // *** Vivado is not implementing this as block ram for some reason.
+    // Vivado is not implementing this as block ram for some reason.
    // The version with byte write enables it correctly infers block ram.
-    integer i;
+
+    bit [WIDTH-1:0]               RAM[DEPTH-1:0];

    // Combinational read: register address and read after clock edge
    logic [$clog2(DEPTH)-1:0] addrd;
--- a/src/generic/mem/ram2p1r1wbe.sv
+++ b/src/generic/mem/ram2p1r1wbe.sv
@ -44,13 +44,12 @@ module ram2p1r1wbe import cvw::*; #(parameter USE_SRAM=0, DEPTH=1024, WIDTH=68)
  output logic [WIDTH-1:0]         rd1
 );

-  bit [WIDTH-1:0]                 mem[DEPTH-1:0];
  localparam                      SRAMWIDTH = 32;
  localparam                      SRAMNUMSETS = SRAMWIDTH/WIDTH;      

-  // ***************************************************************************
-  // TRUE Smem macro
-  // ***************************************************************************
+  ///////////////////////////////////////////////////////////////////////////////
+  // TRUE SRAM macro
+  ///////////////////////////////////////////////////////////////////////////////

  if ((USE_SRAM == 1) & (WIDTH == 68) & (DEPTH == 1024)) begin
    
@ -105,39 +104,35 @@ module ram2p1r1wbe import cvw::*; #(parameter USE_SRAM=0, DEPTH=1024, WIDTH=68)
      .QA(SRAMReadData),
      .QB());

-  end else begin
+  end else begin:ram
    
-    // ***************************************************************************
+    ///////////////////////////////////////////////////////////////////////////////
    // READ first SRAM model
-    // ***************************************************************************
-    integer i;
-/*    
-    initial begin // initialize memory for simulation only; not needed because done in the testbench now
-      integer j;
-      for (j=0; j < DEPTH; j++) 
-        mem[j] = '0;
-    end 
-*/
+    ///////////////////////////////////////////////////////////////////////////////
+
+    bit [WIDTH-1:0] RAM[DEPTH-1:0];

    // Read
    logic [$clog2(DEPTH)-1:0] ra1d;
    flopen #($clog2(DEPTH)) adrreg(clk, ce1, ra1, ra1d);
-    assign rd1 = mem[ra1d];
+    assign rd1 = RAM[ra1d];
    
    // Write divided into part for bytes and part for extra msbs
    // coverage off     
    //   when byte write enables are tied high, the last IF is always taken
-    if(WIDTH >= 8) 
+    if(WIDTH >= 8) begin
+      integer i;
      always @(posedge clk) 
        if (ce2 & we2) 
          for(i = 0; i < WIDTH/8; i++) 
-            if(bwe2[i]) mem[wa2][i*8 +: 8] <= wd2[i*8 +: 8];
+            if(bwe2[i]) RAM[wa2][i*8 +: 8] <= wd2[i*8 +: 8];
+    end
    // coverage on
  
    if (WIDTH%8 != 0) // handle msbs if width not a multiple of 8
      always @(posedge clk) 
        if (ce2 & we2 & bwe2[WIDTH/8])
-          mem[wa2][WIDTH-1:WIDTH-WIDTH%8] <= wd2[WIDTH-1:WIDTH-WIDTH%8];
+          RAM[wa2][WIDTH-1:WIDTH-WIDTH%8] <= wd2[WIDTH-1:WIDTH-WIDTH%8];
  end
  
 endmodule
--- a/src/generic/mem/ram2p1r1wbe_2048x64.sv
+++ b/src/generic/mem/ram2p1r1wbe_2048x64.sv
--- a/src/hazard/hazard.sv
+++ b/src/hazard/hazard.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Determine stalls and flushes
 // 
-// Documentation: RISC-V System on Chip Design Chapter 4, Figure 13.54
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/ieu/aes/aes64ks1i.sv
+++ b/src/ieu/aes/aes64ks1i.sv
@ -26,11 +26,11 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////

 module aes64ks1i(
-   input  logic [3:0]  round,
-   input  logic [63:0] rs1,
-   input  logic [31:0] Sbox0Out,
-   output logic [31:0] SboxKIn,
-   output logic [63:0] result
+   input  logic [3:0]   round,
+   input  logic [63:32] rs1,
+   input  logic [31:0]  Sbox0Out,
+   output logic [31:0]  SboxKIn,
+   output logic [63:0]  result
 );                 
                 
   logic 			        finalround;
--- a/src/ieu/aes/aes64ks2.sv
+++ b/src/ieu/aes/aes64ks2.sv
@ -26,9 +26,9 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////

 module aes64ks2(
-   input  logic [63:0] rs2,
-   input  logic [63:0] rs1,
-   output logic [63:0] result
+   input  logic [63:0]  rs2,
+   input  logic [63:32] rs1,
+   output logic [63:0]  result
 );
   
   logic [31:0] 		    w0, w1;
--- a/src/ieu/aes/aesinvshiftrows64.sv
+++ b/src/ieu/aes/aesinvshiftrows64.sv
@ -26,7 +26,9 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////

 module aesinvshiftrows64(
-   input  logic [127:0] a, 
+   /* verilator lint_off UNUSEDSIGNAL */
+   input  logic [127:0] a,
+   /* verilator lint_on UNUSEDSIGNAL */
   output logic [63:0]  y
 );

--- a/src/ieu/aes/aesshiftrows64.sv
+++ b/src/ieu/aes/aesshiftrows64.sv
@ -26,7 +26,9 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////

 module aesshiftrows64(
+   /* verilator lint_off UNUSEDSIGNAL */
   input  logic [127:0] a, 
+   /* verilator lint_on UNUSEDSIGNAL */
   output logic [63:0] y
 );
 		    
--- a/src/ieu/aes/aesshiftrows64.xv
+++ b/src/ieu/aes/aesshiftrows64.xv
@ -1,35 +0,0 @@
-///////////////////////////////////////////
-// aesshiftrows64.sv
-//
-// Written: ryan.swann@okstate.edu, james.stine@okstate.edu
-// Created: 20 February 2024
-//
-// Purpose: aesshiftrow for taking in first Data line
-//
-// A component of the CORE-V-WALLY configurable RISC-V project.
-// https://github.com/openhwgroup/cvw
-// 
-// Copyright (C) 2021-24 Harvey Mudd College & Oklahoma State University
-//
-// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
-//
-// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
-// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
-// may obtain a copy of the License at
-//
-// https://solderpad.org/licenses/SHL-2.1/
-//
-// Unless required by applicable law or agreed to in writing, any work distributed under the 
-// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
-// either express or implied. See the License for the specific language governing permissions 
-// and limitations under the License.
-////////////////////////////////////////////////////////////////////////////////////////////////
-
-module aesshiftrows64(
-   input  logic [127:0] a, 
-   output logic [63:0]  y
-);
-		    
-   assign y = {a[31:24],   a[119:112], a[79:72],   a[39:32],
-               a[127:120], a[87:80],   a[47:40],   a[7:0]};   
-endmodule
--- a/src/ieu/alu.sv
+++ b/src/ieu/alu.sv
@ -7,7 +7,7 @@
 //
 // Purpose: RISC-V Arithmetic/Logic Unit
 //
-// Documentation: RISC-V System on Chip Design Chapter 4 (Figure 4.4)
+// Documentation: RISC-V System on Chip Design
 // 
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/ieu/bmu/bitmanipalu.sv
+++ b/src/ieu/bmu/bitmanipalu.sv
@ -7,7 +7,7 @@
 //
 // Purpose: RISC-V Arithmetic/Logic Unit Bit-Manipulation Extension and K extension
 //
-// Documentation: RISC-V System on Chip Design Chapter 15
+// Documentation: RISC-V System on Chip Design
 // 
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
@ -49,7 +49,6 @@ module bitmanipalu import cvw::*; #(parameter cvw_t P) (
  logic [P.XLEN-1:0]        ZBBResult;               // ZBB Result
  logic [P.XLEN-1:0]        ZBCResult;               // ZBC Result   
  logic [P.XLEN-1:0] 	      ZBKBResult;              // ZBKB Result
-  logic [P.XLEN-1:0]        ZBKCResult;              // ZBKC Result
  logic [P.XLEN-1:0]        ZBKXResult;              // ZBKX Result      
  logic [P.XLEN-1:0]        ZKNHResult;              // ZKNH Result
  logic [P.XLEN-1:0]        ZKNDEResult;             // ZKNE or ZKND Result   
@ -93,7 +92,7 @@ module bitmanipalu import cvw::*; #(parameter cvw_t P) (

  // ZBC and ZBKCUnit
  if (P.ZBC_SUPPORTED | P.ZBKC_SUPPORTED) begin: zbc
-    zbc #(P) ZBC(.A(ABMU), .RevA, .B(BBMU), .Funct3, .ZBCResult);
+    zbc #(P) ZBC(.A(ABMU), .RevA, .B(BBMU), .Funct3(Funct3[1:0]), .ZBCResult);
  end else assign ZBCResult = '0;

  // ZBB Unit
@ -108,7 +107,7 @@ module bitmanipalu import cvw::*; #(parameter cvw_t P) (

  // ZBKB Unit
  if (P.ZBKB_SUPPORTED) begin: zbkb
-    zbkb #(P.XLEN) ZBKB(.A(ABMU), .B(BBMU), .Funct3, .ZBKBSelect(ZBBSelect[2:0]), .ZBKBResult);
+    zbkb #(P.XLEN) ZBKB(.A(ABMU), .B(BBMU[P.XLEN/2-1:0]), .Funct3, .ZBKBSelect(ZBBSelect[2:0]), .ZBKBResult);
  end else assign ZBKBResult = '0;

  // ZBKX Unit
@ -125,7 +124,7 @@ module bitmanipalu import cvw::*; #(parameter cvw_t P) (
  // ZKNH Unit
  if (P.ZKNH_SUPPORTED) begin: zknh
    if (P.XLEN == 32) zknh32 ZKNH32(.A(ABMU), .B(BBMU), .ZKNHSelect(ZBBSelect), .ZKNHResult(ZKNHResult));
-    else              zknh64 ZKNH64(.A(ABMU), .B(BBMU), .ZKNHSelect(ZBBSelect), .ZKNHResult(ZKNHResult));
+    else              zknh64 ZKNH64(.A(ABMU),           .ZKNHSelect(ZBBSelect), .ZKNHResult(ZKNHResult));
  end else assign ZKNHResult = '0;

  // Result Select Mux
--- a/src/ieu/bmu/bitreverse.sv
+++ b/src/ieu/bmu/bitreverse.sv
@ -7,7 +7,7 @@
 //
 // Purpose: Bit reverse submodule
 //
-// Documentation: RISC-V System on Chip Design Chapter 15
+// Documentation: RISC-V System on Chip Design
 // 
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/ieu/bmu/bmuctrl.sv
+++ b/src/ieu/bmu/bmuctrl.sv
@ -7,7 +7,7 @@
 //
 // Purpose: Top level bit manipulation instruction decoder
 // 
-// Documentation: RISC-V System on Chip Design Chapter 15
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
@ -31,11 +31,8 @@
 module bmuctrl import cvw::*;  #(parameter cvw_t P) (
  input  logic        clk, reset,
  // Decode stage control signals
-  input  logic        StallD, FlushD,          // Stall, flush Decode stage
  input  logic [31:0] InstrD,                  // Instruction in Decode stage
  input  logic        ALUOpD,                  // Regular ALU Operation
-  output logic [3:0]  BSelectD,                // Indicates if ZBA_ZBB_ZBC_ZBS instruction in one-hot encoding in Decode stage
-  output logic [3:0]  ZBBSelectD,              // ZBB mux select signal in Decode stage NOTE: do we need this in decode?
  output logic        BRegWriteD,              // Indicates if it is a R type B instruction in Decode Stage
  output logic        BALUSrcBD,               // Indicates if it is an I/IW (non auipc) type B instruction in Decode Stage
  output logic        BW64D,                   // Indiciates if it is a W type B instruction in Decode Stage
@ -46,7 +43,6 @@ module bmuctrl import cvw::*;  #(parameter cvw_t P) (
  output logic [2:0]  ALUSelectD,              // ALU select
  output logic [3:0]  BSelectE,                // Indicates if ZBA_ZBB_ZBC_ZBS instruction in one-hot encoding
  output logic [3:0]  ZBBSelectE,              // ZBB mux select signal
-  output logic        BRegWriteE,              // Indicates if it is a R type B instruction in Execute
  output logic [2:0]  BALUControlE,            // ALU Control signals for B instructions in Execute Stage
  output logic        BMUActiveE               // Bit manipulation instruction being executed
 );
@ -61,6 +57,8 @@ module bmuctrl import cvw::*;  #(parameter cvw_t P) (
  logic [2:0] BALUControlD;                    // ALU Control signals for B instructions
  logic [2:0] BALUSelectD;                     // ALU Mux select signal in Decode Stage for BMU operations
  logic       BALUOpD;                         // Indicates if it is an ALU B instruction in Decode Stage
+  logic [3:0] BSelectD;                        // Indicates if ZBA_ZBB_ZBC_ZBS instruction in one-hot encoding in Decode stage
+  logic [3:0] ZBBSelectD;                      // ZBB mux select signal in Decode stage

  `define BMUCTRLW 20

@ -285,5 +283,5 @@ module bmuctrl import cvw::*;  #(parameter cvw_t P) (
  assign ALUSelectD = BALUOpD ? BALUSelectD : (ALUOpD ? Funct3D : 3'b000);

  // BMU Execute stage pipieline control register
-  flopenrc #(13) controlregBMU(clk, reset, FlushE, ~StallE, {BSelectD, ZBBSelectD, BRegWriteD, BALUControlD, ~IllegalBitmanipInstrD}, {BSelectE, ZBBSelectE, BRegWriteE, BALUControlE, BMUActiveE});
+  flopenrc #(12) controlregBMU(clk, reset, FlushE, ~StallE, {BSelectD, ZBBSelectD, BALUControlD, ~IllegalBitmanipInstrD}, {BSelectE, ZBBSelectE, BALUControlE, BMUActiveE});
 endmodule
--- a/src/ieu/bmu/byteop.sv
+++ b/src/ieu/bmu/byteop.sv
@ -7,7 +7,7 @@
 //
 // Purpose: RISCV bitmanip byte-wise operation unit
 //
-// Documentation: RISC-V System on Chip Design Chapter 15
+// Documentation: RISC-V System on Chip Design
 // 
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/ieu/bmu/clmul.sv
+++ b/src/ieu/bmu/clmul.sv
@ -7,7 +7,7 @@
 //
 // Purpose: Carry-Less multiplication unit
 //
-// Documentation: RISC-V System on Chip Design Chapter 15
+// Documentation: RISC-V System on Chip Design
 // 
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/ieu/bmu/cnt.sv
+++ b/src/ieu/bmu/cnt.sv
@ -7,7 +7,7 @@
 //
 // Purpose: Count Instruction Submodule
 //
-// Documentation: RISC-V System on Chip Design Chapter 15
+// Documentation: RISC-V System on Chip Design
 // 
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/ieu/bmu/ext.sv
+++ b/src/ieu/bmu/ext.sv
@ -7,7 +7,7 @@
 //
 // Purpose: Sign/Zero Extension Submodule
 //
-// Documentation: RISC-V System on Chip Design Chapter 15
+// Documentation: RISC-V System on Chip Design
 // 
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
@ -29,7 +29,7 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////

 module ext #(parameter WIDTH = 32) (
-  input  logic [WIDTH-1:0] A,            // Operands
+  input  logic [15:0]      A,            // Operand to extend
  input  logic [1:0]       ExtSelect,    // B[2], B[0] of immediate
  output logic [WIDTH-1:0] ExtResult);   // Extend Result

--- a/src/ieu/bmu/popcnt.sv
+++ b/src/ieu/bmu/popcnt.sv
@ -5,7 +5,7 @@
 //
 // Purpose: Population Count
 // 
-// Documentation: RISC-V System on Chip Design Chapter 15
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/ieu/bmu/zbb.sv
+++ b/src/ieu/bmu/zbb.sv
@ -7,7 +7,7 @@
 //
 // Purpose: RISC-V ZBB top level unit
 //
-// Documentation: RISC-V System on Chip Design Chapter 15
+// Documentation: RISC-V System on Chip Design
 // 
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
@ -46,7 +46,7 @@ module zbb #(parameter WIDTH=32) (
  mux2 #(1) ltmux(LT, LTU, BUnsigned , lt);
  cnt #(WIDTH) cnt(.A, .RevA, .B(B[1:0]), .W64, .CntResult);
  byteop #(WIDTH) bu(.A, .ByteSelect(B[0]), .ByteResult);
-  ext #(WIDTH) ext(.A, .ExtSelect({~B[2], {B[2] & B[0]}}), .ExtResult);
+  ext #(WIDTH) ext(.A(A[15:0]), .ExtSelect({~B[2], {B[2] & B[0]}}), .ExtResult);

  // ZBBSelect[2] differentiates between min(u) vs max(u) instruction
  mux2 #(WIDTH) minmaxmux(B, A, ZBBSelect[2]^lt, MinMaxResult);
--- a/src/ieu/bmu/zbc.sv
+++ b/src/ieu/bmu/zbc.sv
@ -7,7 +7,7 @@
 //
 // Purpose: RISC-V ZBC top-level unit
 //
-// Documentation: RISC-V System on Chip Design Chapter 15
+// Documentation: RISC-V System on Chip Design
 // 
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
@ -30,7 +30,7 @@

 module zbc import cvw::*; #(parameter cvw_t P) (
  input  logic [P.XLEN-1:0] A, RevA, B,       // Operands
-  input  logic [2:0]        Funct3,           // Indicates operation to perform
+  input  logic [1:0]        Funct3,           // Indicates operation to perform
  output logic [P.XLEN-1:0] ZBCResult);       // ZBC result

  logic [P.XLEN-1:0] ClmulResult, RevClmulResult;
--- a/src/ieu/comparator.sv
+++ b/src/ieu/comparator.sv
@ -7,7 +7,7 @@
 //
 // Purpose: Branch comparison
 // 
-// Documentation: RISC-V System on Chip Design Chapter 4 (Figure 4.7)
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/ieu/controller.sv
+++ b/src/ieu/controller.sv
@ -7,7 +7,7 @@
 //
 // Purpose: Top level controller module
 // 
-// Documentation: RISC-V System on Chip Design Chapter 4 (Section 4.1.4, Figure 4.8, Table 4.5)
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
@ -53,16 +53,13 @@ module controller import cvw::*;  #(parameter cvw_t P) (
  output logic        ALUSrcAE, ALUSrcBE,      // ALU operands
  output logic        ALUResultSrcE,           // Selects result to pass on to Memory stage
  output logic [2:0]  ALUSelectE,              // ALU mux select signal
-  output logic        MemReadE, CSRReadE,      // Instruction reads memory, reads a CSR (needed for Hazard unit)
  output logic [2:0]  Funct3E,                 // Instruction's funct3 field
  output logic [6:0]  Funct7E,                 // Instruction's funct7 field
  output logic        IntDivE,                 // Integer divide
-  output logic        MDUE,                    // MDU (multiply/divide) operatio
  output logic        W64E,                    // RV64 W-type operation
  output logic        SubArithE,               // Subtraction or arithmetic shift
  output logic        JumpE,                   // jump instruction
  output logic        BranchE,                 // Branch instruction
-  output logic        SCE,                     // Store Conditional instruction
  output logic        BranchSignedE,           // Branch comparison operands are signed (if it's a branch)
  output logic [3:0]  BSelectE,                // One-Hot encoding of if it's ZBA_ZBB_ZBC_ZBS instruction
  output logic [3:0]  ZBBSelectE,              // ZBB mux select signal in Execute stage
@ -81,7 +78,6 @@ module controller import cvw::*;  #(parameter cvw_t P) (
  output logic        CSRReadM, CSRWriteM, PrivilegedM, // CSR read, write, or privileged instruction
  output logic [1:0]  AtomicM,                 // Atomic (AMO) instruction
  output logic [2:0]  Funct3M,                 // Instruction's funct3 field
-  output logic        RegWriteM,               // Instruction writes a register (needed for Hazard unit)
  output logic        InvalidateICacheM, FlushDCacheM, // Invalidate I$, flush D$
  output logic        InstrValidD, InstrValidE, InstrValidM, // Instruction is valid
  output logic        FWriteIntM,              // FPU controller writes integer register file
@ -122,6 +118,9 @@ module controller import cvw::*;  #(parameter cvw_t P) (
  logic        FenceXD;                        // Fence instruction
  logic        CMOD;                           // Cache management instruction
  logic        InvalidateICacheD, FlushDCacheD;// Invalidate I$, flush D$
+  logic        MemReadE, CSRReadE;             // Instruction reads memory, reads a CSR (needed for Hazard unit)
+  logic        MDUE;                           // MDU (multiply/divide) operatio
+  logic        SCE;                            // Store Conditional instruction
  logic        CSRWriteD, CSRWriteE;           // CSR write
  logic        PrivilegedD, PrivilegedE;       // Privileged instruction
  logic        InvalidateICacheE, FlushDCacheE;// Invalidate I$, flush D$
@ -133,14 +132,12 @@ module controller import cvw::*;  #(parameter cvw_t P) (
  logic        unused; 
  logic        BranchFlagE;                    // Branch flag to use (chosen between eq or lt)
  logic        IEURegWriteE;                   // Register write 
-  logic        BRegWriteE;                     // Register write from BMU controller in Execute Stage
  logic        IllegalERegAdrD;                // RV32E attempts to write upper 16 registers
  logic [1:0]  AtomicE;                        // Atomic instruction 
  logic        FenceD, FenceE;                 // Fence instruction
  logic        SFenceVmaD;                     // sfence.vma instruction
  logic        IntDivM;                        // Integer divide instruction
-  logic [3:0]  BSelectD;                       // One-Hot encoding if it's ZBA_ZBB_ZBC_ZBS instruction in decode stage
-  logic [3:0]  ZBBSelectD;                     // ZBB Mux Select Signal
+  logic        RegWriteM;                      // Instruction writes a register (needed for Hazard unit)
  logic [1:0]  CZeroD;
  logic        IFunctD, RFunctD, MFunctD;      // Detect I, R, and M-type RV32IM/Rv64IM instructions
  logic        LFunctD, SFunctD, BFunctD;      // Detect load, store, branch instructions
@ -158,7 +155,6 @@ module controller import cvw::*;  #(parameter cvw_t P) (
  logic [3:0]  CMOpD, CMOpE;                   // which CMO instruction 1: cbo.inval; 2: cbo.flush; 4: cbo.clean; 8: cbo.zero
  logic        IFUPrefetchD;                   // instruction prefetch
  logic        LSUPrefetchD, LSUPrefetchE;     // data prefetch
-  logic        CMOStallD;                      // Structural hazards from cache management ops
  logic        MatchDE;                        // Match between a source register in Decode stage and destination register in Execute stage
  logic        FCvtIntStallD, MDUStallD, CSRRdStallD; // Stall due to conversion, load, multiply/divide, CSR read 
  logic        FunctCZeroD;                    // Funct7 and Funct3 indicate czero.* (not including Op check)
@ -329,9 +325,9 @@ module controller import cvw::*;  #(parameter cvw_t P) (
    logic BSubArithD;                     // TRUE for BMU ext, clr, andn, orn, xnor
    logic BALUSrcBD;                      // BMU alu src select signal

-    bmuctrl #(P) bmuctrl(.clk, .reset, .StallD, .FlushD, .InstrD, .ALUOpD, .BSelectD, .ZBBSelectD, 
+    bmuctrl #(P) bmuctrl(.clk, .reset, .InstrD, .ALUOpD,
      .BRegWriteD, .BALUSrcBD, .BW64D, .BSubArithD, .IllegalBitmanipInstrD, .StallE, .FlushE, 
-      .ALUSelectD(PreALUSelectD), .BSelectE, .ZBBSelectE, .BRegWriteE, .BALUControlE, .BMUActiveE);
+      .ALUSelectD(PreALUSelectD), .BSelectE, .ZBBSelectE, .BALUControlE, .BMUActiveE);
    if (P.ZBA_SUPPORTED) begin
      // ALU Decoding is more comprehensive when ZBA is supported. slt and slti conflicts with sh1add, sh1add.uw
      assign sltD = (Funct3D == 3'b010 & (~(Funct7D[4]) | ~OpD[5])) ;
@ -357,7 +353,6 @@ module controller import cvw::*;  #(parameter cvw_t P) (

    // tie off unused bit manipulation signals
    assign BSelectE = 4'b0000;
-    assign BSelectD = 4'b0000;
    assign ZBBSelectE = 4'b0000;
    assign BALUControlE = 3'b0;
    assign BMUActiveE = 1'b0;
--- a/src/ieu/datapath.sv
+++ b/src/ieu/datapath.sv
@ -7,7 +7,7 @@
 //
 // Purpose: Wally Integer Datapath
 // 
-// Documentation: RISC-V System on Chip Design Chapter 4 (Figure 4.12)
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
@ -80,7 +80,6 @@ module datapath import cvw::*;  #(parameter cvw_t P) (
  // Decode stage signals
  logic [P.XLEN-1:0] R1D, R2D;                       // Read data from Rs1 (RD1), Rs2 (RD2)
  logic [P.XLEN-1:0] ImmExtD;                        // Extended immediate in Decode stage
-  logic [4:0]        RdD;                            // Destination register in Decode stage
  // Execute stage signals
  logic [P.XLEN-1:0] R1E, R2E;                       // Source operands read from register file
  logic [P.XLEN-1:0] ImmExtE;                        // Extended immediate in Execute stage 
--- a/src/ieu/extend.sv
+++ b/src/ieu/extend.sv
@ -7,7 +7,7 @@
 //
 // Purpose: Produce sign-extended immediates from various formats
 // 
-// Documentation: RISC-V System on Chip Design Chapter 4 (Figure 4.3)
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
--- a/src/ieu/ieu.sv
+++ b/src/ieu/ieu.sv
@ -6,7 +6,7 @@
 //
 // Purpose: Integer Execution Unit: datapath and controller
 // 
-// Documentation: RISC-V System on Chip Design Chapter 4 (Figure 4.12)
+// Documentation: RISC-V System on Chip Design
 //
 // A component of the CORE-V-WALLY configurable RISC-V project.
 // https://github.com/openhwgroup/cvw
@ -87,7 +87,6 @@ module ieu import cvw::*;  #(parameter cvw_t P) (
  logic [2:0] ResultSrcW;                                    // Selects result in Writeback stage
  logic       ALUResultSrcE;                                 // Selects ALU result to pass on to Memory stage
  logic [2:0] ALUSelectE;                                    // ALU select mux signal
-  logic       SCE;                                           // Store Conditional instruction
  logic       FWriteIntM;                                    // FPU writing to integer register file
  logic       IntDivW;                                       // Integer divide instruction
  logic [3:0] BSelectE;                                      // Indicates if ZBA_ZBB_ZBC_ZBS instruction in one-hot encoding
@ -99,12 +98,10 @@ module ieu import cvw::*;  #(parameter cvw_t P) (

  // Forwarding signals
  logic [4:0] Rs1D, Rs2D;
-  logic [4:0] Rs2E;                                    // Source registers
+  logic [4:0] Rs2E;                                          // Source registers
  logic [1:0] ForwardAE, ForwardBE;                          // Select signals for forwarding multiplexers
-  logic       RegWriteM, RegWriteW;                          // Register will be written in Memory, Writeback stages
-  logic       MemReadE, CSRReadE;                            // Load, CSRRead instruction
+  logic       RegWriteW;                                     // Register will be written in Writeback stage
  logic       BranchSignedE;                                 // Branch does signed comparison on operands
-  logic       MDUE;                                          // Multiply/divide instruction
  logic       BMUActiveE;                                    // Bit manipulation instruction being executed
  logic [1:0] CZeroE;                                        // {czero.nez, czero.eqz} instructions active
           
@ -113,12 +110,12 @@ module ieu import cvw::*;  #(parameter cvw_t P) (
    .IllegalIEUFPUInstrD, .IllegalBaseInstrD, 
    .StructuralStallD, .LoadStallD, .StoreStallD, .Rs1D, .Rs2D,  .Rs2E,
    .StallE, .FlushE, .FlagsE, .FWriteIntE,
-    .PCSrcE, .ALUSrcAE, .ALUSrcBE, .ALUResultSrcE, .ALUSelectE, .MemReadE, .CSRReadE, 
-    .Funct3E, .Funct7E, .IntDivE, .MDUE, .W64E, .SubArithE, .BranchD, .BranchE, .JumpD, .JumpE, .SCE, 
+    .PCSrcE, .ALUSrcAE, .ALUSrcBE, .ALUResultSrcE, .ALUSelectE,
+    .Funct3E, .Funct7E, .IntDivE, .W64E, .SubArithE, .BranchD, .BranchE, .JumpD, .JumpE,
    .BranchSignedE, .BSelectE, .ZBBSelectE, .BALUControlE, .BMUActiveE, .CZeroE, .MDUActiveE, 
    .FCvtIntE, .ForwardAE, .ForwardBE, .CMOpM, .IFUPrefetchE, .LSUPrefetchM,
    .StallM, .FlushM, .MemRWE, .MemRWM, .CSRReadM, .CSRWriteM, .PrivilegedM, .AtomicM, .Funct3M,
-    .RegWriteM, .FlushDCacheM, .InstrValidM, .InstrValidE, .InstrValidD, .FWriteIntM,
+    .FlushDCacheM, .InstrValidM, .InstrValidE, .InstrValidD, .FWriteIntM,
    .StallW, .FlushW, .RegWriteW, .IntDivW, .ResultSrcW, .CSRWriteFenceM, .InvalidateICacheM,
    .RdW, .RdE, .RdM);

--- a/src/ieu/kmu/packer.sv
+++ b/src/ieu/kmu/packer.sv
@ -26,7 +26,7 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////

 module packer #(parameter WIDTH=32) (
-  input  logic [WIDTH-1:0] A, B,
+  input  logic [WIDTH/2-1:0] A, B,
  input  logic [2:0] 	  PackSelect, 
  output logic [WIDTH-1:0] PackResult
 );
--- a/src/ieu/kmu/zbkb.sv
+++ b/src/ieu/kmu/zbkb.sv
@ -26,10 +26,11 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////

 module zbkb #(parameter WIDTH=32) (
-   input  logic [WIDTH-1:0] A, B,
-  input  logic [2:0] 	    Funct3,
-   input  logic [2:0] 	    ZBKBSelect,
-   output logic [WIDTH-1:0] ZBKBResult
+  input  logic [WIDTH-1:0]   A, 
+  input  logic [WIDTH/2-1:0] B,
+  input  logic [2:0] 	     Funct3,
+  input  logic [2:0] 	     ZBKBSelect,
+  output logic [WIDTH-1:0]   ZBKBResult
 );
   
   logic [WIDTH-1:0] 	     Brev8Result;  // rev8, brev8
@ -42,8 +43,8 @@ module zbkb #(parameter WIDTH=32) (
      for (j=0; j<8; j=j+1) 
         assign Brev8Result[i*8+j] = A[i*8+7-j];
   
-   packer #(WIDTH) pack(.A, .B, .PackSelect({ZBKBSelect[2], Funct3[1:0]}), .PackResult);
-   zipper #(WIDTH) zip(.A, .ZipSelect(Funct3[2]), .ZipResult);
+   packer #(WIDTH) pack(.A(A[WIDTH/2-1:0]), .B(B[WIDTH/2-1:0]), .PackSelect({ZBKBSelect[2], Funct3[1:0]}), .PackResult);
+   zipper #(WIDTH) zipper(.A, .ZipSelect(Funct3[2]), .ZipResult);
   
   // ZBKB Result Select Mux
   mux3 #(WIDTH) zbkbresultmux(Brev8Result, PackResult, ZipResult, ZBKBSelect[1:0], ZBKBResult);   
--- a/src/ieu/kmu/zbkx.sv
+++ b/src/ieu/kmu/zbkx.sv
@ -31,8 +31,10 @@ module zbkx #(parameter WIDTH=32) (
   output logic [WIDTH-1:0] ZBKXResult
 );
   
-   logic [WIDTH-1:0] 	     xperm4, xperm4lookup;
-   logic [WIDTH-1:0] 	     xperm8, xperm8lookup;
+   logic [WIDTH-1:0] 	     xperm4, xperm8;
+   /* verilator lint_off UNUSEDSIGNAL */
+   logic [WIDTH-1:0]         xperm4lookup, xperm8lookup; // not all bits are used
+   /* verilator lint_on UNUSEDSIGNAL */
   int 		     i;
   
   always_comb begin
--- a/src/ieu/kmu/zknde64.sv
+++ b/src/ieu/kmu/zknde64.sv
@ -48,8 +48,8 @@ module zknde64 import cvw::*; #(parameter cvw_t P) (
    aessbox32 sbox(Sbox0In, Sbox0Out);                       // Substitute bytes of value obtained for tmp2 using Rijndael sbox

    // Both ZKND and ZKNE support aes64ks1i and aes64ks2 instructions
-    aes64ks1i aes64ks1i(.round, .rs1(A), .Sbox0Out, .SboxKIn, .result(aes64ks1iRes));
-    aes64ks2  aes64ks2(.rs2(B), .rs1(A), .result(aes64ks2Res));
+    aes64ks1i aes64ks1i(.round, .rs1(A[63:32]), .Sbox0Out, .SboxKIn, .result(aes64ks1iRes));
+    aes64ks2  aes64ks2(.rs2(B), .rs1(A[63:32]), .result(aes64ks2Res));
   
    // Choose among decrypt, encrypt, key schedule 1, key schedule 2 results
    mux4 #(64) zkndmux(aes64dRes, aes64eRes, aes64ks1iRes, aes64ks2Res, ZKNSelect[1:0], ZKNDEResult);
--- a/src/ieu/kmu/zknh64.sv
+++ b/src/ieu/kmu/zknh64.sv
@ -26,7 +26,7 @@
 ////////////////////////////////////////////////////////////////////////////////////////////////

 module zknh64 (
-   input  logic [63:0] A, B, 
+   input  logic [63:0] A,
   input  logic [3:0]  ZKNHSelect,
   output logic [63:0] ZKNHResult
 );
--- a/Show more
+++ b/Show more