diff --git a/src/ebu/ahbcacheinterface.sv b/src/ebu/ahbcacheinterface.sv
index f033b40cc..5316e215f 100644
--- a/src/ebu/ahbcacheinterface.sv
+++ b/src/ebu/ahbcacheinterface.sv
@@ -114,11 +114,12 @@ module ahbcacheinterface import cvw::*; #(
     .s(~(CacheableOrFlushCacheM)), .y(PreHWDATA));
   flopen #(P.AHBW) wdreg(HCLK, HREADY, PreHWDATA, HWDATA); // delay HWDATA by 1 cycle per spec
 
-  // *** bummer need a second byte mask for bus as it is AHBW rather than LLEN.
-  // probably can merge by muxing PAdrM's LLEN/8-1 index bit based on HTRANS being != 0.
-  swbytemask #(P.AHBW) busswbytemask(.Size(HSIZE), .Adr(HADDR[$clog2(P.AHBW/8)-1:0]), .ByteMask(BusByteMaskM), .ByteMaskExtended());
-  
-  flopen #(P.AHBW/8) HWSTRBReg(HCLK, HREADY, BusByteMaskM[P.AHBW/8-1:0], HWSTRB);
+  if (READ_ONLY_CACHE) begin
+    assign HWSTRB = '0;
+  end else begin // compute byte mask for AHB transaction based on size and address.  AHBW may be different than LLEN
+    swbytemask #(P.AHBW) busswbytemask(.Size(HSIZE), .Adr(HADDR[$clog2(P.AHBW/8)-1:0]), .ByteMask(BusByteMaskM), .ByteMaskExtended());
+    flopen #(P.AHBW/8) HWSTRBReg(HCLK, HREADY, BusByteMaskM[P.AHBW/8-1:0], HWSTRB);
+  end
   
   buscachefsm #(BeatCountThreshold, AHBWLOGBWPL, READ_ONLY_CACHE, P.BURST_EN) AHBBuscachefsm(
     .HCLK, .HRESETn, .Flush, .BusRW, .BusAtomic, .Stall, .BusCommitted, .BusStall, .CaptureEn, .SelBusBeat,
diff --git a/src/fpu/fdivsqrt/fdivsqrtiter.sv b/src/fpu/fdivsqrt/fdivsqrtiter.sv
index 4bfcebcd1..0f092706a 100644
--- a/src/fpu/fdivsqrt/fdivsqrtiter.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtiter.sv
@@ -44,7 +44,7 @@ module fdivsqrtiter import cvw::*;  #(parameter cvw_t P) (
   logic [P.DIVb+3:0]      WCNext[P.DIVCOPIES-1:0]; // Q4.DIVb
   logic [P.DIVb+3:0]      WS[P.DIVCOPIES:0];       // Q4.DIVb
   logic [P.DIVb+3:0]      WC[P.DIVCOPIES:0];       // Q4.DIVb
-  logic [P.DIVb:0]        U[P.DIVCOPIES:0];        // U1.DIVb // *** probably Q not U.  See Table 16.26 notes
+  logic [P.DIVb:0]        U[P.DIVCOPIES:0];        // U1.DIVb
   logic [P.DIVb:0]        UM[P.DIVCOPIES:0];       // U1.DIVb
   logic [P.DIVb:0]        UNext[P.DIVCOPIES-1:0];  // U1.DIVb
   logic [P.DIVb:0]        UMNext[P.DIVCOPIES-1:0]; // U1.DIVb
diff --git a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
index 802ac92dc..ffc62b5cc 100644
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@@ -222,7 +222,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
     logic               RemOpE;
 
     /* verilator lint_off WIDTH */
-    assign IntDivNormShiftE = P.DIVb - (CyclesE * P.RK - P.LOGR); // b - rn, used for integer normalization right shift.  rn = Cycles * r * k - r ***explain
+    assign IntDivNormShiftE = P.DIVb - (CyclesE * P.RK - P.LOGR); // b - rn, used for integer normalization right shift.  n = (Cycles * k - 1)
     assign IntRemNormShiftE = mE + (P.DIVb-(P.XLEN-1));           // m + b - (N-1) for remainder normalization shift
     /* verilator lint_on WIDTH */
     assign RemOpE = Funct3E[1];
diff --git a/src/fpu/fdivsqrt/fdivsqrtstage4.sv b/src/fpu/fdivsqrt/fdivsqrtstage4.sv
index 856273a5e..47b1d4b26 100644
--- a/src/fpu/fdivsqrt/fdivsqrtstage4.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtstage4.sv
@@ -52,7 +52,7 @@ module fdivsqrtstage4 import cvw::*;  #(parameter cvw_t P) (
 
   // Digit Selection logic
   assign j0     = ~C[P.DIVb+1];             // first step of R digit selection: C = 00...0
-  assign j1     = C[P.DIVb] & ~C[P.DIVb-1]; // second step of R digit selection: C = 1100...0; *** could simplify to ~C[P.DIVb-1] because j=0 case takes priority
+  assign j1     = ~C[P.DIVb-1]; // second step of R digit selection: C = 1100...0; simplified from  C[P.DIVb] & ~C[P.DIVb-1] because j=0 case takes priority
   assign Smsbs  = U[P.DIVb:P.DIVb-4];       // U1.4 most significant bits of square root
   assign Dmsbs  = D[P.DIVb-1:P.DIVb-3];     // U0.3 most significant fractional bits of divisor after leading 1
   assign WCmsbs = WC[P.DIVb+3:P.DIVb-4];    // Q4.4 most significant bits of residual
diff --git a/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv b/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
index fd1092497..bf75532b3 100644
--- a/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
@@ -95,7 +95,7 @@ module fdivsqrtuslc4cmp (
   // Choose A for current operation 
  always_comb
     if (SqrtE) begin 
-      if (Smsbs[4]) A = 3'b111; // for S = 1.0000  *** can we optimize away this case?
+      if (Smsbs[4]) A = 3'b111; // for S = 1.0000
       else A = Smsbs[2:0];
     end else A = Dmsbs;
     
@@ -108,7 +108,7 @@ module fdivsqrtuslc4cmp (
 
 /* Nannarelli12 design to exploit symmetry is slower because of negation and mux for special case of A = 000
   assign mk0 = -mk1;
-  assign mkm1 = (A == 3'b000) ? -13 : -mk2; // asymmetry in table *** can we hide from critical path
+  assign mkm1 = (A == 3'b000) ? -13 : -mk2; // asymmetry in table
   */
  
   // Compare residual W to selection constants to choose digit
@@ -117,5 +117,5 @@ module fdivsqrtuslc4cmp (
     else if ($signed(Wmsbs) >= $signed(mk1))  udigit = 4'b0100; // choose 1
     else if ($signed(Wmsbs) >= $signed(mk0))  udigit = 4'b0000; // choose 0
     else if ($signed(Wmsbs) >= $signed(mkm1)) udigit = 4'b0010; // choose -1
-    else                                      udigit = 4'b0001; // choose -2  
+    else                                      udigit = 4'b0001; // choose -2
 endmodule
diff --git a/src/generic/mem/ram1p1rwbe.sv b/src/generic/mem/ram1p1rwbe.sv
index 010e55a30..2c15716d9 100644
--- a/src/generic/mem/ram1p1rwbe.sv
+++ b/src/generic/mem/ram1p1rwbe.sv
@@ -44,8 +44,6 @@ module ram1p1rwbe import cvw::*; #(parameter USE_SRAM=0, DEPTH=64, WIDTH=44, PRE
   output logic [WIDTH-1:0]        dout
 );
 
-  bit [WIDTH-1:0]               RAM[DEPTH-1:0];
-
   ///////////////////////////////////////////////////////////////////////////////
   // TRUE SRAM macro
   ///////////////////////////////////////////////////////////////////////////////
@@ -83,6 +81,7 @@ module ram1p1rwbe import cvw::*; #(parameter USE_SRAM=0, DEPTH=64, WIDTH=44, PRE
     // READ first SRAM model
     ///////////////////////////////////////////////////////////////////////////////
   end else begin: ram
+    bit [WIDTH-1:0] RAM[DEPTH-1:0];
     integer i;
 
     if (PRELOAD_ENABLED) begin
diff --git a/src/generic/mem/ram1p1rwe.sv b/src/generic/mem/ram1p1rwe.sv
index a030d2aab..240af6db1 100644
--- a/src/generic/mem/ram1p1rwe.sv
+++ b/src/generic/mem/ram1p1rwe.sv
@@ -41,11 +41,9 @@ module ram1p1rwe import cvw::* ; #(parameter USE_SRAM=0, DEPTH=64, WIDTH=44) (
   output logic [WIDTH-1:0]        dout
 );
 
-  bit [WIDTH-1:0]               RAM[DEPTH-1:0];
-
-  // ***************************************************************************
+  //////////////////////////////////////////////////////////////////////////////
   // TRUE SRAM macro
-  // ***************************************************************************
+  //////////////////////////////////////////////////////////////////////////////
   if ((USE_SRAM == 1) & (WIDTH == 128) & (DEPTH == 64)) begin // Cache data subarray
     // 64 x 128-bit SRAM
     ram1p1rwbe_64x128 sram1A (.CLK(clk), .CEB(~ce), .WEB(~we),
@@ -64,12 +62,15 @@ module ram1p1rwe import cvw::* ; #(parameter USE_SRAM=0, DEPTH=64, WIDTH=44) (
       .A(addr), .D(din), 
       .BWEB('0), .Q(dout));     
     
-    // ***************************************************************************
+    //////////////////////////////////////////////////////////////////////////////
     // READ first SRAM model
-    // ***************************************************************************
+    //////////////////////////////////////////////////////////////////////////////
   end else begin: ram
     // *** Vivado is not implementing this as block ram for some reason.
     // The version with byte write enables it correctly infers block ram.
+
+    bit [WIDTH-1:0]               RAM[DEPTH-1:0];
+
     integer i;
 
     // Combinational read: register address and read after clock edge
diff --git a/src/generic/mem/ram2p1r1wbe.sv b/src/generic/mem/ram2p1r1wbe.sv
index ba6919958..5a677ffaa 100644
--- a/src/generic/mem/ram2p1r1wbe.sv
+++ b/src/generic/mem/ram2p1r1wbe.sv
@@ -44,7 +44,6 @@ module ram2p1r1wbe import cvw::*; #(parameter USE_SRAM=0, DEPTH=1024, WIDTH=68)
   output logic [WIDTH-1:0]         rd1
 );
 
-  bit [WIDTH-1:0]                 mem[DEPTH-1:0];
   localparam                      SRAMWIDTH = 32;
   localparam                      SRAMNUMSETS = SRAMWIDTH/WIDTH;      
 
@@ -105,24 +104,26 @@ module ram2p1r1wbe import cvw::*; #(parameter USE_SRAM=0, DEPTH=1024, WIDTH=68)
       .QA(SRAMReadData),
       .QB());
 
-  end else begin
+  end else begin:ram
     
     ///////////////////////////////////////////////////////////////////////////////
     // READ first SRAM model
     ///////////////////////////////////////////////////////////////////////////////
+
+    bit [WIDTH-1:0] RAM[DEPTH-1:0];
     integer i;
 /*    
     initial begin // initialize memory for simulation only; not needed because done in the testbench now
       integer j;
       for (j=0; j < DEPTH; j++) 
-        mem[j] = '0;
+        RAM[j] = '0;
     end 
 */
 
     // Read
     logic [$clog2(DEPTH)-1:0] ra1d;
     flopen #($clog2(DEPTH)) adrreg(clk, ce1, ra1, ra1d);
-    assign rd1 = mem[ra1d];
+    assign rd1 = RAM[ra1d];
     
     // Write divided into part for bytes and part for extra msbs
     // coverage off     
@@ -131,13 +132,13 @@ module ram2p1r1wbe import cvw::*; #(parameter USE_SRAM=0, DEPTH=1024, WIDTH=68)
       always @(posedge clk) 
         if (ce2 & we2) 
           for(i = 0; i < WIDTH/8; i++) 
-            if(bwe2[i]) mem[wa2][i*8 +: 8] <= wd2[i*8 +: 8];
+            if(bwe2[i]) RAM[wa2][i*8 +: 8] <= wd2[i*8 +: 8];
     // coverage on
   
     if (WIDTH%8 != 0) // handle msbs if width not a multiple of 8
       always @(posedge clk) 
         if (ce2 & we2 & bwe2[WIDTH/8])
-          mem[wa2][WIDTH-1:WIDTH-WIDTH%8] <= wd2[WIDTH-1:WIDTH-WIDTH%8];
+          RAM[wa2][WIDTH-1:WIDTH-WIDTH%8] <= wd2[WIDTH-1:WIDTH-WIDTH%8];
   end
   
 endmodule
diff --git a/testbench/common/DCacheFlushFSM.sv b/testbench/common/DCacheFlushFSM.sv
index ed9d56342..affb10b10 100644
--- a/testbench/common/DCacheFlushFSM.sv
+++ b/testbench/common/DCacheFlushFSM.sv
@@ -64,13 +64,13 @@ module DCacheFlushFSM import cvw::*; #(parameter cvw_t P)
           .loglinebytelen(loglinebytelen), .sramlen(sramlen))
           copyShadow(.clk,
           .start,
-          .tag(testbench.dut.core.lsu.bus.dcache.dcache.CacheWays[way].CacheTagMem.RAM[index][P.PA_BITS-1-tagstart:0]),
+          .tag(testbench.dut.core.lsu.bus.dcache.dcache.CacheWays[way].CacheTagMem.ram.RAM[index][P.PA_BITS-1-tagstart:0]),
           .valid(testbench.dut.core.lsu.bus.dcache.dcache.CacheWays[way].ValidBits[index]),
           .dirty(testbench.dut.core.lsu.bus.dcache.dcache.CacheWays[way].DirtyBits[index]),
                            // these dirty bit selections would be needed if dirty is moved inside the tag array.
           //.dirty(testbench.dut.core.lsu.bus.dcache.dcache.CacheWays[way].dirty.DirtyMem.RAM[index]),
           //.dirty(testbench.dut.core.lsu.bus.dcache.dcache.CacheWays[way].CacheTagMem.RAM[index][P.PA_BITS+tagstart]),
-          .data(testbench.dut.core.lsu.bus.dcache.dcache.CacheWays[way].word[cacheWord].wordram.CacheDataMem.RAM[index]),
+          .data(testbench.dut.core.lsu.bus.dcache.dcache.CacheWays[way].word[cacheWord].wordram.CacheDataMem.ram.RAM[index]),
           .index(index),
           .cacheWord(cacheWord),
           .CacheData(CacheData[way][index][cacheWord]),
diff --git a/testbench/testbench.sv b/testbench/testbench.sv
index 17def063c..1ab9522b8 100644
--- a/testbench/testbench.sv
+++ b/testbench/testbench.sv
@@ -499,10 +499,10 @@ module testbench;
           readResult = $fread(dut.uncoregen.uncore.bootrom.bootrom.memory.ROM, memFile);
           $fclose(memFile);
           memFile = $fopen(memfilename, "rb");
-          readResult = $fread(dut.uncoregen.uncore.ram.ram.memory.RAM, memFile);
+          readResult = $fread(dut.uncoregen.uncore.ram.ram.memory.ram.RAM, memFile);
           $fclose(memFile);
         end else 
-          $readmemh(memfilename, dut.uncoregen.uncore.ram.ram.memory.RAM);
+          $readmemh(memfilename, dut.uncoregen.uncore.ram.ram.memory.ram.RAM);
         if (TEST == "embench") $display("Read memfile %s", memfilename);
       end
       if (CopyRAM) begin
@@ -511,7 +511,7 @@ module testbench;
         EndIndex = (end_signature_addr >> LogXLEN) + 8;
         BaseIndex = P.UNCORE_RAM_BASE >> LogXLEN;
         for(ShadowIndex = StartIndex; ShadowIndex <= EndIndex; ShadowIndex++) begin
-          testbench.DCacheFlushFSM.ShadowRAM[ShadowIndex] = dut.uncoregen.uncore.ram.ram.memory.RAM[ShadowIndex - BaseIndex];
+          testbench.DCacheFlushFSM.ShadowRAM[ShadowIndex] = dut.uncoregen.uncore.ram.ram.memory.ram.RAM[ShadowIndex - BaseIndex];
         end
       end
     end
@@ -519,7 +519,7 @@ module testbench;
   if (P.DTIM_SUPPORTED) begin
     always @(posedge clk) begin
       if (LoadMem) begin
-        $readmemh(memfilename, dut.core.lsu.dtim.dtim.ram.RAM);
+        $readmemh(memfilename, dut.core.lsu.dtim.dtim.ram.ram.RAM);
         $display("Read memfile %s", memfilename);
       end
       if (CopyRAM) begin
@@ -528,7 +528,7 @@ module testbench;
         EndIndex = (end_signature_addr >> LogXLEN) + 8;
         BaseIndex = P.UNCORE_RAM_BASE >> LogXLEN;
         for(ShadowIndex = StartIndex; ShadowIndex <= EndIndex; ShadowIndex++) begin
-          testbench.DCacheFlushFSM.ShadowRAM[ShadowIndex] = dut.core.lsu.dtim.dtim.ram.RAM[ShadowIndex - BaseIndex];
+          testbench.DCacheFlushFSM.ShadowRAM[ShadowIndex] = dut.core.lsu.dtim.dtim.ram.ram.RAM[ShadowIndex - BaseIndex];
         end
       end
     end
@@ -539,7 +539,7 @@ module testbench;
     always @(posedge clk) 
       if (ResetMem)  // program memory is sometimes reset (e.g. for CoreMark, which needs zeroed memory)
         for (adrindex=0; adrindex<(P.UNCORE_RAM_RANGE>>1+(P.XLEN/32)); adrindex = adrindex+1) 
-          dut.uncoregen.uncore.ram.ram.memory.RAM[adrindex] = '0;
+          dut.uncoregen.uncore.ram.ram.memory.ram.RAM[adrindex] = '0;
 
   ////////////////////////////////////////////////////////////////////////////////
   // Actual hardware