Code cleanup: RAM, fdivsqrt

2025-06-28 01:32:49 -04:00 · 2024-06-14 03:35:05 -07:00 · 2024-06-14 03:35:05 -07:00 · b1c9450b4a
commit b1c9450b4a
parent 6789f32154
10 changed files with 35 additions and 33 deletions
--- a/src/ebu/ahbcacheinterface.sv
+++ b/src/ebu/ahbcacheinterface.sv
@ -114,11 +114,12 @@ module ahbcacheinterface import cvw::*; #(
    .s(~(CacheableOrFlushCacheM)), .y(PreHWDATA));
  flopen #(P.AHBW) wdreg(HCLK, HREADY, PreHWDATA, HWDATA); // delay HWDATA by 1 cycle per spec

-  // *** bummer need a second byte mask for bus as it is AHBW rather than LLEN.
-  // probably can merge by muxing PAdrM's LLEN/8-1 index bit based on HTRANS being != 0.
-  swbytemask #(P.AHBW) busswbytemask(.Size(HSIZE), .Adr(HADDR[$clog2(P.AHBW/8)-1:0]), .ByteMask(BusByteMaskM), .ByteMaskExtended());
-  
-  flopen #(P.AHBW/8) HWSTRBReg(HCLK, HREADY, BusByteMaskM[P.AHBW/8-1:0], HWSTRB);
+  if (READ_ONLY_CACHE) begin
+    assign HWSTRB = '0;
+  end else begin // compute byte mask for AHB transaction based on size and address.  AHBW may be different than LLEN
+    swbytemask #(P.AHBW) busswbytemask(.Size(HSIZE), .Adr(HADDR[$clog2(P.AHBW/8)-1:0]), .ByteMask(BusByteMaskM), .ByteMaskExtended());
+    flopen #(P.AHBW/8) HWSTRBReg(HCLK, HREADY, BusByteMaskM[P.AHBW/8-1:0], HWSTRB);
+  end
  
  buscachefsm #(BeatCountThreshold, AHBWLOGBWPL, READ_ONLY_CACHE, P.BURST_EN) AHBBuscachefsm(
    .HCLK, .HRESETn, .Flush, .BusRW, .BusAtomic, .Stall, .BusCommitted, .BusStall, .CaptureEn, .SelBusBeat,
--- a/src/fpu/fdivsqrt/fdivsqrtiter.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtiter.sv
@ -44,7 +44,7 @@ module fdivsqrtiter import cvw::*;  #(parameter cvw_t P) (
  logic [P.DIVb+3:0]      WCNext[P.DIVCOPIES-1:0]; // Q4.DIVb
  logic [P.DIVb+3:0]      WS[P.DIVCOPIES:0];       // Q4.DIVb
  logic [P.DIVb+3:0]      WC[P.DIVCOPIES:0];       // Q4.DIVb
-  logic [P.DIVb:0]        U[P.DIVCOPIES:0];        // U1.DIVb // *** probably Q not U.  See Table 16.26 notes
+  logic [P.DIVb:0]        U[P.DIVCOPIES:0];        // U1.DIVb
  logic [P.DIVb:0]        UM[P.DIVCOPIES:0];       // U1.DIVb
  logic [P.DIVb:0]        UNext[P.DIVCOPIES-1:0];  // U1.DIVb
  logic [P.DIVb:0]        UMNext[P.DIVCOPIES-1:0]; // U1.DIVb
--- a/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtpreproc.sv
@ -222,7 +222,7 @@ module fdivsqrtpreproc import cvw::*;  #(parameter cvw_t P) (
    logic               RemOpE;

    /* verilator lint_off WIDTH */
-    assign IntDivNormShiftE = P.DIVb - (CyclesE * P.RK - P.LOGR); // b - rn, used for integer normalization right shift.  rn = Cycles * r * k - r ***explain
+    assign IntDivNormShiftE = P.DIVb - (CyclesE * P.RK - P.LOGR); // b - rn, used for integer normalization right shift.  n = (Cycles * k - 1)
    assign IntRemNormShiftE = mE + (P.DIVb-(P.XLEN-1));           // m + b - (N-1) for remainder normalization shift
    /* verilator lint_on WIDTH */
    assign RemOpE = Funct3E[1];
--- a/src/fpu/fdivsqrt/fdivsqrtstage4.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtstage4.sv
@ -52,7 +52,7 @@ module fdivsqrtstage4 import cvw::*;  #(parameter cvw_t P) (

  // Digit Selection logic
  assign j0     = ~C[P.DIVb+1];             // first step of R digit selection: C = 00...0
-  assign j1     = C[P.DIVb] & ~C[P.DIVb-1]; // second step of R digit selection: C = 1100...0; *** could simplify to ~C[P.DIVb-1] because j=0 case takes priority
+  assign j1     = ~C[P.DIVb-1]; // second step of R digit selection: C = 1100...0; simplified from  C[P.DIVb] & ~C[P.DIVb-1] because j=0 case takes priority
  assign Smsbs  = U[P.DIVb:P.DIVb-4];       // U1.4 most significant bits of square root
  assign Dmsbs  = D[P.DIVb-1:P.DIVb-3];     // U0.3 most significant fractional bits of divisor after leading 1
  assign WCmsbs = WC[P.DIVb+3:P.DIVb-4];    // Q4.4 most significant bits of residual
--- a/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
+++ b/src/fpu/fdivsqrt/fdivsqrtuslc4cmp.sv
@ -95,7 +95,7 @@ module fdivsqrtuslc4cmp (
  // Choose A for current operation 
 always_comb
    if (SqrtE) begin 
-      if (Smsbs[4]) A = 3'b111; // for S = 1.0000  *** can we optimize away this case?
+      if (Smsbs[4]) A = 3'b111; // for S = 1.0000
      else A = Smsbs[2:0];
    end else A = Dmsbs;
    
@ -108,7 +108,7 @@ module fdivsqrtuslc4cmp (

 /* Nannarelli12 design to exploit symmetry is slower because of negation and mux for special case of A = 000
  assign mk0 = -mk1;
-  assign mkm1 = (A == 3'b000) ? -13 : -mk2; // asymmetry in table *** can we hide from critical path
+  assign mkm1 = (A == 3'b000) ? -13 : -mk2; // asymmetry in table
  */
 
  // Compare residual W to selection constants to choose digit
@ -117,5 +117,5 @@ module fdivsqrtuslc4cmp (
    else if ($signed(Wmsbs) >= $signed(mk1))  udigit = 4'b0100; // choose 1
    else if ($signed(Wmsbs) >= $signed(mk0))  udigit = 4'b0000; // choose 0
    else if ($signed(Wmsbs) >= $signed(mkm1)) udigit = 4'b0010; // choose -1
-    else                                      udigit = 4'b0001; // choose -2  
+    else                                      udigit = 4'b0001; // choose -2
 endmodule
--- a/src/generic/mem/ram1p1rwbe.sv
+++ b/src/generic/mem/ram1p1rwbe.sv
@ -44,8 +44,6 @@ module ram1p1rwbe import cvw::*; #(parameter USE_SRAM=0, DEPTH=64, WIDTH=44, PRE
  output logic [WIDTH-1:0]        dout
 );

-  bit [WIDTH-1:0]               RAM[DEPTH-1:0];
-
  ///////////////////////////////////////////////////////////////////////////////
  // TRUE SRAM macro
  ///////////////////////////////////////////////////////////////////////////////
@ -83,6 +81,7 @@ module ram1p1rwbe import cvw::*; #(parameter USE_SRAM=0, DEPTH=64, WIDTH=44, PRE
    // READ first SRAM model
    ///////////////////////////////////////////////////////////////////////////////
  end else begin: ram
+    bit [WIDTH-1:0] RAM[DEPTH-1:0];
    integer i;

    if (PRELOAD_ENABLED) begin
--- a/src/generic/mem/ram1p1rwe.sv
+++ b/src/generic/mem/ram1p1rwe.sv
@ -41,11 +41,9 @@ module ram1p1rwe import cvw::* ; #(parameter USE_SRAM=0, DEPTH=64, WIDTH=44) (
  output logic [WIDTH-1:0]        dout
 );

-  bit [WIDTH-1:0]               RAM[DEPTH-1:0];
-
-  // ***************************************************************************
+  //////////////////////////////////////////////////////////////////////////////
  // TRUE SRAM macro
-  // ***************************************************************************
+  //////////////////////////////////////////////////////////////////////////////
  if ((USE_SRAM == 1) & (WIDTH == 128) & (DEPTH == 64)) begin // Cache data subarray
    // 64 x 128-bit SRAM
    ram1p1rwbe_64x128 sram1A (.CLK(clk), .CEB(~ce), .WEB(~we),
@ -64,12 +62,15 @@ module ram1p1rwe import cvw::* ; #(parameter USE_SRAM=0, DEPTH=64, WIDTH=44) (
      .A(addr), .D(din), 
      .BWEB('0), .Q(dout));     
    
-    // ***************************************************************************
+    //////////////////////////////////////////////////////////////////////////////
    // READ first SRAM model
-    // ***************************************************************************
+    //////////////////////////////////////////////////////////////////////////////
  end else begin: ram
    // *** Vivado is not implementing this as block ram for some reason.
    // The version with byte write enables it correctly infers block ram.
+
+    bit [WIDTH-1:0]               RAM[DEPTH-1:0];
+
    integer i;

    // Combinational read: register address and read after clock edge
--- a/src/generic/mem/ram2p1r1wbe.sv
+++ b/src/generic/mem/ram2p1r1wbe.sv
@ -44,7 +44,6 @@ module ram2p1r1wbe import cvw::*; #(parameter USE_SRAM=0, DEPTH=1024, WIDTH=68)
  output logic [WIDTH-1:0]         rd1
 );

-  bit [WIDTH-1:0]                 mem[DEPTH-1:0];
  localparam                      SRAMWIDTH = 32;
  localparam                      SRAMNUMSETS = SRAMWIDTH/WIDTH;      

@ -105,24 +104,26 @@ module ram2p1r1wbe import cvw::*; #(parameter USE_SRAM=0, DEPTH=1024, WIDTH=68)
      .QA(SRAMReadData),
      .QB());

-  end else begin
+  end else begin:ram
    
    ///////////////////////////////////////////////////////////////////////////////
    // READ first SRAM model
    ///////////////////////////////////////////////////////////////////////////////
+
+    bit [WIDTH-1:0] RAM[DEPTH-1:0];
    integer i;
 /*    
    initial begin // initialize memory for simulation only; not needed because done in the testbench now
      integer j;
      for (j=0; j < DEPTH; j++) 
-        mem[j] = '0;
+        RAM[j] = '0;
    end 
 */

    // Read
    logic [$clog2(DEPTH)-1:0] ra1d;
    flopen #($clog2(DEPTH)) adrreg(clk, ce1, ra1, ra1d);
-    assign rd1 = mem[ra1d];
+    assign rd1 = RAM[ra1d];
    
    // Write divided into part for bytes and part for extra msbs
    // coverage off     
@ -131,13 +132,13 @@ module ram2p1r1wbe import cvw::*; #(parameter USE_SRAM=0, DEPTH=1024, WIDTH=68)
      always @(posedge clk) 
        if (ce2 & we2) 
          for(i = 0; i < WIDTH/8; i++) 
-            if(bwe2[i]) mem[wa2][i*8 +: 8] <= wd2[i*8 +: 8];
+            if(bwe2[i]) RAM[wa2][i*8 +: 8] <= wd2[i*8 +: 8];
    // coverage on
  
    if (WIDTH%8 != 0) // handle msbs if width not a multiple of 8
      always @(posedge clk) 
        if (ce2 & we2 & bwe2[WIDTH/8])
-          mem[wa2][WIDTH-1:WIDTH-WIDTH%8] <= wd2[WIDTH-1:WIDTH-WIDTH%8];
+          RAM[wa2][WIDTH-1:WIDTH-WIDTH%8] <= wd2[WIDTH-1:WIDTH-WIDTH%8];
  end
  
 endmodule
--- a/testbench/common/DCacheFlushFSM.sv
+++ b/testbench/common/DCacheFlushFSM.sv
@ -64,13 +64,13 @@ module DCacheFlushFSM import cvw::*; #(parameter cvw_t P)
          .loglinebytelen(loglinebytelen), .sramlen(sramlen))
          copyShadow(.clk,
          .start,
-          .tag(testbench.dut.core.lsu.bus.dcache.dcache.CacheWays[way].CacheTagMem.RAM[index][P.PA_BITS-1-tagstart:0]),
+          .tag(testbench.dut.core.lsu.bus.dcache.dcache.CacheWays[way].CacheTagMem.ram.RAM[index][P.PA_BITS-1-tagstart:0]),
          .valid(testbench.dut.core.lsu.bus.dcache.dcache.CacheWays[way].ValidBits[index]),
          .dirty(testbench.dut.core.lsu.bus.dcache.dcache.CacheWays[way].DirtyBits[index]),
                           // these dirty bit selections would be needed if dirty is moved inside the tag array.
          //.dirty(testbench.dut.core.lsu.bus.dcache.dcache.CacheWays[way].dirty.DirtyMem.RAM[index]),
          //.dirty(testbench.dut.core.lsu.bus.dcache.dcache.CacheWays[way].CacheTagMem.RAM[index][P.PA_BITS+tagstart]),
-          .data(testbench.dut.core.lsu.bus.dcache.dcache.CacheWays[way].word[cacheWord].wordram.CacheDataMem.RAM[index]),
+          .data(testbench.dut.core.lsu.bus.dcache.dcache.CacheWays[way].word[cacheWord].wordram.CacheDataMem.ram.RAM[index]),
          .index(index),
          .cacheWord(cacheWord),
          .CacheData(CacheData[way][index][cacheWord]),
--- a/testbench/testbench.sv
+++ b/testbench/testbench.sv
@ -499,10 +499,10 @@ module testbench;
          readResult = $fread(dut.uncoregen.uncore.bootrom.bootrom.memory.ROM, memFile);
          $fclose(memFile);
          memFile = $fopen(memfilename, "rb");
-          readResult = $fread(dut.uncoregen.uncore.ram.ram.memory.RAM, memFile);
+          readResult = $fread(dut.uncoregen.uncore.ram.ram.memory.ram.RAM, memFile);
          $fclose(memFile);
        end else 
-          $readmemh(memfilename, dut.uncoregen.uncore.ram.ram.memory.RAM);
+          $readmemh(memfilename, dut.uncoregen.uncore.ram.ram.memory.ram.RAM);
        if (TEST == "embench") $display("Read memfile %s", memfilename);
      end
      if (CopyRAM) begin
@ -511,7 +511,7 @@ module testbench;
        EndIndex = (end_signature_addr >> LogXLEN) + 8;
        BaseIndex = P.UNCORE_RAM_BASE >> LogXLEN;
        for(ShadowIndex = StartIndex; ShadowIndex <= EndIndex; ShadowIndex++) begin
-          testbench.DCacheFlushFSM.ShadowRAM[ShadowIndex] = dut.uncoregen.uncore.ram.ram.memory.RAM[ShadowIndex - BaseIndex];
+          testbench.DCacheFlushFSM.ShadowRAM[ShadowIndex] = dut.uncoregen.uncore.ram.ram.memory.ram.RAM[ShadowIndex - BaseIndex];
        end
      end
    end
@ -519,7 +519,7 @@ module testbench;
  if (P.DTIM_SUPPORTED) begin
    always @(posedge clk) begin
      if (LoadMem) begin
-        $readmemh(memfilename, dut.core.lsu.dtim.dtim.ram.RAM);
+        $readmemh(memfilename, dut.core.lsu.dtim.dtim.ram.ram.RAM);
        $display("Read memfile %s", memfilename);
      end
      if (CopyRAM) begin
@ -528,7 +528,7 @@ module testbench;
        EndIndex = (end_signature_addr >> LogXLEN) + 8;
        BaseIndex = P.UNCORE_RAM_BASE >> LogXLEN;
        for(ShadowIndex = StartIndex; ShadowIndex <= EndIndex; ShadowIndex++) begin
-          testbench.DCacheFlushFSM.ShadowRAM[ShadowIndex] = dut.core.lsu.dtim.dtim.ram.RAM[ShadowIndex - BaseIndex];
+          testbench.DCacheFlushFSM.ShadowRAM[ShadowIndex] = dut.core.lsu.dtim.dtim.ram.ram.RAM[ShadowIndex - BaseIndex];
        end
      end
    end
@ -539,7 +539,7 @@ module testbench;
    always @(posedge clk) 
      if (ResetMem)  // program memory is sometimes reset (e.g. for CoreMark, which needs zeroed memory)
        for (adrindex=0; adrindex<(P.UNCORE_RAM_RANGE>>1+(P.XLEN/32)); adrindex = adrindex+1) 
-          dut.uncoregen.uncore.ram.ram.memory.RAM[adrindex] = '0;
+          dut.uncoregen.uncore.ram.ram.memory.ram.RAM[adrindex] = '0;

  ////////////////////////////////////////////////////////////////////////////////
  // Actual hardware