Merge pull request #1433 from davidharrishmc/dev

PMP granularity, trick box support, exercise solutions
2025-06-27 17:01:20 -04:00 · 2025-06-04 17:59:20 -07:00 · 2025-06-04 17:59:20 -07:00 · d006d017ed
commit d006d017ed
parent fb0451e7ce 55df24f00b
49 changed files with 984 additions and 112 deletions
--- a/bin/regression-wally
+++ b/bin/regression-wally
@ -317,7 +317,6 @@ class bcolors:
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

-
 def addTests(testList, sim, coverStr, configs):
    sim_logdir = f"{regressionDir}/{sim}/logs/"
    for test in testList:
@ -489,12 +488,11 @@ def selectTests(args, sims, coverStr):
        addTests(tests_buildrootbootlockstep, lockstepsim, coverStr, configs) # lockstep with Questa and ImperasDV runs overnight
    # only run RV64GC tests on in code coverage mode
    if args.ccov:
-        addTestsByDir(f"{archVerifDir}/tests/rv64/", "rv64gc", coveragesim, coverStr, configs, lockstepMode=1)
        addTestsByDir(f"{archVerifDir}/tests/priv/rv64/", "rv64gc", coveragesim, coverStr, configs, lockstepMode=1) # doesn't help coverage much dh 4/12/25
-        addTestsByDir(WALLY+"/tests/coverage/", "rv64gc", coveragesim, coverStr, configs, lockstepMode=1)
-        # Extra tests from riscv-arch-test that should be run as part of the functional coverage suite
        addTestsByDir(f"{WALLY}/tests/riscof/work/riscv-arch-test/rv64i_m/pmp", "rv64gc", coveragesim, coverStr, configs, lockstepMode=1)
        addTestsByDir(f"{WALLY}/tests/riscof/work/wally-riscv-arch-test/rv64i_m/privilege", "rv64gc", coveragesim, coverStr, configs, lockstepMode=1)
+        addTestsByDir(f"{archVerifDir}/tests/rv64/", "rv64gc", coveragesim, coverStr, configs, lockstepMode=1)
+        addTestsByDir(WALLY+"/tests/coverage/", "rv64gc", coveragesim, coverStr, configs, lockstepMode=1)
    # run tests in lockstep in functional coverage mode
    if args.fcov or args.nightly:
        addTestsByDir(f"{archVerifDir}/tests/rv32/", "rv32gc", coveragesim, coverStr, configs, lockstepMode=1)
--- a/config/rv32e/config.vh
+++ b/config/rv32e/config.vh
@ -137,6 +137,7 @@ localparam logic IDIV_ON_FPU = 0;

 // Legal number of PMP entries are 0, 16, or 64
 localparam PMP_ENTRIES = 32'd0;
+localparam PMP_G = 32'b0; // grain of 4 bytes is supported for uncached RV32

 // Address space
 localparam logic [63:0] RESET_VECTOR = 64'h80000000;
--- a/config/rv32gc/config.vh
+++ b/config/rv32gc/config.vh
@ -137,6 +137,9 @@ localparam logic IDIV_ON_FPU = 0;

 // Legal number of PMP entries are 0, 16, or 64
 localparam PMP_ENTRIES = 32'd16;
+// grain size should be a full cache line to avoid problems with accesses within a cache line
+// that span grain boundaries but are handled without a spill
+localparam PMP_G = 32'd4; // 64 bytes for 512-bit cache line

 // Address space
 localparam logic [63:0] RESET_VECTOR = 64'h80000000;
--- a/config/rv32gc/coverage.svh
+++ b/config/rv32gc/coverage.svh
@ -32,6 +32,7 @@
 // Note: Zmmul is a subset of M, so usually only one or the other would be used.
 `define ZMMUL_COVERAGE
 `define ZICOND_COVERAGE
+`define ZIFENCEI_COVERAGE
 `define ZCA_COVERAGE
 `define ZCB_COVERAGE
 `define ZCF_COVERAGE
@ -58,10 +59,12 @@
 `define ENDIANU_COVERAGE
 `define ENDIANS_COVERAGE
 `define ENDIANM_COVERAGE
+`define ENDIANZALRSC_COVERAGE
+`define ENDIANZAAMO_COVERAGE
 `define EXCEPTIONSM_COVERAGE
 `define EXCEPTIONSS_COVERAGE
 `define EXCEPTIONSU_COVERAGE
-`define EXCEPTIONSV_COVERAGE
+//`define EXCEPTIONSV_COVERAGE
 `define EXCEPTIONSF_COVERAGE
 `define EXCEPTIONSZC_COVERAGE
 `define EXCEPTIONSZAAMO_COVERAGE
--- a/config/rv32gc/imperas.ic
+++ b/config/rv32gc/imperas.ic
@ -45,7 +45,9 @@

 # PMP Configuration
 --override cpu/PMP_registers=16
--override cpu/PMP_undefined=T
+--override cpu/PMP_grain=4   # 64-byte grains to match cache line width
+--override cpu/PMP_decompose=T  # unaligned accesses are decomposed into separate aligned accesses
+--override cpu/PMP_undefined=T # access to unimplemented PMP registers cause illegal instruction exception

 # PMA Settings
 # 'r': read access allowed
--- a/config/rv32i/config.vh
+++ b/config/rv32i/config.vh
@ -137,6 +137,7 @@ localparam logic IDIV_ON_FPU = 0;

 // Legal number of PMP entries are 0, 16, or 64
 localparam PMP_ENTRIES = 32'd0;
+localparam PMP_G = 32'b0; // grain of 4 bytes is supported for uncached RV32

 // Address space
 localparam logic [63:0] RESET_VECTOR = 64'h80000000;
--- a/config/rv32imc/config.vh
+++ b/config/rv32imc/config.vh
@ -137,6 +137,7 @@ localparam logic IDIV_ON_FPU = 0;

 // Legal number of PMP entries are 0, 16, or 64
 localparam PMP_ENTRIES = 32'd0;
+localparam PMP_G = 32'b0; // grain of 4 bytes is supported for uncached RV32

 // Address space
 localparam logic [63:0] RESET_VECTOR = 64'h80000000;
--- a/config/rv64gc/config.vh
+++ b/config/rv64gc/config.vh
@ -138,6 +138,10 @@ localparam logic IDIV_ON_FPU = 1;
 // Legal number of PMP entries are 0, 16, or 64
 localparam PMP_ENTRIES = 32'd16;

+// grain size should be a full cache line to avoid problems with accesses within a cache line
+// that span grain boundaries but are handled without a spill
+localparam PMP_G = 32'd4;  //e.g. 4 for 64-byte grains (512-bit cache lines)
+
 // Address space
 localparam logic [63:0] RESET_VECTOR = 64'h0000000080000000;

--- a/config/rv64gc/coverage.svh
+++ b/config/rv64gc/coverage.svh
@ -32,6 +32,7 @@
 // Note: Zmmul is a subset of M, so usually only one or the other would be used.
 `define ZMMUL_COVERAGE
 `define ZICOND_COVERAGE
+`define ZIFENCEI_COVERAGE
 `define ZCA_COVERAGE
 `define ZCB_COVERAGE
 `define ZCD_COVERAGE
@ -55,10 +56,12 @@
 `define ENDIANU_COVERAGE
 `define ENDIANS_COVERAGE
 `define ENDIANM_COVERAGE
+`define ENDIANZALRSC_COVERAGE
+`define ENDIANZAAMO_COVERAGE
 `define EXCEPTIONSM_COVERAGE
 `define EXCEPTIONSS_COVERAGE
 `define EXCEPTIONSU_COVERAGE
-`define EXCEPTIONSV_COVERAGE
+//`define EXCEPTIONSV_COVERAGE
 `define EXCEPTIONSF_COVERAGE
 `define EXCEPTIONSZC_COVERAGE
 `define EXCEPTIONSVM_COVERAGE
--- a/config/rv64gc/imperas.ic
+++ b/config/rv64gc/imperas.ic
@ -48,7 +48,9 @@

 # PMP Configuration
 --override cpu/PMP_registers=16
--override cpu/PMP_undefined=T
+--override cpu/PMP_grain=4   # 64-byte grains to match cache line width
+--override cpu/PMP_decompose=T  # unaligned accesses are decomposed into separate aligned accesses
+--override cpu/PMP_undefined=T # access to unimplemented PMP registers cause illegal instruction exception

 # PMA Settings
 # 'r': read access allowed
--- a/config/rv64i/config.vh
+++ b/config/rv64i/config.vh
@ -137,6 +137,7 @@ localparam logic IDIV_ON_FPU = 0;

 // Legal number of PMP entries are 0, 16, or 64
 localparam PMP_ENTRIES = 32'd0;
+localparam PMP_G = 32'b0; // grain of 8 bytes is supported for uncached RV64

 // Address space
 localparam logic [63:0] RESET_VECTOR = 64'h0000000080000000;
--- a/config/shared/parameter-defs.vh
+++ b/config/shared/parameter-defs.vh
@ -48,6 +48,7 @@ localparam cvw_t P = '{
  IDIV_BITSPERCYCLE :        IDIV_BITSPERCYCLE,
  IDIV_ON_FPU :        IDIV_ON_FPU,
  PMP_ENTRIES :        PMP_ENTRIES,
+  PMP_G :               PMP_G,
  RESET_VECTOR :        RESET_VECTOR,
  WFI_TIMEOUT_BIT :        WFI_TIMEOUT_BIT,
  DTIM_SUPPORTED :        DTIM_SUPPORTED,
--- a/docs/testplans/testplan.md
+++ b/docs/testplans/testplan.md
@ -7,13 +7,15 @@ CORE-V Wally is functionally tested in the following ways.  Each test is run in
 | Verilator Lint      | 5.3            | All configs  | rv64gc | lint-wally            | PASS   | regression-wally --nightly |
 | Instructions        | 3.7            | All configs  | rv64gc | riscv-arch-test       | PASS   | regression-wally --nightly |
 | Privileged          | 3.7            | All configs  | rv64gc | wally-riscv-arch-test | PASS   | regression-wally --nightly |
-| Floating-point      | 5.11.7, 16.5.3 | rv{32/64}gc + derived | rv64gc    | TestFloat | FAIL   | regression-wally --nightly |
-| CoreMark            | 21.1           | Many configs | rv64gc | CoreMark              |        | regression-wally --nightly |
-| Embench             | 21.2           | rv32*        | n/a    | Embench               |        | regression-wally --nightly |
-| Cache PV            | 21.3.1         | rv{32/64}gc  | rv64gc | TBD                   | TBD    | TBD |
-| Cache PV            | 21.3.2         | rv{32/64}gc  | rv64gc | TBD                   | TBD    | TBD |
-| Linux Boot          | 22.3.2         | rv64gc       | rv64gc | TBD                   | TBD    | TBD |
-| FPGA Linux Boot     | 23.2           |              | rv64gc | TBD                   | TBD    | TBD |
-| Code Coverage       | 5.11.10        |              | rv64gc | TBD                   | TBD    | TBD |
-| Functional Coverage | 5.11.11        |              | rv64gc | TBD                   | TBD    | TBD |
+| Floating-point      | 5.11.7, 16.5.3 | rv{32/64}gc + derived | rv64gc    | TestFloat | PASS   | regression-wally --nightly |
+| CoreMark            | 21.1           | Many configs | rv64gc | CoreMark              | PASS       | regression-wally --nightly |
+| Embench             | 21.2           | rv32*        | n/a    | Embench               | PASS       | regression-wally --nightly |
+| Cache PV            | 21.3.1         | rv{32/64}gc  | rv64gc | CacheSim                   | TBD    | TBD |
+| Branch PV            | 21.3.2         | rv{32/64}gc  | rv64gc | BP simulator                   | TBD    | TBD |
+| Linux Boot          | 22.3.2         | rv64gc       | rv64gc | buildroot                  | PASS    | regression-wally --nightly |
+| FPGA Linux Boot     | 23.2           |              | rv64gc | Lab test                   | PASS    | TBD |
+| Code Coverage       | 5.11.10        |              | rv64gc | multiple tests                   | TBD    | regression-wally --nightly |
+| Functional Coverage | 5.11.11        |              | rv64gc | cvw-arch-verif                   | TBD    | regression-wally --nightly |

+Functional Verification plans are in
+https://drive.google.com/drive/folders/11hTR2Yl48kOMODxhwrSsC-eXYtM_rJJE?usp=sharing
--- a/examples/exercises/11p6/11p6.S
+++ b/examples/exercises/11p6/11p6.S
@ -0,0 +1,77 @@
+.section .text.init
+.globl rvtest_entry_point
+
+rvtest_entry_point:
+
+    # set up PMP so all of memory is accessible and we don't trap when entering supervisor mode
+    # Define region 0 to cover all addresses as RWX
+    nop
+    csrw pmpcfg0, 0xF   # configure PMP0 to TOR RWX
+    li t0, 0xFFFFFFFF   
+    csrw pmpaddr0, t0   # configure PMP0 top of range to 0xFFFFFFFF to allow all 32-bit addresses
+
+    # switch to supervisor mode
+    # Set mstatus.MPP to 01, set MPEC to a trampoline address where supervisor should begin, do mret
+    la t0, supervisorstart
+    csrw mepc, t0           # set address for supervisor code to starting
+    li t0, 1                
+    slli t1, t0, 11         # 1 in bit 11
+    csrs mstatus, t1        
+    slli t1, t0, 12         # 1 in bit 12
+    csrc mstatus, t1        # change mstatus.MPP to 01 (for supervisor mode)
+    mret                    # enter supervisor mode at supervisorstart
+    nop
+
+supervisorstart:
+    la t0, pagetable        # get address of root page table
+    srli t0, t0, 12         # extract PPN of root page table
+    li t1, 1
+    slli t1, t1, 31         # 1 in bit 31
+    or t0, t0, t1           # satp value to enable SV32 with root page table
+    csrw satp, t0           # enable virtual memory
+    
+    # now we are execuiting on virtual page 0x80000, which is also physical page 0x80000
+    li t0, 0x90000300       
+    li t1, 42
+    sw t1, 0(t0)
+
+
+    la t0, testcode         # address of a routine to run
+    lui t1, 0x10000         
+    add t0, t0, t1          # address of testcode on virtual page 0x90000 
+    jr t0                   # jump to the testcode on Virtual page 0x90000, 
+                            #   which still points to same code on physical page 0x80000
+    nop                     # shouldn't be executed
+
+testcode:
+    li t0, 42                # do something
+
+write_tohost:
+    la s1, tohost           # terminate with write tohost
+    li t0, 1                # 1 for success, 3 for failure
+    sw t0, 0(s1)            # send success code
+    sw zero, 4(s1)          # not obvious why Sail isn't terminating unless this write is done
+
+self_loop:
+    j self_loop
+
+tohost:
+    .word 0
+
+.data
+
+
+.align 16
+# root (L1) Page table situated at 0x80010000
+pagetable:
+    .space 2048 # skip over addresses below 0x80000000
+    .4byte 0x20004401 # VPN1 = 512 (VA = 0x80000000) points to L0 page table at 80011000
+    .space 252 # skip over addresses below 0x90000000
+    .4byte 0x20004401 # VPN 576 (VA = 0x90000000) points to L0 page table at 0x80011000
+
+.align 12
+# L0 page table situated at 0x80011000
+    .4byte 0x200000CF # VPN 0 points to physical kilopage at 0x80000000 with Dirty, Access, XWR, Valid  
+
+
+
--- a/examples/exercises/11p6/Makefile
+++ b/examples/exercises/11p6/Makefile
@ -0,0 +1,17 @@
+TARGET = 11p6
+
+$(TARGET).elf.objdump: $(TARGET).elf
+	riscv64-unknown-elf-objdump -D $(TARGET).elf > $(TARGET).elf.objdump
+    
+$(TARGET).elf: $(TARGET).S Makefile
+	riscv64-unknown-elf-gcc -g -o $(TARGET) -march=rv32i_zicsr -mabi=ilp32 -mcmodel=medany \
+	    -nostartfiles -T../../link/link.ld $(TARGET).S -o $(TARGET).elf
+
+# simulate in ImperasDV lockstep
+sim: $(TARGET).elf.objdump
+	wsim rv32gc $(TARGET).elf --lockstepverbose
+
+clean:
+	rm -f $(TARGET).elf*
+
+
--- a/examples/exercises/8p3/8p3.S
+++ b/examples/exercises/8p3/8p3.S
@ -0,0 +1,9 @@
+.section .text.init
+.globl rvtest_entry_point
+
+rvtest_entry_point:
+    li t0, 32			# 1 in bit 5
+    csrs mstatush, t0	# set bit 5 (mstatush.MBE)
+
+self_loop:
+    j self_loop
--- a/examples/exercises/8p3/Makefile
+++ b/examples/exercises/8p3/Makefile
@ -0,0 +1,17 @@
+TARGET = 8p3
+
+$(TARGET).elf.objdump: $(TARGET).elf
+	riscv64-unknown-elf-objdump -D $(TARGET).elf > $(TARGET).elf.objdump
+    
+$(TARGET).elf: $(TARGET).S Makefile
+	riscv64-unknown-elf-gcc -g -o $(TARGET) -march=rv32i_zicsr -mabi=ilp32 -mcmodel=medany \
+	    -nostartfiles -T../../link/link.ld $(TARGET).S -o $(TARGET).elf
+
+# simulate in ImperasDV lockstep
+sim: $(TARGET).elf.objdump
+	wsim rv32gc $(TARGET).elf --lockstepverbose
+
+clean:
+	rm -f $(TARGET).elf*
+
+
--- a/examples/exercises/8p4/8p4.S
+++ b/examples/exercises/8p4/8p4.S
@ -0,0 +1,20 @@
+.section .text.init
+.globl rvtest_entry_point
+
+rvtest_entry_point:
+    csrr a0, cycle      # read cycle register before computation
+
+    # add up numbers from 1 to 10.  Place result in s0
+    li s0, 0            # initialize sum to 0 in s0
+    li s1, 1            # initialize loop variable i to 1 in s1
+    li t0, 10           # temporary for maximum number
+loop:
+    add s0, s0, s1      # sum = sum + i
+    addi s1, s1, 1      # i = i + 1
+    ble s1, t0, loop    # repeat while i <= 10
+
+    csrr a1, cycle      # read cycle register after computation
+    sub a0, a1, a0      # compute difference in a0
+
+self_loop:
+    j self_loop
--- a/examples/exercises/8p4/Makefile
+++ b/examples/exercises/8p4/Makefile
@ -0,0 +1,17 @@
+TARGET = 8p4
+
+$(TARGET).elf.objdump: $(TARGET).elf
+	riscv64-unknown-elf-objdump -D $(TARGET).elf > $(TARGET).elf.objdump
+    
+$(TARGET).elf: $(TARGET).S Makefile
+	riscv64-unknown-elf-gcc -g -o $(TARGET) -march=rv32i_zicsr -mabi=ilp32 -mcmodel=medany \
+	    -nostartfiles -T../../link/link.ld $(TARGET).S -o $(TARGET).elf
+
+# simulate in ImperasDV lockstep
+sim: $(TARGET).elf.objdump
+	wsim rv32gc $(TARGET).elf --lockstepverbose
+
+clean:
+	rm -f $(TARGET).elf*
+
+
--- a/examples/exercises/8p5/8p5.S
+++ b/examples/exercises/8p5/8p5.S
@ -0,0 +1,29 @@
+.section .text.init
+.globl rvtest_entry_point
+
+rvtest_entry_point:
+
+    # set up trap trap_handler
+    la t0, trap_handler # address of trap trap_handler
+    csrw mtvec, t0      # mtvec = pointer to trap handler
+    la t0, trapstack    # address of trap stack
+    csrw mscratch, t0   # mscratch = pointer to trap stack
+
+    lw t0, 1(zero)      # cause access or misaligned load fault to invoke trap handler
+
+self_loop:
+    j self_loop
+
+
+trap_handler:
+    csrrw tp, mscratch, tp  # swap tp and mscratch to put a trap stack pointer in tp
+    sw t0, 0(tp)            # save t0 on trap stack
+    csrr t0, mepc           # read mepc
+    addi t0, t0, 4          # mepc + 4
+    csrw mepc, t0           # mepc = mpec + 4 (return to next instruction)
+    lw t0, 0(tp)            # restore t0 from trap stack
+    csrrw tp, mscratch, tp  # restore tp and trap stack pointer
+    mret
+
+trapstack:
+    .word 0                 # room to save a register
--- a/examples/exercises/8p5/Makefile
+++ b/examples/exercises/8p5/Makefile
@ -0,0 +1,17 @@
+TARGET = 8p5
+
+$(TARGET).elf.objdump: $(TARGET).elf
+	riscv64-unknown-elf-objdump -D $(TARGET).elf > $(TARGET).elf.objdump
+    
+$(TARGET).elf: $(TARGET).S Makefile
+	riscv64-unknown-elf-gcc -g -o $(TARGET) -march=rv32i_zicsr -mabi=ilp32 -mcmodel=medany \
+	    -nostartfiles -T../../link/link.ld $(TARGET).S -o $(TARGET).elf
+
+# simulate in ImperasDV lockstep
+sim: $(TARGET).elf.objdump
+	wsim rv32gc $(TARGET).elf --lockstepverbose
+
+clean:
+	rm -f $(TARGET).elf*
+
+
--- a/examples/exercises/8p6/8p6.S
+++ b/examples/exercises/8p6/8p6.S
@ -0,0 +1,164 @@
+.section .text.init
+.globl rvtest_entry_point
+
+rvtest_entry_point:
+
+    # set up trap trap_handler
+    la t0, trap_handler # address of trap trap_handler
+    csrw mtvec, t0      # mtvec = pointer to trap handler
+    la t0, trapstack    # address of trap stack
+    csrw mscratch, t0   # mscratch = pointer to trap stack
+
+    li t0, 7        
+    li t1, 9
+    mul t2, t0, t1      # try 7 * 9.  It will trap and invoke trap handler
+
+self_loop:
+    j self_loop
+
+
+trap_handler:
+    csrrw tp, mscratch, tp  # swap tp and mscratch to put a trap stack pointer in tp
+
+    # save all registers on trap stack.  We will need to index into them to find the arguments to emulate multiply
+    sw x0, 0(tp)            # x0 is 0, but we might want to use it
+    sw x1, 4(tp)
+    sw x2, 8(tp)
+    sw x3, 12(tp)
+    sw x4, 16(tp)
+    sw x5, 20(tp)
+    sw x6, 24(tp)
+    sw x7, 28(tp)
+    sw x8, 32(tp)
+    sw x9, 36(tp)
+    sw x10, 40(tp)
+    sw x11, 44(tp)
+    sw x12, 48(tp)
+    sw x13, 52(tp)
+    sw x14, 56(tp)
+    sw x15, 60(tp)
+    sw x16, 64(tp)
+    sw x17, 68(tp)
+    sw x18, 72(tp)
+    sw x19, 76(tp)
+    sw x20, 80(tp)
+    sw x21, 84(tp)
+    sw x22, 88(tp)
+    sw x23, 92(tp)
+    sw x24, 96(tp)
+    sw x25, 100(tp)
+    sw x26, 104(tp)
+    sw x27, 108(tp)
+    sw x28, 112(tp)
+    sw x29, 116(tp)
+    sw x30, 120(tp)
+    sw x31, 124(tp)
+
+    csrr t0, mcause         # check cause of trap
+    li t1, 2                # cause 2 is illegal instruction
+    bne t0, t1, exit        # exit for any other trap than illegal instruction
+
+    # check if instruction is mul (op = 0110011, funct3 = 000, funct7 = 0000001)
+    csrr t0, mtval          # fetch instruction that caused trap
+    andi t1, t0, 127        # get op field (instr[6:0])
+    xori t1, t1, 0b0110011  # set to 0 if op is 0110011
+    srli t2, t0, 12         # get funct3 field (instr[14:12])
+    andi t2, t2, 7          # mask off other bits.  Should be 0 if funct3 = 000
+    srli t3, t0, 25         # get funct7 field (instr[31:25]).  No need to mask
+    xori t3, t3, 0b0000001  # set to 0 if funct7 = 0000001
+    or t1, t1, t2           # nonzero if op or funct3 mismatch
+    or t1, t1, t3           # nonzero if instruction is not mul
+    bnez t1, exit           # exit for any other instruction than mul
+
+    # emulate mul: fetch arguments
+    srli t1, t0, 15         # extract rs1 from instr[19:15]
+    andi t1, t1, 31         # mask off other bits
+    slli t1, t1, 2          # multiply rs1 by 4 to make it a word index
+    add t1, tp, t1          # find location of rs1 on trap stack
+    lw t1, 0(t1)            # read value of rs1
+    srli t2, t0, 20         # extract rs2 from instr[24:20]
+    andi t2, t2, 31         # mask off other bits
+    slli t2, t2, 2          # multiply rs2 by 4 to make it a word index
+    add t2, tp, t2          # find location of rs2 on trap stack
+    lw t2, 0(t2)            # read value of rs2
+
+    # emulate mul p = x * y: shift and add
+    # x in t1, y in t2, p in t3
+    // p = 0
+    // while (y != 0)) {     # iterate until all bits of y are consumed
+    //   if (y%2) p = p + x  # add x to running total
+    //   y = y >> 1          # go on to next bit
+    //   x = x << 1          # shift x to double
+    // }
+
+    li t3, 0                # p = 0
+mulloop:
+    beqz t2, muldone        # done if y == 0
+    andi t4, t2, 1          # t4 = y % 2
+    beqz t4, skipadd        # don't increment p if y%2 == 0
+    add t3, t3, t1          # otherwise p = p + x
+skipadd:
+    srli t2, t2, 1          # y = y >> 1
+    slli t1, t1, 1          # x = x << 1
+    j mulloop               # repeat until done
+muldone:
+
+    # find rd and put result there
+    srli t1, t0, 7          # extract rd from instr[11:7]
+    andi t1, t1, 31         # mask off other bits
+    slli t1, t1, 2          # multiply rd by 4 to make it a word index
+    add t1, tp, t1          # find location of rd on trap stack
+    sw t3, 0(t1)            # store result into rd storage on trap stack
+
+    # return to next instruction
+
+    csrr t0, mepc           # read mepc
+    addi t0, t0, 4          # mepc + 4
+    csrw mepc, t0           # mepc = mpec + 4 (return to next instruction)
+    # restore all of the registers from the trap stack (rd could be in any one)
+    lw x1, 4(tp)
+    lw x2, 8(tp)
+    lw x3, 12(tp)
+    lw x4, 16(tp)
+    lw x5, 20(tp)
+    lw x6, 24(tp)
+    lw x7, 28(tp)
+    lw x8, 32(tp)
+    lw x9, 36(tp)
+    lw x10, 40(tp)
+    lw x11, 44(tp)
+    lw x12, 48(tp)
+    lw x13, 52(tp)
+    lw x14, 56(tp)
+    lw x15, 60(tp)
+    lw x16, 64(tp)
+    lw x17, 68(tp)
+    lw x18, 72(tp)
+    lw x19, 76(tp)
+    lw x20, 80(tp)
+    lw x21, 84(tp)
+    lw x22, 88(tp)
+    lw x23, 92(tp)
+    lw x24, 96(tp)
+    lw x25, 100(tp)
+    lw x26, 104(tp)
+    lw x27, 108(tp)
+    lw x28, 112(tp)
+    lw x29, 116(tp)
+    lw x30, 120(tp)
+    lw x31, 124(tp)
+    csrrw tp, mscratch, tp  # restore tp and trap stack pointer
+    mret
+
+exit:
+    la t1, tohost
+    li t0, 3            # 1 for success, 3 for failure
+    sw t0, 0(t1)        # send fail code
+    j self_loop         # wait
+    
+.section .tohost 
+tohost:                 # write to HTIF
+    .dword 0
+
+trapstack:
+    .fill 32, 4             # room to save registers
--- a/examples/exercises/8p6/Makefile
+++ b/examples/exercises/8p6/Makefile
@ -0,0 +1,17 @@
+TARGET = 8p6
+
+$(TARGET).elf.objdump: $(TARGET).elf
+	riscv64-unknown-elf-objdump -D $(TARGET).elf > $(TARGET).elf.objdump
+    
+$(TARGET).elf: $(TARGET).S Makefile
+	riscv64-unknown-elf-gcc -g -o $(TARGET) -march=rv32im_zicsr -mabi=ilp32 -mcmodel=medany \
+	    -nostartfiles -T../../link/link.ld $(TARGET).S -o $(TARGET).elf
+
+# simulate with Spike
+sim: $(TARGET).elf.objdump
+	spike --isa=rv32i_zicsr -d $(TARGET).elf
+
+clean:
+	rm -f $(TARGET).elf*
+
+
--- a/examples/exercises/8p7/8p7.S
+++ b/examples/exercises/8p7/8p7.S
@ -0,0 +1,134 @@
+.section .text.init
+.globl rvtest_entry_point
+
+rvtest_entry_point:
+
+    # set up trap trap_handler
+    la t0, trap_handler # address of trap trap_handler
+    csrw mtvec, t0      # mtvec = pointer to trap handler
+    la t0, trapstack    # address of trap stack
+    csrw mscratch, t0   # mscratch = pointer to trap stack
+
+    la t0, destination  # get address to make a load
+    lw t0, 3(t0)        # misaligned load will invoke trap handler
+    # should return 0x23456789 in t0
+
+self_loop:
+    j self_loop
+
+
+trap_handler:
+    csrrw tp, mscratch, tp  # swap tp and mscratch to put a trap stack pointer in tp
+
+    # save all registers on trap stack.  We will need to index into them to find the arguments to emulate multiply
+    sw x0, 0(tp)            # x0 is 0, but we might want to use it
+    sw x1, 4(tp)
+    sw x2, 8(tp)
+    sw x3, 12(tp)
+    sw x4, 16(tp)
+    sw x5, 20(tp)
+    sw x6, 24(tp)
+    sw x7, 28(tp)
+    sw x8, 32(tp)
+    sw x9, 36(tp)
+    sw x10, 40(tp)
+    sw x11, 44(tp)
+    sw x12, 48(tp)
+    sw x13, 52(tp)
+    sw x14, 56(tp)
+    sw x15, 60(tp)
+    sw x16, 64(tp)
+    sw x17, 68(tp)
+    sw x18, 72(tp)
+    sw x19, 76(tp)
+    sw x20, 80(tp)
+    sw x21, 84(tp)
+    sw x22, 88(tp)
+    sw x23, 92(tp)
+    sw x24, 96(tp)
+    sw x25, 100(tp)
+    sw x26, 104(tp)
+    sw x27, 108(tp)
+    sw x28, 112(tp)
+    sw x29, 116(tp)
+    sw x30, 120(tp)
+    sw x31, 124(tp)
+
+    csrr t0, mcause         # check cause of trap
+    li t1, 4                # cause 4 is misaligned load
+    bne t0, t1, trap_return # return for any other cause
+
+    # check if instruction is lw (op=0000011, funct3 = 010)
+    csrr t0, mepc           # address of faulting instruction
+    lw t3, 0(t0)            # fetch the instruction.  It must have been a load.
+    srli t1, t3, 12         # get funct3 field (instr[14:12])
+    andi t1, t1, 7          # mask off other bits.
+    xori t1, t1, 0b010      # should produce 0 if funct3 = 010
+    bnez t1, trap_return    # return if any other kind of load
+
+    # emulate lw by performing four byte loads
+    csrr t0, mtval          # address of load instruction
+    lbu t1, 0(t0)           # read zeroth byte
+    lbu t2, 1(t0)           # read the first byte
+    slli t2, t2, 8          # shift into position
+    or t1, t1, t2           # merge with zeroth byte
+    lbu t2, 2(t0)           # read the second byte
+    slli t2, t2, 16         # shift into position
+    or t1, t1, t2           # merge with previous two bytes
+    lbu t2, 3(t0)           # read the third byte
+    slli t2, t2, 24         # shift into position
+    or t2, t1, t2           # merge with previous three bytes
+
+    # find rd and put result there
+    srli t1, t3, 7          # extract rd from instr[11:7]
+    andi t1, t1, 31         # mask off other bits
+    slli t1, t1, 2          # multiply rd by 4 to make it a word index
+    add t1, tp, t1          # find location of rd on trap stack
+    sw t2, 0(t1)            # store result into rd storage on trap stack
+
+    # return to next instruction
+
+trap_return:
+    csrr t0, mepc           # read mepc
+    addi t0, t0, 4          # mepc + 4
+    csrw mepc, t0           # mepc = mpec + 4 (return to next instruction)
+    # restore all of the registers from the trap stack (rd could be in any one)
+    lw x1, 4(tp)
+    lw x2, 8(tp)
+    lw x3, 12(tp)
+    lw x4, 16(tp)
+    lw x5, 20(tp)
+    lw x6, 24(tp)
+    lw x7, 28(tp)
+    lw x8, 32(tp)
+    lw x9, 36(tp)
+    lw x10, 40(tp)
+    lw x11, 44(tp)
+    lw x12, 48(tp)
+    lw x13, 52(tp)
+    lw x14, 56(tp)
+    lw x15, 60(tp)
+    lw x16, 64(tp)
+    lw x17, 68(tp)
+    lw x18, 72(tp)
+    lw x19, 76(tp)
+    lw x20, 80(tp)
+    lw x21, 84(tp)
+    lw x22, 88(tp)
+    lw x23, 92(tp)
+    lw x24, 96(tp)
+    lw x25, 100(tp)
+    lw x26, 104(tp)
+    lw x27, 108(tp)
+    lw x28, 112(tp)
+    lw x29, 116(tp)
+    lw x30, 120(tp)
+    lw x31, 124(tp)
+    csrrw tp, mscratch, tp  # restore tp and trap stack pointer
+    mret
+
+destination:
+    .dword 0x0123456789ABCDEF   # fill destination with some stuff
+
+trapstack:
+    .fill 32, 4             # room to save registers
--- a/examples/exercises/8p7/Makefile
+++ b/examples/exercises/8p7/Makefile
@ -0,0 +1,17 @@
+TARGET = 8p7
+
+$(TARGET).elf.objdump: $(TARGET).elf
+	riscv64-unknown-elf-objdump -D $(TARGET).elf > $(TARGET).elf.objdump
+    
+$(TARGET).elf: $(TARGET).S Makefile
+	riscv64-unknown-elf-gcc -g -o $(TARGET) -march=rv32im_zicsr -mabi=ilp32 -mcmodel=medany \
+	    -nostartfiles -T../../link/link.ld $(TARGET).S -o $(TARGET).elf
+
+# simulate with Spike
+sim: $(TARGET).elf.objdump
+	spike --isa=rv32i_zicsr -d $(TARGET).elf
+
+clean:
+	rm -f $(TARGET).elf*
+
+
--- a/examples/exercises/8p8/8p8.S
+++ b/examples/exercises/8p8/8p8.S
@ -0,0 +1,56 @@
+.section .text.init
+.globl rvtest_entry_point
+
+rvtest_entry_point:
+
+    # set up trap trap_handler
+    la t0, trap_handler # address of trap trap_handler
+    csrw mtvec, t0      # mtvec = pointer to trap handler
+    la t0, trapstack    # address of trap stack
+    csrw mscratch, t0   # mscratch = pointer to trap stack
+    sw zero, 12(t0)     # size of buffer
+
+wait:
+    nop
+    j wait              # wait for uart communication
+
+
+self_loop:
+    j self_loop
+
+
+trap_handler:
+    csrrw tp, mscratch, tp  # swap tp and mscratch to put a trap stack pointer in tp
+
+    # save some registers on trap stack. ß
+    sw t0, 0(tp)            
+    sw t1, 4(tp)
+    sw t2, 8(tp)
+ 
+    lw t0, 12(tp)           # get current length of buffer
+    li t1, 0x10000000       # UART base address
+    lbu t1, 0(t1)           # fetch next character
+    add t2, tp, t0          # address in buffer
+    sb t1, 0(t2)            # store character in buffer
+    li t2, 79               # maximum buffer length
+    beq t0, t2, skip        # is buffer full?
+    addi t0, t0, 1          # increase buffer pointer
+skip:
+    sw t0, 12(tp)           # update buffer length
+
+trap_return:                # return to next instruction
+    csrr t0, mepc           # read mepc
+    addi t0, t0, 4          # mepc + 4
+    csrw mepc, t0           # mepc = mpec + 4 (return to next instruction)
+    # restore all of the registers from the trap stack (rd could be in any one)
+    lw t0, 0(tp)
+    lw t1, 4(tp)
+    lw t2, 8(tp)
+    csrrw tp, mscratch, tp  # restore tp and trap stack pointer
+    mret
+
+buffer:
+   .fill 80, 1              # room for buffer
+
+trapstack:
+    .fill 34, 4             # room to save registers and buffer length
--- a/examples/exercises/8p8/Makefile
+++ b/examples/exercises/8p8/Makefile
@ -0,0 +1,17 @@
+TARGET = 8p8
+
+$(TARGET).elf.objdump: $(TARGET).elf
+	riscv64-unknown-elf-objdump -D $(TARGET).elf > $(TARGET).elf.objdump
+    
+$(TARGET).elf: $(TARGET).S Makefile
+	riscv64-unknown-elf-gcc -g -o $(TARGET) -march=rv32im_zicsr -mabi=ilp32 -mcmodel=medany \
+	    -nostartfiles -T../../link/link.ld $(TARGET).S -o $(TARGET).elf
+
+# simulate in Spike
+sim: $(TARGET).elf.objdump
+	spike --isa=rv32i_zicsr -d $(TARGET).elf
+
+clean:
+	rm -f $(TARGET).elf*
+
+
--- a/examples/exercises/8p9/8p9.S
+++ b/examples/exercises/8p9/8p9.S
@ -0,0 +1,61 @@
+.section .text.init
+.globl rvtest_entry_point
+
+# register to write for GPIO output pins
+.equ GPIO_OUTPUT_VAL, 0x1006000C
+.equ CLINT_MTIMECMP, 0x02004000
+.equ PERIOD, 500
+
+# register use:
+# s0: address of GPIO_OUTPUT_VAL
+# s1: adress of CLINT_MTIME_CMP
+# s2: PERIOD
+
+rvtest_entry_point:
+   
+    # initialize LED to off
+    li s0, GPIO_OUTPUT_VAL
+    sw zero, 0(s0)      # LEDs off
+
+    # configure timer interrupt
+    li s2, PERIOD       
+    csrr t0, time       # read lower 32 bits of timer
+    csrr t1, timeh      # read upper 32 bits of timer
+    add t0, t0, s2      # increment by PERIOD
+    li s1, CLINT_MTIMECMP # set timer for next toggle
+    sw t0, 0(s1)        # CLINT_MTIMECMP = time + PERIOD
+    sw zero, 4(s1)      # upper word = 0 (this is only because program is just starting)
+    # csrci mstatus, 8    # clear mstatus.MIE so interrupts are globally disabled
+    li t0, 128          # 1 in mie.MTIE
+    csrw mie, t0        # enable timer interrupts
+    li s3, 4            # loop counter
+
+/*
+    # enter user mode
+    li t0, 0b11         # 3
+    slli t0, t0, 11     # 11 in bits 12:11
+    csrc mstatus, t0    # mstatus.MPP = 00 (for user mode)
+    la t0, user_start   # 
+    csrw mepc, t0       # where to go when entering user mode
+    mret
+*/
+
+#user_start:             # loop with wfi
+wait_loop:
+    csrr t0, time       # check time before timer fires
+    wfi                 # wait until timer interrupt fires.
+    csrr t0, time       # check time again after timer fires for debugging
+    # interrupts are globally disabled, so when the timer fires, 
+    # wfi will advance here rather than going to an interrupt handler
+    lw t0, 0(s0)        # read GPIO_OUTPUT_VAL
+    xori t0, t0, 1      # toggle least significant bits
+    sw t0, 0(s0)        # update GPIO_OUTPUT_VAL to turn LED off->on or on->off
+    lw t0, 0(s1)        # read CLINT_MTIME_CMP
+    add t0, t0, s2      # add PERIOD
+    sw t0, 0(s1)        # CLINT_MTIME_CMP = CLINT_MTIME_CMP + PERIOD
+    addi s3, s3, -1     # decrement loop counter
+    bnez s3, wait_loop  # repeat
+
+self_loop:
+    j self_loop
+
--- a/examples/exercises/8p9/Makefile
+++ b/examples/exercises/8p9/Makefile
@ -0,0 +1,17 @@
+TARGET = 8p9
+
+$(TARGET).elf.objdump: $(TARGET).elf
+	riscv64-unknown-elf-objdump -D $(TARGET).elf > $(TARGET).elf.objdump
+    
+$(TARGET).elf: $(TARGET).S Makefile
+	riscv64-unknown-elf-gcc -g -o $(TARGET) -march=rv32im_zicsr -mabi=ilp32 -mcmodel=medany \
+	    -nostartfiles -T../../link/link.ld $(TARGET).S -o $(TARGET).elf
+
+# simulate in ImperasDV lockstep
+sim: $(TARGET).elf.objdump
+	wsim rv32gc 8p9.elf --lockstepverbose > log
+
+clean:
+	rm -f $(TARGET).elf*
+
+
--- a/sim/questa/GetLineNum.do
+++ b/sim/questa/GetLineNum.do
@ -14,5 +14,5 @@ proc GetLineNum {fname target} {
     }
    close $f
    return -code error \
-         "target string not found"
+         [append "target string not found " $target " not found by GetLineNum.do for coverage exclusion in " $fname]
 }
--- a/sim/questa/coverage-exclusions-rv64gc.do
+++ b/sim/questa/coverage-exclusions-rv64gc.do
@ -347,11 +347,6 @@ coverage exclude -scope /dut/core/ifu/immu/immu/pmp/pmpchecker -linerange [GetLi
 coverage exclude -scope /dut/core/ifu/immu/immu/pmp/pmpchecker -linerange [GetLineNum ${SRC}/mmu/pmpchecker.sv "exclusion-tag: immu-pmpcboz"] 
 coverage exclude -scope /dut/core/ifu/immu/immu/pmp/pmpchecker -linerange [GetLineNum ${SRC}/mmu/pmpchecker.sv "exclusion-tag: immu-pmpcboaccess"] 

-# IMMU PMP only makes 4-byte accesses
-coverage exclude -scope /dut/core/ifu/immu/immu/pmp/pmpchecker -linerange [GetLineNum ${SRC}/mmu/pmpchecker.sv "SizeBytesMinus1 = 3'd0"]  -item bs 1
-coverage exclude -scope /dut/core/ifu/immu/immu/pmp/pmpchecker -linerange [GetLineNum ${SRC}/mmu/pmpchecker.sv "SizeBytesMinus1 = 3'd1"]  -item bs 1
-coverage exclude -scope /dut/core/ifu/immu/immu/pmp/pmpchecker -linerange [GetLineNum ${SRC}/mmu/pmpchecker.sv "SizeBytesMinus1 = 3'd7"]  -item bs 1
-
 # No irom
 set line [GetLineNum ${SRC}/ifu/ifu.sv "~ITLBMissF & ~CacheableF & ~SelIROM"] 
 coverage exclude -scope /dut/core/ifu -linerange $line-$line -item c 1 -feccondrow 6
--- a/src/cvw.sv
+++ b/src/cvw.sv
@ -95,6 +95,7 @@ typedef struct packed {

 // Legal number of PMP entries are 0, 16, or 64
  int           PMP_ENTRIES;
+  int           PMP_G; // grain 

 // Address space
  logic [63:0]  RESET_VECTOR;
--- a/src/mmu/pmpadrdec.sv
+++ b/src/mmu/pmpadrdec.sv
@ -40,31 +40,22 @@ module pmpadrdec import cvw::*;  #(parameter cvw_t P) (
  input  logic                  PAgePMPAdrIn,
  output logic                  PAgePMPAdrOut,
  output logic                  Match, 
-  output logic [P.PA_BITS-1:0]  PMPTop,
  output logic                  L, X, W, R
 );
  
  // define PMP addressing mode codes
  localparam                    TOR   = 2'b01;
-  localparam                    NA4   = 2'b10;
  localparam                    NAPOT = 2'b11;

  logic                         TORMatch, NAMatch;
  logic                         PAltPMPAdr;
-  logic [P.PA_BITS-1:0]         CurrentAdrFull;
  logic [1:0]                   AdrMode;
-  logic [P.PA_BITS-1:0]         PMPTop1, PMPTopTOR, PMPTopNaturallyAligned;
 
  assign AdrMode = PMPCfg[4:3];

-  // The two lsb of the physical address don't matter for this checking.
-  // The following code includes them, but hardwires the PMP checker lsbs to 00
-  // and masks them later.  Logic synthesis should optimize away these bottom bits.
- 
  // Top-of-range (TOR)
  // Append two implicit trailing 0's to PMPAdr value
-  assign CurrentAdrFull  = {PMPAdr,  2'b00};
-  assign PAltPMPAdr = {1'b0, PhysicalAddress} < {1'b0, CurrentAdrFull}; // unsigned comparison
+  assign PAltPMPAdr = {1'b0, PhysicalAddress} < {1'b0, PMPAdr, 2'b00}; // unsigned comparison
  assign PAgePMPAdrOut = ~PAltPMPAdr;
  assign TORMatch = PAgePMPAdrIn & PAltPMPAdr; // exclusion-tag: PAgePMPAdrIn

@ -80,15 +71,8 @@ module pmpadrdec import cvw::*;  #(parameter cvw_t P) (

  // finally pick the appropriate match for the access type
  assign Match = (AdrMode == TOR) ? TORMatch : 
-                 (AdrMode == NA4 | AdrMode == NAPOT) ? NAMatch :
-                 1'b0;
-
-  // Report top of region for first matching region
-  // PMP should match but fail if the size is too big (8-byte accesses spanning to TOR or NA4 region)
-  assign PMPTopTOR = {PMPAdr-1,  2'b11}; // TOR goes to (pmpaddr << 2) - 1
-  assign PMPTopNaturallyAligned = {PMPAdr,2'b00} | NAMask; // top of the pmp region for NA4 and NAPOT.  All 1s in the lower bits.  Used to check the address doesn't pass the top
-  assign PMPTop1 = (AdrMode == TOR) ? PMPTopTOR : PMPTopNaturallyAligned;
-  assign PMPTop = FirstMatch ? PMPTop1 : '0; // AND portion of distributed AND-OR mux (OR portion in pmpchhecker)
+                 (AdrMode[1]) ? NAMatch : // NA4 or NAPOT
+                 1'b0; // OFF never matches

  assign L = PMPCfg[7];
  assign X = PMPCfg[2];
--- a/src/mmu/pmpchecker.sv
+++ b/src/mmu/pmpchecker.sv
@ -41,7 +41,7 @@ module pmpchecker import cvw::*;  #(parameter cvw_t P) (
  // keyword, the compiler warns us that it's interpreting the signal as a var,
  // which we might not intend.
  input  var logic [7:0]           PMPCFG_ARRAY_REGW[P.PMP_ENTRIES-1:0],
-  input  var logic [P.PA_BITS-3:0] PMPADDR_ARRAY_REGW [P.PMP_ENTRIES-1:0],
+  input  var logic [P.PA_BITS-3:0] PMPADDR_ARRAY_REGW[P.PMP_ENTRIES-1:0],
  input  logic                     ExecuteAccessF, WriteAccessM, ReadAccessM,
  input  logic [1:0]               Size,
  input  logic [3:0]               CMOpM,
@ -56,12 +56,10 @@ module pmpchecker import cvw::*;  #(parameter cvw_t P) (
  logic [P.PMP_ENTRIES-1:0]        FirstMatch; // onehot encoding for the first pmpaddr to match the current address.
  logic [P.PMP_ENTRIES-1:0]        L, X, W, R; // PMP matches and has flag set
  logic [P.PMP_ENTRIES-1:0]        PAgePMPAdr; // for TOR PMP matching, PhysicalAddress > PMPAdr[i]
-  logic [P.PA_BITS-1:0]            PMPTop[P.PMP_ENTRIES-1:0];     // Upper end of each region, for checking that the access is fully within the region
  logic                            PMPCMOAccessFault, PMPCBOMAccessFault, PMPCBOZAccessFault;
  logic [2:0]                      SizeBytesMinus1;
  logic                            MatchingR, MatchingW, MatchingX, MatchingL;
-  logic [P.PA_BITS-1:0]            MatchingPMPTop, PhysicalAddressTop;
-  logic                            TooBig;
+

  if (P.PMP_ENTRIES > 0) begin: pmp // prevent complaints about array of no elements when PMP_ENTRIES = 0
    pmpadrdec #(P) pmpadrdecs[P.PMP_ENTRIES-1:0](
@ -72,31 +70,16 @@ module pmpchecker import cvw::*;  #(parameter cvw_t P) (
      .FirstMatch,
      .PAgePMPAdrIn({PAgePMPAdr[P.PMP_ENTRIES-2:0], 1'b1}),
      .PAgePMPAdrOut(PAgePMPAdr),
-      .Match, .PMPTop, .L, .X, .W, .R);
+      .Match, .L, .X, .W, .R);
  end

  priorityonehot #(P.PMP_ENTRIES) pmppriority(.a(Match), .y(FirstMatch)); // combine the match signal from all the address decoders to find the first one that matches.

  // Distributed AND-OR mux to select the first matching results
-  // If the access does not match all bytes of the PMP region, it is too big and the matches are disabled
-  assign MatchingR = |(R & FirstMatch) & ~TooBig;
-  assign MatchingW = |(W & FirstMatch) & ~TooBig;
-  assign MatchingX = |(X & FirstMatch) & ~TooBig;
+  assign MatchingR = |(R & FirstMatch);
+  assign MatchingW = |(W & FirstMatch);
+  assign MatchingX = |(X & FirstMatch);
  assign MatchingL = |(L & FirstMatch);
-  or_rows #(P.PMP_ENTRIES, P.PA_BITS) PTEOr(PMPTop, MatchingPMPTop);
-
-  // Matching PMP entry must match all bytes of an access, or the access fails (Priv Spec 3.7.1.3)
-  // First find the size of the access in terms of the offset to the most significant byte
-  always_comb
-    case (Size)
-      2'b00: SizeBytesMinus1 = 3'd0;
-      2'b01: SizeBytesMinus1 = 3'd1;
-      2'b10: SizeBytesMinus1 = 3'd3;
-      2'b11: SizeBytesMinus1 = 3'd7;
-    endcase
-  // Then find the top of the access and see if it is beyond the top of the region
-  assign PhysicalAddressTop = PhysicalAddress + {{P.PA_BITS-3{1'b0}}, SizeBytesMinus1}; // top of the access range
-  assign TooBig = PhysicalAddressTop > MatchingPMPTop; // check if the access goes beyond the top of the PMP region

  // Only enforce PMP checking for effective S and U modes (accounting for mstatus.MPRV) or in Machine mode when L bit is set in selected region
  assign EnforcePMP = (EffectivePrivilegeModeW != P.M_MODE) | MatchingL;
--- a/src/privileged/csrm.sv
+++ b/src/privileged/csrm.sv
@ -46,14 +46,15 @@ module csrm  import cvw::*;  #(parameter cvw_t P) (
  output logic [15:0]              MEDELEG_REGW,
  output logic [11:0]              MIDELEG_REGW,
  /* verilator lint_off UNDRIVEN */ // PMP registers are only used when PMP_ENTRIES > 0
+  output var logic [P.PA_BITS-3:0] PMPADDR_ARRAY_REGW[P.PMP_ENTRIES-1:0],
  output var logic [7:0]           PMPCFG_ARRAY_REGW[P.PMP_ENTRIES-1:0],
-  output var logic [P.PA_BITS-3:0] PMPADDR_ARRAY_REGW [P.PMP_ENTRIES-1:0],
  /* verilator lint_on UNDRIVEN */
  output logic                     WriteMSTATUSM, WriteMSTATUSHM,
  output logic                     IllegalCSRMAccessM, IllegalCSRMWriteReadonlyM,
  output logic [63:0]              MENVCFG_REGW
 );

+  logic [P.PA_BITS-3:0]            PMPADDR_ARRAY_PREGRAIN_REGW[P.PMP_ENTRIES-1:0];
  logic [P.XLEN-1:0]               MISA_REGW, MHARTID_REGW;
  logic [P.XLEN-1:0]               MSCRATCH_REGW, MTVAL_REGW, MCAUSE_REGW;
  logic [P.XLEN-1:0]               MENVCFGH_REGW;
@ -103,6 +104,7 @@ module csrm  import cvw::*;  #(parameter cvw_t P) (
  // when compressed instructions are supported, there can't be misaligned instructions
  localparam MEDELEG_MASK  = P.ZCA_SUPPORTED ? 16'hB3FE : 16'hB3FF;
  localparam MIDELEG_MASK  = 12'h222; // we choose to not make machine interrupts delegable
+  localparam Gm1 = P.PMP_G > 0 ? P.PMP_G - 1 : 0; // max(G-1, 0)

 // There are PMP_ENTRIES = 0, 16, or 64 PMPADDR registers, each of which has its own flop
  genvar i;
@ -112,8 +114,9 @@ module csrm  import cvw::*;  #(parameter cvw_t P) (
    logic [7:0]               CSRPMPWriteValM[P.PMP_ENTRIES-1:0];
    logic [7:0]               CSRPMPLegalizedWriteValM[P.PMP_ENTRIES-1:0];
    logic [1:0]               CSRPMPWRLegalizedWriteValM[P.PMP_ENTRIES-1:0]; 
+    logic [1:0]               CSRPMPALegalizedWriteValM[P.PMP_ENTRIES-1:0]; 
    logic [P.PMP_ENTRIES-1:0] ADDRLocked, CFGLocked;
-    for(i=0; i<P.PMP_ENTRIES; i++) begin
+    for(i=0; i<P.PMP_ENTRIES; i++) begin:pmp
      // when the lock bit is set, don't allow writes to the PMPCFG or PMPADDR
      // also, when the lock bit of the next entry is set and the next entry is TOR, don't allow writes to this entry PMPADDR
      assign CFGLocked[i] = PMPCFG_ARRAY_REGW[i][7];
@ -123,7 +126,8 @@ module csrm  import cvw::*;  #(parameter cvw_t P) (
        assign ADDRLocked[i] = PMPCFG_ARRAY_REGW[i][7] | (PMPCFG_ARRAY_REGW[i+1][7] & PMPCFG_ARRAY_REGW[i+1][4:3] == 2'b01);

      assign WritePMPADDRM[i] = (CSRMWriteM & (CSRAdrM == (PMPADDR0+i))) & ~ADDRLocked[i];
-      flopenr #(P.PA_BITS-2) PMPADDRreg(clk, reset, WritePMPADDRM[i], CSRWriteValM[P.PA_BITS-3:0], PMPADDR_ARRAY_REGW[i]);
+      // PMPADDR_ARRAY_PREGRAIN_REGW flip-flops hold all the bits even though all but G-1 lsbs can be controlled by PMP mode and granularity
+      flopenr #(P.PA_BITS-2) PMPADDRreg(clk, reset, WritePMPADDRM[i], CSRWriteValM[P.PA_BITS-3:0], PMPADDR_ARRAY_PREGRAIN_REGW[i]);
      if (P.XLEN==64) begin
        assign WritePMPCFGM[i] = (CSRMWriteM & (CSRAdrM == (PMPCFG0+2*(i/8)))) & ~CFGLocked[i];
        assign CSRPMPWriteValM[i] = CSRWriteValM[(i%8)*8+7:(i%8)*8];
@ -132,8 +136,9 @@ module csrm  import cvw::*;  #(parameter cvw_t P) (
        assign CSRPMPWriteValM[i] = CSRWriteValM[(i%4)*8+7:(i%4)*8];
      end

+      assign CSRPMPALegalizedWriteValM[i] = ((P.PMP_G > 0) & (CSRPMPWriteValM[i][4:3] == 2'b10)) ? PMPCFG_ARRAY_REGW[i][4:3] : CSRPMPWriteValM[i][4:3]; // WARL A field keeps its old value when attempting to write unselectable NA4 mode
      assign CSRPMPWRLegalizedWriteValM[i] = {(CSRPMPWriteValM[i][1] & CSRPMPWriteValM[i][0]), CSRPMPWriteValM[i][0]}; // legalize WR fields (reserved 10 written as 00)
-      assign CSRPMPLegalizedWriteValM[i] = {CSRPMPWriteValM[i][7], 2'b00, CSRPMPWriteValM[i][4:2], CSRPMPWRLegalizedWriteValM[i]};
+      assign CSRPMPLegalizedWriteValM[i] = {CSRPMPWriteValM[i][7], 2'b00, CSRPMPALegalizedWriteValM[i], CSRPMPWriteValM[i][2], CSRPMPWRLegalizedWriteValM[i]};
      flopenr #(8) PMPCFGreg(clk, reset, WritePMPCFGM[i], CSRPMPLegalizedWriteValM[i], PMPCFG_ARRAY_REGW[i]);
    end
  end
@ -214,6 +219,15 @@ module csrm  import cvw::*;  #(parameter cvw_t P) (
    assign MENVCFGH_REGW = '0;
  end

+  // Grain alignment for PMPADDR read values.
+  for(i=0; i<P.PMP_ENTRIES; i++) 
+    always_comb begin
+      logic [P.XLEN-1:0] pmpaddr;
+      pmpaddr = {{(P.XLEN-(P.PA_BITS-2)){1'b0}}, PMPADDR_ARRAY_PREGRAIN_REGW[i]}; // raw value in PMP registers
+      if (PMPCFG_ARRAY_REGW[i][4]) PMPADDR_ARRAY_REGW[i] = {pmpaddr[P.PA_BITS-3:Gm1],     {Gm1    {1'b1}}}; // in NAPOT/NA4, bottom G-1 bits read as all 1s (but no bits affected for NA4)
+      else                         PMPADDR_ARRAY_REGW[i] = {pmpaddr[P.PA_BITS-3:P.PMP_G], {P.PMP_G{1'b0}}}; // in TOR/OFF, bottom G bits read as 0s
+    end
+
  // Read machine mode CSRs
  // verilator lint_off WIDTH
  logic [5:0] entry;
@ -221,8 +235,8 @@ module csrm  import cvw::*;  #(parameter cvw_t P) (
    entry = '0;
    CSRMReadValM = '0;
    IllegalCSRMAccessM = !(P.S_SUPPORTED) & (CSRAdrM == MEDELEG | CSRAdrM == MIDELEG); // trap on DELEG register access when no S or N-mode
-    if ($unsigned(CSRAdrM) >= PMPADDR0 & $unsigned(CSRAdrM) < PMPADDR0 + P.PMP_ENTRIES) // reading a PMP entry
-      CSRMReadValM = {{(P.XLEN-(P.PA_BITS-2)){1'b0}}, PMPADDR_ARRAY_REGW[CSRAdrM - PMPADDR0]};
+    if ($unsigned(CSRAdrM) >= PMPADDR0 & $unsigned(CSRAdrM) < PMPADDR0 + P.PMP_ENTRIES) 
+      CSRMReadValM = {{(P.XLEN-(P.PA_BITS-2)){1'b0}}, PMPADDR_ARRAY_REGW[CSRAdrM - PMPADDR0]}; // read PMPADDR entry with lsbs aligned to grain based on NAPOT vs. TOR
    else if ($unsigned(CSRAdrM) >= PMPCFG0 & $unsigned(CSRAdrM) < PMPCFG0 + P.PMP_ENTRIES/4 & (P.XLEN==32 | CSRAdrM[0] == 0)) begin
      // only odd-numbered PMPCFG entries exist in RV64
      if (P.XLEN==64) begin
--- a/src/uncore/trickbox_apb.sv
+++ b/src/uncore/trickbox_apb.sv
@ -0,0 +1,162 @@
+///////////////////////////////////////////
+// trickbox_apb.sv
+//
+// Written: David_Harris@hmc.edu 20 May 2025
+// Modified: 
+//
+// Purpose: Trickbox, superset of CLINT
+//  https://docs.google.com/document/d/1erHBVchBtwmgZ0bCNjb88spYfN7CpRbhmSNFH6cO8CY/edit?tab=t.0
+//
+// Documentation: RISC-V System on Chip Design
+//
+// A component of the CORE-V-WALLY configurable RISC-V project.
+// https://github.com/openhwgroup/cvw
+// 
+// Copyright (C) 2021-23 Harvey Mudd College & Oklahoma State University
+//
+// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1
+//
+// Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file 
+// except in compliance with the License, or, at your option, the Apache License version 2.0. You 
+// may obtain a copy of the License at
+//
+// https://solderpad.org/licenses/SHL-2.1/
+//
+// Unless required by applicable law or agreed to in writing, any work distributed under the 
+// License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, 
+// either express or implied. See the License for the specific language governing permissions 
+// and limitations under the License.
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+module trickbox_apb import cvw::*;  #(parameter XLEN = 64, NUM_HARTS = 1) (
+  input  logic                PCLK, PRESETn,
+  input  logic                PSEL,
+  input  logic [15:0]         PADDR, 
+  input  logic [XLEN-1:0]     PWDATA,
+  input  logic [XLEN/8-1:0]   PSTRB,
+  input  logic                PWRITE,
+  input  logic                PENABLE,
+  output logic [XLEN-1:0]     PRDATA,
+  output logic                PREADY,
+  input  logic [63:0]         MTIME_IN,
+  input  logic [NUM_HARTS-1:0] MTIP_IN, MSIP_IN, SSIP_IN, MEIP_IN, SEIP_IN,
+  input  var logic [XLEN-1:0] HGEIP_IN[NUM_HARTS-1:0],
+  output logic [63:0]         MTIME_OUT, 
+  output logic [NUM_HARTS-1:0] MTIP_OUT, MSIP_OUT, SSIP_OUT, MEIP_OUT, SEIP_OUT,
+  output var logic  [XLEN-1:0] HGEIP_OUT[NUM_HARTS-1:0],
+  output logic [XLEN-1:0]     TOHOST_OUT
+);
+
+  // register map
+  localparam CLINT_MSIP     = 16'h0000;
+  localparam CLINT_MTIMECMP = 16'h4000;
+  localparam CLINT_MTIME    = 16'hBFF8;
+
+  logic [63:0]                MTIMECMP[NUM_HARTS-1:0];
+  logic [7:0]                 TRICKEN;
+  logic [63:0]                MTIME;
+  logic [NUM_HARTS-1:0]       MTIP, MSIP, SSIP, MEIP, SEIP;
+  logic [XLEN-1:0]            TOHOST;
+  logic [XLEN-1:0]            HGEIP[NUM_HARTS-1:0];
+  logic [15:0]                entry;
+  logic [9:0]                 hart;                   // which hart is being accessed
+  logic                       memwrite;
+  logic [63:0]                RD;
+  genvar                      i;
+  
+  assign memwrite = PWRITE & PENABLE & PSEL;  // only write in access phase
+  assign PREADY   = 1'b1;                     // CLINT never takes >1 cycle to respond
+  assign hart = PADDR[12:3];                  // middle bits of address allow control of up to 1024 harts 
+  
+  // read circuitry
+  // 64-bit accesses, then reduce to 32-bit for RV32
+  always_ff @(posedge PCLK) begin
+    case (PADDR[15:13])
+      3'b000: RD <= {63'b0, MSIP[hart]};     // *** memory map
+      3'b001: RD <= {63'b0, SSIP[hart]};
+      3'b010: RD <= MTIMECMP[hart];
+      3'b011: RD <= {63'b0, MEIP[hart]};
+      3'b100: RD <= {63'b0, SEIP[hart]};
+      3'b101: case (hart) 
+        10'b0000000000: RD <= TOHOST;
+        10'b0000000001: RD <= '0; // Reading COM1 has no effect; busy bit not yet implemented.  Later add busy bit
+        10'b0000000010: RD <= {56'b0, TRICKEN};
+        10'b1111111111: RD <= MTIME;
+        default: RD <= '0;
+      endcase
+      3'b110: RD <= HGEIP[hart];
+      default: RD <= '0;
+    endcase
+  end
+
+  // word aligned reads
+  if (XLEN == 64) assign PRDATA = RD;
+  else            assign PRDATA = RD[PADDR[2]*32 +: 32]; // 32-bit register access to upper or lower half
+  
+  // write circuitry
+  always_ff @(posedge PCLK)
+    if (~PRESETn) begin
+      MSIP <= '0;
+      SSIP <= '0;
+      MEIP <= '0;
+      SEIP <= '0;
+      TOHOST <= '0;
+      TRICKEN <= '0;
+    end else if (memwrite) begin
+      case (PADDR[15:13])
+        3'b000: MSIP[hart] <= PWDATA[0];
+        3'b001: SSIP[hart] <= PWDATA[0];
+        3'b011: MEIP[hart] <= PWDATA[0];
+        3'b100: SEIP[hart] <= PWDATA[0];
+        3'b101: case (hart) 
+          10'b0000000000: TOHOST <= PWDATA;
+          10'b0000000001: $display("%c", PWDATA[7:0]); // COM1 prints to simulation console.  Eventually allow it to be redirected to a UART, and provide a busy bit.
+          10'b0000000010: TRICKEN <= PWDATA[7:0];
+        endcase
+      endcase
+    end
+    // generate loop write circuits for MTIMECMP and HGEIP
+    for (i=0; i<NUM_HARTS; i++) 
+      always_ff @(posedge PCLK) 
+        if (~PRESETn) begin
+          MTIMECMP[i] <= 64'hFFFFFFFFFFFFFFFF; // Spec says MTIMECMP is not reset, but we reset to maximum value to prevent spurious timer interrupts
+          HGEIP[i] <= 0;
+        end else if (memwrite & (hart == i)) begin
+          if (PADDR[15:13] == 3'b010) begin
+            if (XLEN == 64) MTIMECMP[hart] <= PWDATA; // 64-bit write
+            else            MTIMECMP[hart][PADDR[2]*32 +: 32] <= PWDATA; // 32-bit write
+          end else if (PADDR[15:13] == 3'b110) begin
+            HGEIP[hart] <= PWDATA;
+          end
+        end 
+
+  // mtime register
+  always_ff @(posedge PCLK) 
+    if (~PRESETn) begin
+      MTIME <= '0;
+    end else if (memwrite & (PADDR[15:13] == 3'b101 && hart == 10'b1111111111)) begin
+      if (XLEN == 64) MTIME <= PWDATA; // 64-bit write
+      else            MTIME <= MTIME[PADDR[2]*32 +: 32]; // 32-bit write
+    end else          MTIME <= MTIME + 1; 
+
+  // timer interrupt when MTIME >= MTIMECMP (unsigned)
+  for (i=0;i<NUM_HARTS;i++) 
+    assign MTIP[i] = ({1'b0, MTIME} >= {1'b0, MTIMECMP[i]}); 
+
+  // TRICKEN controls whether outputs come from TrickBox or are daisy-chained from elsewhere 
+  always_comb begin
+    MSIP_OUT = TRICKEN[0] ? MSIP : MSIP_IN;
+    SSIP_OUT = TRICKEN[1] ? SSIP : SSIP_IN;
+    MEIP_OUT = TRICKEN[2] ? MEIP : MEIP_IN; 
+    SEIP_OUT = TRICKEN[3] ? SEIP : SEIP_IN;
+    MTIP_OUT = TRICKEN[4] ? MTIP : MTIP_IN;
+    MTIME_OUT = TRICKEN[5] ? MTIME : MTIME_IN;
+    TOHOST_OUT = TRICKEN[7] ? TOHOST : '0;
+    // NO COM1
+  end
+
+  for (i=0; i<NUM_HARTS;i++) 
+    assign HGEIP_OUT[i] = TRICKEN[6] ? HGEIP[i] : HGEIP_IN[i];
+
+endmodule
+
--- a/testbench/common/riscvassertions.sv
+++ b/testbench/common/riscvassertions.sv
@ -22,13 +22,17 @@
 module riscvassertions import cvw::*; #(parameter cvw_t P);
  initial begin
    assert (P.PMP_ENTRIES == 0 | P.PMP_ENTRIES==16 | P.PMP_ENTRIES==64) else $fatal(1, "Illegal number of PMP entries: PMP_ENTRIES must be 0, 16, or 64");
-    assert (P.S_SUPPORTED | P.VIRTMEM_SUPPORTED == 0) else $fatal(1, "Virtual memory requires S mode support");
+    assert (P.PMP_G > 0 | P.XLEN == 32 | P.PMP_ENTRIES == 0) else $fatal(1, "RV64 requires PMP_G at least 1 to avoid checking for 8-byte accesses to 4-byte region");
+    assert ((P.PMP_G >= $clog2(P.DCACHE_LINELENINBITS/8)-2) | !P.ZICCLSM_SUPPORTED | P.PMP_ENTRIES == 0) else $fatal(1, "Systems that support misaligned data with PMP must have grain size of at least one cache line so accesses that span grains will also cause spills");
+    assert ((P.PMP_G >= $clog2(P.ICACHE_LINELENINBITS/8)-2) | !P.ZCA_SUPPORTED | (P.PMP_ENTRIES == 0) | !P.ICACHE_SUPPORTED) else $fatal(1, "Systems that support compressed instructions with PMP must have grain size of at least one cache line so fetches that span grains will also cause spills");
+    assert (P.PMP_G < P.PA_BITS-2 | P.PMP_ENTRIES == 0) else $fatal(1, "PMP granularity must be less than the number of physical address bits");
    assert (P.IDIV_BITSPERCYCLE == 1 | P.IDIV_BITSPERCYCLE==2 | P.IDIV_BITSPERCYCLE==4) else $fatal(1, "Illegal number of divider bits/cycle: IDIV_BITSPERCYCLE must be 1, 2, or 4");
-    assert (P.F_SUPPORTED | ~P.D_SUPPORTED) else $fatal(1, "Can't support double fp (D) without supporting float (F)");
-    assert (P.D_SUPPORTED | ~P.Q_SUPPORTED) else $fatal(1, "Can't support quad fp (Q) without supporting double (D)");
-    assert (P.F_SUPPORTED | ~P.ZFH_SUPPORTED) else $fatal(1, "Can't support half-precision fp (ZFH) without supporting float (F)");
-    assert (P.DCACHE_SUPPORTED | ~P.F_SUPPORTED | P.FLEN <= P.XLEN) else $fatal(1, "Data cache required to support FLEN > XLEN because AHB/DTIM bus width is XLEN");
+    assert (P.F_SUPPORTED | !P.D_SUPPORTED) else $fatal(1, "Can't support double fp (D) without supporting float (F)");
+    assert (P.D_SUPPORTED | !P.Q_SUPPORTED) else $fatal(1, "Can't support quad fp (Q) without supporting double (D)");
+    assert (P.F_SUPPORTED | !P.ZFH_SUPPORTED) else $fatal(1, "Can't support half-precision fp (ZFH) without supporting float (F)");
+    assert (P.DCACHE_SUPPORTED | !P.F_SUPPORTED | P.FLEN <= P.XLEN) else $fatal(1, "Data cache required to support FLEN > XLEN because AHB/DTIM bus width is XLEN");
    assert (P.I_SUPPORTED ^ P.E_SUPPORTED) else $fatal(1, "Exactly one of I and E must be supported");
+    assert (P.S_SUPPORTED | P.VIRTMEM_SUPPORTED == 0) else $fatal(1, "Virtual memory requires S mode support");
    assert (P.DCACHE_WAYSIZEINBYTES <= 4096 | (!P.DCACHE_SUPPORTED) | P.VIRTMEM_SUPPORTED == 0) else $fatal(1, "DCACHE_WAYSIZEINBYTES cannot exceed 4 KiB when caches and virtual memory is enabled (to prevent aliasing)");
    assert (P.DCACHE_LINELENINBITS >= 128 | (!P.DCACHE_SUPPORTED)) else $fatal(1, "DCACHE_LINELENINBITS must be at least 128 when caches are enabled");
    assert (P.DCACHE_LINELENINBITS < P.DCACHE_WAYSIZEINBYTES*8) else $fatal(1, "DCACHE_LINELENINBITS must be smaller than way size");
--- a/testbench/common/wallyTracer.sv
+++ b/testbench/common/wallyTracer.sv
@ -280,7 +280,7 @@ module wallyTracer import cvw::*; #(parameter cvw_t P) (rvviTrace rvvi);

  // PMPADDR CSRs 3B0 to 3EF
    for(genvar pmpAddrID = 0; pmpAddrID < P.PMP_ENTRIES; pmpAddrID++) begin
-      `CONNECT_CSR(PMPADDR``pmpAddrID, 12'h3B0 + pmpAddrID, testbench.dut.core.priv.priv.csr.csrm.PMPADDR_ARRAY_REGW[pmpAddrID]);
+      `CONNECT_CSR(PMPADDR``pmpAddrID, 12'h3B0 + pmpAddrID, testbench.dut.core.priv.priv.csr.csrm.PMPADDR_ARRAY_REGW[pmpAddrID]); // aligned to grain
    end
  end

--- a/testbench/tests.vh
+++ b/testbench/tests.vh
@ -153,25 +153,25 @@ string arch32pmp[] = '{
  `RISCVARCHTEST,
  "rv32i_m/pmp32/src/pmp-CFG-reg.S",
  "rv32i_m/pmp32/src/pmp-CSR-access.S",
-  "rv32i_m/pmp32/src/pmp-NA4-R-priority-level-2.S",
-  "rv32i_m/pmp32/src/pmp-NA4-R-priority.S",
-  "rv32i_m/pmp32/src/pmp-NA4-R.S",
-  "rv32i_m/pmp32/src/pmp-NA4-RW-priority-level-2.S",
-  "rv32i_m/pmp32/src/pmp-NA4-RW-priority.S",
-  "rv32i_m/pmp32/src/pmp-NA4-RW.S",
+  //"rv32i_m/pmp32/src/pmp-NA4-R-priority-level-2.S", *** restore when BLOCKED is removed after tests work with G > 0
+  //"rv32i_m/pmp32/src/pmp-NA4-R-priority.S",
+  //"rv32i_m/pmp32/src/pmp-NA4-R.S",
+  //"rv32i_m/pmp32/src/pmp-NA4-RW-priority-level-2.S",
+  //"rv32i_m/pmp32/src/pmp-NA4-RW-priority.S",
+  //"rv32i_m/pmp32/src/pmp-NA4-RW.S",
  "rv32i_m/pmp32/src/pmp-NA4-RWX.S",
-  "rv32i_m/pmp32/src/pmp-NA4-RX-priority-level-2.S",
-  "rv32i_m/pmp32/src/pmp-NA4-RX-priority.S",
-  "rv32i_m/pmp32/src/pmp-NA4-RX.S",
-  "rv32i_m/pmp32/src/pmp-NA4-X-priority-level-2.S",
-  "rv32i_m/pmp32/src/pmp-NA4-X-priority.S",
-  "rv32i_m/pmp32/src/pmp-NA4-X.S",
-  "rv32i_m/pmp32/src/pmp-NAPOT-R-priority-level-2.S",
-  "rv32i_m/pmp32/src/pmp-NAPOT-R-priority.S",
-  "rv32i_m/pmp32/src/pmp-NAPOT-R.S",
-  "rv32i_m/pmp32/src/pmp-NAPOT-RW-priority-level-2.S",
-  "rv32i_m/pmp32/src/pmp-NAPOT-RW-priority.S",
-  "rv32i_m/pmp32/src/pmp-NAPOT-RW.S",
+  //"rv32i_m/pmp32/src/pmp-NA4-RX-priority-level-2.S",
+  //"rv32i_m/pmp32/src/pmp-NA4-RX-priority.S",
+  //"rv32i_m/pmp32/src/pmp-NA4-RX.S",
+  //"rv32i_m/pmp32/src/pmp-NA4-X-priority-level-2.S",
+  //"rv32i_m/pmp32/src/pmp-NA4-X-priority.S",
+  //"rv32i_m/pmp32/src/pmp-NA4-X.S",
+  //"rv32i_m/pmp32/src/pmp-NAPOT-R-priority-level-2.S",
+  //"rv32i_m/pmp32/src/pmp-NAPOT-R-priority.S",
+  //"rv32i_m/pmp32/src/pmp-NAPOT-R.S",
+  //"rv32i_m/pmp32/src/pmp-NAPOT-RW-priority-level-2.S",
+  //"rv32i_m/pmp32/src/pmp-NAPOT-RW-priority.S",
+  //"rv32i_m/pmp32/src/pmp-NAPOT-RW.S",
  "rv32i_m/pmp32/src/pmp-NAPOT-RWX.S",
  "rv32i_m/pmp32/src/pmp-NAPOT-RX-priority-level-2.S",
  "rv32i_m/pmp32/src/pmp-NAPOT-RX-priority.S",
@ -197,10 +197,10 @@ string arch32pmp[] = '{
 string arch64pmp[] = '{
  `RISCVARCHTEST,
  "rv64i_m/pmp/src/pmp64-CSR-ALL-MODES.S",
-  "rv64i_m/pmp/src/pmp64-NA4-M.S",
-  "rv64i_m/pmp/src/pmp64-NA4-S.S",
-  "rv64i_m/pmp/src/pmp64-NA4-U.S",
-  "rv64i_m/pmp/src/pmp64-NAPOT-M.S",
+  //"rv64i_m/pmp/src/pmp64-NA4-M.S", *** restore when PMP tests work with G > 0
+  //"rv64i_m/pmp/src/pmp64-NA4-S.S",
+  //"rv64i_m/pmp/src/pmp64-NA4-U.S",
+  //"rv64i_m/pmp/src/pmp64-NAPOT-M.S",
  "rv64i_m/pmp/src/pmp64-NAPOT-S.S",
  "rv64i_m/pmp/src/pmp64-NAPOT-U.S",
  "rv64i_m/pmp/src/pmp64-TOR-M.S",
@ -213,8 +213,8 @@ string arch32vm_sv32[] = '{
  "rv32i_m/vm_sv32/src/mstatus_tvm_test.S",
  "rv32i_m/vm_sv32/src/pmp_check_on_pa_S_mode.S",
  "rv32i_m/vm_sv32/src/pmp_check_on_pa_U_mode.S",
-  "rv32i_m/vm_sv32/src/pmp_check_on_pte_S_mode.S",
-  "rv32i_m/vm_sv32/src/pmp_check_on_pte_U_mode.S",
+  //"rv32i_m/vm_sv32/src/pmp_check_on_pte_S_mode.S", *** restore when PMP tests work with G > 0
+  //"rv32i_m/vm_sv32/src/pmp_check_on_pte_U_mode.S",
  "rv32i_m/vm_sv32/src/satp_access_tests.S",
  "rv32i_m/vm_sv32/src/vm_A_and_D_S_mode.S",
  "rv32i_m/vm_sv32/src/vm_A_and_D_U_mode.S",
--- a/tests/coverage/WALLY-init-lib.h
+++ b/tests/coverage/WALLY-init-lib.h
@ -39,6 +39,11 @@ rvtest_entry_point:
    csrw mtvec, t0      # Initialize MTVEC to trap_handler
    csrw mideleg, zero  # Don't delegate interrupts
    csrw medeleg, zero  # Don't delegate exceptions
+    # The following three lines are needed to initalize the timer for Spike to run correctly,
+    # but are not necessary for Wally to run lockstep.
+    # Unfortunately, they throw off the program addresses for tests/coverage/pmp.S,
+    # causing it to fail.  Ideally, pmp.S would become more robust or be replaced by
+    # functional coverage tests, and these three lines will be restored.
 #    li t0, -1           # set mtimecmp to biggest number so it doesnt interrupt again
 #    li t1, 0x02004000   # MTIMECMP in CLINT
 #    sd t0, 0(t1)      
--- a/tests/coverage/fpu.S
+++ b/tests/coverage/fpu.S
@ -32,6 +32,11 @@ main:
    bseti t0, zero, 14  # turn on FPU
    csrs mstatus, t0

+    # fsqrt with Y = 0 to check divby0 flags
+    fcvt.s.w f0, zero
+    fli.s f1, 1
+    fsqrt.s f2, f1
+
    #Pull denormalized FP number from memory and pass it to fclass.S for coverage
    la t0, TestData1
    flw ft0, 0(t0)
--- a/tests/coverage/pmp.S
+++ b/tests/coverage/pmp.S
@ -17,7 +17,7 @@ main:
 // Configuration

  # | Reg   | pmpaddr     | pmpcfg    | L | A    | X | W | R | Comments
-  # |0      | 0x2000003f  | 0x83      | 1 | 00   | 0 | 1 | 1 | 0
+  # |0      | 0x20000040  | 0x83      | 1 | 00   | 0 | 1 | 1 | 0
  # |1      | 0x2000007f  | 0x8b      | 1 | 01   | 0 | 1 | 1 | 1
  # |2      | 0x200000be  | 0x93      | 1 | 10   | 0 | 1 | 1 | 2
  # |3      | 0x2000011e  | 0x9b      | 1 | 11   | 0 | 1 | 1 | 3
@ -34,7 +34,10 @@ main:
  # |14     | 0x200003be  | 0x10      | 0 | 10   | 0 | 0 | 0 | 14
  # |15     | 0x2000041e  | 0x18      | 0 | 11   | 0 | 0 | 0 | 15
 # configure the pmp address of register 0 in mode 0
-li t5, 536870975
+ # changed from 2000003F to 20000040 5/22/25 david_harris@hmc.edu 
+ # to prevent access fault to trap handler when grain size is large
+ # may result in lower coverage.
+li t5, 0x20000040
 csrw pmpaddr0, t5

 # configure the pmp address of register 1 in mode 1
@ -101,6 +104,7 @@ csrw pmpaddr15, t5
  # write pmpcfg0, output 0x191109019b938b83
 li t4, 1806234828062034819
 csrw pmpcfg0, t4
+csrr t5, pmpcfg0

  # write pmpcfg2, output 0x181008001c140c04
 li t4, 1733894653101739012
@ -923,6 +927,7 @@ li t4, 1806234828062034819
 csrw pmpcfg2, t4


+
 // Testing

 // END Configuration and Testing Starting at Register: 8
--- a/tests/riscof/sail_cSim/rv32gc.json
+++ b/tests/riscof/sail_cSim/rv32gc.json
@ -10,7 +10,7 @@
  },
  "memory": {
    "pmp": {
-      "grain": 0,
+      "grain": 4,
      "count": 16
    },
    "misaligned": {
--- a/tests/riscof/sail_cSim/rv64gc.json
+++ b/tests/riscof/sail_cSim/rv64gc.json
@ -10,7 +10,7 @@
  },
  "memory": {
    "pmp": {
-      "grain": 0,
+      "grain": 4,
      "count": 16
    },
    "misaligned": {
--- a/tests/riscof/spike/riscof_spike.py
+++ b/tests/riscof/spike/riscof_spike.py
@ -143,6 +143,12 @@ class spike(pluginTemplate):
      #TODO: The following assumes you are using the riscv-gcc toolchain. If
      #      not please change appropriately
      self.compile_cmd = self.compile_cmd+' -mabi='+('lp64 ' if 64 in ispec['supported_xlen'] else ('ilp32e ' if "E" in ispec["ISA"] else 'ilp32 '))
+      if 'pmp-grain' in ispec['PMP']:
+          # if the PMP granularity is specified in the isa yaml, then we use that value
+          # convert from G to bytes: g = 2^(G+2) bytes
+          self.granularity = pow(2, ispec['PMP']['pmp-grain']+2)
+      else:
+        self.granularity = 4  # default granularity is 4 bytes 

    def runTests(self, testList):

@ -198,7 +204,7 @@ class spike(pluginTemplate):
                reference_output = re.sub("/src/","/references/", re.sub(".S",".reference_output", test))
                simcmd = f'cut -c-{8:g} {reference_output} > {sig_file}' #use cut to remove comments when copying
            else:
-                simcmd = self.dut_exe + f' {"--misaligned" if self.xlen == "64" else ""} --isa={self.isa} +signature={sig_file} +signature-granularity=4 {elf}'
+                simcmd = self.dut_exe + f' {"--misaligned" if self.xlen == "64" else ""} --isa={self.isa} --pmpgranularity={self.granularity} +signature={sig_file} +signature-granularity=4 {elf}'
          else:
            simcmd = 'echo "NO RUN"'

--- a/tests/riscof/spike/spike_rv32gc_isa.yaml
+++ b/tests/riscof/spike/spike_rv32gc_isa.yaml
@ -28,7 +28,7 @@ hart0:
              - Unchanged
  PMP:
    implemented: True
-    pmp-grain: 0
+    pmp-grain: 4
    pmp-count: 16
    pmp-writable: 12

--- a/tests/riscof/spike/spike_rv64gc_isa.yaml
+++ b/tests/riscof/spike/spike_rv64gc_isa.yaml
@ -30,6 +30,6 @@ hart0:
              - Unchanged
  PMP:
    implemented: True
-    pmp-grain: 0
+    pmp-grain: 4
    pmp-count: 16
    pmp-writable: 12
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-pmp-01.S
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv32i_m/privilege/src/WALLY-pmp-01.S
@ -73,7 +73,7 @@ test_cases:

 # write pmpcfg regs with the information in the table above. this should also write the value of these registers to the output.
 .4byte 0x0, 0x0009001F, write_pmpcfg_0 # write pmpcfg0, output 0x0009001F
-.4byte 0x1, 0x0018900C, write_pmpcfg_1 # write pmpcfg1, output 0x0018900C
+.4byte 0x1, 0x0018900C, write_pmpcfg_1 # write pmpcfg1, output 0x0018980C because NA4 reads as NAPOT with G > 0
 # pmpcfg2 is zeroed out, so it doesn't need a write
 .4byte 0x3, 0x1F000000, write_pmpcfg_3 # write pmpcfg3, output 0x1F000000

@ -117,8 +117,8 @@ test_cases:
 .4byte 0x80100010, 0x600DAA, read32_test # read correct value out

 # test read and write fault on region with no access
-.4byte 0x80100208, 0x600D15, write32_test # Write fault on no-access range (PMP6)
-.4byte 0x80100208, 0x600D15, read32_test # read fault on no-access range (PMP6)
+.4byte 0x80100208, 0x600D17, write32_test # Write fault on no-access range (PMP4)
+.4byte 0x80100208, 0x600D17, read32_test # read fault on no-access range (PMP4)

 # test jalr to region with X=0 causes access fault
 .4byte 0x80100020, 0xbad, executable_test # execute fault on no-execute range (PMP2)
--- a/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-pmp-01.S
+++ b/tests/wally-riscv-arch-test/riscv-test-suite/rv64i_m/privilege/src/WALLY-pmp-01.S
@ -71,7 +71,7 @@ test_cases:
 .8byte 0xF, 0x2FFFFFFF, write_pmpaddr_15 # | 15    | 0x2FFFFFFF  | 1F        | 0 | NAPOT | 1 | 1 | 1 | Main mem 80000000-FFFFFFFF RWX|

 # write pmpcfg regs with the information in the table above. this should also write the value of these registers to the output.
-.8byte 0x0, 0x0018900C0009001F, write_pmpcfg_0 # write pmpcfg0, output 0x0018900C0009001F
+.8byte 0x0, 0x0018900C0009001F, write_pmpcfg_0 # write pmpcfg0, output 0x0018980C0009001F because NA4 writes as NAPOT
 .8byte 0x2, 0x1F00000000000000, write_pmpcfg_2 # write pmpcfg2, output 0x1F00000000000000

 # write known values to memory where W=0. This should be possible since we're in machine mode.