From d04d2afed2db51fbd84ebea596a473d4a2a93070 Mon Sep 17 00:00:00 2001 From: Ross Thompson Date: Fri, 21 Jul 2023 13:06:27 -0500 Subject: [PATCH] Modified the LSU/IFU and caches to improve critical path. Arty A7 went from 15 to 17Mhz. I believe we can push all the way to 20+Mhz with relatively little effort. Along the way I'm fixing up the scripts build the linux images for the flash card. --- fpga/constraints/constraints-ArtyA7.xdc | 4 +++- linux/devicetree/wally-artya7.dts | 6 +++--- src/cache/cache.sv | 3 ++- src/ifu/ifu.sv | 4 +++- src/lsu/lsu.sv | 17 +++++++++++------ 5 files changed, 22 insertions(+), 12 deletions(-) diff --git a/fpga/constraints/constraints-ArtyA7.xdc b/fpga/constraints/constraints-ArtyA7.xdc index b757107d9..292fd9f5e 100644 --- a/fpga/constraints/constraints-ArtyA7.xdc +++ b/fpga/constraints/constraints-ArtyA7.xdc @@ -106,7 +106,9 @@ set_output_delay -clock [get_clocks clk_out3_xlnx_mmcm] -max -add_delay 6.000 [g set_output_delay -clock [get_clocks clk_out3_xlnx_mmcm] 0.000 [get_ports SDCCLK] -set_multicycle_path -from [get_pins xlnx_ddr3_c0/u_xlnx_ddr3_mig/u_memc_ui_top_axi/mem_intfc0/ddr_phy_top0/u_ddr_calib_top/init_calib_complete_reg/C] -to [get_pins xlnx_proc_sys_reset_0/U0/EXT_LPF/lpf_int_reg/D] 10 +#set_multicycle_path -from [get_pins xlnx_ddr3_c0/u_xlnx_ddr3_mig/u_memc_ui_top_axi/mem_intfc0/ddr_phy_top0/u_ddr_calib_top/init_calib_complete_reg/C] -to [get_pins xlnx_proc_sys_reset_0/U0/EXT_LPF/lpf_int_reg/D] 10 + +set_max_delay -datapath_only -from [get_pins xlnx_ddr3_c0/u_xlnx_ddr3_mig/u_memc_ui_top_axi/mem_intfc0/ddr_phy_top0/u_ddr_calib_top/init_calib_complete_reg/C] -to [get_pins xlnx_proc_sys_reset_0/U0/EXT_LPF/lpf_int_reg/D] 20.000 # ********************************* #set_property DCI_CASCADE {64} [get_iobanks 65] diff --git a/linux/devicetree/wally-artya7.dts b/linux/devicetree/wally-artya7.dts index 82e48d70f..f2d0b3359 100644 --- a/linux/devicetree/wally-artya7.dts +++ b/linux/devicetree/wally-artya7.dts @@ -21,8 +21,8 @@ cpus { #address-cells = <0x01>; #size-cells = <0x00>; - clock-frequency = <0xE4E1C0>; - timebase-frequency = <0xE4E1C0>; + clock-frequency = <0x1036640>; + timebase-frequency = <0x1036640>; cpu@0 { phandle = <0x01>; @@ -51,7 +51,7 @@ uart@10000000 { interrupts = <0x0a>; interrupt-parent = <0x03>; - clock-frequency = <0xE4E1C0>; + clock-frequency = <0x1036640>; reg = <0x00 0x10000000 0x00 0x100>; compatible = "ns16550a"; }; diff --git a/src/cache/cache.sv b/src/cache/cache.sv index 1275d0cbc..d4d03a302 100644 --- a/src/cache/cache.sv +++ b/src/cache/cache.sv @@ -33,6 +33,7 @@ module cache import cvw::*; #(parameter cvw_t P, input logic reset, input logic Stall, // Stall the cache, preventing new accesses. In-flight access finished but does not return to READY input logic FlushStage, // Pipeline flush of second stage (prevent writes and bus operations) + input logic IgnoreRequestTLB, // // cpu side input logic [1:0] CacheRW, // [1] Read, [0] Write input logic [1:0] CacheAtomic, // Atomic operation @@ -210,7 +211,7 @@ module cache import cvw::*; #(parameter cvw_t P, ///////////////////////////////////////////////////////////////////////////////////////////// // Cache FSM ///////////////////////////////////////////////////////////////////////////////////////////// - + cachefsm #(READ_ONLY_CACHE) cachefsm(.clk, .reset, .CacheBusRW, .CacheBusAck, .FlushStage, .CacheRW, .CacheAtomic, .Stall, .CacheHit, .LineDirty, .CacheStall, .CacheCommitted, diff --git a/src/ifu/ifu.sv b/src/ifu/ifu.sv index de0200058..db588441f 100644 --- a/src/ifu/ifu.sv +++ b/src/ifu/ifu.sv @@ -229,13 +229,14 @@ module ifu import cvw::*; #(parameter cvw_t P) ( logic [P.PA_BITS-1:0] ICacheBusAdr; logic ICacheBusAck; logic [1:0] CacheBusRW, BusRW, CacheRWF; + logic [1:0] CacheBusRWTemp; assign BusRW = ~ITLBMissF & ~CacheableF & ~SelIROM ? IFURWF : '0; assign CacheRWF = ~ITLBMissF & CacheableF & ~SelIROM ? IFURWF : '0; cache #(.P(P), .PA_BITS(P.PA_BITS), .XLEN(P.XLEN), .LINELEN(P.ICACHE_LINELENINBITS), .NUMLINES(P.ICACHE_WAYSIZEINBYTES*8/P.ICACHE_LINELENINBITS), .NUMWAYS(P.ICACHE_NUMWAYS), .LOGBWPL(LOGBWPL), .WORDLEN(32), .MUXINTERVAL(16), .READ_ONLY_CACHE(1)) - icache(.clk, .reset, .FlushStage(FlushD), .Stall(GatedStallD), + icache(.clk, .reset, .FlushStage(FlushD), .IgnoreRequestTLB(1'b0), .Stall(GatedStallD), .FetchBuffer, .CacheBusAck(ICacheBusAck), .CacheBusAdr(ICacheBusAdr), .CacheStall(ICacheStallF), .CacheBusRW, @@ -249,6 +250,7 @@ module ifu import cvw::*; #(parameter cvw_t P) ( .NextSet(PCSpillNextF[11:0]), .PAdr(PCPF), .CacheCommitted(CacheCommittedF), .InvalidateCache(InvalidateICacheM)); + ahbcacheinterface #(P.AHBW, P.LLEN, P.PA_BITS, WORDSPERLINE, LOGBWPL, LINELEN, LLENPOVERAHBW, 1) ahbcacheinterface(.HCLK(clk), .HRESETn(~reset), .HRDATA, diff --git a/src/lsu/lsu.sv b/src/lsu/lsu.sv index 9fda87cd0..4782b8aa2 100644 --- a/src/lsu/lsu.sv +++ b/src/lsu/lsu.sv @@ -255,26 +255,31 @@ module lsu import cvw::*; #(parameter cvw_t P) ( logic [1:0] CacheRWM; // Cache read (10), write (01), AMO (11) logic [1:0] CacheAtomicM; // Cache AMO logic FlushDCache; // Suppress d cache flush if there is an ITLB miss. + logic CacheStall; + logic [1:0] CacheBusRWTemp; assign BusRW = ~CacheableM & ~IgnoreRequestTLB & ~SelDTIM ? LSURWM : '0; assign CacheableOrFlushCacheM = CacheableM | FlushDCacheM; - assign CacheRWM = CacheableM & ~IgnoreRequestTLB & ~SelDTIM ? LSURWM : '0; - assign CacheAtomicM = CacheableM & ~IgnoreRequestTLB & ~SelDTIM ? LSUAtomicM : '0; - assign FlushDCache = FlushDCacheM & ~(IgnoreRequestTLB | SelHPTW); + assign CacheRWM = CacheableM & ~SelDTIM ? LSURWM : '0; + assign CacheAtomicM = CacheableM & ~SelDTIM ? LSUAtomicM : '0; + assign FlushDCache = FlushDCacheM & ~(SelHPTW); // *** need RT to add support for CMOpM and LSUPrefetchM (DH 7/2/23) // *** prefetch can just act as a read operation cache #(.P(P), .PA_BITS(P.PA_BITS), .XLEN(P.XLEN), .LINELEN(P.DCACHE_LINELENINBITS), .NUMLINES(P.DCACHE_WAYSIZEINBYTES*8/LINELEN), .NUMWAYS(P.DCACHE_NUMWAYS), .LOGBWPL(LLENLOGBWPL), .WORDLEN(P.LLEN), .MUXINTERVAL(P.LLEN), .READ_ONLY_CACHE(0)) dcache( - .clk, .reset, .Stall(GatedStallW), .SelBusBeat, .FlushStage(FlushW), .CacheRW(CacheRWM), .CacheAtomic(CacheAtomicM), + .clk, .reset, .Stall(GatedStallW), .SelBusBeat, .FlushStage(FlushW | IgnoreRequestTLB), .IgnoreRequestTLB, .CacheRW(CacheRWM), .CacheAtomic(CacheAtomicM), .FlushCache(FlushDCache), .NextSet(IEUAdrE[11:0]), .PAdr(PAdrM), .ByteMask(ByteMaskM), .BeatCount(BeatCount[AHBWLOGBWPL-1:AHBWLOGBWPL-LLENLOGBWPL]), .CacheWriteData(LSUWriteDataM), .SelHPTW, - .CacheStall(DCacheStallM), .CacheMiss(DCacheMiss), .CacheAccess(DCacheAccess), + .CacheStall, .CacheMiss(DCacheMiss), .CacheAccess(DCacheAccess), .CacheCommitted(DCacheCommittedM), .CacheBusAdr(DCacheBusAdr), .ReadDataWord(DCacheReadDataWordM), - .FetchBuffer, .CacheBusRW, + .FetchBuffer, .CacheBusRW(CacheBusRWTemp), .CacheBusAck(DCacheBusAck), .InvalidateCache(1'b0)); + + assign DCacheStallM = CacheStall & ~IgnoreRequestTLB; + assign CacheBusRW = IgnoreRequestTLB ? 2'b0 : CacheBusRWTemp; ahbcacheinterface #(.AHBW(P.AHBW), .LLEN(P.LLEN), .PA_BITS(P.PA_BITS), .BEATSPERLINE(BEATSPERLINE), .AHBWLOGBWPL(AHBWLOGBWPL), .LINELEN(LINELEN), .LLENPOVERAHBW(LLENPOVERAHBW), .READ_ONLY_CACHE(0)) ahbcacheinterface( .HCLK(clk), .HRESETn(~reset), .Flush(FlushW),