diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 97f8d30ad..daa6a1154 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -1,23 +1,23 @@
 before_script:
-  - export CXX=g++-7 CC=gcc-7
   # paths to local or network installations (the riscv toolchain and 
   # verilator are not built in the ci job as in travis)
-  - export QUESTASIM_HOME=
-  - export QUESTASIM_VERSION=
-  - export QUESTASIM_FLAGS=
-  - export RISCV=/scratch/$USER/projects/riscv_install
-  - export VERILATOR_ROOT=/scratch/$USER/projects/verilator-3.924
+  - export QUESTASIM_HOME=/usr/pack/modelsim-10.6b-kgf/questasim/
+  - export QUESTASIM_VERSION=-10.6b
+  - export QUESTASIM_FLAGS=-noautoldlibpath
+  - export CXX=g++-7.2.0 CC=gcc-7.2.0
+  - export RISCV=/usr/scratch2/larain1/gitlabci/riscv_install
+  - export VERILATOR_ROOT=/usr/scratch2/larain1/gitlabci/verilator-3.924
   # setup dependent paths
   - export PATH=${RISCV}/bin:$VERILATOR_ROOT/bin:${PATH}
   - export LIBRARY_PATH=$RISCV/lib
-  - export LD_LIBRARY_PATH=$RISCV/lib
-  - export C_INCLUDE_PATH=$RISCV/include:$VERILATOR_ROOT/include
-  - export CPLUS_INCLUDE_PATH=$RISCV/include:$VERILATOR_ROOT/include
+  - export LD_LIBRARY_PATH=$RISCV/lib:/usr/pack/gcc-7.2.0-af/linux-x64/lib64/
+  - export C_INCLUDE_PATH=$RISCV/include:$VERILATOR_ROOT/include:/usr/pack/gcc-7.2.0-af/linux-x64/include
+  - export CPLUS_INCLUDE_PATH=$RISCV/include:$VERILATOR_ROOT/include:/usr/pack/gcc-7.2.0-af/linux-x64/include
   # number of parallel jobs to use for make commands and simulation
   - export NUM_JOBS=4
   - ci/make-tmp.sh
   - git submodule update --init --recursive
-
+   
 variables:
   GIT_SUBMODULE_STRATEGY: recursive
 
@@ -52,33 +52,9 @@ run-benchmarks-questa:
   dependencies:
     - build    
 
-# rv64ui-p-* tests
-run-asm-tests1-verilator:
-  stage: test_std
-  script:
-    - make -j${NUM_JOBS} run-asm-tests1-verilator 
-  dependencies:
-    - build
-
-# rv64ui-v-* tests
-run-asm-tests2-verilator:
-  stage: test_std
-  script:
-    - make -j${NUM_JOBS} run-asm-tests2-verilator 
-  dependencies:
-    - build    
-
-run-benchmarks-verilator:
-  stage: test_std
-  script:
-    - make -j${NUM_JOBS} run-benchmarks-verilator 
-  dependencies:
-    - build
-
 torture:
   stage: test_std
   script:
     - make torture-rtest
-    - make torture-rtest-verilator
   dependencies:
     - build
diff --git a/.gitmodules b/.gitmodules
index 38c356f40..d25c969d3 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,12 +4,21 @@
 [submodule "src/axi_node"]
 	path = src/axi_node
 	url = https://github.com/pulp-platform/axi_node.git
+[submodule "src/fpu"]
+	path = src/fpu
+	url = https://github.com/pulp-platform/fpnew.git
 [submodule "src/fpga-support"]
 	path = src/fpga-support
 	url = https://github.com/pulp-platform/fpga-support.git
 [submodule "src/common_cells"]
 	path = src/common_cells
-	url = https://github.com/pulp-platform/common_cells.git
+  url = https://github.com/pulp-platform/common_cells.git
 [submodule "src/axi"]
 	path = src/axi
 	url = https://github.com/pulp-platform/axi.git
+[submodule "src/fpu_div_sqrt_mvp"]
+	path = src/fpu_div_sqrt_mvp
+	url = https://github.com/pulp-platform/fpu_div_sqrt_mvp.git
+[submodule "src/tech_cells_generic"]
+	path = src/tech_cells_generic
+	url = https://github.com/pulp-platform/tech_cells_generic.git
diff --git a/Bender.yml b/Bender.yml
index 1b12de061..5b169a76d 100644
--- a/Bender.yml
+++ b/Bender.yml
@@ -3,15 +3,48 @@ package:
   authors: [ "Florian Zaruba <zarubaf@iis.ee.ethz.ch>" ]
 
 dependencies:
-  axi:                { git: "git@iis-git.ee.ethz.ch:sasa/axi.git",                     rev: master      }
-  axi2per:            { git: "git@iis-git.ee.ethz.ch:pulp-open/axi2per.git",            rev: master      }
-  axi_mem_if:         { git: "git@github.com:pulp-platform/axi_mem_if.git",             rev: master      }
-  axi_node:           { git: "git@iis-git.ee.ethz.ch:pulp-open/axi_node.git",           version: v1.1.0  }
-  axi_slice:          { git: "git@iis-git.ee.ethz.ch:sasa/axi_slice.git",               version: 1.1.2   }
-  tech_cells_generic: { git: "git@iis-git.ee.ethz.ch:pulp-open/tech_cells_generic.git", rev: master      }
-  common_cells:       { git: "git@iis-git.ee.ethz.ch:sasa/common_cells.git",            version: v1.7.4  }
-  fpga-support:       { git: "https://github.com/pulp-platform/fpga-support.git",       version: v0.3.2  }
+  axi:                { git: "https://github.com/pulp-platform/axi.git",                version: 0.4.5 }
+  axi_mem_if:         { git: "https://github.com/pulp-platform/axi_mem_if.git",         version: 0.2.0 }
+  axi_node:           { git: "https://github.com/pulp-platform/axi_node.git",           version: 1.1.1 }
+  tech_cells_generic: { git: "https://github.com/pulp-platform/tech_cells_generic.git", version: 0.1.1 }
+  common_cells:       { git: "https://github.com/pulp-platform/common_cells.git",       version: 1.7.5 }
+  fpga-support:       { git: "https://github.com/pulp-platform/fpga-support.git",       version: 0.3.2 }
+
 sources:
+  - src/fpu_div_sqrt_mvp/hdl/fpu_ff.sv
+  - src/fpu_div_sqrt_mvp/hdl/defs_div_sqrt_mvp.sv
+  - src/fpu_div_sqrt_mvp/hdl/control_mvp.sv
+  - src/fpu_div_sqrt_mvp/hdl/div_sqrt_mvp_wrapper.sv
+  - src/fpu_div_sqrt_mvp/hdl/div_sqrt_top_mvp.sv
+  - src/fpu_div_sqrt_mvp/hdl/iteration_div_sqrt_mvp.sv
+  - src/fpu_div_sqrt_mvp/hdl/norm_div_sqrt_mvp.sv
+  - src/fpu_div_sqrt_mvp/hdl/nrbd_nrsc_mvp.sv
+  - src/fpu_div_sqrt_mvp/hdl/preprocess_mvp.sv
+  - src/fpu/src/pkg/fpnew_pkg.vhd
+  - src/fpu/src/pkg/fpnew_fmts_pkg.vhd
+  - src/fpu/src/pkg/fpnew_comps_pkg.vhd
+  - src/fpu/src/pkg/fpnew_pkg_constants.vhd
+  - src/fpu/src/utils/fp_pipe.vhd
+  - src/fpu/src/utils/fp_rounding.vhd
+  - src/fpu/src/utils/fp_arbiter.vhd
+  - src/fpu/src/ops/fma_core.vhd
+  - src/fpu/src/ops/fp_fma.vhd
+  - src/fpu/src/ops/fp_divsqrt_multi.vhd
+  - src/fpu/src/ops/fp_noncomp.vhd
+  - src/fpu/src/ops/fp_f2fcasts_fmt.vhd
+  - src/fpu/src/ops/fp_f2icasts_fmt.vhd
+  - src/fpu/src/ops/fp_i2fcasts_fmt.vhd
+  - src/fpu/src/subunits/addmul_fmt_slice.vhd
+  - src/fpu/src/subunits/addmul_block.vhd
+  - src/fpu/src/subunits/divsqrt_multifmt_slice.vhd
+  - src/fpu/src/subunits/divsqrt_block.vhd
+  - src/fpu/src/subunits/noncomp_fmt_slice.vhd
+  - src/fpu/src/subunits/noncomp_block.vhd
+  - src/fpu/src/subunits/conv_fmt_slice.vhd
+  - src/fpu/src/subunits/conv_ifmt_slice.vhd
+  - src/fpu/src/subunits/conv_block.vhd
+  - src/fpu/src/fpnew.vhd
+  - src/fpu/src/fpnew_top.vhd
   - include/riscv_pkg.sv
   - src/debug/dm_pkg.sv
   - include/ariane_pkg.sv
@@ -21,49 +54,43 @@ sources:
       - src/util/instruction_tracer_pkg.sv
       - src/util/instruction_tracer_if.sv
   - src/alu.sv
+  - src/fpu_wrap.sv
   - src/ariane.sv
   - src/branch_unit.sv
-  - src/cache_ctrl.sv
-  - src/commit_stage.sv
   - src/compressed_decoder.sv
   - src/controller.sv
   - src/csr_buffer.sv
   - src/csr_regfile.sv
   - src/decoder.sv
   - src/ex_stage.sv
-  - src/frontend/btb.sv,
-  - src/frontend/bht.sv,
-  - src/frontend/ras.sv,
-  - src/frontend/instr_scan.sv,
+  - src/frontend/btb.sv
+  - src/frontend/bht.sv
+  - src/frontend/ras.sv
+  - src/frontend/instr_scan.sv
   - src/frontend/frontend.sv
-  - src/icache.sv
   - src/id_stage.sv
   - src/instr_realigner.sv
   - src/issue_read_operands.sv
   - src/issue_stage.sv
-  - src/lfsr.sv
   - src/load_unit.sv
   - src/lsu_arbiter.sv
   - src/lsu.sv
-  - src/miss_handler.sv
   - src/mmu.sv
   - src/mult.sv
-  - src/nbdcache.sv
-  - src/vdregs.sv
   - src/perf_counters.sv
   - src/ptw.sv
-  - src/std_cache_subsystem.sv
-  - src/sram_wrapper.sv
-  # - src/ariane_regfile_ff.sv
-  - src/ariane_regfile.sv
+  - src/ariane_regfile_ff.sv
+  # - src/ariane_regfile.sv
   - src/re_name.sv
   - src/scoreboard.sv
   - src/store_buffer.sv
+  - src/amo_buffer.sv
   - src/store_unit.sv
   - src/tlb.sv
   - src/commit_stage.sv
   - src/axi_adapter.sv
   - src/cache_subsystem/cache_ctrl.sv
+  - src/cache_subsystem/amo_alu.sv
   - src/cache_subsystem/miss_handler.sv
   - src/cache_subsystem/std_cache_subsystem.sv
   - src/cache_subsystem/std_icache.sv
@@ -76,4 +103,5 @@ sources:
   - src/debug/dm_top.sv
   - src/debug/dmi_cdc.sv
   - src/debug/dmi_jtag.sv
+  - src/debug/dm_sba.sv
   - src/debug/dmi_jtag_tap.sv
diff --git a/CODEOWNERS b/CODEOWNERS
new file mode 100644
index 000000000..e3285d0ad
--- /dev/null
+++ b/CODEOWNERS
@@ -0,0 +1 @@
+*  @zarubaf @msfschaffner
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 44337a124..084df0aa1 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -20,6 +20,7 @@ See [style-guidlines](https://github.com/pulp-platform/style-guidelines)
     * :fire: `:fire` Removing code or files.
     * :memo: `:memo:` When writing docs
     * :bug: `:bug:` When fixing a bug
+    * :fire: `:fire:` When removing code or files
     * :wastebasket: `:wastebasket:` When removing code or files
     * :green_heart: `:green_heart:` When fixing the CI build
     * :construction_worker: `:construction_worker:` Adding CI build system
@@ -28,17 +29,16 @@ See [style-guidlines](https://github.com/pulp-platform/style-guidelines)
     * :arrow_up: `:arrow_up:` When upgrading dependencies
     * :arrow_down: `:arrow_down:` When downgrading dependencies
     * :rotating_light: `:rotating_light:` When removing linter warnings
-    * :pencil2: `:pencil2:` Fixing typos
-    * :recycle: `:recycle:` Refactoring code.
+    * :pencil2: `pencil2:` Fixing typos
+    * :recycle: `:scisccor:` Refactoring code.
     * :boom: `:boom:` Introducing breaking changes
-    * :truck: `:truck:` Moving or renaming files.
+    * :truck: `truck` Moving or renaming files.
     * :space_invader: `:space_invader:` When fixing something synthesis related
     * :beers: `:beer:` Writing code drunkenly.
-    * :ok_hand: `:ok_hand:` Updating code due to code review changes
+    * :ok_hand: `:ok_hand` Updating code due to code review changes
     * :building_construction: `:building_construction:` Making architectural changes.
-    * :wrench: `:wrench:` Tooling
-    * :construction: `:construction:` Work In Progress WIP
-    * :bookmark: `:bookmark:` version tag
+
+For a detailed why and how please refer to one of the multiple [resources](https://chris.beams.io/posts/git-commit/) regarding git commit messages.
 
 If you use `vi` for your commit message, consider to put the following snippet inside your `~/.vimrc`:
 
diff --git a/Makefile b/Makefile
index a76620921..1445cb108 100755
--- a/Makefile
+++ b/Makefile
@@ -29,18 +29,23 @@ torture-logs := -log
 
 # Sources
 # Package files -> compile first
-ariane_pkg := include/riscv_pkg.sv       \
-              src/debug/dm_pkg.sv        \
-              include/ariane_pkg.sv      \
-              include/std_cache_pkg.sv   \
-              src/axi/src/axi_pkg.sv     \
-              include/axi_intf.sv
+ariane_pkg := include/riscv_pkg.sv                          \
+              src/debug/dm_pkg.sv                           \
+              include/ariane_pkg.sv                         \
+              include/std_cache_pkg.sv                      \
+              src/axi/src/axi_pkg.sv                        \
+              include/axi_intf.sv                           \
+              src/fpu/src/pkg/fpnew_pkg.vhd                 \
+              src/fpu/src/pkg/fpnew_fmts_pkg.vhd            \
+              src/fpu/src/pkg/fpnew_comps_pkg.vhd           \
+              src/fpu_div_sqrt_mvp/hdl/defs_div_sqrt_mvp.sv \
+              src/fpu/src/pkg/fpnew_pkg_constants.vhd
 
 # utility modules
-util := $(wildcard src/util/*.svh)         \
-        src/util/instruction_tracer_pkg.sv \
-        src/util/instruction_tracer_if.sv  \
-        src/util/cluster_clock_gating.sv   \
+util := $(wildcard src/util/*.svh)                            \
+        src/util/instruction_tracer_pkg.sv                    \
+        src/util/instruction_tracer_if.sv                     \
+        src/tech_cells_generic/src/cluster_clock_gating.sv    \
         src/util/sram.sv
 
 # Test packages
@@ -51,6 +56,11 @@ dpi := $(patsubst tb/dpi/%.cc,${dpi-library}/%.o,$(wildcard tb/dpi/*.cc))
 dpi_hdr := $(wildcard tb/dpi/*.h)
 # this list contains the standalone components
 src :=  $(filter-out src/ariane_regfile.sv, $(wildcard src/*.sv))      \
+        $(wildcard src/fpu/src/utils/*.vhd)                            \
+        $(wildcard src/fpu/src/ops/*.vhd)                              \
+        $(wildcard src/fpu/src/subunits/*.vhd)                         \
+        $(filter-out src/fpu_div_sqrt_mvp/hdl/defs_div_sqrt_mvp.sv,    \
+        $(wildcard src/fpu_div_sqrt_mvp/hdl/*.sv))                     \
         $(wildcard src/frontend/*.sv)                                  \
         $(wildcard src/cache_subsystem/*.sv)                           \
         $(wildcard bootrom/*.sv)                                       \
@@ -59,6 +69,12 @@ src :=  $(filter-out src/ariane_regfile.sv, $(wildcard src/*.sv))      \
         $(wildcard src/axi_mem_if/src/*.sv)                            \
         $(filter-out src/debug/dm_pkg.sv, $(wildcard src/debug/*.sv))  \
         $(wildcard src/debug/debug_rom/*.sv)                           \
+        src/fpu/src/fpnew.vhd                                          \
+        src/fpu/src/fpnew_top.vhd                                      \
+        src/common_cells/src/deprecated/generic_fifo.sv                \
+        src/common_cells/src/deprecated/pulp_sync.sv                   \
+        src/common_cells/src/deprecated/find_first_one.sv              \
+        src/common_cells/src/rstgen_bypass.sv                          \
         src/axi/src/axi_cut.sv                                         \
         src/axi/src/axi_join.sv                                        \
         src/fpga-support/rtl/SyncSpRamBeNx64.sv                        \
@@ -71,12 +87,12 @@ src :=  $(filter-out src/ariane_regfile.sv, $(wildcard src/*.sv))      \
         src/common_cells/src/lzc.sv                                    \
         src/common_cells/src/rrarbiter.sv                              \
         src/common_cells/src/lfsr_8bit.sv                              \
+        src/tech_cells_generic/src/cluster_clock_inverter.sv           \
+        src/tech_cells_generic/src/pulp_clock_mux2.sv                  \
         tb/ariane_testharness.sv                                       \
         tb/common/SimDTM.sv                                            \
         tb/common/SimJTAG.sv
 
-
-
 # root path
 root-dir := $(shell pwd)
 # look for testbenches
@@ -96,6 +112,7 @@ riscv-test ?= rv64ui-p-add
 incdir :=
 # Compile and sim flags
 compile_flag += +cover=bcfst+/dut -incr -64 -nologo -quiet -suppress 13262 -permissive +define+$(defines)
+compile_flag_vhd += -64 -nologo -quiet -2008
 uvm-flags    += +UVM_NO_RELNOTES
 # Iterate over all include directories and write them with +incdir+ prefixed
 # +incdir+ works for Verilator and QuestaSim
@@ -113,9 +130,11 @@ build: $(library) $(library)/.build-srcs $(library)/.build-tb $(dpi-library)/ari
 # src files
 $(library)/.build-srcs: $(ariane_pkg) $(util) $(src) $(library)
 	vlog$(questa_version) $(compile_flag) -work $(library) $(filter %.sv,$(ariane_pkg)) $(list_incdir) -suppress 2583
+	vcom$(questa_version) $(compile_flag_vhd) -work $(library) -pedanticerrors $(filter %.vhd,$(ariane_pkg))
 	vlog$(questa_version) $(compile_flag) -work $(library) $(filter %.sv,$(util)) $(list_incdir) -suppress 2583
 	# Suppress message that always_latch may not be checked thoroughly by QuestaSim.
-	vlog$(questa_version) $(compile_flag) -work $(library) -pedanticerrors $(src) $(list_incdir) -suppress 2583
+	vcom$(questa_version) $(compile_flag_vhd) -work $(library) -pedanticerrors $(filter %.vhd,$(src))
+	vlog$(questa_version) $(compile_flag) -work $(library) -pedanticerrors $(filter %.sv,$(src)) $(list_incdir) -suppress 2583
 	touch $(library)/.build-srcs
 
 # build TBs
@@ -137,19 +156,25 @@ $(dpi-library)/ariane_dpi.so: $(dpi)
 	# Compile C-code and generate .so file
 	$(CXX) -shared -m64 -o $(dpi-library)/ariane_dpi.so $? -lfesvr
 
-
 sim: build
-	vsim${questa_version} +permissive -64 -lib ${library} +max-cycles=$(max_cycles) +UVM_TESTNAME=${test_case}    \
-	+BASEDIR=$(riscv-test-dir) $(uvm-flags) "+UVM_VERBOSITY=LOW" -coverage -classdebug  +jtag_rbb_enable=0        \
-	$(QUESTASIM_FLAGS)                                                                                            \
-	-gblso $(RISCV)/lib/libfesvr.so -sv_lib $(dpi-library)/ariane_dpi -do " log -r /*; run -all; exit"            \
+	vsim${questa_version} +permissive -64 -lib ${library} +max-cycles=$(max_cycles) +UVM_TESTNAME=${test_case}        \
+	+BASEDIR=$(riscv-test-dir) $(uvm-flags) "+UVM_VERBOSITY=LOW" -coverage -classdebug  +jtag_rbb_enable=0            \
+	$(QUESTASIM_FLAGS)                                                                                                \
+	-gblso $(RISCV)/lib/libfesvr.so -sv_lib $(dpi-library)/ariane_dpi -do " set StdArithNoWarnings 1; set NumericStdNoWarnings 1; log -r /*; run -all; exit"  \
     ${top_level}_optimized +permissive-off ++$(riscv-test-dir)/$(riscv-test) ++$(target-options)
 
 simc: build
 	vsim${questa_version} +permissive -64 -c -lib ${library} +max-cycles=$(max_cycles) +UVM_TESTNAME=${test_case} \
 	+BASEDIR=$(riscv-test-dir) $(uvm-flags) "+UVM_VERBOSITY=LOW" -coverage -classdebug +jtag_rbb_enable=0         \
 	$(QUESTASIM_FLAGS)                                                                                            \
-	-gblso $(RISCV)/lib/libfesvr.so -sv_lib $(dpi-library)/ariane_dpi -do " run -all; exit"                       \
+	-gblso $(RISCV)/lib/libfesvr.so -sv_lib $(dpi-library)/ariane_dpi -do " set StdArithNoWarnings 1; set NumericStdNoWarnings 1; run -all; exit"                       \
+    ${top_level}_optimized +permissive-off ++$(riscv-test-dir)/$(riscv-test) ++$(target-options)
+
+simc-log: build
+	vsim${questa_version} +permissive -64 -c -lib ${library} +max-cycles=$(max_cycles) +UVM_TESTNAME=${test_case} \
+	+BASEDIR=$(riscv-test-dir) $(uvm-flags) "+UVM_VERBOSITY=LOW" -coverage -classdebug +jtag_rbb_enable=0         \
+	$(QUESTASIM_FLAGS)                                                                                            \
+	-gblso $(RISCV)/lib/libfesvr.so -sv_lib $(dpi-library)/ariane_dpi -do " set StdArithNoWarnings 1; set NumericStdNoWarnings 1; log -r /*; run -all; exit"                       \
     ${top_level}_optimized +permissive-off ++$(riscv-test-dir)/$(riscv-test) ++$(target-options)
 
 $(riscv-asm-tests): build
@@ -157,7 +182,7 @@ $(riscv-asm-tests): build
 	+BASEDIR=$(riscv-test-dir) $(uvm-flags) "+UVM_VERBOSITY=LOW" -coverage -classdebug +jtag_rbb_enable=0         \
 	$(QUESTASIM_FLAGS)                                                                                            \
 	-gblso $(RISCV)/lib/libfesvr.so -sv_lib $(dpi-library)/ariane_dpi                                             \
-	-do "coverage save -onexit tmp/$@.ucdb; run -a; quit -code [coverage attribute -name TESTSTATUS -concise]"    \
+	-do " set StdArithNoWarnings 1; set NumericStdNoWarnings 1; coverage save -onexit tmp/$@.ucdb; run -a; quit -code [coverage attribute -name TESTSTATUS -concise]"    \
 	${top_level}_optimized +permissive-off ++$(riscv-test-dir)/$@ ++$(target-options) | tee tmp/riscv-asm-tests-$@.log
 
 $(riscv-benchmarks): build
@@ -165,27 +190,27 @@ $(riscv-benchmarks): build
 	+BASEDIR=$(riscv-benchmarks-dir) $(uvm-flags) "+UVM_VERBOSITY=LOW" -coverage -classdebug +jtag_rbb_enable=0   \
 	$(QUESTASIM_FLAGS)                                                                                            \
 	-gblso $(RISCV)/lib/libfesvr.so -sv_lib $(dpi-library)/ariane_dpi                                             \
-	-do "coverage save -onexit tmp/$@.ucdb; run -a; quit -code [coverage attribute -name TESTSTATUS -concise]"    \
+	-do " set StdArithNoWarnings 1; set NumericStdNoWarnings 1; coverage save -onexit tmp/$@.ucdb; run -a; quit -code [coverage attribute -name TESTSTATUS -concise]"    \
 	${top_level}_optimized +permissive-off ++$(riscv-benchmarks-dir)/$@ ++$(target-options) | tee tmp/riscv-benchmarks-$@.log
 
 # can use -jX to run ci tests in parallel using X processes
 run-asm-tests: $(riscv-asm-tests)
-	make check-asm-tests
+	$(MAKE) check-asm-tests
 
 check-asm-tests:
 	ci/check-tests.sh tmp/riscv-asm-tests- $(shell wc -l $(riscv-asm-tests-list) | awk -F " " '{ print $1 }')
 
 # can use -jX to run ci tests in parallel using X processes
 run-benchmarks: $(riscv-benchmarks)
-	make check-benchmarks
+	$(MAKE) check-benchmarks
 
 check-benchmarks:
 	ci/check-tests.sh tmp/riscv-benchmarks- $(shell wc -l $(riscv-benchmarks-list) | awk -F " " '{ print $1 }')
 
 # verilator-specific
 verilate_command := $(verilator)                                                           \
-                    $(ariane_pkg)                                                          \
-                    $(filter-out tb/ariane_bt.sv,$(src))                                   \
+                    $(filter-out %.vhd, $(ariane_pkg))                                     \
+                    $(filter-out src/fpu_wrap.sv, $(filter-out %.vhd, $(src)))             \
                     +define+$(defines)                                                     \
                     src/util/sram.sv                                                       \
                     +incdir+src/axi_node                                                   \
@@ -209,7 +234,7 @@ verilate_command := $(verilator)
 # User Verilator, at some point in the future this will be auto-generated
 verilate:
 	$(verilate_command)
-	cd $(ver-library) && make -j${NUM_JOBS} -f Variane_testharness.mk
+	cd $(ver-library) && $(MAKE) -j${NUM_JOBS} -f Variane_testharness.mk
 
 $(addsuffix -verilator,$(riscv-asm-tests)): verilate
 	$(ver-library)/Variane_testharness $(riscv-test-dir)/$(subst -verilator,,$@)
@@ -235,29 +260,29 @@ torture-itest:
 	cd $(riscv-torture-dir) && $(riscv-torture-bin) 'testrun/run -a output/test.S'
 
 torture-rtest: build
-	cd $(riscv-torture-dir) && printf "#!/bin/sh\ncd $(root-dir) && make run-torture$(torture-logs) defines=$(defines) test-location=$(test-location)" > call.sh && chmod +x call.sh
+	cd $(riscv-torture-dir) && printf "#!/bin/sh\ncd $(root-dir) && $(MAKE) run-torture$(torture-logs) defines=$(defines) test-location=$(test-location)" > call.sh && chmod +x call.sh
 	cd $(riscv-torture-dir) && $(riscv-torture-bin) 'testrun/run -r ./call.sh -a $(test-location).S' | tee $(test-location).log
 	make check-torture test-location=$(test-location)
 
 torture-dummy: build
-	cd $(riscv-torture-dir) && printf "#!/bin/sh\ncd $(root-dir) && make run-torture defines=$(defines) test-location=\$${@: -1}" > call.sh
+	cd $(riscv-torture-dir) && printf "#!/bin/sh\ncd $(root-dir) && $(MAKE) run-torture defines=$(defines) test-location=\$${@: -1}" > call.sh
 
 torture-rnight: build
-	cd $(riscv-torture-dir) && printf "#!/bin/sh\ncd $(root-dir) && make run-torture$(torture-logs) defines=$(defines) test-location=\$${@: -1}" > call.sh && chmod +x call.sh
+	cd $(riscv-torture-dir) && printf "#!/bin/sh\ncd $(root-dir) && $(MAKE) run-torture$(torture-logs) defines=$(defines) test-location=\$${@: -1}" > call.sh && chmod +x call.sh
 	cd $(riscv-torture-dir) && $(riscv-torture-bin) 'overnight/run -r ./call.sh -g none' | tee output/overnight.log
-	make check-torture
+	$(MAKE) check-torture
 
 torture-rtest-verilator: verilate
-	cd $(riscv-torture-dir) && printf "#!/bin/sh\ncd $(root-dir) && make run-torture-verilator defines=$(defines)" > call.sh && chmod +x call.sh
+	cd $(riscv-torture-dir) && printf "#!/bin/sh\ncd $(root-dir) && $(MAKE) run-torture-verilator defines=$(defines)" > call.sh && chmod +x call.sh
 	cd $(riscv-torture-dir) && $(riscv-torture-bin) 'testrun/run -r ./call.sh -a output/test.S' | tee output/test.log
-	make check-torture
+	$(MAKE) check-torture
 
 run-torture: build
 	vsim${questa_version} +permissive -64 -c -lib ${library} +max-cycles=$(max_cycles)+UVM_TESTNAME=${test_case}  \
 	+BASEDIR=$(riscv-torture-dir) $(uvm-flags) "+UVM_VERBOSITY=LOW" -coverage -classdebug +jtag_rbb_enable=0      \
 	$(QUESTASIM_FLAGS)                                                                                            \
 	-gblso $(RISCV)/lib/libfesvr.so -sv_lib $(dpi-library)/ariane_dpi                                             \
-	-do "coverage save -onexit tmp/$@.ucdb; run -a; quit -code [coverage attribute -name TESTSTATUS -concise]"    \
+	-do " set StdArithNoWarnings 1; set NumericStdNoWarnings 1; coverage save -onexit tmp/$@.ucdb; run -a; quit -code [coverage attribute -name TESTSTATUS -concise]"    \
 	${top_level}_optimized +permissive-off                                                                        \
 	+signature=$(riscv-torture-dir)/$(test-location).rtlsim.sig ++$(riscv-torture-dir)/$(test-location) ++$(target-options)
 
@@ -271,6 +296,7 @@ run-torture-log: build
 	+signature=$(riscv-torture-dir)/$(test-location).rtlsim.sig ++$(riscv-torture-dir)/$(test-location) ++$(target-options)
 	cp vsim.wlf $(riscv-torture-dir)/$(test-location).wlf
 	cp trace_core_00_0.log $(riscv-torture-dir)/$(test-location).trace
+	cp trace_core_00_0_commit.log $(riscv-torture-dir)/$(test-location).commit
 	cp transcript $(riscv-torture-dir)/$(test-location).transcript
 
 run-torture-verilator: verilate
diff --git a/README.md b/README.md
index 72ec74dc7..766e0dcfd 100644
--- a/README.md
+++ b/README.md
@@ -81,6 +81,10 @@ $ make simc riscv-test-dir=$RISCV/riscv64-unknown-elf/bin riscv-test=pk target-o
 
 > Be patient! RTL simulation is way slower than Spike. If you think that you ran into problems you can inspect the trace files.
 
+### FPU Support
+
+> There is preliminary support for floating point extensions F and D. At the moment floating point support will only be available in QuestaSim as the FPU is written in VHDL. This is likely to change. The floating point extensions can be enabled by setting `RVF` and `RVD` to `1'b1` in the `include/ariane_pkg.sv` file.
+
 ## FPGA Emulation
 
 Coming.
diff --git a/ci/riscv-asm-tests.list b/ci/riscv-asm-tests.list
index 01e12844b..e03f7c392 100644
--- a/ci/riscv-asm-tests.list
+++ b/ci/riscv-asm-tests.list
@@ -164,4 +164,4 @@ rv64ua-v-amomin_d
 rv64ua-v-amomin_w
 rv64ua-v-amominu_d
 rv64ua-v-amominu_w
-rv64ua-v-lrsc
\ No newline at end of file
+rv64ua-v-lrsc
diff --git a/include/ariane_pkg.sv b/include/ariane_pkg.sv
index b920db551..7db67cb16 100644
--- a/include/ariane_pkg.sv
+++ b/include/ariane_pkg.sv
@@ -24,7 +24,7 @@ package ariane_pkg;
     localparam NR_SB_ENTRIES = 8; // number of scoreboard entries
     localparam TRANS_ID_BITS = $clog2(NR_SB_ENTRIES); // depending on the number of scoreboard entries we need that many bits
                                                       // to uniquely identify the entry in the scoreboard
-    localparam NR_WB_PORTS   = 5;
+    localparam NR_WB_PORTS   = 4;
     localparam ASID_WIDTH    = 1;
     localparam BTB_ENTRIES   = 8;
     localparam BHT_ENTRIES   = 32;
@@ -32,18 +32,64 @@ package ariane_pkg;
     localparam BITS_SATURATION_COUNTER = 2;
     localparam NR_COMMIT_PORTS = 2;
 
-    localparam logic [63:0] ISA_CODE =
-                                     | (1 <<  0)  // A - Atomic extension
-                                     | (1 <<  2)  // C - Compressed extension
-                                     | (1 <<  8)  // I - RV32I/64I/128I base ISA
-                                     | (1 << 12)  // M - Integer Multiply/Divide extension
-                                     | (0 << 13)  // N - User level interrupts supported
-                                     | (1 << 18)  // S - Supervisor mode implemented
-                                     | (1 << 20)  // U - User mode implemented
-                                     | (0 << 23)  // X - Non-standard extensions present
-                                     | (1 << 63); // RV64
     localparam ENABLE_RENAME = 1'b1;
 
+    // Floating-point extensions configuration
+    localparam bit RVF = 1'b0; // Is F extension enabled
+    localparam bit RVD = 1'b0; // Is D extension enabled
+    localparam bit RVA = 1'b1; // Is A extension enabled
+
+    // Transprecision floating-point extensions configuration
+    localparam bit XF16    = 1'b0; // Is half-precision float extension (Xf16) enabled
+    localparam bit XF16ALT = 1'b0; // Is alternative half-precision float extension (Xf16alt) enabled
+    localparam bit XF8     = 1'b0; // Is quarter-precision float extension (Xf8) enabled
+    localparam bit XFVEC   = 1'b0; // Is vectorial float extension (Xfvec) enabled
+
+    // Transprecision float unit
+    localparam logic [30:0] LAT_COMP_FP32    = 'd3;
+    localparam logic [30:0] LAT_COMP_FP64    = 'd4;
+    localparam logic [30:0] LAT_COMP_FP16    = 'd3;
+    localparam logic [30:0] LAT_COMP_FP16ALT = 'd3;
+    localparam logic [30:0] LAT_COMP_FP8     = 'd2;
+    localparam logic [30:0] LAT_DIVSQRT      = 'd2;
+    localparam logic [30:0] LAT_NONCOMP      = 'd1;
+    localparam logic [30:0] LAT_CONV         = 'd2;
+
+    // --------------------------------------
+    // vvvv Don't change these by hand! vvvv
+    localparam bit FP_PRESENT = RVF | RVD | XF16 | XF16ALT | XF8;
+
+    // Length of widest floating-point format
+    localparam FLEN    = RVD     ? 64 : // D ext.
+                         RVF     ? 32 : // F ext.
+                         XF16    ? 16 : // Xf16 ext.
+                         XF16ALT ? 16 : // Xf16alt ext.
+                         XF8     ? 8 :  // Xf8 ext.
+                         0;             // Unused in case of no FP
+
+    localparam bit NSX = XF16 | XF16ALT | XF8 | XFVEC; // Are non-standard extensions present?
+
+    localparam bit RVFVEC     = RVF     & XFVEC & FLEN>32; // FP32 vectors available if vectors and larger fmt enabled
+    localparam bit XF16VEC    = XF16    & XFVEC & FLEN>16; // FP16 vectors available if vectors and larger fmt enabled
+    localparam bit XF16ALTVEC = XF16ALT & XFVEC & FLEN>16; // FP16ALT vectors available if vectors and larger fmt enabled
+    localparam bit XF8VEC     = XF8     & XFVEC & FLEN>8;  // FP8 vectors available if vectors and larger fmt enabled
+    // ^^^^ until here ^^^^
+    // ---------------------
+
+    localparam logic [63:0] ARIANE_MARCHID = 64'd3;
+
+    localparam logic [63:0] ISA_CODE = (RVA <<  0)  // A - Atomic Instructions extension
+                                     | (1   <<  2)  // C - Compressed extension
+                                     | (RVD <<  3)  // D - Double precsision floating-point extension
+                                     | (RVF <<  5)  // F - Single precsision floating-point extension
+                                     | (1   <<  8)  // I - RV32I/64I/128I base ISA
+                                     | (1   << 12)  // M - Integer Multiply/Divide extension
+                                     | (0   << 13)  // N - User level interrupts supported
+                                     | (1   << 18)  // S - Supervisor mode implemented
+                                     | (1   << 20)  // U - User mode implemented
+                                     | (NSX << 23)  // X - Non-standard extensions present
+                                     | (1   << 63); // RV64
+
     // 32 registers + 1 bit for re-naming = 6
     localparam REG_ADDR_SIZE = 6;
 
@@ -57,9 +103,8 @@ package ariane_pkg;
                                                 dataaddr: dm::DataAddr
                                               };
 
-
     // enables a commit log which matches spikes commit log format for easier trace comparison
-    localparam bit ENABLE_SPIKE_COMMIT_LOG = 1'b0;
+    localparam bit ENABLE_SPIKE_COMMIT_LOG = 1'b1;
 
     // ------------- Dangerouse -------------
     // if set to zero a flush will not invalidate the cache-lines, in a single core environment
@@ -152,7 +197,9 @@ package ariane_pkg;
         ALU,       // 3
         CTRL_FLOW, // 4
         MULT,      // 5
-        CSR        // 6
+        CSR,       // 6
+        FPU,       // 7
+        FPU_VEC    // 8
     } fu_t;
 
     localparam EXC_OFF_RST      = 8'h80;
@@ -199,9 +246,94 @@ package ariane_pkg;
                                // Multiplications
                                MUL, MULH, MULHU, MULHSU, MULW,
                                // Divisions
-                               DIV, DIVU, DIVW, DIVUW, REM, REMU, REMW, REMUW
+                               DIV, DIVU, DIVW, DIVUW, REM, REMU, REMW, REMUW,
+                               // Floating-Point Load and Store Instructions
+                               FLD, FLW, FLH, FLB, FSD, FSW, FSH, FSB,
+                               // Floating-Point Computational Instructions
+                               FADD, FSUB, FMUL, FDIV, FMIN_MAX, FSQRT, FMADD, FMSUB, FNMSUB, FNMADD,
+                               // Floating-Point Conversion and Move Instructions
+                               FCVT_F2I, FCVT_I2F, FCVT_F2F, FSGNJ, FMV_F2X, FMV_X2F,
+                               // Floating-Point Compare Instructions
+                               FCMP,
+                               // Floating-Point Classify Instruction
+                               FCLASS,
+                               // Vectorial Floating-Point Instructions that don't directly map onto the scalar ones
+                               VFMIN, VFMAX, VFSGNJ, VFSGNJN, VFSGNJX, VFEQ, VFNE, VFLT, VFGE, VFLE, VFGT, VFCPKAB_S, VFCPKCD_S, VFCPKAB_D, VFCPKCD_D
                              } fu_op;
 
+    typedef struct packed {
+      fu_op        operator;
+      logic [63:0] operand_a;
+      logic [63:0] operand_b;
+      logic [63:0] imm;
+    } fu_data_t;
+
+    // -------------------------------
+    // Extract Src/Dst FP Reg from Op
+    // -------------------------------
+    function automatic logic is_rs1_fpr (input fu_op op);
+        if (FP_PRESENT) begin // makes function static for non-fp case
+            unique case (op) inside
+                [FMUL:FNMADD],                   // Computational Operations (except ADD/SUB)
+                FCVT_F2I,                        // Float-Int Casts
+                FCVT_F2F,                        // Float-Float Casts
+                FSGNJ,                           // Sign Injections
+                FMV_F2X,                         // FPR-GPR Moves
+                FCMP,                            // Comparisons
+                FCLASS,                          // Classifications
+                [VFMIN:VFCPKCD_D] : return 1'b1; // Additional Vectorial FP ops
+                default           : return 1'b0; // all other ops
+            endcase
+        end else
+            return 1'b0;
+    endfunction;
+
+    function automatic logic is_rs2_fpr (input fu_op op);
+        if (FP_PRESENT) begin // makes function static for non-fp case
+            unique case (op) inside
+                [FSD:FSB],                       // FP Stores
+                [FADD:FMIN_MAX],                 // Computational Operations (no sqrt)
+                [FMADD:FNMADD],                  // Fused Computational Operations
+                FCVT_F2F,                        // Vectorial F2F Conversions requrie target
+                [FSGNJ:FMV_F2X],                 // Sign Injections and moves mapped to SGNJ
+                FCMP,                            // Comparisons
+                [VFMIN:VFCPKCD_D] : return 1'b1; // Additional Vectorial FP ops
+                default           : return 1'b0; // all other ops
+            endcase
+        end else
+            return 1'b0;
+    endfunction;
+
+    // ternary operations encode the rs3 address in the imm field, also add/sub
+    function automatic logic is_imm_fpr (input fu_op op);
+        if (FP_PRESENT) begin // makes function static for non-fp case
+            unique case (op) inside
+                [FADD:FSUB],                         // ADD/SUB need inputs as Operand B/C
+                [FMADD:FNMADD],                      // Fused Computational Operations
+                [VFCPKAB_S:VFCPKCD_D] : return 1'b1; // Vectorial FP cast and pack ops
+                default               : return 1'b0; // all other ops
+            endcase
+        end else
+            return 1'b0;
+    endfunction;
+
+    function automatic logic is_rd_fpr (input fu_op op);
+        if (FP_PRESENT) begin // makes function static for non-fp case
+            unique case (op) inside
+                [FLD:FLB],                           // FP Loads
+                [FADD:FNMADD],                       // Computational Operations
+                FCVT_I2F,                            // Int-Float Casts
+                FCVT_F2F,                            // Float-Float Casts
+                FSGNJ,                               // Sign Injections
+                FMV_X2F,                             // GPR-FPR Moves
+                [VFMIN:VFSGNJX],                     // Vectorial MIN/MAX and SGNJ
+                [VFCPKAB_S:VFCPKCD_D] : return 1'b1; // Vectorial FP cast and pack ops
+                default               : return 1'b0; // all other ops
+            endcase
+        end else
+            return 1'b0;
+    endfunction;
+
     function automatic logic is_amo (fu_op op);
         case (op) inside
             [AMO_LRW:AMO_MINDU]: begin
@@ -244,7 +376,10 @@ package ariane_pkg;
         logic [REG_ADDR_SIZE-1:0] rs1;           // register source address 1
         logic [REG_ADDR_SIZE-1:0] rs2;           // register source address 2
         logic [REG_ADDR_SIZE-1:0] rd;            // register destination address
-        logic [63:0]              result;        // for unfinished instructions this field also holds the immediate
+        logic [63:0]              result;        // for unfinished instructions this field also holds the immediate,
+                                                 // for unfinished floating-point that are partly encoded in rs2, this field also holds rs2
+                                                 // for unfinished floating-point fused operations (FMADD, FMSUB, FNMADD, FNMSUB)
+                                                 // this field holds the address of the third operand from the floating-point register file
         logic                     valid;         // is the result valid
         logic                     use_imm;       // should we use the immediate as operand b?
         logic                     use_zimm;      // use zimm as operand a
@@ -433,7 +568,7 @@ package ariane_pkg;
     // ----------------------
     function automatic logic [1:0] extract_transfer_size(fu_op op);
         case (op)
-            LD, SD,
+            LD, SD, FLD, FSD,
             AMO_LRD,   AMO_SCD,
             AMO_SWAPD, AMO_ADDD,
             AMO_ANDD,  AMO_ORD,
@@ -442,7 +577,7 @@ package ariane_pkg;
             AMO_MINDU: begin
                 return 2'b11;
             end
-            LW, LWU, SW,
+            LW, LWU, SW, FLW, FSW,
             AMO_LRW,   AMO_SCW,
             AMO_SWAPW, AMO_ADDW,
             AMO_ANDW,  AMO_ORW,
@@ -451,8 +586,8 @@ package ariane_pkg;
             AMO_MINWU: begin
                 return 2'b10;
             end
-            LH, LHU, SH: return 2'b01;
-            LB, SB, LBU: return 2'b00;
+            LH, LHU, SH, FLH, FSH: return 2'b01;
+            LB, LBU, SB, FLB, FSB: return 2'b00;
             default:     return 2'b11;
         endcase
     endfunction
diff --git a/include/riscv_pkg.sv b/include/riscv_pkg.sv
index 7a223e444..ba5fdf34b 100644
--- a/include/riscv_pkg.sv
+++ b/include/riscv_pkg.sv
@@ -32,6 +32,13 @@ package riscv;
         XLEN_128 = 2'b11
     } xlen_t;
 
+    typedef enum logic [1:0] {
+        Off     = 2'b00,
+        Initial = 2'b01,
+        Clean   = 2'b10,
+        Dirty   = 2'b11
+    } xs_t;
+
     typedef struct packed {
         logic         sd;     // signal dirty - read-only - hardwired zero
         logic [62:36] wpri4;  // writes preserved reads ignored
@@ -44,8 +51,8 @@ package riscv;
         logic         mxr;    // make executable readable
         logic         sum;    // permit supervisor user memory access
         logic         mprv;   // modify privilege - privilege level for ld/st
-        logic [1:0]   xs;     // extension register - hardwired to zero
-        logic [1:0]   fs;     // extension register - hardwired to zero
+        xs_t          xs;     // extension register - hardwired to zero
+        xs_t          fs;     // floating point extension register
         priv_lvl_t    mpp;    // holds the previous privilege mode up to machine
         logic [1:0]   wpri2;  // writes preserved reads ignored
         logic         spp;    // holds the previous privilege mode up to supervisor
@@ -104,6 +111,37 @@ package riscv;
         logic [6:0]   opcode;
     } rtype_t;
 
+    typedef struct packed {
+        logic [31:27] rs3;
+        logic [26:25] funct2;
+        logic [24:20] rs2;
+        logic [19:15] rs1;
+        logic [14:12] funct3;
+        logic [11:7]  rd;
+        logic [6:0]   opcode;
+    } r4type_t;
+
+    typedef struct packed {
+        logic [31:27] funct5;
+        logic [26:25] fmt;
+        logic [24:20] rs2;
+        logic [19:15] rs1;
+        logic [14:12] rm;
+        logic [11:7]  rd;
+        logic [6:0]   opcode;
+    } rftype_t; // floating-point
+
+    typedef struct packed {
+        logic [31:30] funct2;
+        logic [29:25] vecfltop;
+        logic [24:20] rs2;
+        logic [19:15] rs1;
+        logic [14:14] repl;
+        logic [13:12] vfmt;
+        logic [11:7]  rd;
+        logic [6:0]   opcode;
+    } rvftype_t; // vectorial floating-point
+
     typedef struct packed {
         logic [31:20] imm;
         logic [19:15] rs1;
@@ -142,6 +180,9 @@ package riscv;
     typedef union packed {
         logic [31:0]   instr;
         rtype_t        rtype;
+        r4type_t       r4type;
+        rftype_t       rftype;
+        rvftype_t      rvftype;
         itype_t        itype;
         stype_t        stype;
         utype_t        utype;
@@ -151,27 +192,72 @@ package riscv;
     // --------------------
     // Opcodes
     // --------------------
-    localparam OpcodeSystem    = 7'h73;
-    localparam OpcodeFence     = 7'h0f;
-    localparam OpcodeOp        = 7'h33;
-    localparam OpcodeOp32      = 7'h3B;
-    localparam OpcodeOpimm     = 7'h13;
-    localparam OpcodeOpimm32   = 7'h1B;
-    localparam OpcodeStore     = 7'h23;
-    localparam OpcodeStoreFP   = 7'b01_001_11;
-    localparam OpcodeLoad      = 7'h03;
-    localparam OpcodeLoadFP    = 7'b00_001_11;
-    localparam OpcodeBranch    = 7'h63;
-    localparam OpcodeJalr      = 7'h67;
-    localparam OpcodeJal       = 7'h6f;
-    localparam OpcodeAuipc     = 7'h17;
-    localparam OpcodeLui       = 7'h37;
-    localparam OpcodeAmo       = 7'h2F;
+    // RV32/64G listings:
+    // Quadrant 0
+    localparam OpcodeLoad      = 7'b00_000_11;
+    localparam OpcodeLoadFp    = 7'b00_001_11;
+    localparam OpcodeCustom0   = 7'b00_010_11;
+    localparam OpcodeMiscMem   = 7'b00_011_11;
+    localparam OpcodeOpImm     = 7'b00_100_11;
+    localparam OpcodeAuipc     = 7'b00_101_11;
+    localparam OpcodeOpImm32   = 7'b00_110_11;
+    // Quadrant 1
+    localparam OpcodeStore     = 7'b01_000_11;
+    localparam OpcodeStoreFp   = 7'b01_001_11;
+    localparam OpcodeCustom1   = 7'b01_010_11;
+    localparam OpcodeAmo       = 7'b01_011_11;
+    localparam OpcodeOp        = 7'b01_100_11;
+    localparam OpcodeLui       = 7'b01_101_11;
+    localparam OpcodeOp32      = 7'b01_110_11;
+    // Quadrant 2
+    localparam OpcodeMadd      = 7'b10_000_11;
+    localparam OpcodeMsub      = 7'b10_001_11;
+    localparam OpcodeNmsub     = 7'b10_010_11;
+    localparam OpcodeNmadd     = 7'b10_011_11;
+    localparam OpcodeOpFp      = 7'b10_100_11;
+    localparam OpcodeRsrvd1    = 7'b10_101_11;
+    localparam OpcodeCustom2   = 7'b10_110_11;
+    // Quadrant 3
+    localparam OpcodeBranch    = 7'b11_000_11;
+    localparam OpcodeJalr      = 7'b11_001_11;
+    localparam OpcodeRsrvd2    = 7'b11_010_11;
+    localparam OpcodeJal       = 7'b11_011_11;
+    localparam OpcodeSystem    = 7'b11_100_11;
+    localparam OpcodeRsrvd3    = 7'b11_101_11;
+    localparam OpcodeCustom3   = 7'b11_110_11;
 
-    localparam OpcodeCJ        = 3'b101;
-    localparam OpcodeCBeqz     = 3'b110;
-    localparam OpcodeCBnez     = 3'b111;
+    // RV64C listings:
+    // Quadrant 0
+    localparam OpcodeC0             = 2'b00;
+    localparam OpcodeC0Addi4spn     = 3'b000;
+    localparam OpcodeC0Fld          = 3'b001;
+    localparam OpcodeC0Lw           = 3'b010;
+    localparam OpcodeC0Ld           = 3'b011;
+    localparam OpcodeC0Rsrvd        = 3'b100;
+    localparam OpcodeC0Fsd          = 3'b101;
+    localparam OpcodeC0Sw           = 3'b110;
+    localparam OpcodeC0Sd           = 3'b111;
+    // Quadrant 1
+    localparam OpcodeC1             = 2'b01;
+    localparam OpcodeC1Addi         = 3'b000;
+    localparam OpcodeC1Addiw        = 3'b001;
+    localparam OpcodeC1Li           = 3'b010;
+    localparam OpcodeC1LuiAddi16sp  = 3'b011;
+    localparam OpcodeC1MiscAlu      = 3'b100;
+    localparam OpcodeC1J            = 3'b101;
+    localparam OpcodeC1Beqz         = 3'b110;
+    localparam OpcodeC1Bnez         = 3'b111;
+    // Quadrant 2
+    localparam OpcodeC2             = 2'b10;
+    localparam OpcodeC2Slli         = 3'b000;
+    localparam OpcodeC2Fldsp        = 3'b001;
+    localparam OpcodeC2Lwsp         = 3'b010;
+    localparam OpcodeC2Ldsp         = 3'b011;
     localparam OpcodeC2JalrMvAdd    = 3'b100;
+    localparam OpcodeC2Fsdsp        = 3'b101;
+    localparam OpcodeC2Swsp         = 3'b110;
+    localparam OpcodeC2Sdsp         = 3'b111;
+
     // ----------------------
     // Performance Counters
     // ----------------------
@@ -235,6 +321,11 @@ package riscv;
     // CSRs
     // -----
     typedef enum logic [11:0] {
+        // Floating-Point CSRs
+        CSR_FFLAGS         = 12'h001,
+        CSR_FRM            = 12'h002,
+        CSR_FCSR           = 12'h003,
+        CSR_FTRAN          = 12'h800,
         // Supervisor Mode CSRs
         CSR_SSTATUS        = 12'h100,
         CSR_SIE            = 12'h104,
@@ -321,6 +412,14 @@ package riscv;
         csr_addr_t  csr_decode;
     } csr_t;
 
+    // Floating-Point control and status register (32-bit!)
+    typedef struct packed {
+        logic [31:15] reserved;  // reserved for L extension, return 0 otherwise
+        logic [6:0]   fprec;     // div/sqrt precision control
+        logic [2:0]   frm;       // float rounding mode
+        logic [4:0]   fflags;    // float exception flags
+    } fcsr_t;
+
     // -----
     // Debug
     // -----
@@ -398,13 +497,14 @@ package riscv;
 
     // trace log compatible to spikes commit log feature
     // pragma translate_off
-    function string spikeCommitLog(logic [63:0] pc, priv_lvl_t priv_lvl, logic [31:0] instr, logic [4:0] rd, logic [63:0] result);
+    function string spikeCommitLog(logic [63:0] pc, priv_lvl_t priv_lvl, logic [31:0] instr, logic [4:0] rd, logic [63:0] result, logic rd_fpr);
         string rd_s;
+        automatic string rf_s = rd_fpr ? "f" : "x";
 
-        if (rd < 10) rd_s = $sformatf("x %0d", rd);
-        else rd_s = $sformatf("x%0d", rd);
+        if (rd < 10) rd_s = $sformatf("%s %0d", rf_s, rd);
+        else rd_s = $sformatf("%s%0d", rf_s, rd);
 
-        if (rd != 0) begin
+        if (rd_fpr || rd != 0) begin
             // 0 0x0000000080000118 (0xeecf8f93) x31 0x0000000080004000
             return $sformatf("%d 0x%h (0x%h) %s 0x%h\n", priv_lvl, pc, instr, rd_s, result);
         end else begin
diff --git a/src/alu.sv b/src/alu.sv
index fadd053a2..b9fe8d235 100644
--- a/src/alu.sv
+++ b/src/alu.sv
@@ -19,23 +19,40 @@
 
 import ariane_pkg::*;
 
-module alu
-(
+module alu (
+    input  logic                     clk_i,          // Clock
+    input  logic                     rst_ni,         // Asynchronous reset active low
+    input  logic                     flush_i,
+    input  logic [63:0]              pc_i,
     input  logic [TRANS_ID_BITS-1:0] trans_id_i,
     input  logic                     alu_valid_i,
+    input  logic                     branch_valid_i,
+    input  logic                     csr_valid_i,
     input  fu_op                     operator_i,
     input  logic [63:0]              operand_a_i,
     input  logic [63:0]              operand_b_i,
+    input  logic [63:0]              imm_i,
     output logic [63:0]              result_o,
-    output logic                     alu_branch_res_o,
     output logic                     alu_valid_o,
     output logic                     alu_ready_o,
-    output logic [TRANS_ID_BITS-1:0] alu_trans_id_o
+    output logic [TRANS_ID_BITS-1:0] alu_trans_id_o,
+    output exception_t               alu_exception_o,
+
+    input  logic                     fu_valid_i,
+    input  logic                     is_compressed_instr_i,
+    input  branchpredict_sbe_t       branch_predict_i,
+    output branchpredict_t           resolved_branch_o,
+    output logic                     resolve_branch_o,
+
+    input  logic                     commit_i,
+    // to CSR file
+    output logic  [11:0]             csr_addr_o  // CSR address to commit stage
 );
 
-    // ALU is a single cycle instructions, hence it is always ready
-    assign alu_ready_o    = 1'b1;
-    assign alu_valid_o    = alu_valid_i;
+    logic csr_ready;
+
+    assign alu_ready_o    = csr_ready;
+    assign alu_valid_o    = alu_valid_i | branch_valid_i | csr_valid_i;
     assign alu_trans_id_o = trans_id_i;
 
     logic [63:0] operand_a_rev;
@@ -43,6 +60,8 @@ module alu
     logic [64:0] operand_b_neg;
     logic [65:0] adder_result_ext_o;
     logic        less;  // handles both signed and unsigned forms
+    logic        alu_branch_res;
+    logic [63:0] branch_result, csr_result;
 
     // bit reverse operand_a for left shifts and bit counting
     generate
@@ -89,13 +108,13 @@ module alu
     // get the right branch comparison result
     always_comb begin : branch_resolve
         // set comparison by default
-        alu_branch_res_o      = 1'b1;
+        alu_branch_res      = 1'b1;
         case (operator_i)
-            EQ:       alu_branch_res_o = adder_z_flag;
-            NE:       alu_branch_res_o = ~adder_z_flag;
-            LTS, LTU: alu_branch_res_o = less;
-            GES, GEU: alu_branch_res_o = ~less;
-            default:  alu_branch_res_o = 1'b1;
+            EQ:       alu_branch_res = adder_z_flag;
+            NE:       alu_branch_res = ~adder_z_flag;
+            LTS, LTU: alu_branch_res = less;
+            GES, GEU: alu_branch_res = ~less;
+            default:  alu_branch_res = 1'b1;
         endcase
     end
 
@@ -198,6 +217,48 @@ module alu
 
             default: ; // default case to suppress unique warning
         endcase
+
+        if (branch_valid_i) begin
+            result_o = branch_result;
+        end else if (csr_valid_i) begin
+            result_o = csr_result;
+        end
+
     end
 
+    // ----------------------
+    // Branch Unit
+    // ----------------------
+    branch_unit branch_unit_i (
+        .operator_i,
+        .operand_a_i,
+        .operand_b_i,
+        .imm_i,
+        .pc_i,
+        .is_compressed_instr_i,
+        // any functional unit is valid, check that there is no accidental mis-predict
+        .fu_valid_i,
+        .branch_valid_i,
+        .branch_comp_res_i ( alu_branch_res ),
+        .branch_result_o   ( branch_result ),
+        .branch_predict_i,
+        .resolved_branch_o,
+        .resolve_branch_o,
+        .branch_exception_o ( alu_exception_o )
+    );
+
+    csr_buffer csr_buffer_i (
+        .clk_i,
+        .rst_ni,
+        .flush_i,
+        .csr_valid_i,
+        .operator_i,
+        .operand_a_i,
+        .operand_b_i,
+        .csr_ready_o    ( csr_ready    ),
+        .csr_result_o   ( csr_result   ),
+        .commit_i,
+        .csr_addr_o
+    );
+
 endmodule
diff --git a/src/ariane.sv b/src/ariane.sv
index 0f8829049..bfdf5f4b5 100644
--- a/src/ariane.sv
+++ b/src/ariane.sv
@@ -91,12 +91,8 @@ module ariane #(
     logic [TRANS_ID_BITS-1:0] alu_trans_id_ex_id;
     logic                     alu_valid_ex_id;
     logic [63:0]              alu_result_ex_id;
+    exception_t               alu_exception_ex_id;
     // Branches and Jumps
-    logic                     branch_ready_ex_id;
-    logic [TRANS_ID_BITS-1:0] branch_trans_id_ex_id;
-    logic [63:0]              branch_result_ex_id;
-    exception_t               branch_exception_ex_id;
-    logic                     branch_valid_ex_id;
     logic                     branch_valid_id_ex;
 
     branchpredict_sbe_t       branch_predict_id_ex;
@@ -114,17 +110,23 @@ module ariane #(
     logic [TRANS_ID_BITS-1:0] mult_trans_id_ex_id;
     logic [63:0]              mult_result_ex_id;
     logic                     mult_valid_ex_id;
+    // FPU
+    logic                     fpu_ready_ex_id;
+    logic                     fpu_valid_id_ex;
+    logic [1:0]               fpu_fmt_id_ex;
+    logic [2:0]               fpu_rm_id_ex;
+    logic [TRANS_ID_BITS-1:0] fpu_trans_id_ex_id;
+    logic [63:0]              fpu_result_ex_id;
+    logic                     fpu_valid_ex_id;
+    exception_t               fpu_exception_ex_id;
     // CSR
-    logic                     csr_ready_ex_id;
     logic                     csr_valid_id_ex;
-    logic [TRANS_ID_BITS-1:0] csr_trans_id_ex_id;
-    logic [63:0]              csr_result_ex_id;
-    logic                     csr_valid_ex_id;
     // --------------
     // EX <-> COMMIT
     // --------------
     // CSR Commit
     logic                     csr_commit_commit_ex;
+    logic                     dirty_fp_state;
     // LSU Commit
     logic                     lsu_commit_commit_ex;
     logic                     lsu_commit_ready_ex_commit;
@@ -139,10 +141,15 @@ module ariane #(
     // --------------
     logic [NR_COMMIT_PORTS-1:0][4:0]  waddr_commit_id;
     logic [NR_COMMIT_PORTS-1:0][63:0] wdata_commit_id;
-    logic [NR_COMMIT_PORTS-1:0]       we_commit_id;
+    logic [NR_COMMIT_PORTS-1:0]       we_gpr_commit_id;
+    logic [NR_COMMIT_PORTS-1:0]       we_fpr_commit_id;
     // --------------
     // CSR <-> *
     // --------------
+    logic [4:0]               fflags_csr_commit;
+    riscv::xs_t               fs;
+    logic [2:0]               frm_csr_id_issue_ex;
+    logic [6:0]               fprec_csr_ex;
     logic                     enable_translation_csr_ex;
     logic                     en_ld_st_translation_csr_ex;
     riscv::priv_lvl_t         ld_st_priv_lvl_csr_ex;
@@ -159,6 +166,7 @@ module ariane #(
     logic                     tw_csr_id;
     logic                     tsr_csr_id;
     logic                     dcache_en_csr_nbdcache;
+    logic                     csr_write_fflags_commit_cs;
     logic                     icache_en_csr;
     logic                     debug_mode;
     logic                     single_step_csr_commit;
@@ -252,6 +260,8 @@ module ariane #(
         .issue_instr_ack_i          ( issue_instr_issue_id            ),
 
         .priv_lvl_i                 ( priv_lvl                        ),
+        .fs_i                       ( fs                              ),
+        .frm_i                      ( frm_csr_id_issue_ex             ),
         .debug_mode_i               ( debug_mode                      ),
         .tvm_i                      ( tvm_csr_id                      ),
         .tw_i                       ( tw_csr_id                       ),
@@ -288,7 +298,6 @@ module ariane #(
         .alu_ready_i                ( alu_ready_ex_id                 ),
         .alu_valid_o                ( alu_valid_id_ex                 ),
         // Branches and Jumps
-        .branch_ready_i             ( branch_ready_ex_id              ),
         .branch_valid_o             ( branch_valid_id_ex              ), // branch is valid
         .branch_predict_o           ( branch_predict_id_ex            ), // branch predict to ex
         .resolve_branch_i           ( resolve_branch_ex_id            ), // in order to resolve the branch
@@ -298,20 +307,25 @@ module ariane #(
         // Multiplier
         .mult_ready_i               ( mult_ready_ex_id                ),
         .mult_valid_o               ( mult_valid_id_ex                ),
+        // FPU
+        .fpu_ready_i                ( fpu_ready_ex_id                 ),
+        .fpu_valid_o                ( fpu_valid_id_ex                 ),
+        .fpu_fmt_o                  ( fpu_fmt_id_ex                   ),
+        .fpu_rm_o                   ( fpu_rm_id_ex                    ),
         // CSR
-        .csr_ready_i                ( csr_ready_ex_id                 ),
         .csr_valid_o                ( csr_valid_id_ex                 ),
 
+        // Commit
         .resolved_branch_i          ( resolved_branch                 ),
-        .trans_id_i                 ( {alu_trans_id_ex_id,         lsu_trans_id_ex_id,  branch_trans_id_ex_id,    csr_trans_id_ex_id,         mult_trans_id_ex_id        }),
-        .wbdata_i                   ( {alu_result_ex_id,           lsu_result_ex_id,    branch_result_ex_id,      csr_result_ex_id,           mult_result_ex_id          }),
-        .ex_ex_i                    ( {{$bits(exception_t){1'b0}}, lsu_exception_ex_id, branch_exception_ex_id,   {$bits(exception_t){1'b0}}, {$bits(exception_t){1'b0}} }),
-        .wb_valid_i                 ( {alu_valid_ex_id,            lsu_valid_ex_id,     branch_valid_ex_id,       csr_valid_ex_id,            mult_valid_ex_id           }),
+        .trans_id_i                 ( {alu_trans_id_ex_id,         lsu_trans_id_ex_id,   mult_trans_id_ex_id,        fpu_trans_id_ex_id }),
+        .wbdata_i                   ( {alu_result_ex_id,           lsu_result_ex_id,       mult_result_ex_id,          fpu_result_ex_id }),
+        .ex_ex_i                    ( {alu_exception_ex_id,        lsu_exception_ex_id, {$bits(exception_t){1'b0}}, fpu_exception_ex_id }),
+        .wb_valid_i                 ( {alu_valid_ex_id,            lsu_valid_ex_id,         mult_valid_ex_id,           fpu_valid_ex_id }),
 
         .waddr_i                    ( waddr_commit_id               ),
         .wdata_i                    ( wdata_commit_id               ),
-        .we_i                       ( we_commit_id                  ),
-
+        .we_gpr_i                   ( we_gpr_commit_id              ),
+        .we_fpr_i                   ( we_fpr_commit_id              ),
         .commit_instr_o             ( commit_instr_id_commit        ),
         .commit_ack_i               ( commit_ack                    ),
         .*
@@ -321,6 +335,8 @@ module ariane #(
     // EX
     // ---------
     ex_stage ex_stage_i (
+        .clk_i                  ( clk_i                                  ),
+        .rst_ni                 ( rst_ni                                 ),
         .flush_i                ( flush_ctrl_ex                          ),
         .fu_i                   ( fu_id_ex                               ),
         .operator_i             ( operator_id_ex                         ),
@@ -336,16 +352,16 @@ module ariane #(
         .alu_result_o           ( alu_result_ex_id                       ),
         .alu_trans_id_o         ( alu_trans_id_ex_id                     ),
         .alu_valid_o            ( alu_valid_ex_id                        ),
+        .alu_exception_o        ( alu_exception_ex_id                    ),
         // Branches and Jumps
-        .branch_ready_o         ( branch_ready_ex_id                     ),
-        .branch_valid_o         ( branch_valid_ex_id                     ),
         .branch_valid_i         ( branch_valid_id_ex                     ),
-        .branch_trans_id_o      ( branch_trans_id_ex_id                  ),
-        .branch_result_o        ( branch_result_ex_id                    ),
-        .branch_exception_o     ( branch_exception_ex_id                 ),
         .branch_predict_i       ( branch_predict_id_ex                   ), // branch predict to ex
         .resolved_branch_o      ( resolved_branch                        ),
         .resolve_branch_o       ( resolve_branch_ex_id                   ),
+        // CSR
+        .csr_valid_i            ( csr_valid_id_ex                        ),
+        .csr_addr_o             ( csr_addr_ex_csr                        ),
+        .csr_commit_i           ( csr_commit_commit_ex                   ), // from commit
         // LSU
         .lsu_ready_o            ( lsu_ready_ex_id                        ),
         .lsu_valid_i            ( lsu_valid_id_ex                        ),
@@ -356,17 +372,26 @@ module ariane #(
         .lsu_commit_ready_o     ( lsu_commit_ready_ex_commit             ), // to commit
         .lsu_exception_o        ( lsu_exception_ex_id                    ),
         .no_st_pending_o        ( no_st_pending_ex_commit                ),
+        // MULT
+        .mult_ready_o           ( mult_ready_ex_id                       ),
+        .mult_valid_i           ( mult_valid_id_ex                       ),
+        .mult_trans_id_o        ( mult_trans_id_ex_id                    ),
+        .mult_result_o          ( mult_result_ex_id                      ),
+        .mult_valid_o           ( mult_valid_ex_id                       ),
+        // FPU
+        .fpu_ready_o            ( fpu_ready_ex_id                        ),
+        .fpu_valid_i            ( fpu_valid_id_ex                        ),
+        .fpu_fmt_i              ( fpu_fmt_id_ex                          ),
+        .fpu_rm_i               ( fpu_rm_id_ex                           ),
+        .fpu_frm_i              ( frm_csr_id_issue_ex                    ),
+        .fpu_prec_i             ( fprec_csr_ex                           ),
+        .fpu_trans_id_o         ( fpu_trans_id_ex_id                     ),
+        .fpu_result_o           ( fpu_result_ex_id                       ),
+        .fpu_valid_o            ( fpu_valid_ex_id                        ),
+        .fpu_exception_o        ( fpu_exception_ex_id                    ),
         .amo_valid_commit_i     ( amo_valid_commit                       ),
         .amo_req_o              ( amo_req                                ),
         .amo_resp_i             ( amo_resp                               ),
-        // CSR
-        .csr_ready_o            ( csr_ready_ex_id                        ),
-        .csr_valid_i            ( csr_valid_id_ex                        ),
-        .csr_trans_id_o         ( csr_trans_id_ex_id                     ),
-        .csr_result_o           ( csr_result_ex_id                       ),
-        .csr_valid_o            ( csr_valid_ex_id                        ),
-        .csr_addr_o             ( csr_addr_ex_csr                        ),
-        .csr_commit_i           ( csr_commit_commit_ex                   ), // from commit
         // Performance counters
         .itlb_miss_o            ( itlb_miss_ex_perf                      ),
         .dtlb_miss_o            ( dtlb_miss_ex_perf                      ),
@@ -382,16 +407,9 @@ module ariane #(
         .asid_i                 ( asid_csr_ex                            ), // from CSR
         .icache_areq_i          ( icache_areq_cache_ex                   ),
         .icache_areq_o          ( icache_areq_ex_cache                   ),
-
-        .mult_ready_o           ( mult_ready_ex_id                       ),
-        .mult_valid_i           ( mult_valid_id_ex                       ),
-        .mult_trans_id_o        ( mult_trans_id_ex_id                    ),
-        .mult_result_o          ( mult_result_ex_id                      ),
-        .mult_valid_o           ( mult_valid_ex_id                       ),
         // DCACHE interfaces
         .dcache_req_ports_i     ( dcache_req_ports_cache_ex              ),
-        .dcache_req_ports_o     ( dcache_req_ports_ex_cache              ),
-        .*
+        .dcache_req_ports_o     ( dcache_req_ports_ex_cache              )
     );
 
     // ---------
@@ -403,6 +421,7 @@ module ariane #(
         .halt_i                 ( halt_ctrl                     ),
         .flush_dcache_i         ( dcache_flush_ctrl_cache       ),
         .exception_o            ( ex_commit                     ),
+        .dirty_fp_state_o       ( dirty_fp_state                ),
         .debug_mode_i           ( debug_mode                    ),
         .debug_req_i            ( debug_req                     ),
         .single_step_i          ( single_step_csr_commit        ),
@@ -411,7 +430,8 @@ module ariane #(
         .no_st_pending_i        ( no_st_pending_ex_commit       ),
         .waddr_o                ( waddr_commit_id               ),
         .wdata_o                ( wdata_commit_id               ),
-        .we_o                   ( we_commit_id                  ),
+        .we_gpr_o               ( we_gpr_commit_id              ),
+        .we_fpr_o               ( we_fpr_commit_id              ),
         .commit_lsu_o           ( lsu_commit_commit_ex          ),
         .commit_lsu_ready_i     ( lsu_commit_ready_ex_commit    ),
         .amo_valid_commit_o     ( amo_valid_commit              ),
@@ -421,6 +441,7 @@ module ariane #(
         .csr_op_o               ( csr_op_commit_csr             ),
         .csr_wdata_o            ( csr_wdata_commit_csr          ),
         .csr_rdata_i            ( csr_rdata_csr_commit          ),
+        .csr_write_fflags_o     ( csr_write_fflags_commit_cs    ),
         .csr_exception_i        ( csr_exception_csr_commit      ),
         .fence_i_o              ( fence_i_commit_controller     ),
         .fence_o                ( fence_commit_controller       ),
@@ -441,6 +462,8 @@ module ariane #(
         .commit_ack_i           ( commit_ack                    ),
         .ex_i                   ( ex_commit                     ),
         .csr_op_i               ( csr_op_commit_csr             ),
+        .csr_write_fflags_i     ( csr_write_fflags_commit_cs    ),
+        .dirty_fp_state_i       ( dirty_fp_state                ),
         .csr_addr_i             ( csr_addr_ex_csr               ),
         .csr_wdata_i            ( csr_wdata_commit_csr          ),
         .csr_rdata_o            ( csr_rdata_csr_commit          ),
@@ -451,6 +474,10 @@ module ariane #(
         .set_debug_pc_o         ( set_debug_pc                  ),
         .trap_vector_base_o     ( trap_vector_base_commit_pcgen ),
         .priv_lvl_o             ( priv_lvl                      ),
+        .fs_o                   ( fs                            ),
+        .fflags_o               ( fflags_csr_commit             ),
+        .frm_o                  ( frm_csr_id_issue_ex           ),
+        .fprec_o                ( fprec_csr_ex                  ),
         .ld_st_priv_lvl_o       ( ld_st_priv_lvl_csr_ex         ),
         .en_translation_o       ( enable_translation_csr_ex     ),
         .en_ld_st_translation_o ( en_ld_st_translation_csr_ex   ),
@@ -584,7 +611,8 @@ module ariane #(
     // write-back
     assign tracer_if.waddr             = waddr_commit_id;
     assign tracer_if.wdata             = wdata_commit_id;
-    assign tracer_if.we                = we_commit_id;
+    assign tracer_if.we_gpr            = we_gpr_commit_id;
+    assign tracer_if.we_fpr            = we_fpr_commit_id;
     // commit
     assign tracer_if.commit_instr      = commit_instr_id_commit;
     assign tracer_if.commit_ack        = commit_ack;
diff --git a/src/ariane_regfile.sv b/src/ariane_regfile.sv
index 0203202e8..8e54a8278 100644
--- a/src/ariane_regfile.sv
+++ b/src/ariane_regfile.sv
@@ -23,151 +23,98 @@
 //                 latches and is thus smaller than the flip-flop based RF.
 //
 
-module ariane_regfile #(
-  parameter DATA_WIDTH    = 32
+module ariane_regfile_lol #(
+  parameter int unsigned DATA_WIDTH     = 32,
+  parameter int unsigned NR_READ_PORTS  = 2,
+  parameter int unsigned NR_WRITE_PORTS = 2,
+  parameter bit          ZERO_REG_ZERO  = 0
 )(
-  // Clock and Reset
-  input  logic                   clk,
-  input  logic                   rst_n,
-
-  input  logic                   test_en_i,
-
-  //Read port R1
-  input  logic [4:0]             raddr_a_i,
-  output logic [DATA_WIDTH-1:0]  rdata_a_o,
-
-  //Read port R2
-  input  logic [4:0]             raddr_b_i,
-  output logic [DATA_WIDTH-1:0]  rdata_b_o,
-
-
-  // Write port W1
-  input  logic [4:0]              waddr_a_i,
-  input  logic [DATA_WIDTH-1:0]   wdata_a_i,
-  input  logic                    we_a_i,
-
-  // Write port W2
-  input  logic [4:0]              waddr_b_i,
-  input  logic [DATA_WIDTH-1:0]   wdata_b_i,
-  input  logic                    we_b_i
+  // clock and reset
+  input  logic                                      clk_i,
+  input  logic                                      rst_ni,
+  // disable clock gates for testing
+  input  logic                                      test_en_i,
+  // read port
+  input  logic [NR_READ_PORTS-1:0][4:0]             raddr_i,
+  output logic [NR_READ_PORTS-1:0][DATA_WIDTH-1:0]  rdata_o,
+  // write port
+  input  logic [NR_WRITE_PORTS-1:0][4:0]            waddr_i,
+  input  logic [NR_WRITE_PORTS-1:0][DATA_WIDTH-1:0] wdata_i,
+  input  logic [NR_WRITE_PORTS-1:0]                 we_i
 );
 
-    localparam    ADDR_WIDTH = 5;;
-    localparam    NUM_WORDS  = 2**ADDR_WIDTH;
+    localparam ADDR_WIDTH = 5;
+    localparam NUM_WORDS  = 2**ADDR_WIDTH;
 
-    logic [DATA_WIDTH-1:0]      mem[NUM_WORDS];
+    logic [NUM_WORDS-1:ZERO_REG_ZERO]          mem_clocks;
 
-    logic [NUM_WORDS-1:1]       waddr_onehot_a;
-    logic [NUM_WORDS-1:1]       waddr_onehot_b, waddr_onehot_b_q;
+    logic [DATA_WIDTH-1:0]                     mem[NUM_WORDS];
+    logic [NR_WRITE_PORTS-1:0][NUM_WORDS-1:1]  waddr_onehot,waddr_onehot_q;
+    logic [NR_WRITE_PORTS-1:0][DATA_WIDTH-1:0] wdata_q;
 
-    logic [NUM_WORDS-1:1]       mem_clocks;
-    logic [DATA_WIDTH-1:0]      wdata_a_q;
-    logic [DATA_WIDTH-1:0]      wdata_b_q;
 
-    // Write port W1
-    logic [ADDR_WIDTH-1:0]     raddr_a_int, raddr_b_int, waddr_a_int;
+    // decode addresses
+    for (genvar i = 0; i < NR_READ_PORTS; i++)
+        assign rdata_o[i] = mem[raddr_i[i][ADDR_WIDTH-1:0]];
 
-    assign raddr_a_int = raddr_a_i[ADDR_WIDTH-1:0];
-    assign raddr_b_int = raddr_b_i[ADDR_WIDTH-1:0];
-    assign waddr_a_int = waddr_a_i[ADDR_WIDTH-1:0];
-
-    int unsigned i;
-    int unsigned j;
-    int unsigned k;
-    int unsigned l;
-    genvar x;
-
-    logic clk_int;
-
-    //-----------------------------------------------------------------------------
-    //-- READ : Read address decoder RAD
-    //-----------------------------------------------------------------------------
-    assign rdata_a_o = mem[raddr_a_int];
-    assign rdata_b_o = mem[raddr_b_int];
-
-    //-----------------------------------------------------------------------------
-    // WRITE : SAMPLE INPUT DATA
-    //---------------------------------------------------------------------------
-
-    cluster_clock_gating CG_WE_GLOBAL
-    (
-      .clk_i     ( clk             ),
-      .en_i      ( we_a_i          ),
-      .test_en_i ( test_en_i       ),
-      .clk_o     ( clk_int         )
-    );
-
-    // use clk_int here, since otherwise we don't want to write anything anyway
-    always_ff @(posedge clk_int, negedge rst_n) begin : sample_waddr
-        if (~rst_n) begin
-            wdata_a_q        <= '0;
-            wdata_b_q        <= '0;
-            waddr_onehot_b_q <= '0;
+    always_ff @(posedge clk_i, negedge rst_ni) begin : sample_waddr
+        if (~rst_ni) begin
+            wdata_q <= '0;
         end else begin
-            if (we_a_i)
-                wdata_a_q <= wdata_a_i;
-            if (we_b_i)
-                wdata_b_q <= wdata_b_i;
-
-            waddr_onehot_b_q <= waddr_onehot_b;
+            for (int unsigned i = 0; i < NR_WRITE_PORTS; i++)
+                // enable flipflop will most probably infer clock gating
+                if (we_i[i]) begin
+                    wdata_q[i]     <= wdata_i[i];
+                end
+            waddr_onehot_q <= waddr_onehot;
         end
     end
 
-    //-----------------------------------------------------------------------------
-    //-- WRITE : Write Address Decoder (WAD), combinatorial process
-    //-----------------------------------------------------------------------------
-    always_comb begin : p_WADa
-        for (i = 1; i < NUM_WORDS; i++) begin : p_WordItera
-            if ((we_a_i == 1'b1) && (waddr_a_i == i))
-                waddr_onehot_a[i] = 1'b1;
-            else
-                waddr_onehot_a[i] = 1'b0;
+    // WRITE : Write Address Decoder (WAD), combinatorial process
+    always_comb begin : decode_write_addess
+        for (int unsigned i = 0; i < NR_WRITE_PORTS; i++) begin
+            for (int unsigned j = 1; j < NUM_WORDS; j++) begin
+                if (we_i[i] && (waddr_i[i] == j))
+                    waddr_onehot[i][j] = 1'b1;
+                else
+                    waddr_onehot[i][j] = 1'b0;
+            end
         end
     end
 
-    always_comb begin : p_WADb
-         for (j = 1; j < NUM_WORDS; j++) begin : p_WordIterb
-            if ((we_b_i == 1'b1) && (waddr_b_i == j))
-                waddr_onehot_b[j] = 1'b1;
-            else
-                waddr_onehot_b[j] = 1'b0;
-        end
+    // WRITE : Clock gating (if integrated clock-gating cells are available)
+    for (genvar x = ZERO_REG_ZERO; x < NUM_WORDS; x++) begin
+
+        logic [NR_WRITE_PORTS-1:0] waddr_ored;
+
+        for (genvar i = 0; i < NR_WRITE_PORTS; i++)
+          assign waddr_ored[i] = waddr_onehot[i][x];
+
+        cluster_clock_gating i_cg (
+            .clk_i     ( clk_i         ),
+            .en_i      ( |waddr_ored   ),
+            .test_en_i ( test_en_i     ),
+            .clk_o     ( mem_clocks[x] )
+        );
     end
 
-    //-----------------------------------------------------------------------------
-    //-- WRITE : Clock gating (if integrated clock-gating cells are available)
-    //-----------------------------------------------------------------------------
-    generate
-       for (x = 1; x < NUM_WORDS; x++)
-         begin : CG_CELL_WORD_ITER
-            cluster_clock_gating CG_Inst
-              (
-               .clk_i     ( clk_int                               ),
-               .en_i      ( waddr_onehot_a[x] | waddr_onehot_b[x] ),
-               .test_en_i ( test_en_i                             ),
-               .clk_o     ( mem_clocks[x]                         )
-               );
-         end
-    endgenerate
-
-    //-----------------------------------------------------------------------------
-    //-- WRITE : Write operation
-    //-----------------------------------------------------------------------------
-    //-- Generate M = WORDS sequential processes, each of which describes one
-    //-- word of the memory. The processes are synchronized with the clocks
-    //-- ClocksxC(i), i = 0, 1, ..., M-1
-    //-- Use active low, i.e. transparent on low latches as storage elements
-    //-- Data is sampled on rising clock edge
+    // Generate M = WORDS sequential processes, each of which describes one
+    // word of the memory. The processes are synchronized with the clocks
+    // ClocksxC(i), i = 0, 1, ..., M-1
+    // Use active low, i.e. transparent on low latches as storage elements
+    // Data is sampled on rising clock edge
 
     // Integer registers
     always_latch begin : latch_wdata
         // Note: The assignment has to be done inside this process or Modelsim complains about it
-        mem[0] = '0;
+        if (ZERO_REG_ZERO)
+            mem[0] = '0;
 
-        for(k = 1; k < NUM_WORDS; k++)
-          begin : w_WordIter
-             if (mem_clocks[k] == 1'b1)
-               mem[k] = waddr_onehot_b_q[k] ? wdata_b_q : wdata_a_q;
-          end
+        for (int unsigned i = 0; i < NR_WRITE_PORTS; i++) begin
+            for (int unsigned k = ZERO_REG_ZERO; k < NUM_WORDS; k++) begin
+                if (mem_clocks[k] && waddr_onehot_q[i][k])
+                    mem[k] = wdata_q[i];
+            end
+        end
     end
 endmodule
diff --git a/src/ariane_regfile_ff.sv b/src/ariane_regfile_ff.sv
index 6514ecb12..a5b9c6954 100644
--- a/src/ariane_regfile_ff.sv
+++ b/src/ariane_regfile_ff.sv
@@ -23,87 +23,63 @@
 //
 
 module ariane_regfile #(
-  parameter DATA_WIDTH    = 32
+  parameter int unsigned DATA_WIDTH     = 32,
+  parameter int unsigned NR_READ_PORTS  = 2,
+  parameter int unsigned NR_WRITE_PORTS = 2,
+  parameter bit          ZERO_REG_ZERO  = 0
 )(
-  // Clock and Reset
-  input  logic                   clk,
-  input  logic                   rst_n,
-
-  input  logic                   test_en_i,
-
-  //Read port R1
-  input  logic [4:0]             raddr_a_i,
-  output logic [DATA_WIDTH-1:0]  rdata_a_o,
-
-  //Read port R2
-  input  logic [4:0]             raddr_b_i,
-  output logic [DATA_WIDTH-1:0]  rdata_b_o,
-
-
-  // Write port W1
-  input  logic [4:0]              waddr_a_i,
-  input  logic [DATA_WIDTH-1:0]   wdata_a_i,
-  input  logic                    we_a_i,
-
-  // Write port W2
-  input  logic [4:0]              waddr_b_i,
-  input  logic [DATA_WIDTH-1:0]   wdata_b_i,
-  input  logic                    we_b_i
+  // clock and reset
+  input  logic                                      clk_i,
+  input  logic                                      rst_ni,
+  // disable clock gates for testing
+  input  logic                                      test_en_i,
+  // read port
+  input  logic [NR_READ_PORTS-1:0][4:0]             raddr_i,
+  output logic [NR_READ_PORTS-1:0][DATA_WIDTH-1:0]  rdata_o,
+  // write port
+  input  logic [NR_WRITE_PORTS-1:0][4:0]            waddr_i,
+  input  logic [NR_WRITE_PORTS-1:0][DATA_WIDTH-1:0] wdata_i,
+  input  logic [NR_WRITE_PORTS-1:0]                 we_i
 );
 
   localparam    ADDR_WIDTH = 5;
   localparam    NUM_WORDS  = 2**ADDR_WIDTH;
 
-  logic [NUM_WORDS-1:0][DATA_WIDTH-1:0] rf_reg;
-  logic [NUM_WORDS-1:0]                 we_a_dec, we_b_dec;
+  logic [NUM_WORDS-1:0][DATA_WIDTH-1:0]     mem;
+  logic [NR_WRITE_PORTS-1:0][NUM_WORDS-1:0] we_dec;
 
-  always_comb begin : we_a_decoder
-    for (int i = 0; i < NUM_WORDS; i++) begin
-      if (waddr_a_i == i)
-        we_a_dec[i] = we_a_i;
-      else
-        we_a_dec[i] = 1'b0;
-    end
-  end
 
-  always_comb begin : we_b_decoder
-    for (int i = 0; i < NUM_WORDS; i++) begin
-      if (waddr_b_i == i)
-        we_b_dec[i] = we_b_i;
-      else
-        we_b_dec[i] = 1'b0;
-    end
-  end
-
-  generate
-    // loop from 1 to NUM_WORDS-1 as R0 is nil
-    for (genvar i = 1; i < NUM_WORDS; i++) begin : rf_gen
-
-      always_ff @(posedge clk, negedge rst_n) begin : register_write_behavioral
-        if (rst_n==1'b0) begin
-          rf_reg[i] <= 'b0;
-        end else begin
-          if (we_a_dec[i])
-            rf_reg[i] <= wdata_a_i;
-
-          if (we_b_dec[i])
-            rf_reg[i] <= wdata_b_i;
+    always_comb begin : we_decoder
+        for (int unsigned j = 0; j < NR_WRITE_PORTS; j++) begin
+            for (int unsigned i = 0; i < NUM_WORDS; i++) begin
+                if (waddr_i[j] == i)
+                    we_dec[j][i] = we_i[j];
+                else
+                    we_dec[j][i] = 1'b0;
+            end
         end
-      end
     end
 
-// R0 is nil
-`ifdef verilator
-    always_ff @(posedge clk, negedge rst_n) begin
-      rf_reg[0] <= '0;
+    // loop from 1 to NUM_WORDS-1 as R0 is nil
+    always_ff @(posedge clk_i, negedge rst_ni) begin : register_write_behavioral
+        if (~rst_ni) begin
+            mem <= '{default: '0};
+        end else begin
+            for (int unsigned j = 0; j < NR_WRITE_PORTS; j++) begin
+                for (int unsigned i = 0; i < NUM_WORDS; i++) begin
+                    if (we_dec[j][i]) begin
+                        mem[i] <= wdata_i[j];
+                    end
+                end
+                if (ZERO_REG_ZERO) begin
+                  mem[0] <= '0;
+                end
+            end
+        end
     end
-`else
-    assign rf_reg[0] = '0;
-`endif
 
-  endgenerate
-
-  assign rdata_a_o = rf_reg[raddr_a_i];
-  assign rdata_b_o = rf_reg[raddr_b_i];
+  for (genvar i = 0; i < NR_READ_PORTS; i++) begin
+    assign rdata_o[i] = mem[raddr_i[i]];
+  end
 
 endmodule
diff --git a/src/branch_unit.sv b/src/branch_unit.sv
index 7600ce35d..ff646d36a 100644
--- a/src/branch_unit.sv
+++ b/src/branch_unit.sv
@@ -15,7 +15,6 @@
 import ariane_pkg::*;
 
 module branch_unit (
-    input  logic [TRANS_ID_BITS-1:0]  trans_id_i,
     input  fu_op                      operator_i,             // comparison operation to perform
     input  logic [63:0]               operand_a_i,            // contains content of RS 1
     input  logic [63:0]               operand_b_i,            // contains content of RS 2
@@ -25,10 +24,7 @@ module branch_unit (
     input  logic                      fu_valid_i,             // any functional unit is valid, check that there is no accidental mis-predict
     input  logic                      branch_valid_i,
     input  logic                      branch_comp_res_i,      // branch comparison result from ALU
-    output logic                      branch_ready_o,
-    output logic                      branch_valid_o,
     output logic [63:0]               branch_result_o,
-    output logic [TRANS_ID_BITS-1:0]  branch_trans_id_o,
 
     input  branchpredict_sbe_t        branch_predict_i,       // this is the address we predicted
     output branchpredict_t            resolved_branch_o,      // this is the actual address we are targeting
@@ -38,10 +34,6 @@ module branch_unit (
 );
     logic [63:0] target_address;
     logic [63:0] next_pc;
-    // branches are single cycle at the moment, feed-through the control signals
-    assign branch_trans_id_o = trans_id_i;
-    assign branch_valid_o    = branch_valid_i;
-    assign branch_ready_o    = 1'b1; // we are always ready
 
     // here we handle the various possibilities of mis-predicts
     always_comb begin : mispredict_handler
diff --git a/src/cache_subsystem/miss_handler.sv b/src/cache_subsystem/miss_handler.sv
index b119f68a6..31eed3a14 100644
--- a/src/cache_subsystem/miss_handler.sv
+++ b/src/cache_subsystem/miss_handler.sv
@@ -180,7 +180,7 @@ module miss_handler #(
 
             IDLE: begin
                 // lowest priority are AMOs, wait until everything else is served before going for the AMOs
-                if (amo_req_i.req) begin
+                if (amo_req_i.req && !busy_i) begin
                     // 1. Flush the cache
                     if (!serve_amo_q) begin
                         state_d = FLUSH_REQ_STATUS;
@@ -203,6 +203,8 @@ module miss_handler #(
                     // here comes the refill portion of code
                     if (miss_req_valid[i] && !miss_req_bypass[i]) begin
                         state_d      = MISS;
+                        // we are taking another request so don't take the AMO
+                        serve_amo_d  = 1'b0;
                         // save to MSHR
                         mshr_d.valid = 1'b1;
                         mshr_d.we    = miss_req_we[i];
diff --git a/src/cache_subsystem/std_icache.sv b/src/cache_subsystem/std_icache.sv
index a6dcc2def..5fbb295b5 100644
--- a/src/cache_subsystem/std_icache.sv
+++ b/src/cache_subsystem/std_icache.sv
@@ -50,8 +50,8 @@ module std_icache  #(
     logic                                   flushing_d, flushing_q;
 
     // signals
-    logic [ICACHE_SET_ASSOC-1:0]          req;           // request to data memory 
-    logic [ICACHE_SET_ASSOC-1:0]          vld_req;       // request to valid/tag memory 
+    logic [ICACHE_SET_ASSOC-1:0]          req;           // request to data memory
+    logic [ICACHE_SET_ASSOC-1:0]          vld_req;       // request to valid/tag memory
     logic [(ICACHE_LINE_WIDTH+7)/8-1:0]   data_be;       // byte enable for data memory
     logic [(2**NR_AXI_REFILLS-1):0][7:0]  be;            // byte enable
     logic [$clog2(ICACHE_NUM_WORD)-1:0]   addr;          // this is a cache-line address, to memory array
@@ -109,24 +109,24 @@ module std_icache  #(
             .rdata_o   ( data_rdata[i]      )
         );
     end
-    
+
     // --------------------
     // Tag Comparison and way select
     // --------------------
 
     // cacheline selected by hit
-    logic [ICACHE_SET_ASSOC-1:0][FETCH_WIDTH-1:0] cl_sel;                
-    
+    logic [ICACHE_SET_ASSOC-1:0][FETCH_WIDTH-1:0] cl_sel;
+
     assign idx = vaddr_q[ICACHE_BYTE_OFFSET-1:2];
 
-    generate 
+    generate
         for (genvar i=0;i<ICACHE_SET_ASSOC;i++) begin : g_tag_cmpsel
             assign hit[i] = (tag_rdata[i].tag == tag) ? tag_rdata[i].valid : 1'b0;
             assign cl_sel[i] = (hit[i]) ? data_rdata[i][{idx,5'b0} +: FETCH_WIDTH] : '0;
             assign way_valid[i] = tag_rdata[i].valid;
         end
     endgenerate
-    
+
     // OR reduction of selected cachelines
     always_comb begin : p_reduction
         dreq_o.data = cl_sel[0];
@@ -177,7 +177,7 @@ module std_icache  #(
     assign dreq_o.ex = areq_i.fetch_exception;
 
     assign addr = (state_q==FLUSH) ? cnt_q : vaddr_d[ICACHE_INDEX_WIDTH-1:ICACHE_BYTE_OFFSET];
-        
+
 
     // ------------------
     // Cache Ctrl
@@ -219,7 +219,7 @@ module std_icache  #(
             IDLE: begin
                 dreq_o.ready = 1'b1;
                 vaddr_d      = dreq_i.vaddr;
-                    
+
                 // we are getting a new request
                 if (dreq_i.req) begin
                     // request the content of all arrays
@@ -239,7 +239,7 @@ module std_icache  #(
             // ~> compare the tag
             TAG_CMP, TAG_CMP_SAVED: begin
                 areq_o.fetch_req = 1'b1; // request address translation
-                
+
                 // (speculatively) request the content of all arrays
                 req     = '1;
                 vld_req = '1;
@@ -255,7 +255,7 @@ module std_icache  #(
                     dreq_o.ready = 1'b1;
                     dreq_o.valid = 1'b1;
                     vaddr_d      = dreq_i.vaddr;
-                        
+
                     // we've got another request
                     if (dreq_i.req) begin
                         // save the index and stay in compare mode
@@ -335,7 +335,7 @@ module std_icache  #(
 
                 req     = evict_way_q;
                 vld_req = evict_way_q;
-                
+
                 if (axi.r_valid) begin
                     we = 1'b1;
                     tag_wdata.tag = tag_q;
@@ -380,7 +380,14 @@ module std_icache  #(
         endcase
 
         // those are the states where we need to wait a little longer until we can safely exit
-        if (dreq_i.kill_s2 && !(state_q inside {REFILL, WAIT_AXI_R_RESP, WAIT_KILLED_REFILL, WAIT_KILLED_AXI_R_RESP}) && !dreq_o.ready) begin
+        if (dreq_i.kill_s2 && !(state_q inside {
+                                                    REFILL,
+                                                    WAIT_AXI_R_RESP,
+                                                    WAIT_KILLED_AXI_R_RESP,
+                                                    WAIT_KILLED_REFILL,
+                                                    WAIT_ADDRESS_TRANSLATION,
+                                                    WAIT_ADDRESS_TRANSLATION_KILLED})
+                           && !dreq_o.ready) begin
             state_d = IDLE;
         end
 
@@ -443,14 +450,14 @@ module std_icache  #(
 //pragma translate_off
 `ifndef VERILATOR
 initial begin
-    assert ($bits(axi.aw_addr) == 64) 
+    assert ($bits(axi.aw_addr) == 64)
         else $fatal(1, "[icache] Ariane needs a 64-bit bus");
 end
 
 // assert that cache only hits on one way
 onehot: assert property (
-    @(posedge clk_i) disable iff (~rst_ni) $onehot0(hit)) 
+    @(posedge clk_i) disable iff (~rst_ni) $onehot0(hit))
         else $fatal(1, "[icache] Hit should be one-hot encoded");
 `endif
-//pragma translate_on   
+//pragma translate_on
 endmodule
diff --git a/src/clint/clint.sv b/src/clint/clint.sv
index a53766497..6b58686ed 100644
--- a/src/clint/clint.sv
+++ b/src/clint/clint.sv
@@ -24,7 +24,7 @@ module clint #(
 )(
     input  logic                clk_i,       // Clock
     input  logic                rst_ni,      // Asynchronous reset active low
-
+    input  logic                testmode_i,
     AXI_BUS.Slave               slave,
 
     input  logic                rtc_i,       // Real-time clock in (usually 32.768 kHz)
@@ -146,7 +146,7 @@ module clint #(
     // 1. Put the RTC input through a classic two stage edge-triggered synchronizer to filter out any
     //    metastability effects (or at least make them unlikely :-))
     sync_wedge i_sync_edge (
-        .en_i      ( 1'b1           ),
+        .en_i      ( ~testmode_i    ),
         .serial_i  ( rtc_i          ),
         .r_edge_o  ( increase_timer ),
         .f_edge_o  (                ), // left open
diff --git a/src/commit_stage.sv b/src/commit_stage.sv
index e91be4e68..957d13a5a 100644
--- a/src/commit_stage.sv
+++ b/src/commit_stage.sv
@@ -22,6 +22,7 @@ module commit_stage #(
     input  logic                                    halt_i,             // request to halt the core
     input  logic                                    flush_dcache_i,     // request to flush dcache -> also flush the pipeline
     output exception_t                              exception_o,        // take exception to controller
+    output logic                                    dirty_fp_state_o,   // mark the F state as dirty
     input  logic                                    debug_mode_i,       // we are in debug mode
     input  logic                                    debug_req_i,        // debug unit is requesting to enter debug mode
     input  logic                                    single_step_i,      // we are in single step debug mode
@@ -31,7 +32,8 @@ module commit_stage #(
     // to register file
     output  logic [NR_COMMIT_PORTS-1:0][4:0]        waddr_o,            // register file write address
     output  logic [NR_COMMIT_PORTS-1:0][63:0]       wdata_o,            // register file write data
-    output  logic [NR_COMMIT_PORTS-1:0]             we_o,               // register file write enable
+    output  logic [NR_COMMIT_PORTS-1:0]             we_gpr_o,           // register file write enable
+    output  logic [NR_COMMIT_PORTS-1:0]             we_fpr_o,           // floating point register enable
     // Atomic memory operations
     input  amo_resp_t                               amo_resp_i,         // result of AMO operation
     // to CSR file and PC Gen (because on certain CSR instructions we'll need to flush the whole pipeline)
@@ -41,6 +43,7 @@ module commit_stage #(
     output logic [63:0]                             csr_wdata_o,        // data to write to CSR
     input  logic [63:0]                             csr_rdata_i,        // data to read from CSR
     input  exception_t                              csr_exception_i,    // exception or interrupt occurred in CSR stage (the same as commit)
+    output logic                                    csr_write_fflags_o, // write the fflags CSR
     // commit signals to ex
     output logic                                    commit_lsu_o,       // commit the pending store
     input  logic                                    commit_lsu_ready_i, // commit buffer of LSU is ready
@@ -53,10 +56,12 @@ module commit_stage #(
     output logic                                    sfence_vma_o        // flush TLBs and pipeline
 );
 
+    // TODO make these parametric with NR_COMMIT_PORTS
     assign waddr_o[0] = commit_instr_i[0].rd[4:0];
     assign waddr_o[1] = commit_instr_i[1].rd[4:0];
 
-    assign pc_o = commit_instr_i[0].pc;
+    assign pc_o       = commit_instr_i[0].pc;
+    assign dirty_fp_state_o = |we_fpr_o;
 
     logic instr_0_is_amo;
     assign instr_0_is_amo = is_amo(commit_instr_i[0].op);
@@ -65,25 +70,27 @@ module commit_stage #(
     // -------------------
     // write register file or commit instruction in LSU or CSR Buffer
     always_comb begin : commit
+
         // default assignments
-        commit_ack_o[0] = 1'b0;
-        commit_ack_o[1] = 1'b0;
+        commit_ack_o[0]    = 1'b0;
+        commit_ack_o[1]    = 1'b0;
 
         amo_valid_commit_o = 1'b0;
 
-        we_o[0]         = 1'b0;
-        we_o[1]         = 1'b0;
-
-        commit_lsu_o    = 1'b0;
-        commit_csr_o    = 1'b0;
+        we_gpr_o[0]        = 1'b0;
+        we_gpr_o[1]        = 1'b0;
+        we_fpr_o           = '{default: 1'b0};
+        commit_lsu_o       = 1'b0;
+        commit_csr_o       = 1'b0;
         // amos will commit on port 0
         wdata_o[0]      = (amo_resp_i.ack) ? amo_resp_i.result : commit_instr_i[0].result;
         wdata_o[1]      = commit_instr_i[1].result;
         csr_op_o        = ADD; // this corresponds to a CSR NOP
-        csr_wdata_o     = 64'b0;
-        fence_i_o       = 1'b0;
-        fence_o         = 1'b0;
-        sfence_vma_o    = 1'b0;
+        csr_wdata_o        = 64'b0;
+        fence_i_o          = 1'b0;
+        fence_o            = 1'b0;
+        sfence_vma_o       = 1'b0;
+        csr_write_fflags_o = 1'b0;
         flush_commit_o  = 1'b0;
 
         // we will not commit the instruction if we took an exception
@@ -92,6 +99,8 @@ module commit_stage #(
         // also check that there is no atomic memory operation committing, right now this is the only operation
         // which will take longer than one cycle to commit
         if (commit_instr_i[0].valid && !halt_i) begin
+            // we have to exclude the AMOs from debug mode as we are not jumping to debug
+            // while committing an AMO
             if (!debug_req_i || debug_mode_i) begin
                 commit_ack_o[0] = 1'b1;
                 // register will be the all zero register.
@@ -101,7 +110,10 @@ module commit_stage #(
                 if (!exception_o.valid) begin
                     // we can definitely write the register file
                     // if the instruction is not committing anything the destination
-                    we_o[0] = 1'b1;
+                    if (is_rd_fpr(commit_instr_i[0].op))
+                        we_fpr_o[0] = 1'b1;
+                    else
+                        we_gpr_o[0] = 1'b1;
 
                     // check whether the instruction we retire was a store
                     // do not commit the instruction if we got an exception since the store buffer will be cleared
@@ -113,6 +125,14 @@ module commit_stage #(
                         else // if the LSU buffer is not ready - do not commit, wait
                             commit_ack_o[0] = 1'b0;
                     end
+                    // ---------
+                    // FPU Flags
+                    // ---------
+                    if (commit_instr_i[0].fu inside {FPU, FPU_VEC}) begin
+                        // write the CSR with potential exception flags from retiring floating point instruction
+                        csr_wdata_o = {59'b0, commit_instr_i[0].ex.cause[4:0]};
+                        csr_write_fflags_o = 1'b1;
+                    end
                 end
 
                 // ---------
@@ -158,13 +178,13 @@ module commit_stage #(
             // ------------------
             // AMO
             // ------------------
-            if (instr_0_is_amo && !commit_instr_i[0].ex.valid) begin
+            if (RVA && instr_0_is_amo && !commit_instr_i[0].ex.valid) begin
                 // AMO finished
                 commit_ack_o[0] = amo_resp_i.ack;
                 // flush the pipeline
                 flush_commit_o = amo_resp_i.ack;
                 amo_valid_commit_o = 1'b1;
-                we_o[0] = amo_resp_i.ack;
+                we_gpr_o[0] = amo_resp_i.ack;
             end
         end
 
@@ -180,11 +200,27 @@ module commit_stage #(
                             && !instr_0_is_amo
                             && !single_step_i) begin
             // only if the first instruction didn't throw an exception and this instruction won't throw an exception
-            // and the operator is of type ALU, LOAD, CTRL_FLOW, MULT
+            // and the functional unit is of type ALU, LOAD, CTRL_FLOW, MULT, FPU or FPU_VEC
             if (!exception_o.valid && !commit_instr_i[1].ex.valid
-                                   && (commit_instr_i[1].fu inside {ALU, LOAD, CTRL_FLOW, MULT})) begin
-                we_o[1] = 1'b1;
+                                   && (commit_instr_i[1].fu inside {ALU, LOAD, CTRL_FLOW, MULT, FPU, FPU_VEC})) begin
+
+                if (is_rd_fpr(commit_instr_i[1].op))
+                    we_fpr_o[1] = 1'b1;
+                else
+                    we_gpr_o[1] = 1'b1;
+
                 commit_ack_o[1] = 1'b1;
+
+                // additionally check if we are retiring an FPU instruction because we need to make sure that we write all
+                // exception flags
+                if (commit_instr_i[1].fu inside {FPU, FPU_VEC}) begin
+                    if (csr_write_fflags_o)
+                        csr_wdata_o = {59'b0, (commit_instr_i[0].ex.cause[4:0] | commit_instr_i[1].ex.cause[4:0])};
+                    else
+                        csr_wdata_o = {59'b0, commit_instr_i[1].ex.cause[4:0]};
+
+                    csr_write_fflags_o = 1'b1;
+                end
             end
         end
     end
diff --git a/src/common_cells b/src/common_cells
index 9278bc769..21a060d2c 160000
--- a/src/common_cells
+++ b/src/common_cells
@@ -1 +1 @@
-Subproject commit 9278bc769f3efd006864a7ef7721f2796ed968e6
+Subproject commit 21a060d2c2c75173312b82cc72db96a2c62e66c5
diff --git a/src/compressed_decoder.sv b/src/compressed_decoder.sv
index f45a21ed0..7298db691 100644
--- a/src/compressed_decoder.sv
+++ b/src/compressed_decoder.sv
@@ -21,10 +21,10 @@ import ariane_pkg::*;
 
 module compressed_decoder
 (
-  input  logic [31:0] instr_i,
-  output logic [31:0] instr_o,
-  output logic        illegal_instr_o,
-  output logic        is_compressed_o
+    input  logic [31:0] instr_i,
+    output logic [31:0] instr_o,
+    output logic        illegal_instr_o,
+    output logic        is_compressed_o
 );
 
     // -------------------
@@ -36,33 +36,46 @@ module compressed_decoder
         is_compressed_o = 1'b1;
         instr_o         = instr_i;
 
+        // I: |    imm[11:0]    | rs1 | funct3 |    rd    | opcode |
+        // S: | imm[11:5] | rs2 | rs1 | funct3 | imm[4:0] | opcode |
         unique case (instr_i[1:0])
             // C0
-            2'b00: begin
+            riscv::OpcodeC0: begin
                 unique case (instr_i[15:13])
-                    3'b000: begin
+                    riscv::OpcodeC0Addi4spn: begin
                         // c.addi4spn -> addi rd', x2, imm
-                        instr_o = {2'b0, instr_i[10:7], instr_i[12:11], instr_i[5], instr_i[6], 2'b00, 5'h02, 3'b000, 2'b01, instr_i[4:2], riscv::OpcodeOpimm};
+                        instr_o = {2'b0, instr_i[10:7], instr_i[12:11], instr_i[5], instr_i[6], 2'b00, 5'h02, 3'b000, 2'b01, instr_i[4:2], riscv::OpcodeOpImm};
                         if (instr_i[12:5] == 8'b0)  illegal_instr_o = 1'b1;
                     end
 
-                    3'b010: begin
+                    riscv::OpcodeC0Fld: begin
+                        // c.fld -> fld rd', imm(rs1')
+                        // CLD: | funct3 | imm[5:3] | rs1' | imm[7:6] | rd' | C0 |
+                        instr_o = {4'b0, instr_i[6:5], instr_i[12:10], 3'b000, 2'b01, instr_i[9:7], 3'b011, 2'b01, instr_i[4:2], riscv::OpcodeLoadFp};
+                    end
+
+                    riscv::OpcodeC0Lw: begin
                         // c.lw -> lw rd', imm(rs1')
                         instr_o = {5'b0, instr_i[5], instr_i[12:10], instr_i[6], 2'b00, 2'b01, instr_i[9:7], 3'b010, 2'b01, instr_i[4:2], riscv::OpcodeLoad};
                     end
 
-                    3'b011: begin
+                    riscv::OpcodeC0Ld: begin
                         // c.ld -> ld rd', imm(rs1')
-                        // | imm[11:0] | rs1 | funct3 | rd | opcode|
+                        // CLD: | funct3 | imm[5:3] | rs1' | imm[7:6] | rd' | C0 |
                         instr_o = {4'b0, instr_i[6:5], instr_i[12:10], 3'b000, 2'b01, instr_i[9:7], 3'b011, 2'b01, instr_i[4:2], riscv::OpcodeLoad};
                     end
 
-                    3'b110: begin
+                    riscv::OpcodeC0Fsd: begin
+                        // c.fsd -> fsd rs2', imm(rs1')
+                        instr_o = {4'b0, instr_i[6:5], instr_i[12], 2'b01, instr_i[4:2], 2'b01, instr_i[9:7], 3'b011, instr_i[11:10], 3'b000, riscv::OpcodeStoreFp};
+                    end
+
+                    riscv::OpcodeC0Sw: begin
                         // c.sw -> sw rs2', imm(rs1')
                         instr_o = {5'b0, instr_i[5], instr_i[12], 2'b01, instr_i[4:2], 2'b01, instr_i[9:7], 3'b010, instr_i[11:10], instr_i[6], 2'b00, riscv::OpcodeStore};
                     end
 
-                    3'b111: begin
+                    riscv::OpcodeC0Sd: begin
                         // c.sd -> sd rs2', imm(rs1')
                         instr_o = {4'b0, instr_i[6:5], instr_i[12], 2'b01, instr_i[4:2], 2'b01, instr_i[9:7], 3'b011, instr_i[11:10], 3'b000, riscv::OpcodeStore};
                     end
@@ -74,137 +87,143 @@ module compressed_decoder
             end
 
             // C1
-            2'b01: begin
-              unique case (instr_i[15:13])
-                  3'b000: begin
-                      // c.addi -> addi rd, rd, nzimm
-                      // c.nop -> addi 0, 0, 0
-                      instr_o = {{6 {instr_i[12]}}, instr_i[12], instr_i[6:2], instr_i[11:7], 3'b0, instr_i[11:7], riscv::OpcodeOpimm};
-                  end
+            riscv::OpcodeC1: begin
+                unique case (instr_i[15:13])
+                    riscv::OpcodeC1Addi: begin
+                        // c.addi -> addi rd, rd, nzimm
+                        // c.nop -> addi 0, 0, 0
+                        instr_o = {{6 {instr_i[12]}}, instr_i[12], instr_i[6:2], instr_i[11:7], 3'b0, instr_i[11:7], riscv::OpcodeOpImm};
+                    end
 
-                  // c.addiw -> addiw rd, rd, nzimm for RV64
-                  3'b001: begin
-                      if (instr_i[11:7] != 5'h0) // only valid if the destination is not r0
-                        instr_o = {{6 {instr_i[12]}}, instr_i[12], instr_i[6:2], instr_i[11:7], 3'b0, instr_i[11:7], riscv::OpcodeOpimm32};
-                      else
-                        illegal_instr_o = 1'b1;
-                  end
+                    // c.addiw -> addiw rd, rd, nzimm for RV64
+                    riscv::OpcodeC1Addiw: begin
+                        if (instr_i[11:7] != 5'h0) // only valid if the destination is not r0
+                            instr_o = {{6 {instr_i[12]}}, instr_i[12], instr_i[6:2], instr_i[11:7], 3'b0, instr_i[11:7], riscv::OpcodeOpImm32};
+                        else
+                            illegal_instr_o = 1'b1;
+                    end
 
-                  riscv::OpcodeCJ: begin
-                      // 101: c.j   -> jal x0, imm
-                      instr_o = {instr_i[12], instr_i[8], instr_i[10:9], instr_i[6], instr_i[7], instr_i[2], instr_i[11], instr_i[5:3], {9 {instr_i[12]}}, 4'b0, ~instr_i[15], riscv::OpcodeJal};
-                  end
+                    riscv::OpcodeC1Li: begin
+                        // c.li -> addi rd, x0, nzimm
+                        instr_o = {{6 {instr_i[12]}}, instr_i[12], instr_i[6:2], 5'b0, 3'b0, instr_i[11:7], riscv::OpcodeOpImm};
+                        if (instr_i[11:7] == 5'b0)  illegal_instr_o = 1'b1;
+                    end
 
-                  3'b010: begin
-                      // c.li -> addi rd, x0, nzimm
-                      instr_o = {{6 {instr_i[12]}}, instr_i[12], instr_i[6:2], 5'b0, 3'b0, instr_i[11:7], riscv::OpcodeOpimm};
-                      if (instr_i[11:7] == 5'b0)  illegal_instr_o = 1'b1;
-                  end
+                    riscv::OpcodeC1LuiAddi16sp: begin
+                        // c.lui -> lui rd, imm
+                        instr_o = {{15 {instr_i[12]}}, instr_i[6:2], instr_i[11:7], riscv::OpcodeLui};
 
-                  3'b011: begin
-                      // c.lui -> lui rd, imm
-                      instr_o = {{15 {instr_i[12]}}, instr_i[6:2], instr_i[11:7], riscv::OpcodeLui};
+                        if (instr_i[11:7] == 5'h02) begin
+                            // c.addi16sp -> addi x2, x2, nzimm
+                            instr_o = {{3 {instr_i[12]}}, instr_i[4:3], instr_i[5], instr_i[2], instr_i[6], 4'b0, 5'h02, 3'b000, 5'h02, riscv::OpcodeOpImm};
+                        end else if (instr_i[11:7] == 5'b0) begin
+                            illegal_instr_o = 1'b1;
+                        end
 
-                      if (instr_i[11:7] == 5'h02) begin
-                          // c.addi16sp -> addi x2, x2, nzimm
-                          instr_o = {{3 {instr_i[12]}}, instr_i[4:3], instr_i[5], instr_i[2], instr_i[6], 4'b0, 5'h02, 3'b000, 5'h02, riscv::OpcodeOpimm};
-                      end else if (instr_i[11:7] == 5'b0) begin
-                          illegal_instr_o = 1'b1;
-                      end
+                        if ({instr_i[12], instr_i[6:2]} == 6'b0) illegal_instr_o = 1'b1;
+                    end
 
-                      if ({instr_i[12], instr_i[6:2]} == 6'b0) illegal_instr_o = 1'b1;
-                  end
+                    riscv::OpcodeC1MiscAlu: begin
+                        unique case (instr_i[11:10])
+                            2'b00,
+                            2'b01: begin
+                                // 00: c.srli -> srli rd, rd, shamt
+                                // 01: c.srai -> srai rd, rd, shamt
+                                instr_o = {1'b0, instr_i[10], 4'b0, instr_i[12], instr_i[6:2], 2'b01, instr_i[9:7], 3'b101, 2'b01, instr_i[9:7], riscv::OpcodeOpImm};
+                                // shamt field must be non-zero
+                                if ({instr_i[12], instr_i[6:2]} == 6'b0) illegal_instr_o = 1'b1;
+                            end
 
-                  3'b100: begin
-                      unique case (instr_i[11:10])
-                          2'b00,
-                          2'b01: begin
-                              // 00: c.srli -> srli rd, rd, shamt
-                              // 01: c.srai -> srai rd, rd, shamt
-                              instr_o = {1'b0, instr_i[10], 4'b0, instr_i[12], instr_i[6:2], 2'b01, instr_i[9:7], 3'b101, 2'b01, instr_i[9:7], riscv::OpcodeOpimm};
-                              // shamt field must be non-zero
-                              if ({instr_i[12], instr_i[6:2]} == 6'b0) illegal_instr_o = 1'b1;
-                          end
+                            2'b10: begin
+                                // c.andi -> andi rd, rd, imm
+                                instr_o = {{6 {instr_i[12]}}, instr_i[12], instr_i[6:2], 2'b01, instr_i[9:7], 3'b111, 2'b01, instr_i[9:7], riscv::OpcodeOpImm};
+                            end
 
-                          2'b10: begin
-                              // c.andi -> andi rd, rd, imm
-                              instr_o = {{6 {instr_i[12]}}, instr_i[12], instr_i[6:2], 2'b01, instr_i[9:7], 3'b111, 2'b01, instr_i[9:7], riscv::OpcodeOpimm};
-                          end
+                            2'b11: begin
+                                unique case ({instr_i[12], instr_i[6:5]})
+                                    3'b000: begin
+                                        // c.sub -> sub rd', rd', rs2'
+                                        instr_o = {2'b01, 5'b0, 2'b01, instr_i[4:2], 2'b01, instr_i[9:7], 3'b000, 2'b01, instr_i[9:7], riscv::OpcodeOp};
+                                    end
 
-                          2'b11: begin
-                              unique case ({instr_i[12], instr_i[6:5]})
-                                  3'b000: begin
-                                      // c.sub -> sub rd', rd', rs2'
-                                      instr_o = {2'b01, 5'b0, 2'b01, instr_i[4:2], 2'b01, instr_i[9:7], 3'b000, 2'b01, instr_i[9:7], riscv::OpcodeOp};
-                                  end
+                                    3'b001: begin
+                                        // c.xor -> xor rd', rd', rs2'
+                                        instr_o = {7'b0, 2'b01, instr_i[4:2], 2'b01, instr_i[9:7], 3'b100, 2'b01, instr_i[9:7], riscv::OpcodeOp};
+                                    end
 
-                                  3'b001: begin
-                                      // c.xor -> xor rd', rd', rs2'
-                                      instr_o = {7'b0, 2'b01, instr_i[4:2], 2'b01, instr_i[9:7], 3'b100, 2'b01, instr_i[9:7], riscv::OpcodeOp};
-                                  end
+                                    3'b010: begin
+                                        // c.or  -> or  rd', rd', rs2'
+                                        instr_o = {7'b0, 2'b01, instr_i[4:2], 2'b01, instr_i[9:7], 3'b110, 2'b01, instr_i[9:7], riscv::OpcodeOp};
+                                    end
 
-                                  3'b010: begin
-                                      // c.or  -> or  rd', rd', rs2'
-                                      instr_o = {7'b0, 2'b01, instr_i[4:2], 2'b01, instr_i[9:7], 3'b110, 2'b01, instr_i[9:7], riscv::OpcodeOp};
-                                  end
+                                    3'b011: begin
+                                        // c.and -> and rd', rd', rs2'
+                                        instr_o = {7'b0, 2'b01, instr_i[4:2], 2'b01, instr_i[9:7], 3'b111, 2'b01, instr_i[9:7], riscv::OpcodeOp};
+                                    end
 
-                                  3'b011: begin
-                                      // c.and -> and rd', rd', rs2'
-                                      instr_o = {7'b0, 2'b01, instr_i[4:2], 2'b01, instr_i[9:7], 3'b111, 2'b01, instr_i[9:7], riscv::OpcodeOp};
-                                  end
+                                    3'b100: begin
+                                        // c.subw -> subw rd', rd', rs2'
+                                        instr_o = {2'b01, 5'b0, 2'b01, instr_i[4:2], 2'b01, instr_i[9:7], 3'b000, 2'b01, instr_i[9:7], riscv::OpcodeOp32};
+                                    end
+                                    3'b101: begin
+                                        // c.addw -> addw rd', rd', rs2'
+                                        instr_o = {2'b00, 5'b0, 2'b01, instr_i[4:2], 2'b01, instr_i[9:7], 3'b000, 2'b01, instr_i[9:7], riscv::OpcodeOp32};
+                                    end
 
-                                  3'b100: begin
-                                      // c.subw -> subw rd', rd', rs2'
-                                      instr_o = {2'b01, 5'b0, 2'b01, instr_i[4:2], 2'b01, instr_i[9:7], 3'b000, 2'b01, instr_i[9:7], riscv::OpcodeOp32};
-                                  end
-                                  3'b101: begin
-                                      // c.addw -> addw rd', rd', rs2'
-                                      instr_o = {2'b00, 5'b0, 2'b01, instr_i[4:2], 2'b01, instr_i[9:7], 3'b000, 2'b01, instr_i[9:7], riscv::OpcodeOp32};
-                                  end
+                                    3'b110,
+                                    3'b111: begin
+                                        // 100: c.subw
+                                        // 101: c.addw
+                                        illegal_instr_o = 1'b1;
+                                        instr_o = {16'b0, instr_i};
+                                    end
+                                endcase
+                            end
+                        endcase
+                    end
 
-                                  3'b110,
-                                  3'b111: begin
-                                      // 100: c.subw
-                                      // 101: c.addw
-                                      illegal_instr_o = 1'b1;
-                                      instr_o = {16'b0, instr_i[15:0]};
-                                  end
-                              endcase
-                          end
-                      endcase
-                  end
+                    riscv::OpcodeC1J: begin
+                        // 101: c.j   -> jal x0, imm
+                        instr_o = {instr_i[12], instr_i[8], instr_i[10:9], instr_i[6], instr_i[7], instr_i[2], instr_i[11], instr_i[5:3], {9 {instr_i[12]}}, 4'b0, ~instr_i[15], riscv::OpcodeJal};
+                    end
 
-                  riscv::OpcodeCBeqz, riscv::OpcodeCBnez: begin
-                      // 0: c.beqz -> beq rs1', x0, imm
-                      // 1: c.bnez -> bne rs1', x0, imm
-                      instr_o = {{4 {instr_i[12]}}, instr_i[6:5], instr_i[2], 5'b0, 2'b01, instr_i[9:7], 2'b00, instr_i[13], instr_i[11:10], instr_i[4:3], instr_i[12], riscv::OpcodeBranch};
-                  end
-              endcase
+                    riscv::OpcodeC1Beqz, riscv::OpcodeC1Bnez: begin
+                        // 0: c.beqz -> beq rs1', x0, imm
+                        // 1: c.bnez -> bne rs1', x0, imm
+                        instr_o = {{4 {instr_i[12]}}, instr_i[6:5], instr_i[2], 5'b0, 2'b01, instr_i[9:7], 2'b00, instr_i[13], instr_i[11:10], instr_i[4:3], instr_i[12], riscv::OpcodeBranch};
+                    end
+                endcase
             end
 
             // C2
-            2'b10: begin
+            riscv::OpcodeC2: begin
                 unique case (instr_i[15:13])
-                    3'b000: begin
+                    riscv::OpcodeC2Slli: begin
                         // c.slli -> slli rd, rd, shamt
-                        instr_o = {6'b0, instr_i[12], instr_i[6:2], instr_i[11:7], 3'b001, instr_i[11:7], riscv::OpcodeOpimm};
+                        instr_o = {6'b0, instr_i[12], instr_i[6:2], instr_i[11:7], 3'b001, instr_i[11:7], riscv::OpcodeOpImm};
                         if (instr_i[11:7] == 5'b0)  illegal_instr_o = 1'b1; // register not x0
                         if ({instr_i[12], instr_i[6:2]}  == 6'b0)  illegal_instr_o = 1'b1; // shift amount must be non zero
                     end
 
-                    3'b010: begin
+                    riscv::OpcodeC2Fldsp: begin
+                        // c.fldsp -> fld rd, imm(x2)
+                        instr_o = {3'b0, instr_i[4:2], instr_i[12], instr_i[6:5], 3'b000, 5'h02, 3'b011, instr_i[11:7], riscv::OpcodeLoadFp};
+                        if (instr_i[11:7] == 5'b0)  illegal_instr_o = 1'b1;
+                    end
+
+                    riscv::OpcodeC2Lwsp: begin
                         // c.lwsp -> lw rd, imm(x2)
                         instr_o = {4'b0, instr_i[3:2], instr_i[12], instr_i[6:4], 2'b00, 5'h02, 3'b010, instr_i[11:7], riscv::OpcodeLoad};
                         if (instr_i[11:7] == 5'b0)  illegal_instr_o = 1'b1;
                     end
 
-                    3'b011: begin
+                    riscv::OpcodeC2Ldsp: begin
                         // c.ldsp -> ld rd, imm(x2)
                         instr_o = {3'b0, instr_i[4:2], instr_i[12], instr_i[6:5], 3'b000, 5'h02, 3'b011, instr_i[11:7], riscv::OpcodeLoad};
                         if (instr_i[11:7] == 5'b0)  illegal_instr_o = 1'b1;
                     end
 
-                    3'b100: begin
+                    riscv::OpcodeC2JalrMvAdd: begin
                         if (instr_i[12] == 1'b0) begin
                             // c.mv -> add rd/rs1, x0, rs2
                             instr_o = {7'b0, instr_i[6:2], 5'b0, 3'b0, instr_i[11:7], riscv::OpcodeOp};
@@ -231,12 +250,17 @@ module compressed_decoder
                         end
                     end
 
-                    3'b110: begin
+                    riscv::OpcodeC2Fsdsp: begin
+                        // c.fsdsp -> fsd rs2, imm(x2)
+                        instr_o = {3'b0, instr_i[9:7], instr_i[12], instr_i[6:2], 5'h02, 3'b011, instr_i[11:10], 3'b000, riscv::OpcodeStoreFp};
+                    end
+
+                    riscv::OpcodeC2Swsp: begin
                         // c.swsp -> sw rs2, imm(x2)
                         instr_o = {4'b0, instr_i[8:7], instr_i[12], instr_i[6:2], 5'h02, 3'b010, instr_i[11:9], 2'b00, riscv::OpcodeStore};
                     end
 
-                    3'b111: begin
+                    riscv::OpcodeC2Sdsp: begin
                         // c.sdsp -> sd rs2, imm(x2)
                         instr_o = {3'b0, instr_i[9:7], instr_i[12], instr_i[6:2], 5'h02, 3'b011, instr_i[11:10], 3'b000, riscv::OpcodeStore};
                     end
diff --git a/src/csr_buffer.sv b/src/csr_buffer.sv
index 3afdc1def..9ddac2b65 100644
--- a/src/csr_buffer.sv
+++ b/src/csr_buffer.sv
@@ -23,13 +23,10 @@ module csr_buffer (
     input  fu_op                     operator_i,
     input  logic [63:0]              operand_a_i,
     input  logic [63:0]              operand_b_i,
-    input  logic [TRANS_ID_BITS-1:0] trans_id_i,     // transaction id, needed for WB
 
     output logic                     csr_ready_o,    // FU is ready e.g. not busy
     input  logic                     csr_valid_i,    // Input is valid
-    output logic [TRANS_ID_BITS-1:0] csr_trans_id_o, // ID of scoreboard entry at which to write back
     output logic [63:0]              csr_result_o,
-    output logic                     csr_valid_o,    // transaction id for which the output is the requested one
     input  logic                     commit_i,       // commit the pending CSR OP
 
     // to CSR file
@@ -43,9 +40,6 @@ module csr_buffer (
     } csr_reg_n, csr_reg_q;
 
     // control logic, scoreboard signals
-    assign csr_trans_id_o = trans_id_i;
-    // CSR instructions for this post buffer are single cycle
-    assign csr_valid_o    = csr_valid_i;
     assign csr_result_o   = operand_a_i;
     assign csr_addr_o     = csr_reg_q.csr_address;
 
diff --git a/src/csr_regfile.sv b/src/csr_regfile.sv
index 98e5abfd4..541d7c2eb 100644
--- a/src/csr_regfile.sv
+++ b/src/csr_regfile.sv
@@ -21,7 +21,6 @@ module csr_regfile #(
     input  logic                  clk_i,                      // Clock
     input  logic                  rst_ni,                     // Asynchronous reset active low
     input  logic                  time_irq_i,                 // Timer threw a interrupt
-
     // send a flush request out if a CSR with a side effect has changed (e.g. written)
     output logic                  flush_o,
     output logic                  halt_csr_o,                 // halt requested
@@ -39,6 +38,8 @@ module csr_regfile #(
     input  logic  [11:0]          csr_addr_i,                 // Address of the register to read/write
     input  logic  [63:0]          csr_wdata_i,                // Write data in
     output logic  [63:0]          csr_rdata_o,                // Read data out
+    input  logic                  dirty_fp_state_i,           // Mark the FP sate as dirty
+    input  logic                  csr_write_fflags_i,         // Write fflags register e.g.: we are retiring a floating point instruction
     input  logic  [63:0]          pc_i,                       // PC of instruction accessing the CSR
     output exception_t            csr_exception_o,            // attempts to access a CSR without appropriate privilege
                                                               // level or to write  a read-only register also
@@ -48,6 +49,11 @@ module csr_regfile #(
     output logic                  eret_o,                     // Return from exception, set the PC of epc_o
     output logic  [63:0]          trap_vector_base_o,         // Output base of exception vector, correct CSR is output (mtvec, stvec)
     output riscv::priv_lvl_t      priv_lvl_o,                 // Current privilege level the CPU is in
+    // FPU
+    output riscv::xs_t            fs_o,                       // Floating point extension status
+    output logic [4:0]            fflags_o,                   // Floating-Point Accured Exceptions
+    output logic [2:0]            frm_o,                      // Floating-Point Dynamic Rounding Mode
+    output logic [6:0]            fprec_o,                    // Floating-Point Precision Control
     // MMU
     output logic                  en_translation_o,           // enable VA translation
     output logic                  en_ld_st_translation_o,     // enable VA translation for load and stores
@@ -87,12 +93,14 @@ module csr_regfile #(
     logic  mret;  // return from M-mode exception
     logic  sret;  // return from S-mode exception
     logic  dret;  // return from debug mode
-
+    // CSR write causes us to mark the FPU state as dirty
+    logic  dirty_fp_state_csr;
     riscv::csr_t  csr_addr;
     // ----------------
     // Assignments
     // ----------------
     assign csr_addr = riscv::csr_t'(csr_addr_i);
+    assign fs_o = mstatus_q.fs;
     // ----------------
     // CSR Registers
     // ----------------
@@ -134,6 +142,8 @@ module csr_regfile #(
     logic [63:0] cycle_q,     cycle_d;
     logic [63:0] instret_q,   instret_d;
 
+    riscv::fcsr_t fcsr_q, fcsr_d;
+
     // ----------------
     // CSR Read logic
     // ----------------
@@ -146,6 +156,35 @@ module csr_regfile #(
 
         if (csr_read) begin
             case (csr_addr.address)
+                riscv::CSR_FFLAGS: begin
+                    if (mstatus_q.fs == riscv::Off) begin
+                        read_access_exception = 1'b1;
+                    end else begin
+                        csr_rdata = {59'b0, fcsr_q.fflags};
+                    end
+                end
+                riscv::CSR_FRM: begin
+                    if (mstatus_q.fs == riscv::Off) begin
+                        read_access_exception = 1'b1;
+                    end else begin
+                        csr_rdata = {61'b0, fcsr_q.frm};
+                    end
+                end
+                riscv::CSR_FCSR: begin
+                    if (mstatus_q.fs == riscv::Off) begin
+                        read_access_exception = 1'b1;
+                    end else begin
+                        csr_rdata = {56'b0, fcsr_q.frm, fcsr_q.fflags};
+                    end
+                end
+                // non-standard extension
+                riscv::CSR_FTRAN: begin
+                    if (mstatus_q.fs == riscv::Off) begin
+                        read_access_exception = 1'b1;
+                    end else begin
+                        csr_rdata = {57'b0, fcsr_q.fprec};
+                    end
+                end
                 // debug registers
                 riscv::CSR_DCSR:               csr_rdata = {32'b0, dcsr_q};
                 riscv::CSR_DPC:                csr_rdata = dpc_q;
@@ -189,7 +228,7 @@ module csr_regfile #(
                 riscv::CSR_PMPCFG0:            csr_rdata = pmpcfg0_q;
                 riscv::CSR_PMPADDR0:           csr_rdata = pmpaddr0_q;
                 riscv::CSR_MVENDORID:          csr_rdata = 64'b0; // not implemented
-                riscv::CSR_MARCHID:            csr_rdata = 64'b0; // PULP, anonymous source (no allocated ID yet)
+                riscv::CSR_MARCHID:            csr_rdata = ARIANE_MARCHID;
                 riscv::CSR_MIMPID:             csr_rdata = 64'b0; // not implemented
                 riscv::CSR_MHARTID:            csr_rdata = {53'b0, cluster_id_i[5:0], 1'b0, core_id_i[3:0]};
                 riscv::CSR_MCYCLE:             csr_rdata = cycle_q;
@@ -227,7 +266,7 @@ module csr_regfile #(
         sapt = satp_q;
         mip = csr_wdata & 64'h33;
         instret = instret_q;
-        // only USIP, SSIP, UTIP, STIP are write-able
+        // only FCSR, USIP, SSIP, UTIP, STIP are write-able
 
         eret_o                  = 1'b0;
         flush_o                 = 1'b0;
@@ -238,6 +277,8 @@ module csr_regfile #(
         perf_we_o               = 1'b0;
         perf_data_o             = 'b0;
 
+        fcsr_d                  = fcsr_q;
+
         priv_lvl_d              = priv_lvl_q;
         debug_mode_d            = debug_mode_q;
         dcsr_d                  = dcsr_q;
@@ -279,10 +320,51 @@ module csr_regfile #(
         instret_d               = instret_q;
 
         en_ld_st_translation_d  = en_ld_st_translation_q;
-
+        dirty_fp_state_csr      = 1'b0;
         // check for correct access rights and that we are writing
         if (csr_we) begin
             case (csr_addr.address)
+                // Floating-Point
+                riscv::CSR_FFLAGS: begin
+                    if (mstatus_q.fs == riscv::Off) begin
+                        update_access_exception = 1'b1;
+                    end else begin
+                        dirty_fp_state_csr = 1'b1;
+                        fcsr_d.fflags = csr_wdata[4:0];
+                        // this instruction has side-effects
+                        flush_o = 1'b1;
+                    end
+                end
+                riscv::CSR_FRM: begin
+                    if (mstatus_q.fs == riscv::Off) begin
+                        update_access_exception = 1'b1;
+                    end else begin
+                        dirty_fp_state_csr = 1'b1;
+                        fcsr_d.frm    = csr_wdata[2:0];
+                        // this instruction has side-effects
+                        flush_o = 1'b1;
+                    end
+                end
+                riscv::CSR_FCSR: begin
+                    if (mstatus_q.fs == riscv::Off) begin
+                        update_access_exception = 1'b1;
+                    end else begin
+                        dirty_fp_state_csr = 1'b1;
+                        fcsr_d[7:0] = csr_wdata[7:0]; // ignore writes to reserved space
+                        // this instruction has side-effects
+                        flush_o = 1'b1;
+                    end
+                end
+                riscv::CSR_FTRAN: begin
+                    if (mstatus_q.fs == riscv::Off) begin
+                        update_access_exception = 1'b1;
+                    end else begin
+                        dirty_fp_state_csr = 1'b1;
+                        fcsr_d.fprec = csr_wdata[6:0]; // ignore writes to reserved space
+                        // this instruction has side-effects
+                        flush_o = 1'b1;
+                    end
+                end
                 // debug CSR
                 riscv::CSR_DCSR: begin
                     dcsr_d = csr_wdata[31:0];
@@ -306,10 +388,13 @@ module csr_regfile #(
                     // also hardwire the registers for sstatus
                     mstatus_d.sxl  = riscv::XLEN_64;
                     mstatus_d.uxl  = riscv::XLEN_64;
-                    // hardwired zero registers
-                    mstatus_d.sd   = 1'b0;
-                    mstatus_d.xs   = 2'b0;
-                    mstatus_d.fs   = 2'b0;
+                    // hardwired extension registers
+                    mstatus_d.sd   = (&mstatus_q.xs) | (&mstatus_q.fs);
+                    mstatus_d.xs   = riscv::Off;
+                    // hardwire to zero if floating point extension is not present
+                    if (!FP_PRESENT) begin
+                        mstatus_d.fs = riscv::Off;
+                    end
                     mstatus_d.upie = 1'b0;
                     mstatus_d.uie  = 1'b0;
                     // not all fields of mstatus can be written
@@ -366,9 +451,11 @@ module csr_regfile #(
                     mstatus_d.sxl  = riscv::XLEN_64;
                     mstatus_d.uxl  = riscv::XLEN_64;
                     // hardwired zero registers
-                    mstatus_d.sd   = 1'b0;
-                    mstatus_d.xs   = 2'b0;
-                    mstatus_d.fs   = 2'b0;
+                    mstatus_d.sd   = (&mstatus_q.xs) | (&mstatus_q.fs);
+                    mstatus_d.xs   = riscv::Off;
+                    if (!FP_PRESENT) begin
+                        mstatus_d.fs = riscv::Off;
+                    end
                     mstatus_d.upie = 1'b0;
                     mstatus_d.uie  = 1'b0;
                     // this register has side-effects on other registers, flush the pipeline
@@ -426,6 +513,16 @@ module csr_regfile #(
                 default: update_access_exception = 1'b1;
             endcase
         end
+
+        // mark the floating point extension register as dirty
+        if (FP_PRESENT && (dirty_fp_state_csr || dirty_fp_state_i)) begin
+            mstatus_d.fs = riscv::Dirty;
+        end
+
+        // write the floating point status register
+        if (csr_write_fflags_i)
+            fcsr_d.fflags = csr_wdata_i[4:0] | fcsr_q.fflags;
+
         // ---------------------
         // External Interrupts
         // ---------------------
@@ -541,9 +638,21 @@ module csr_regfile #(
             end
 
             // single step enable and we just retired an instruction
-            if (dcsr_q.step && (|commit_ack_i)) begin
-                // we saved the correct target address during execute
-                dpc_d = commit_instr_i[0].bp.predict_address;
+            if (dcsr_q.step && commit_ack_i[0]) begin
+                // valid CTRL flow change
+                if (commit_instr_i[0].fu == CTRL_FLOW) begin
+                    // we saved the correct target address during execute
+                    dpc_d = commit_instr_i[0].bp.predict_address;
+                // exception valid
+                end else if (ex_i.valid) begin
+                    dpc_d = trap_vector_base_o;
+                // return from environment
+                end else if (eret_o) begin
+                    dpc_d = epc_o;
+                // consecutive PC
+                end else begin
+                    dpc_d = commit_instr_i[0].pc + (commit_instr_i[0].is_compressed ? 'h2 : 'h4);
+                end
                 debug_mode_d = 1'b1;
                 set_debug_pc_o = 1'b1;
                 dcsr_d.cause = dm::CauseSingleStep;
@@ -807,6 +916,10 @@ module csr_regfile #(
     assign csr_rdata_o      = csr_rdata;
     // in debug mode we execute with privilege level M
     assign priv_lvl_o       = (debug_mode_q) ? riscv::PRIV_LVL_M : priv_lvl_q;
+    // FPU outputs
+    assign fflags_o         = fcsr_q.fflags;
+    assign frm_o            = fcsr_q.frm;
+    assign fprec_o          = fcsr_q.fprec;
     // MMU outputs
     assign satp_ppn_o       = satp_q.ppn;
     assign asid_o           = satp_q.asid[ASID_WIDTH-1:0];
@@ -829,6 +942,8 @@ module csr_regfile #(
     always_ff @(posedge clk_i or negedge rst_ni) begin
         if (~rst_ni) begin
             priv_lvl_q             <= riscv::PRIV_LVL_M;
+            // floating-point registers
+            fcsr_q                 <= 64'b0;
             // debug signals
             debug_mode_q           <= 1'b0;
             dcsr_q                 <= '0;
@@ -866,6 +981,8 @@ module csr_regfile #(
             wfi_q                  <= 1'b0;
         end else begin
             priv_lvl_q             <= priv_lvl_d;
+            // floating-point registers
+            fcsr_q                 <= fcsr_d;
             // debug signals
             debug_mode_q           <= debug_mode_d;
             dcsr_q                 <= dcsr_d;
diff --git a/src/debug/dm_csrs.sv b/src/debug/dm_csrs.sv
index aee8177ab..f3a0aaca3 100644
--- a/src/debug/dm_csrs.sv
+++ b/src/debug/dm_csrs.sv
@@ -438,14 +438,26 @@ module dm_csrs #(
     end
 
     assign dmactive_o = dmcontrol_q.dmactive;
-    // if the PoR is set we want to re-set the other system as well
-    assign ndmreset_o = dmcontrol_q.ndmreset | (~rst_ni);
     assign cmd_o      = command_q;
     assign progbuf_o  = progbuf_q;
     assign data_o     = data_q;
 
     assign resp_queue_pop = dmi_resp_ready_i & ~resp_queue_empty;
 
+    logic ndmreset_n;
+
+    // if the PoR is set we want to re-set the other system as well
+    rstgen_bypass i_rstgen_bypass (
+        .clk_i ( clk_i ),
+        .rst_ni ( ~(dmcontrol_q.ndmreset | ~rst_ni) ),
+        .rst_test_mode_ni ( rst_ni ),
+        .test_mode_i ( testmode_i ),
+        .rst_no ( ndmreset_n ),
+        .init_no () // keep open
+    );
+
+    assign ndmreset_o = ~ndmreset_n;
+
     // response FIFO
     fifo_v2 #(
         .dtype            ( logic [31:0]         ),
@@ -468,9 +480,19 @@ module dm_csrs #(
     always_ff @(posedge clk_i or negedge rst_ni) begin
         // PoR
         if (~rst_ni) begin
-            dmcontrol_q <= '0;
-            havereset_q <= '1;
+            dmcontrol_q    <= '0;
+            havereset_q    <= '1;
+            // this is the only write-able bit during reset
+            cmderr_q       <= dm::CmdErrNone;
+            command_q      <= '0;
+            abstractauto_q <= '0;
+            progbuf_q      <= '0;
+            data_q         <= '0;
+            sbcs_q         <= '0;
+            sbaddr_q       <= '0;
+            sbdata_q       <= '0;
         end else begin
+            havereset_q    <= havereset_d;
             // synchronous re-set of debug module, active-low, except for dmactive
             if (!dmcontrol_q.dmactive) begin
                 dmcontrol_q.haltreq          <= '0;
@@ -495,7 +517,6 @@ module dm_csrs #(
                 sbaddr_q                     <= '0;
                 sbdata_q                     <= '0;
             end else begin
-                havereset_q                  <= havereset_d;
                 dmcontrol_q                  <= dmcontrol_d;
                 cmderr_q                     <= cmderr_d;
                 command_q                    <= command_d;
@@ -508,4 +529,4 @@ module dm_csrs #(
             end
         end
     end
-endmodule
+endmodule
\ No newline at end of file
diff --git a/src/debug/dm_mem.sv b/src/debug/dm_mem.sv
index 36d1f63be..3cfb09c2d 100644
--- a/src/debug/dm_mem.sv
+++ b/src/debug/dm_mem.sv
@@ -20,7 +20,7 @@ module dm_mem #(
     parameter int NrHarts     = -1
 )(
     input  logic                             clk_i,       // Clock
-    input  logic                             dmactive_i,  // debug module reset
+    input  logic                             rst_ni,      // debug module reset
 
     output logic [NrHarts-1:0]               debug_req_o,
     input  logic [19:0]                      hartsel_i,
@@ -363,8 +363,8 @@ module dm_mem #(
     // the ROM base address
     assign fwd_rom_d = (addr_i[DbgAddressBits-1:0] >= dm::HaltAddress[DbgAddressBits-1:0]) ? 1'b1 : 1'b0;
 
-    always_ff @(posedge clk_i) begin
-        if (~dmactive_i) begin
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+        if (~rst_ni) begin
             fwd_rom_q  <= 1'b0;
             rdata_q    <= '0;
             halted_q   <= 1'b0;
diff --git a/src/debug/dm_sba.sv b/src/debug/dm_sba.sv
index d316982a4..7b46d92cb 100644
--- a/src/debug/dm_sba.sv
+++ b/src/debug/dm_sba.sv
@@ -18,6 +18,7 @@
 
 module dm_sba (
     input  logic          clk_i,       // Clock
+    input  logic          rst_ni,
     input  logic          dmactive_i,  // synchronous reset active low
 
     AXI_BUS.Master        axi_master,
@@ -111,7 +112,7 @@ module dm_sba (
             end
         endcase
         // handle error case
-        if (sbaccess_i > 3 && state_d != Idle) begin
+        if (sbaccess_i > 3 && state_q != Idle) begin
             req             = 1'b0;
             state_d         = Idle;
             sberror_valid_o = 1'b1;
@@ -120,35 +121,36 @@ module dm_sba (
         // further error handling should go here ...
     end
 
-    always_ff @(posedge clk_i) begin
-        if (~dmactive_i) begin
+    always_ff @(posedge clk_i or negedge rst_ni) begin
+        if (~rst_ni) begin
             state_q <= Idle;
         end else begin
             state_q <= state_d;
         end
     end
 
+
     axi_adapter #(
-        .DATA_WIDTH            ( 64                       )
+        .DATA_WIDTH            ( 64                        )
     ) i_axi_master (
-        .clk_i                 ( clk_i                    ),
-        .rst_ni                ( dmactive_i               ),
-        .req_i                 ( req                      ),
-        .type_i                ( std_cache_pkg::SINGLE_REQ),
-        .gnt_o                 ( gnt                      ),
-        .gnt_id_o              (                          ),
-        .addr_i                ( address                  ),
-        .we_i                  ( we                       ),
-        .wdata_i               ( sbdata_i                 ),
-        .be_i                  ( be                       ),
-        .size_i                ( sbaccess_i[1:0]          ),
-        .id_i                  ( '0                       ),
-        .valid_o               ( sbdata_valid_o           ),
-        .rdata_o               ( sbdata_o                 ),
-        .id_o                  (                          ),
-        .critical_word_o       (                          ), // not needed here
-        .critical_word_valid_o (                          ), // not needed here
-        .axi                   ( axi_master               )
+        .clk_i                 ( clk_i                     ),
+        .rst_ni                ( rst_ni                    ),
+        .req_i                 ( req                       ),
+        .type_i                ( std_cache_pkg::SINGLE_REQ ),
+        .gnt_o                 ( gnt                       ),
+        .gnt_id_o              (                           ),
+        .addr_i                ( address                   ),
+        .we_i                  ( we                        ),
+        .wdata_i               ( sbdata_i                  ),
+        .be_i                  ( be                        ),
+        .size_i                ( sbaccess_i[1:0]           ),
+        .id_i                  ( '0                        ),
+        .valid_o               ( sbdata_valid_o            ),
+        .rdata_o               ( sbdata_o                  ),
+        .id_o                  (                           ),
+        .critical_word_o       (                           ), // not needed here
+        .critical_word_valid_o (                           ), // not needed here
+        .axi                   ( axi_master                )
     );
 
 
diff --git a/src/debug/dm_top.sv b/src/debug/dm_top.sv
index 5e3431677..303c414af 100644
--- a/src/debug/dm_top.sv
+++ b/src/debug/dm_top.sv
@@ -143,6 +143,7 @@ module dm_top #(
 
     dm_sba i_dm_sba (
         .clk_i                   ( clk_i                 ),
+        .rst_ni                  ( rst_ni                ),
         .dmactive_i              ( dmactive_o            ),
         .axi_master,
         .sbaddress_i             ( sbaddress_csrs_sba    ),
@@ -166,7 +167,7 @@ module dm_top #(
         .NrHarts (NrHarts)
     ) i_dm_mem (
         .clk_i                   ( clk_i                 ),
-        .dmactive_i              ( dmactive_o            ),
+        .rst_ni                  ( rst_ni                ),
         .debug_req_o             ( debug_req_o           ),
         .hartsel_i               ( hartsel               ),
         .haltreq_i               ( haltreq               ),
@@ -197,7 +198,7 @@ module dm_top #(
         .AXI_USER_WIDTH ( AxiUserWidth )
     ) i_axi2mem (
         .clk_i  ( clk_i      ),
-        .rst_ni ( dmactive_o ),
+        .rst_ni ( rst_ni     ),
         .slave  ( axi_slave  ),
         .req_o  ( req        ),
         .we_o   ( we         ),
diff --git a/src/debug/dmi_jtag.sv b/src/debug/dmi_jtag.sv
index 430ccae51..49df7ce5b 100644
--- a/src/debug/dmi_jtag.sv
+++ b/src/debug/dmi_jtag.sv
@@ -19,9 +19,9 @@
 module dmi_jtag (
     input  logic         clk_i,      // DMI Clock
     input  logic         rst_ni,     // Asynchronous reset active low
+    input  logic         testmode_i,
 
     output logic         dmi_rst_no, // hard reset
-
     output dm::dmi_req_t dmi_req_o,
     output logic         dmi_req_valid_o,
     input  logic         dmi_req_ready_i,
@@ -37,7 +37,7 @@ module dmi_jtag (
     output logic         td_o,     // JTAG test data output pad
     output logic         tdo_oe_o  // Data out output enable
 );
-    assign       dmi_rst_no = 1'b1;
+    assign       dmi_rst_no = rst_ni;
 
     logic        test_logic_reset;
     logic        shift_dr;
@@ -218,6 +218,7 @@ module dmi_jtag (
         .td_i,
         .td_o,
         .tdo_oe_o,
+        .testmode_i         ( testmode_i       ),
         .test_logic_reset_o ( test_logic_reset ),
         .shift_dr_o         ( shift_dr         ),
         .update_dr_o        ( update_dr        ),
diff --git a/src/debug/dmi_jtag_tap.sv b/src/debug/dmi_jtag_tap.sv
index 5d55bacb1..ae4b2fcfb 100644
--- a/src/debug/dmi_jtag_tap.sv
+++ b/src/debug/dmi_jtag_tap.sv
@@ -25,6 +25,7 @@ module dmi_jtag_tap #(
     input  logic        td_i,     // JTAG test data input pad
     output logic        td_o,     // JTAG test data output pad
     output logic        tdo_oe_o, // Data out output enable
+    input  logic        testmode_i,
     output logic        test_logic_reset_o,
     output logic        shift_dr_o,
     output logic        update_dr_o,
@@ -207,8 +208,23 @@ module dmi_jtag_tap #(
 
     end
 
-  // TDO changes state at negative edge of TCK
-    always_ff @(negedge tck_i, negedge trst_ni) begin
+    // DFT
+    logic tck_n, tck_ni;
+
+    cluster_clock_inverter i_tck_inv (
+        .clk_i ( tck_i  ),
+        .clk_o ( tck_ni )
+    );
+
+    pulp_clock_mux2 i_dft_tck_mux (
+        .clk0_i    ( tck_ni     ),
+        .clk1_i    ( tck_i      ), // bypass the inverted clock for testing
+        .clk_sel_i ( testmode_i ),
+        .clk_o     ( tck_n      )
+    );
+
+    // TDO changes state at negative edge of TCK
+    always_ff @(posedge tck_n, negedge trst_ni) begin
         if (~trst_ni) begin
             td_o     <= 1'b0;
             tdo_oe_o <= 1'b0;
diff --git a/src/decoder.sv b/src/decoder.sv
index efb31c5ad..4362d551e 100644
--- a/src/decoder.sv
+++ b/src/decoder.sv
@@ -30,6 +30,8 @@ module decoder (
     // From CSR
     input  riscv::priv_lvl_t   priv_lvl_i,              // current privilege level
     input  logic               debug_mode_i,            // we are in debug mode
+    input  riscv::xs_t         fs_i,                    // floating point extension status
+    input  logic [2:0]         frm_i,                   // floating-point dynamic rounding mode
     input  logic               tvm_i,                   // trap virtual memory
     input  logic               tw_i,                    // timeout wait
     input  logic               tsr_i,                   // trap sret
@@ -41,13 +43,15 @@ module decoder (
     logic ecall;
     // this instruction is a software break-point
     logic ebreak;
+    // this instruction needs floating-point rounding-mode verification
+    logic check_fprm;
     riscv::instruction_t instr;
     assign instr = riscv::instruction_t'(instruction_i);
     // --------------------
     // Immediate select
     // --------------------
     enum logic[3:0] {
-        NOIMM, IIMM, SIMM, SBIMM, UIMM, JIMM
+        NOIMM, IIMM, SIMM, SBIMM, UIMM, JIMM, RS3
     } imm_select;
 
     logic [63:0] imm_i_type;
@@ -63,6 +67,7 @@ module decoder (
         is_control_flow_instr_o     = 1'b0;
         illegal_instr               = 1'b0;
         instruction_o.pc            = pc_i;
+        instruction_o.trans_id      = 5'b0;
         instruction_o.fu            = NONE;
         instruction_o.op            = ADD;
         instruction_o.rs1           = '0;
@@ -75,6 +80,7 @@ module decoder (
         instruction_o.bp            = branch_predict_i;
         ecall                       = 1'b0;
         ebreak                      = 1'b0;
+        check_fprm                  = 1'b0;
 
         if (~ex_i.valid) begin
             case (instr.rtype.opcode)
@@ -208,7 +214,7 @@ module decoder (
                     endcase
                 end
                 // Memory ordering instructions
-                riscv::OpcodeFence: begin
+                riscv::OpcodeMiscMem: begin
                     instruction_o.fu  = CSR;
                     instruction_o.rs1 = '0;
                     instruction_o.rs2 = '0;
@@ -235,35 +241,268 @@ module decoder (
                 // Reg-Reg Operations
                 // --------------------------
                 riscv::OpcodeOp: begin
-                    instruction_o.fu  = (instr.rtype.funct7 == 7'b000_0001) ? MULT : ALU;
-                    instruction_o.rs1[4:0] = instr.rtype.rs1;
-                    instruction_o.rs2[4:0] = instr.rtype.rs2;
-                    instruction_o.rd[4:0]  = instr.rtype.rd;
+                    // --------------------------------------------
+                    // Vectorial Floating-Point Reg-Reg Operations
+                    // --------------------------------------------
+                    if (instr.rvftype.funct2 == 2'b10) begin // Prefix 10 for all Xfvec ops
+                        // only generate decoder if FP extensions are enabled (static)
+                        if (FP_PRESENT && XFVEC && fs_i != riscv::Off) begin
+                            automatic logic allow_replication; // control honoring of replication flag
 
-                    unique case ({instr.rtype.funct7, instr.rtype.funct3})
-                        {7'b000_0000, 3'b000}: instruction_o.op = ADD;   // Add
-                        {7'b010_0000, 3'b000}: instruction_o.op = SUB;   // Sub
-                        {7'b000_0000, 3'b010}: instruction_o.op = SLTS;  // Set Lower Than
-                        {7'b000_0000, 3'b011}: instruction_o.op = SLTU;  // Set Lower Than Unsigned
-                        {7'b000_0000, 3'b100}: instruction_o.op = XORL;  // Xor
-                        {7'b000_0000, 3'b110}: instruction_o.op = ORL;   // Or
-                        {7'b000_0000, 3'b111}: instruction_o.op = ANDL;  // And
-                        {7'b000_0000, 3'b001}: instruction_o.op = SLL;   // Shift Left Logical
-                        {7'b000_0000, 3'b101}: instruction_o.op = SRL;   // Shift Right Logical
-                        {7'b010_0000, 3'b101}: instruction_o.op = SRA;   // Shift Right Arithmetic
-                        // Multiplications
-                        {7'b000_0001, 3'b000}: instruction_o.op = MUL;
-                        {7'b000_0001, 3'b001}: instruction_o.op = MULH;
-                        {7'b000_0001, 3'b010}: instruction_o.op = MULHSU;
-                        {7'b000_0001, 3'b011}: instruction_o.op = MULHU;
-                        {7'b000_0001, 3'b100}: instruction_o.op = DIV;
-                        {7'b000_0001, 3'b101}: instruction_o.op = DIVU;
-                        {7'b000_0001, 3'b110}: instruction_o.op = REM;
-                        {7'b000_0001, 3'b111}: instruction_o.op = REMU;
-                        default: begin
+                            instruction_o.fu       = FPU_VEC; // Same unit, but sets 'vectorial' signal
+                            instruction_o.rs1[4:0] = instr.rvftype.rs1;
+                            instruction_o.rs2[4:0] = instr.rvftype.rs2;
+                            instruction_o.rd[4:0]  = instr.rvftype.rd;
+                            check_fprm             = 1'b1;
+                            allow_replication      = 1'b1;
+                            // decode vectorial FP instruction
+                            unique case (instr.rvftype.vecfltop)
+                                5'b00001 : begin
+                                    instruction_o.op  = FADD; // vfadd.vfmt - Vectorial FP Addition
+                                    instruction_o.rs1 = '0;                // Operand A is set to 0
+                                    instruction_o.rs2 = instr.rvftype.rs1; // Operand B is set to rs1
+                                    imm_select        = IIMM;              // Operand C is set to rs2
+                                end
+                                5'b00010 : begin
+                                    instruction_o.op  = FSUB; // vfsub.vfmt - Vectorial FP Subtraction
+                                    instruction_o.rs1 = '0;                // Operand A is set to 0
+                                    instruction_o.rs2 = instr.rvftype.rs1; // Operand B is set to rs1
+                                    imm_select        = IIMM;              // Operand C is set to rs2
+                                end
+                                5'b00011 : instruction_o.op = FMUL; // vfmul.vfmt - Vectorial FP Multiplication
+                                5'b00100 : instruction_o.op = FDIV; // vfdiv.vfmt - Vectorial FP Division
+                                5'b00101 : begin
+                                    instruction_o.op = VFMIN; // vfmin.vfmt - Vectorial FP Minimum
+                                    check_fprm       = 1'b0;  // rounding mode irrelevant
+                                end
+                                5'b00110 : begin
+                                    instruction_o.op = VFMAX; // vfmax.vfmt - Vectorial FP Maximum
+                                    check_fprm       = 1'b0;  // rounding mode irrelevant
+                                end
+                                5'b00111 : begin
+                                    instruction_o.op  = FSQRT; // vfsqrt.vfmt - Vectorial FP Square Root
+                                    allow_replication = 1'b0;  // only one operand
+                                    if (instr.rvftype.rs2 != 5'b00000) illegal_instr = 1'b1; // rs2 must be 0
+                                end
+                                5'b01000 : begin
+                                    instruction_o.op = FMADD; // vfmac.vfmt - Vectorial FP Multiply-Accumulate
+                                    imm_select       = SIMM;  // rd into result field (upper bits don't matter)
+                                end
+                                5'b01001 : begin
+                                    instruction_o.op = FMSUB; // vfmre.vfmt - Vectorial FP Multiply-Reduce
+                                    imm_select       = SIMM;  // rd into result field (upper bits don't matter)
+                                end
+                                5'b01100 : begin
+                                    unique case (instr.rvftype.rs2) inside // operation encoded in rs2, `inside` for matching ?
+                                        5'b00000 : begin
+                                            instruction_o.rs2 = instr.rvftype.rs1; // set rs2 = rs1 so we can map FMV to SGNJ in the unit
+                                            if (instr.rvftype.repl)
+                                                instruction_o.op = FMV_F2X; // vfmv.x.vfmt - FPR to GPR Move
+                                            else
+                                                instruction_o.op = FMV_X2F; // vfmv.vfmt.x - GPR to FPR Move
+                                            check_fprm = 1'b0;              // no rounding for moves
+                                        end
+                                        5'b00001 : begin
+                                            instruction_o.op  = FCLASS; // vfclass.vfmt - Vectorial FP Classify
+                                            check_fprm        = 1'b0;   // no rounding for classification
+                                            allow_replication = 1'b0;   // R must not be set
+                                        end
+                                        5'b00010 : instruction_o.op = FCVT_F2I; // vfcvt.x.vfmt - Vectorial FP to Int Conversion
+                                        5'b00011 : instruction_o.op = FCVT_I2F; // vfcvt.vfmt.x - Vectorial Int to FP Conversion
+                                        5'b001?? : begin
+                                            instruction_o.op  = FCVT_F2F; // vfcvt.vfmt.vfmt - Vectorial FP to FP Conversion
+                                            instruction_o.rs2 = instr.rvftype.rd; // set rs2 = rd as target vector for conversion
+                                            imm_select        = IIMM;     // rs2 holds part of the intruction
+                                            // TODO CHECK R bit for valid fmt combinations
+                                            // determine source format
+                                            unique case (instr.rvftype.rs2[21:20])
+                                                // Only process instruction if corresponding extension is active (static)
+                                                2'b00: if (~RVFVEC)     illegal_instr = 1'b1;
+                                                2'b01: if (~XF16ALTVEC) illegal_instr = 1'b1;
+                                                2'b10: if (~XF16VEC)    illegal_instr = 1'b1;
+                                                2'b11: if (~XF8VEC)     illegal_instr = 1'b1;
+                                                default : illegal_instr = 1'b1;
+                                            endcase
+                                        end
+                                        default : illegal_instr = 1'b1;
+                                    endcase
+                                end
+                                5'b01101 : begin
+                                    check_fprm = 1'b0;         // no rounding for sign-injection
+                                    instruction_o.op = VFSGNJ; // vfsgnj.vfmt - Vectorial FP Sign Injection
+                                end
+                                5'b01110 : begin
+                                    check_fprm = 1'b0;          // no rounding for sign-injection
+                                    instruction_o.op = VFSGNJN; // vfsgnjn.vfmt - Vectorial FP Negated Sign Injection
+                                end
+                                5'b01111 : begin
+                                    check_fprm = 1'b0;          // no rounding for sign-injection
+                                    instruction_o.op = VFSGNJX; // vfsgnjx.vfmt - Vectorial FP XORed Sign Injection
+                                end
+                                5'b10000 : begin
+                                    check_fprm = 1'b0;          // no rounding for comparisons
+                                    instruction_o.op = VFEQ;    // vfeq.vfmt - Vectorial FP Equality
+                                end
+                                5'b10001 : begin
+                                    check_fprm = 1'b0;          // no rounding for comparisons
+                                    instruction_o.op = VFNE;    // vfne.vfmt - Vectorial FP Non-Equality
+                                end
+                                5'b10010 : begin
+                                    check_fprm = 1'b0;          // no rounding for comparisons
+                                    instruction_o.op = VFLT;    // vfle.vfmt - Vectorial FP Less Than
+                                end
+                                5'b10011 : begin
+                                    check_fprm = 1'b0;          // no rounding for comparisons
+                                    instruction_o.op = VFGE;    // vfge.vfmt - Vectorial FP Greater or Equal
+                                end
+                                5'b10100 : begin
+                                    check_fprm = 1'b0;          // no rounding for comparisons
+                                    instruction_o.op = VFLE;    // vfle.vfmt - Vectorial FP Less or Equal
+                                end
+                                5'b10101 : begin
+                                    check_fprm = 1'b0;          // no rounding for comparisons
+                                    instruction_o.op = VFGT;    // vfgt.vfmt - Vectorial FP Greater Than
+                                end
+                                5'b11000 : begin
+                                    instruction_o.op  = VFCPKAB_S; // vfcpka/b.vfmt.s - Vectorial FP Cast-and-Pack from 2x FP32, lowest 4 entries
+                                    imm_select        = SIMM;      // rd into result field (upper bits don't matter)
+                                    if (~RVF) illegal_instr = 1'b1; // if we don't support RVF, we can't cast from FP32
+                                    // check destination format
+                                    unique case (instr.rvftype.vfmt)
+                                        // Only process instruction if corresponding extension is active and FLEN suffices (static)
+                                        2'b00: begin
+                                            if (~RVFVEC)            illegal_instr = 1'b1; // destination vector not supported
+                                            if (instr.rvftype.repl) illegal_instr = 1'b1; // no entries 2/3 in vector of 2 fp32
+                                        end
+                                        2'b01: begin
+                                            if (~XF16ALTVEC) illegal_instr = 1'b1; // destination vector not supported
+                                        end
+                                        2'b10: begin
+                                            if (~XF16VEC) illegal_instr = 1'b1; // destination vector not supported
+                                        end
+                                        2'b11: begin
+                                            if (~XF8VEC) illegal_instr = 1'b1; // destination vector not supported
+                                        end
+                                        default : illegal_instr = 1'b1;
+                                    endcase
+                                end
+                                5'b11001 : begin
+                                    instruction_o.op  = VFCPKCD_S; // vfcpkc/d.vfmt.s - Vectorial FP Cast-and-Pack from 2x FP32, second 4 entries
+                                    imm_select        = SIMM;      // rd into result field (upper bits don't matter)
+                                    if (~RVF) illegal_instr = 1'b1; // if we don't support RVF, we can't cast from FP32
+                                    // check destination format
+                                    unique case (instr.rvftype.vfmt)
+                                        // Only process instruction if corresponding extension is active and FLEN suffices (static)
+                                        2'b00: illegal_instr = 1'b1; // no entries 4-7 in vector of 2 FP32
+                                        2'b01: illegal_instr = 1'b1; // no entries 4-7 in vector of 4 FP16ALT
+                                        2'b10: illegal_instr = 1'b1; // no entries 4-7 in vector of 4 FP16
+                                        2'b11: begin
+                                            if (~XF8VEC) illegal_instr = 1'b1; // destination vector not supported
+                                        end
+                                        default : illegal_instr = 1'b1;
+                                    endcase
+                                end
+                                5'b11010 : begin
+                                    instruction_o.op  = VFCPKAB_D; // vfcpka/b.vfmt.d - Vectorial FP Cast-and-Pack from 2x FP64, lowest 4 entries
+                                    imm_select        = SIMM;      // rd into result field (upper bits don't matter)
+                                    if (~RVD) illegal_instr = 1'b1; // if we don't support RVD, we can't cast from FP64
+                                    // check destination format
+                                    unique case (instr.rvftype.vfmt)
+                                        // Only process instruction if corresponding extension is active and FLEN suffices (static)
+                                        2'b00: begin
+                                            if (~RVFVEC)            illegal_instr = 1'b1; // destination vector not supported
+                                            if (instr.rvftype.repl) illegal_instr = 1'b1; // no entries 2/3 in vector of 2 fp32
+                                        end
+                                        2'b01: begin
+                                            if (~XF16ALTVEC) illegal_instr = 1'b1; // destination vector not supported
+                                        end
+                                        2'b10: begin
+                                            if (~XF16VEC) illegal_instr = 1'b1; // destination vector not supported
+                                        end
+                                        2'b11: begin
+                                            if (~XF8VEC) illegal_instr = 1'b1; // destination vector not supported
+                                        end
+                                        default : illegal_instr = 1'b1;
+                                    endcase
+                                end
+                                5'b11011 : begin
+                                    instruction_o.op  = VFCPKCD_D; // vfcpka/b.vfmt.d - Vectorial FP Cast-and-Pack from 2x FP64, second 4 entries
+                                    imm_select        = SIMM;      // rd into result field (upper bits don't matter)
+                                    if (~RVD) illegal_instr = 1'b1; // if we don't support RVD, we can't cast from FP64
+                                    // check destination format
+                                    unique case (instr.rvftype.vfmt)
+                                        // Only process instruction if corresponding extension is active and FLEN suffices (static)
+                                        2'b00: illegal_instr = 1'b1; // no entries 4-7 in vector of 2 FP32
+                                        2'b01: illegal_instr = 1'b1; // no entries 4-7 in vector of 4 FP16ALT
+                                        2'b10: illegal_instr = 1'b1; // no entries 4-7 in vector of 4 FP16
+                                        2'b11: begin
+                                            if (~XF8VEC) illegal_instr = 1'b1; // destination vector not supported
+                                        end
+                                        default : illegal_instr = 1'b1;
+                                    endcase
+                                end
+                                default : illegal_instr = 1'b1;
+                            endcase
+
+                            // check format
+                            unique case (instr.rvftype.vfmt)
+                                // Only process instruction if corresponding extension is active (static)
+                                2'b00: if (~RVFVEC)     illegal_instr = 1'b1;
+                                2'b01: if (~XF16ALTVEC) illegal_instr = 1'b1;
+                                2'b10: if (~XF16VEC)    illegal_instr = 1'b1;
+                                2'b11: if (~XF8VEC)     illegal_instr = 1'b1;
+                                default: illegal_instr = 1'b1;
+                            endcase
+
+                            // check disallowed replication
+                            if (~allow_replication & instr.rvftype.repl) illegal_instr = 1'b1;
+
+                            // check rounding mode
+                            if (check_fprm) begin
+                                unique case (frm_i) inside // actual rounding mode from frm csr
+                                    [3'b000:3'b100]: ; //legal rounding modes
+                                    default : illegal_instr = 1'b1;
+                                endcase
+                            end
+
+                        end else begin // No vectorial FP enabled (static)
                             illegal_instr = 1'b1;
                         end
-                    endcase
+
+                    // ---------------------------
+                    // Integer Reg-Reg Operations
+                    // ---------------------------
+                    end else begin
+                        instruction_o.fu  = (instr.rtype.funct7 == 7'b000_0001) ? MULT : ALU;
+                        instruction_o.rs1 = instr.rtype.rs1;
+                        instruction_o.rs2 = instr.rtype.rs2;
+                        instruction_o.rd  = instr.rtype.rd;
+
+                        unique case ({instr.rtype.funct7, instr.rtype.funct3})
+                            {7'b000_0000, 3'b000}: instruction_o.op = ADD;   // Add
+                            {7'b010_0000, 3'b000}: instruction_o.op = SUB;   // Sub
+                            {7'b000_0000, 3'b010}: instruction_o.op = SLTS;  // Set Lower Than
+                            {7'b000_0000, 3'b011}: instruction_o.op = SLTU;  // Set Lower Than Unsigned
+                            {7'b000_0000, 3'b100}: instruction_o.op = XORL;  // Xor
+                            {7'b000_0000, 3'b110}: instruction_o.op = ORL;   // Or
+                            {7'b000_0000, 3'b111}: instruction_o.op = ANDL;  // And
+                            {7'b000_0000, 3'b001}: instruction_o.op = SLL;   // Shift Left Logical
+                            {7'b000_0000, 3'b101}: instruction_o.op = SRL;   // Shift Right Logical
+                            {7'b010_0000, 3'b101}: instruction_o.op = SRA;   // Shift Right Arithmetic
+                            // Multiplications
+                            {7'b000_0001, 3'b000}: instruction_o.op = MUL;
+                            {7'b000_0001, 3'b001}: instruction_o.op = MULH;
+                            {7'b000_0001, 3'b010}: instruction_o.op = MULHSU;
+                            {7'b000_0001, 3'b011}: instruction_o.op = MULHU;
+                            {7'b000_0001, 3'b100}: instruction_o.op = DIV;
+                            {7'b000_0001, 3'b101}: instruction_o.op = DIVU;
+                            {7'b000_0001, 3'b110}: instruction_o.op = REM;
+                            {7'b000_0001, 3'b111}: instruction_o.op = REMU;
+                            default: begin
+                                illegal_instr = 1'b1;
+                            end
+                        endcase
+                    end
                 end
 
                 // --------------------------
@@ -293,7 +532,7 @@ module decoder (
                 // --------------------------------
                 // Reg-Immediate Operations
                 // --------------------------------
-                riscv::OpcodeOpimm: begin
+                riscv::OpcodeOpImm: begin
                     instruction_o.fu  = ALU;
                     imm_select = IIMM;
                     instruction_o.rs1[4:0] = instr.itype.rs1;
@@ -327,7 +566,7 @@ module decoder (
                 // --------------------------------
                 // 32 bit Reg-Immediate Operations
                 // --------------------------------
-                riscv::OpcodeOpimm32: begin
+                riscv::OpcodeOpImm32: begin
                     instruction_o.fu  = ALU;
                     imm_select = IIMM;
                     instruction_o.rs1[4:0] = instr.itype.rs1;
@@ -390,6 +629,264 @@ module decoder (
                     endcase
                 end
 
+                // --------------------------------
+                // Floating-Point Load/store
+                // --------------------------------
+                riscv::OpcodeStoreFp: begin
+                    if (FP_PRESENT && fs_i != riscv::Off) begin // only generate decoder if FP extensions are enabled (static)
+                        instruction_o.fu  = STORE;
+                        imm_select = SIMM;
+                        instruction_o.rs1        = instr.stype.rs1;
+                        instruction_o.rs2        = instr.stype.rs2;
+                        // determine store size
+                        unique case (instr.stype.funct3)
+                            // Only process instruction if corresponding extension is active (static)
+                            3'b000: if (XF8) instruction_o.op = FSB;
+                                    else illegal_instr = 1'b1;
+                            3'b001: if (XF16 | XF16ALT) instruction_o.op = FSH;
+                                    else illegal_instr = 1'b1;
+                            3'b010: if (RVF) instruction_o.op = FSW;
+                                    else illegal_instr = 1'b1;
+                            3'b011: if (RVD) instruction_o.op = FSD;
+                                    else illegal_instr = 1'b1;
+                            default: illegal_instr = 1'b1;
+                        endcase
+                    end else
+                        illegal_instr = 1'b1;
+                end
+
+                riscv::OpcodeLoadFp: begin
+                    if (FP_PRESENT && fs_i != riscv::Off) begin // only generate decoder if FP extensions are enabled (static)
+                        instruction_o.fu  = LOAD;
+                        imm_select = IIMM;
+                        instruction_o.rs1       = instr.itype.rs1;
+                        instruction_o.rd        = instr.itype.rd;
+                        // determine load size
+                        unique case (instr.itype.funct3)
+                            // Only process instruction if corresponding extension is active (static)
+                            3'b000: if (XF8) instruction_o.op = FLB;
+                                    else illegal_instr = 1'b1;
+                            3'b001: if (XF16 | XF16ALT) instruction_o.op = FLH;
+                                    else illegal_instr = 1'b1;
+                            3'b010: if (RVF) instruction_o.op  = FLW;
+                                    else illegal_instr = 1'b1;
+                            3'b011: if (RVD) instruction_o.op  = FLD;
+                                    else illegal_instr = 1'b1;
+                            default: illegal_instr = 1'b1;
+                        endcase
+                    end else
+                        illegal_instr = 1'b1;
+                end
+
+                // ----------------------------------
+                // Floating-Point Reg-Reg Operations
+                // ----------------------------------
+                riscv::OpcodeMadd,
+                riscv::OpcodeMsub,
+                riscv::OpcodeNmsub,
+                riscv::OpcodeNmadd: begin
+                    if (FP_PRESENT && fs_i != riscv::Off) begin // only generate decoder if FP extensions are enabled (static)
+                        instruction_o.fu  = FPU;
+                        instruction_o.rs1 = instr.r4type.rs1;
+                        instruction_o.rs2 = instr.r4type.rs2;
+                        instruction_o.rd  = instr.r4type.rd;
+                        imm_select        = RS3; // rs3 into result field
+                        check_fprm        = 1'b1;
+                        // select the correct fused operation
+                        unique case (instr.r4type.opcode)
+                            default:      instruction_o.op = FMADD;  // fmadd.fmt - FP Fused multiply-add
+                            riscv::OpcodeMsub:  instruction_o.op = FMSUB;  // fmsub.fmt - FP Fused multiply-subtract
+                            riscv::OpcodeNmsub: instruction_o.op = FNMSUB; // fnmsub.fmt - FP Negated fused multiply-subtract
+                            riscv::OpcodeNmadd: instruction_o.op = FNMADD; // fnmadd.fmt - FP Negated fused multiply-add
+                        endcase
+
+                        // determine fp format
+                        unique case (instr.r4type.funct2)
+                            // Only process instruction if corresponding extension is active (static)
+                            2'b00: if (~RVF)             illegal_instr = 1'b1;
+                            2'b01: if (~RVD)             illegal_instr = 1'b1;
+                            2'b10: if (~XF16 & ~XF16ALT) illegal_instr = 1'b1;
+                            2'b11: if (~XF8)             illegal_instr = 1'b1;
+                            default: illegal_instr = 1'b1;
+                        endcase
+
+                        // check rounding mode
+                        if (check_fprm) begin
+                            unique case (instr.rftype.rm) inside
+                                [3'b000:3'b100]: ; //legal rounding modes
+                                3'b101: begin      // Alternative Half-Precsision encded as fmt=10 and rm=101
+                                    if (~XF16ALT || instr.rftype.fmt != 2'b10)
+                                        illegal_instr = 1'b1;
+                                    unique case (frm_i) inside // actual rounding mode from frm csr
+                                        [3'b000:3'b100]: ; //legal rounding modes
+                                        default : illegal_instr = 1'b1;
+                                    endcase
+                                end
+                                3'b111: begin
+                                    // rounding mode from frm csr
+                                    unique case (frm_i) inside
+                                        [3'b000:3'b100]: ; //legal rounding modes
+                                        default : illegal_instr = 1'b1;
+                                    endcase
+                                end
+                                default : illegal_instr = 1'b1;
+                            endcase
+                        end
+                    end else begin
+                        illegal_instr = 1'b1;
+                    end
+                end
+
+                riscv::OpcodeOpFp: begin
+                    if (FP_PRESENT && fs_i != riscv::Off) begin // only generate decoder if FP extensions are enabled (static)
+                        instruction_o.fu  = FPU;
+                        instruction_o.rs1 = instr.rftype.rs1;
+                        instruction_o.rs2 = instr.rftype.rs2;
+                        instruction_o.rd  = instr.rftype.rd;
+                        check_fprm        = 1'b1;
+                        // decode FP instruction
+                        unique case (instr.rftype.funct5)
+                            5'b00000: begin
+                                instruction_o.op  = FADD;             // fadd.fmt - FP Addition
+                                instruction_o.rs1 = '0;               // Operand A is set to 0
+                                instruction_o.rs2 = instr.rftype.rs1; // Operand B is set to rs1
+                                imm_select        = IIMM;             // Operand C is set to rs2
+                            end
+                            5'b00001: begin
+                                instruction_o.op  = FSUB;  // fsub.fmt - FP Subtraction
+                                instruction_o.rs1 = '0;               // Operand A is set to 0
+                                instruction_o.rs2 = instr.rftype.rs1; // Operand B is set to rs1
+                                imm_select        = IIMM;             // Operand C is set to rs2
+                            end
+                            5'b00010: instruction_o.op = FMUL;  // fmul.fmt - FP Multiplication
+                            5'b00011: instruction_o.op = FDIV;  // fdiv.fmt - FP Division
+                            5'b01011: begin
+                                instruction_o.op = FSQRT; // fsqrt.fmt - FP Square Root
+                                // rs2 must be zero
+                                if (instr.rftype.rs2 != 5'b00000) illegal_instr = 1'b1;
+                            end
+                            5'b00100: begin
+                                instruction_o.op = FSGNJ; // fsgn{j[n]/jx}.fmt - FP Sign Injection
+                                check_fprm       = 1'b0;  // instruction encoded in rm, do the check here
+                                if (XF16ALT) begin        // FP16ALT instructions encoded in rm separately (static)
+                                    if (!(instr.rftype.rm inside {[3'b000:3'b010], [3'b100:3'b110]}))
+                                        illegal_instr = 1'b1;
+                                end else begin
+                                    if (!(instr.rftype.rm inside {[3'b000:3'b010]}))
+                                        illegal_instr = 1'b1;
+                                end
+                            end
+                            5'b00101: begin
+                                instruction_o.op = FMIN_MAX; // fmin/fmax.fmt - FP Minimum / Maximum
+                                check_fprm       = 1'b0;     // instruction encoded in rm, do the check here
+                                if (XF16ALT) begin           // FP16ALT instructions encoded in rm separately (static)
+                                    if (!(instr.rftype.rm inside {[3'b000:3'b001], [3'b100:3'b101]}))
+                                        illegal_instr = 1'b1;
+                                end else begin
+                                    if (!(instr.rftype.rm inside {[3'b000:3'b001]}))
+                                        illegal_instr = 1'b1;
+                                end
+                            end
+                            5'b01000: begin
+                                instruction_o.op  = FCVT_F2F; // fcvt.fmt.fmt - FP to FP Conversion
+                                instruction_o.rs2 = instr.rvftype.rs1; // tie rs2 to rs1 to be safe (vectors use rs2)
+                                imm_select        = IIMM;     // rs2 holds part of the intruction
+                                if (instr.rftype.rs2[24:23]) illegal_instr = 1'b1; // bits [22:20] used, other bits must be 0
+                                // check source format
+                                unique case (instr.rftype.rs2[22:20])
+                                    // Only process instruction if corresponding extension is active (static)
+                                    3'b000: if (~RVF)     illegal_instr = 1'b1;
+                                    3'b001: if (~RVD)     illegal_instr = 1'b1;
+                                    3'b010: if (~XF16)    illegal_instr = 1'b1;
+                                    3'b110: if (~XF16ALT) illegal_instr = 1'b1;
+                                    3'b011: if (~XF8)     illegal_instr = 1'b1;
+                                    default: illegal_instr = 1'b1;
+                                endcase
+                            end
+                            5'b10100: begin
+                                instruction_o.op = FCMP; // feq/flt/fle.fmt - FP Comparisons
+                                check_fprm       = 1'b0; // instruction encoded in rm, do the check here
+                                if (XF16ALT) begin       // FP16ALT instructions encoded in rm separately (static)
+                                    if (!(instr.rftype.rm inside {[3'b000:3'b010], [3'b100:3'b110]}))
+                                        illegal_instr = 1'b1;
+                                end else begin
+                                    if (!(instr.rftype.rm inside {[3'b000:3'b010]}))
+                                        illegal_instr = 1'b1;
+                                end
+                            end
+                            5'b11000: begin
+                                instruction_o.op = FCVT_F2I; // fcvt.ifmt.fmt - FP to Int Conversion
+                                imm_select       = IIMM;     // rs2 holds part of the instruction
+                                if (instr.rftype.rs2[24:22]) illegal_instr = 1'b1; // bits [21:20] used, other bits must be 0
+                            end
+                            5'b11010: begin
+                                instruction_o.op = FCVT_I2F;  // fcvt.fmt.ifmt - Int to FP Conversion
+                                imm_select       = IIMM;     // rs2 holds part of the instruction
+                                if (instr.rftype.rs2[24:22]) illegal_instr = 1'b1; // bits [21:20] used, other bits must be 0
+                            end
+                            5'b11100: begin
+                                instruction_o.rs2 = instr.rftype.rs1; // set rs2 = rs1 so we can map FMV to SGNJ in the unit
+                                check_fprm        = 1'b0; // instruction encoded in rm, do the check here
+                                if (instr.rftype.rm == 3'b000 || (XF16ALT && instr.rftype.rm == 3'b100)) // FP16ALT has separate encoding
+                                    instruction_o.op = FMV_F2X;       // fmv.ifmt.fmt - FPR to GPR Move
+                                else if (instr.rftype.rm == 3'b001 || (XF16ALT && instr.rftype.rm == 3'b101)) // FP16ALT has separate encoding
+                                    instruction_o.op = FCLASS; // fclass.fmt - FP Classify
+                                else illegal_instr = 1'b1;
+                                // rs2 must be zero
+                                if (instr.rftype.rs2 != 5'b00000) illegal_instr = 1'b1;
+                            end
+                            5'b11110: begin
+                                instruction_o.op = FMV_X2F;   // fmv.fmt.ifmt - GPR to FPR Move
+                                instruction_o.rs2 = instr.rftype.rs1; // set rs2 = rs1 so we can map FMV to SGNJ in the unit
+                                check_fprm       = 1'b0; // instruction encoded in rm, do the check here
+                                if (!(instr.rftype.rm == 3'b000 || (XF16ALT && instr.rftype.rm == 3'b100)))
+                                    illegal_instr = 1'b1;
+                                // rs2 must be zero
+                                if (instr.rftype.rs2 != 5'b00000) illegal_instr = 1'b1;
+                            end
+                            default : illegal_instr = 1'b1;
+                        endcase
+
+                        // check format
+                        unique case (instr.rftype.fmt)
+                            // Only process instruction if corresponding extension is active (static)
+                            2'b00: if (~RVF)             illegal_instr = 1'b1;
+                            2'b01: if (~RVD)             illegal_instr = 1'b1;
+                            2'b10: if (~XF16 & ~XF16ALT) illegal_instr = 1'b1;
+                            2'b11: if (~XF8)             illegal_instr = 1'b1;
+                            default: illegal_instr = 1'b1;
+                        endcase
+
+                        // check rounding mode
+                        if (check_fprm) begin
+                            unique case (instr.rftype.rm) inside
+                                [3'b000:3'b100]: ; //legal rounding modes
+                                3'b101: begin      // Alternative Half-Precsision encded as fmt=10 and rm=101
+                                    if (~XF16ALT || instr.rftype.fmt != 2'b10)
+                                        illegal_instr = 1'b1;
+                                    unique case (frm_i) inside // actual rounding mode from frm csr
+                                        [3'b000:3'b100]: ; //legal rounding modes
+                                        default : illegal_instr = 1'b1;
+                                    endcase
+                                end
+                                3'b111: begin
+                                    // rounding mode from frm csr
+                                    unique case (frm_i) inside
+                                        [3'b000:3'b100]: ; //legal rounding modes
+                                        default : illegal_instr = 1'b1;
+                                    endcase
+                                end
+                                default : illegal_instr = 1'b1;
+                            endcase
+                        end
+                    end else begin
+                        illegal_instr = 1'b1;
+                    end
+                end
+
+                // ----------------------------------
+                // Atomic Operations
+                // ----------------------------------
                 riscv::OpcodeAmo: begin
                     // we are going to use the load unit for AMOs
                     instruction_o.fu  = STORE;
@@ -398,7 +895,7 @@ module decoder (
                     instruction_o.rd[4:0]  = instr.atype.rd;
                     // TODO(zarubaf): Ordering
                     // words
-                    if (instr.stype.funct3 == 3'h2) begin
+                    if (RVA && instr.stype.funct3 == 3'h2) begin
                         unique case (instr.instr[31:27])
                             5'h0:  instruction_o.op = AMO_ADDW;
                             5'h1:  instruction_o.op = AMO_SWAPW;
@@ -417,7 +914,7 @@ module decoder (
                             default: illegal_instr = 1'b1;
                         endcase
                     // double words
-                    end else if (instr.stype.funct3 == 3'h3) begin
+                    end else if (RVA && instr.stype.funct3 == 3'h3) begin
                         unique case (instr.instr[31:27])
                             5'h0:  instruction_o.op = AMO_ADDD;
                             5'h1:  instruction_o.op = AMO_SWAPD;
@@ -500,6 +997,7 @@ module decoder (
             endcase
         end
     end
+
     // --------------------------------
     // Sign extend immediate
     // --------------------------------
@@ -511,7 +1009,7 @@ module decoder (
         imm_uj_type = uj_imm(instruction_i);
         imm_bi_type = { {59{instruction_i[24]}}, instruction_i[24:20] };
 
-        // NOIMM, IIMM, SIMM, BIMM, UIMM, JIMM
+        // NOIMM, IIMM, SIMM, BIMM, UIMM, JIMM, RS3
         // select immediate
         case (imm_select)
             IIMM: begin
@@ -534,6 +1032,11 @@ module decoder (
                 instruction_o.result = imm_uj_type;
                 instruction_o.use_imm = 1'b1;
             end
+            RS3: begin
+                // result holds address of fp operand rs3
+                instruction_o.result = {59'b0, instr.r4type.rs3};
+                instruction_o.use_imm = 1'b0;
+            end
             default: begin
                 instruction_o.result = 64'b0;
                 instruction_o.use_imm = 1'b0;
diff --git a/src/ex_stage.sv b/src/ex_stage.sv
index ac14f98ca..acb41cc46 100644
--- a/src/ex_stage.sv
+++ b/src/ex_stage.sv
@@ -16,8 +16,8 @@
 import ariane_pkg::*;
 
 module ex_stage #(
-        parameter int          ASID_WIDTH       = 1
-    ) (
+    parameter int          ASID_WIDTH       = 1
+)(
     input  logic                                   clk_i,    // Clock
     input  logic                                   rst_ni,   // Asynchronous reset active low
     input  logic                                   flush_i,
@@ -37,17 +37,16 @@ module ex_stage #(
     output logic                                   alu_valid_o,           // ALU result is valid
     output logic [63:0]                            alu_result_o,
     output logic [TRANS_ID_BITS-1:0]               alu_trans_id_o,        // ID of scoreboard entry at which to write back
+    output exception_t                             alu_exception_o,
     // Branches and Jumps
-    output logic                                   branch_ready_o,
     input  logic                                   branch_valid_i,        // we are using the branch unit
-    output logic                                   branch_valid_o,        // the calculated branch target is valid
-    output logic [63:0]                            branch_result_o,       // branch target address out
-    input  branchpredict_sbe_t                     branch_predict_i,      // branch prediction in
-    output logic [TRANS_ID_BITS-1:0]               branch_trans_id_o,
-    output exception_t                             branch_exception_o,    // branch unit detected an exception
-
+    input  branchpredict_sbe_t                     branch_predict_i,
     output branchpredict_t                         resolved_branch_o,     // the branch engine uses the write back from the ALU
     output logic                                   resolve_branch_o,      // to ID signaling that we resolved the branch
+    // CSR
+    input  logic                                   csr_valid_i,
+    output logic [11:0]                            csr_addr_o,
+    input  logic                                   csr_commit_i,
     // LSU
     output logic                                   lsu_ready_o,           // FU is ready
     input  logic                                   lsu_valid_i,           // Input is valid
@@ -59,20 +58,23 @@ module ex_stage #(
     output exception_t                             lsu_exception_o,
     output logic                                   no_st_pending_o,
     input  logic                                   amo_valid_commit_i,
-    // CSR
-    output logic                                   csr_ready_o,
-    input  logic                                   csr_valid_i,
-    output logic [TRANS_ID_BITS-1:0]               csr_trans_id_o,
-    output logic [63:0]                            csr_result_o,
-    output logic                                   csr_valid_o,
-    output logic [11:0]                            csr_addr_o,
-    input  logic                                   csr_commit_i,
     // MULT
     output logic                                   mult_ready_o,      // FU is ready
     input  logic                                   mult_valid_i,      // Output is valid
     output logic [TRANS_ID_BITS-1:0]               mult_trans_id_o,
     output logic [63:0]                            mult_result_o,
     output logic                                   mult_valid_o,
+    // FPU
+    output logic                                   fpu_ready_o,      // FU is ready
+    input  logic                                   fpu_valid_i,      // Output is valid
+    input  logic [1:0]                             fpu_fmt_i,        // FP format
+    input  logic [2:0]                             fpu_rm_i,         // FP rm
+    input  logic [2:0]                             fpu_frm_i,        // FP frm csr
+    input  logic [6:0]                             fpu_prec_i,       // FP precision control
+    output logic [TRANS_ID_BITS-1:0]               fpu_trans_id_o,
+    output logic [63:0]                            fpu_result_o,
+    output logic                                   fpu_valid_o,
+    output exception_t                             fpu_exception_o,
 
     // Memory Management
     input  logic                                   enable_translation_i,
@@ -104,50 +106,153 @@ module ex_stage #(
     // -----
     // ALU
     // -----
-    alu alu_i (
-        .result_o            ( alu_result_o                 ),
-        .alu_branch_res_o    ( alu_branch_res               ),
-        .*
-    );
+    fu_data_t alu_data;
+    assign alu_data.operator  = (alu_valid_i | branch_valid_i | csr_valid_i) ? operator_i  : ADD;
+    assign alu_data.operand_a = (alu_valid_i | branch_valid_i | csr_valid_i) ? operand_a_i : '0;
+    assign alu_data.operand_b = (alu_valid_i | branch_valid_i | csr_valid_i) ? operand_b_i : '0;
+    assign alu_data.imm       = (alu_valid_i | branch_valid_i | csr_valid_i) ? imm_i : '0;
 
-    // --------------------
-    // Branch Engine
-    // --------------------
-    branch_unit branch_unit_i (
-        .fu_valid_i          ( alu_valid_i || lsu_valid_i || csr_valid_i || mult_valid_i), // any functional unit is valid, check that there is no accidental mis-predict
-        .branch_comp_res_i   ( alu_branch_res ),
-        .*
+    // fixed latency FUs
+    // TOOD(zarubaf) Re-name this module and re-factor ALU
+    alu alu_i (
+        .clk_i,
+        .rst_ni,
+        .flush_i,
+        .pc_i,
+        .trans_id_i,
+        .alu_valid_i,
+        .branch_valid_i,
+        .csr_valid_i      ( csr_valid_i        ),
+        .operator_i       ( alu_data.operator  ),
+        .operand_a_i      ( alu_data.operand_a ),
+        .operand_b_i      ( alu_data.operand_b ),
+        .imm_i            ( alu_data.imm       ),
+        .result_o         ( alu_result_o       ),
+        .alu_valid_o,
+        .alu_ready_o,
+        .alu_trans_id_o,
+        .alu_exception_o,
+
+        .fu_valid_i       ( alu_valid_i || lsu_valid_i || csr_valid_i || mult_valid_i || fpu_valid_i ),
+        .is_compressed_instr_i,
+        .branch_predict_i,
+        .resolved_branch_o,
+        .resolve_branch_o,
+
+        .commit_i        ( csr_commit_i ),
+        .csr_addr_o      ( csr_addr_o   )
     );
 
     // ----------------
     // Multiplication
     // ----------------
+    fu_data_t mult_data;
+    assign mult_data.operator  = mult_valid_i ? operator_i  : MUL;
+    assign mult_data.operand_a = mult_valid_i ? operand_a_i : '0;
+    assign mult_data.operand_b = mult_valid_i ? operand_b_i : '0;
+
     mult i_mult (
-        .result_o ( mult_result_o ),
-        .*
+        .clk_i,
+        .rst_ni,
+        .flush_i,
+        .trans_id_i,
+        .mult_valid_i,
+        .operator_i      ( mult_data.operator  ),
+        .operand_a_i     ( mult_data.operand_a ),
+        .operand_b_i     ( mult_data.operand_b ),
+        .result_o        ( mult_result_o       ),
+        .mult_valid_o,
+        .mult_ready_o,
+        .mult_trans_id_o
     );
 
+    // ----------------
+    // FPU
+    // ----------------
+    generate
+        if (FP_PRESENT) begin : fpu_gen
+            fu_data_t fpu_data;
+            assign fpu_data.operator  = fpu_valid_i ? operator_i  : FSGNJ;
+            assign fpu_data.operand_a = fpu_valid_i ? operand_a_i : '0;
+            assign fpu_data.operand_b = fpu_valid_i ? operand_b_i : '0;
+            assign fpu_data.imm       = fpu_valid_i ? imm_i       : '0;
+
+            fpu_wrap fpu_i (
+                .clk_i,
+                .rst_ni,
+                .flush_i,
+                .trans_id_i,
+                .fu_i,
+                .fpu_valid_i,
+                .fpu_ready_o,
+                .operator_i      ( fpu_data.operator            ),
+                .operand_a_i     ( fpu_data.operand_a[FLEN-1:0] ),
+                .operand_b_i     ( fpu_data.operand_b[FLEN-1:0] ),
+                .operand_c_i     ( fpu_data.imm[FLEN-1:0]       ),
+                .fpu_fmt_i,
+                .fpu_rm_i,
+                .fpu_frm_i,
+                .fpu_prec_i,
+                .fpu_trans_id_o,
+                .result_o        ( fpu_result_o ),
+                .fpu_valid_o,
+                .fpu_exception_o
+            );
+        end else begin : no_fpu_gen
+            assign fpu_ready_o     = '0;
+            assign fpu_trans_id_o  = '0;
+            assign fpu_result_o    = '0;
+            assign fpu_valid_o     = '0;
+            assign fpu_exception_o = '0;
+        end
+    endgenerate
+
     // ----------------
     // Load-Store Unit
     // ----------------
+    fu_data_t lsu_data;
+    assign lsu_data.operator  = lsu_valid_i ? operator_i  : LD;
+    assign lsu_data.operand_a = lsu_valid_i ? operand_a_i : '0;
+    assign lsu_data.operand_b = lsu_valid_i ? operand_b_i : '0;
+    assign lsu_data.imm       = lsu_valid_i ? imm_i       : '0;
+
     lsu lsu_i (
-        .commit_i           ( lsu_commit_i       ),
-        .commit_ready_o     ( lsu_commit_ready_o ),
-        .dcache_req_ports_i,
-        .dcache_req_ports_o,
-        .amo_req_o,
-        .amo_resp_i,
-        .*
+        .clk_i                                         ,
+        .rst_ni                                        ,
+        .flush_i                                       ,
+        .no_st_pending_o                               ,
+        .fu_i                                          ,
+        .operator_i            (lsu_data.operator     ),
+        .operand_a_i           (lsu_data.operand_a    ),
+        .operand_b_i           (lsu_data.operand_b    ),
+        .imm_i                 (lsu_data.imm          ),
+        .lsu_ready_o                                   ,
+        .lsu_valid_i                                   ,
+        .trans_id_i                                    ,
+        .lsu_trans_id_o                                ,
+        .lsu_result_o                                  ,
+        .lsu_valid_o                                   ,
+        .commit_i              (lsu_commit_i          ),
+        .commit_ready_o        (lsu_commit_ready_o    ),
+        .enable_translation_i                          ,
+        .en_ld_st_translation_i                        ,
+        .icache_areq_i                                 ,
+        .icache_areq_o                                 ,
+        .priv_lvl_i                                    ,
+        .ld_st_priv_lvl_i                              ,
+        .sum_i                                         ,
+        .mxr_i                                         ,
+        .satp_ppn_i                                    ,
+        .asid_i                                        ,
+        .flush_tlb_i                                   ,
+        .itlb_miss_o                                   ,
+        .dtlb_miss_o                                   ,
+        .dcache_req_ports_i                            ,
+        .dcache_req_ports_o                            ,
+        .lsu_exception_o                               ,
+        .amo_valid_commit_i                            ,
+        .amo_req_o                                     ,
+        .amo_resp_i
     );
 
-    // -----
-    // CSR
-    // -----
-    // CSR address buffer
-    csr_buffer csr_buffer_i (
-        .commit_i ( csr_commit_i  ),
-        .*
-    );
-
-
 endmodule
diff --git a/src/fpu b/src/fpu
new file mode 160000
index 000000000..00e257917
--- /dev/null
+++ b/src/fpu
@@ -0,0 +1 @@
+Subproject commit 00e2579173f1412f06d4eb95d6b98d0eb1cd2e94
diff --git a/src/fpu_div_sqrt_mvp b/src/fpu_div_sqrt_mvp
new file mode 160000
index 000000000..3736c4c84
--- /dev/null
+++ b/src/fpu_div_sqrt_mvp
@@ -0,0 +1 @@
+Subproject commit 3736c4c844074bd64c3c505c017181db71b738b4
diff --git a/src/fpu_wrap.sv b/src/fpu_wrap.sv
new file mode 100644
index 000000000..eb2775b2e
--- /dev/null
+++ b/src/fpu_wrap.sv
@@ -0,0 +1,603 @@
+// Copyright 2018 ETH Zurich and University of Bologna.
+// Copyright and related rights are licensed under the Solderpad Hardware
+// License, Version 0.51 (the "License"); you may not use this file except in
+// compliance with the License.  You may obtain a copy of the License at
+// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
+// or agreed to in writing, software, hardware and materials distributed under
+// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+//
+// Author: Stefan Mach, ETH Zurich
+// Date: 12.04.2018
+// Description: Wrapper for the floating-point unit
+
+
+import ariane_pkg::*;
+
+module fpu_wrap (
+    input  logic                     clk_i,
+    input  logic                     rst_ni,
+    input  logic                     flush_i,
+    input  logic [TRANS_ID_BITS-1:0] trans_id_i,
+    input  fu_t                      fu_i,
+    input  logic                     fpu_valid_i,
+    output logic                     fpu_ready_o,
+    input  fu_op                     operator_i,
+    input  logic [FLEN-1:0]          operand_a_i,
+    input  logic [FLEN-1:0]          operand_b_i, // imm will be here unless used as operand
+    input  logic [FLEN-1:0]          operand_c_i, // imm will be here unless used as operand
+    input  logic [1:0]               fpu_fmt_i,
+    input  logic [2:0]               fpu_rm_i,
+    input  logic [2:0]               fpu_frm_i,
+    input  logic [6:0]               fpu_prec_i,
+    output logic [TRANS_ID_BITS-1:0] fpu_trans_id_o,
+    output logic [FLEN-1:0]          result_o,
+    output logic                     fpu_valid_o,
+    output exception_t               fpu_exception_o
+);
+
+    //-----------------------------------
+    // FPnew encoding from FPnew package
+    //-----------------------------------
+    localparam OPBITS  =  4;
+    localparam FMTBITS =  3;
+    localparam IFMTBITS = 2;
+
+    integer OP_NUMBITS, FMT_NUMBITS, IFMT_NUMBITS;
+
+    logic [OPBITS-1:0] OP_FMADD;
+    logic [OPBITS-1:0] OP_FNMSUB;
+    logic [OPBITS-1:0] OP_ADD;
+    logic [OPBITS-1:0] OP_MUL;
+    logic [OPBITS-1:0] OP_DIV;
+    logic [OPBITS-1:0] OP_SQRT;
+    logic [OPBITS-1:0] OP_SGNJ;
+    logic [OPBITS-1:0] OP_MINMAX;
+    logic [OPBITS-1:0] OP_CMP;
+    logic [OPBITS-1:0] OP_CLASS;
+    logic [OPBITS-1:0] OP_F2I;
+    logic [OPBITS-1:0] OP_I2F;
+    logic [OPBITS-1:0] OP_F2F;
+    logic [OPBITS-1:0] OP_CPKAB;
+    logic [OPBITS-1:0] OP_CPKCD;
+
+    logic [FMTBITS-1:0] FMT_FP32;
+    logic [FMTBITS-1:0] FMT_FP64;
+    logic [FMTBITS-1:0] FMT_FP16;
+    logic [FMTBITS-1:0] FMT_FP8;
+    logic [FMTBITS-1:0] FMT_FP16ALT;
+    logic [FMTBITS-1:0] FMT_CUST1;
+    logic [FMTBITS-1:0] FMT_CUST2;
+    logic [FMTBITS-1:0] FMT_CUST3;
+
+    logic [IFMTBITS-1:0] IFMT_INT8;
+    logic [IFMTBITS-1:0] IFMT_INT16;
+    logic [IFMTBITS-1:0] IFMT_INT32;
+    logic [IFMTBITS-1:0] IFMT_INT64;
+
+    // bind the constants from the fpnew entity
+    fpnew_pkg_constants i_fpnew_constants (
+        .OP_NUMBITS   ( OP_NUMBITS   ),
+        .OP_FMADD     ( OP_FMADD     ),
+        .OP_FNMSUB    ( OP_FNMSUB    ),
+        .OP_ADD       ( OP_ADD       ),
+        .OP_MUL       ( OP_MUL       ),
+        .OP_DIV       ( OP_DIV       ),
+        .OP_SQRT      ( OP_SQRT      ),
+        .OP_SGNJ      ( OP_SGNJ      ),
+        .OP_MINMAX    ( OP_MINMAX    ),
+        .OP_CMP       ( OP_CMP       ),
+        .OP_CLASS     ( OP_CLASS     ),
+        .OP_F2I       ( OP_F2I       ),
+        .OP_I2F       ( OP_I2F       ),
+        .OP_F2F       ( OP_F2F       ),
+        .OP_CPKAB     ( OP_CPKAB     ),
+        .OP_CPKCD     ( OP_CPKCD     ),
+        .FMT_NUMBITS  ( FMT_NUMBITS  ),
+        .FMT_FP32     ( FMT_FP32     ),
+        .FMT_FP64     ( FMT_FP64     ),
+        .FMT_FP16     ( FMT_FP16     ),
+        .FMT_FP8      ( FMT_FP8      ),
+        .FMT_FP16ALT  ( FMT_FP16ALT  ),
+        .FMT_CUST1    ( FMT_CUST1    ),
+        .FMT_CUST2    ( FMT_CUST2    ),
+        .FMT_CUST3    ( FMT_CUST3    ),
+        .IFMT_NUMBITS ( IFMT_NUMBITS ),
+        .IFMT_INT8    ( IFMT_INT8    ),
+        .IFMT_INT16   ( IFMT_INT16   ),
+        .IFMT_INT32   ( IFMT_INT32   ),
+        .IFMT_INT64   ( IFMT_INT64   )
+    );
+
+    // always_comb begin
+    //     assert (OPBITS >= OP_NUMBITS) else $error("OPBITS is smaller than %0d", OP_NUMBITS);
+    //     assert (FMTBITS >= FMT_NUMBITS) else $error("FMTBITS is smaller than %0d", FMT_NUMBITS);
+    //     assert (IFMTBITS >= IFMT_NUMBITS) else $error("IFMTBITS is smaller than %0d", IFMT_NUMBITS);
+    // end
+
+    //-------------------------------------------------
+    // Inputs to the FPU and protocol inversion buffer
+    //-------------------------------------------------
+    logic [FLEN-1:0]     operand_a_d,  operand_a_q,  operand_a;
+    logic [FLEN-1:0]     operand_b_d,  operand_b_q,  operand_b;
+    logic [FLEN-1:0]     operand_c_d,  operand_c_q,  operand_c;
+    logic [OPBITS-1:0]   fpu_op_d,     fpu_op_q,     fpu_op;
+    logic                fpu_op_mod_d, fpu_op_mod_q, fpu_op_mod;
+    logic [FMTBITS-1:0]  fpu_fmt_d,    fpu_fmt_q,    fpu_fmt;
+    logic [FMTBITS-1:0]  fpu_fmt2_d,   fpu_fmt2_q,   fpu_fmt2;
+    logic [IFMTBITS-1:0] fpu_ifmt_d,   fpu_ifmt_q,   fpu_ifmt;
+    logic [2:0]          fpu_rm_d,     fpu_rm_q,     fpu_rm;
+    logic                fpu_vec_op_d, fpu_vec_op_q, fpu_vec_op;
+
+    logic [TRANS_ID_BITS-1:0] fpu_tag_d, fpu_tag_q, fpu_tag;
+
+    logic fpu_in_ready, fpu_in_valid;
+    logic fpu_out_ready, fpu_out_valid;
+
+    logic [4:0] fpu_status;
+
+    // FSM to handle protocol inversion
+    enum logic {READY, STALL} state_q, state_d;
+    logic hold_inputs;
+    logic use_hold;
+
+    //-----------------------------
+    // Translate inputs
+    //-----------------------------
+
+    always_comb begin : input_translation
+
+        automatic logic vec_replication; // control honoring of replication flag
+        automatic logic replicate_c;     // replicate operand C instead of B (for ADD/SUB)
+        automatic logic check_ah;        // Decide for AH from RM field encoding
+
+        // Default Values
+        operand_a_d         = operand_a_i;
+        operand_b_d         = operand_b_i; // immediates come through this port unless used as operand
+        operand_c_d         = operand_c_i; // immediates come through this port unless used as operand
+        fpu_op_d            = OP_SGNJ; // sign injection by default
+        fpu_op_mod_d        = 1'b0;
+        fpu_fmt_d           = FMT_FP32;
+        fpu_fmt2_d          = FMT_FP32;
+        fpu_ifmt_d          = IFMT_INT32;
+        fpu_rm_d            = fpu_rm_i;
+        fpu_vec_op_d        = fu_i == FPU_VEC;
+        fpu_tag_d           = trans_id_i;
+        vec_replication     = fpu_rm_i[0]; // replication bit is sent via rm field
+        replicate_c         = 1'b0;
+        check_ah            = 1'b0; // whether set scalar AH encoding from MSB of rm_i
+
+        // Scalar Rounding Modes - some ops encode inside RM but use smaller range
+        if (!(fpu_rm_i inside {[3'b000:3'b100]}))
+            fpu_rm_d = fpu_frm_i;
+
+        // Vectorial ops always consult FRM
+        if (fpu_vec_op_d)
+            fpu_rm_d = fpu_frm_i;
+
+        // Formats
+        unique case (fpu_fmt_i)
+            // FP32
+            2'b00 : fpu_fmt_d = FMT_FP32;
+            // FP64 or FP16ALT (vectorial)
+            2'b01 : fpu_fmt_d = fpu_vec_op_d ? FMT_FP16ALT : FMT_FP64;
+            // FP16 or FP16ALT (scalar)
+            2'b10 : begin
+               if (!fpu_vec_op_d && fpu_rm_i==3'b101)
+                   fpu_fmt_d = FMT_FP16ALT;
+               else
+                   fpu_fmt_d = FMT_FP16;
+            end
+            // FP8
+            default : fpu_fmt_d = FMT_FP8;
+        endcase
+
+
+        // Operations (this can modify the rounding mode field and format!)
+        unique case (operator_i)
+            // Addition
+            FADD      : begin
+                fpu_op_d    = OP_ADD;
+                replicate_c = 1'b1; // second operand is in C
+            end
+            // Subtraction is modified ADD
+            FSUB      : begin
+                fpu_op_d     = OP_ADD;
+                fpu_op_mod_d = 1'b1;
+                replicate_c  = 1'b1; // second operand is in C
+            end
+            // Multiplication
+            FMUL      : fpu_op_d = OP_MUL;
+            // Division
+            FDIV      : fpu_op_d = OP_DIV;
+            // Min/Max - OP is encoded in rm (000-001)
+            FMIN_MAX  : begin
+                fpu_op_d = OP_MINMAX;
+                fpu_rm_d = {1'b0, fpu_rm_i[1:0]}; // mask out AH encoding bit
+                check_ah = 1'b1; // AH has RM MSB encoding
+            end
+            // Square Root
+            FSQRT     : fpu_op_d = OP_SQRT;
+            // Fused Multiply Add
+            FMADD     : fpu_op_d = OP_FMADD;
+            // Fused Multiply Subtract is modified FMADD
+            FMSUB     : begin
+                fpu_op_d     = OP_FMADD;
+                fpu_op_mod_d = 1'b1;
+            end
+            // Fused Negated Multiply Subtract
+            FNMSUB    : fpu_op_d = OP_FNMSUB;
+            // Fused Negated Multiply Add is modified FNMSUB
+            FNMADD    : begin
+                fpu_op_d     = OP_FNMSUB;
+                fpu_op_mod_d = 1'b1;
+            end
+            // Float to Int Cast - Op encoded in lowest two imm bits or rm
+            FCVT_F2I  : begin
+                fpu_op_d     = OP_F2I;
+                // Vectorial Ops encoded in R bit
+                if (fpu_vec_op_d) begin
+                    fpu_op_mod_d      = fpu_rm_i[0];
+                    vec_replication = 1'b0; // no replication, R bit used for op
+                    unique case (fpu_fmt_i)
+                        2'b00 : fpu_ifmt_d = IFMT_INT32;
+                        2'b01,
+                        2'b10 : fpu_ifmt_d = IFMT_INT16;
+                        2'b11 : fpu_ifmt_d = IFMT_INT8;
+                    endcase
+                // Scalar casts encoded in imm
+                end else begin
+                    fpu_op_mod_d = operand_c_i[0];
+                    if (operand_c_i[1])
+                        fpu_ifmt_d = IFMT_INT64;
+                    else
+                        fpu_ifmt_d = IFMT_INT32;
+                end
+            end
+            // Int to Float Cast - Op encoded in lowest two imm bits or rm
+            FCVT_I2F  : begin
+                fpu_op_d = OP_I2F;
+                // Vectorial Ops encoded in R bit
+                if (fpu_vec_op_d) begin
+                    fpu_op_mod_d      = fpu_rm_i[0];
+                    vec_replication = 1'b0; // no replication, R bit used for op
+                    unique case (fpu_fmt_i)
+                        2'b00 : fpu_ifmt_d = IFMT_INT32;
+                        2'b01,
+                        2'b10 : fpu_ifmt_d = IFMT_INT16;
+                        2'b11 : fpu_ifmt_d = IFMT_INT8;
+                    endcase
+                // Scalar casts encoded in imm
+                end else begin
+                    fpu_op_mod_d = operand_c_i[0];
+                    if (operand_c_i[1])
+                        fpu_ifmt_d = IFMT_INT64;
+                    else
+                        fpu_ifmt_d = IFMT_INT32;
+                end
+            end
+            // Float to Float Cast - Source format encoded in lowest two/three imm bits
+            FCVT_F2F  : begin
+                fpu_op_d = OP_F2F;
+                // Vectorial ops encoded in lowest two imm bits
+                if (fpu_vec_op_d) begin
+                    vec_replication = 1'b0; // no replication for casts (not needed)
+                    unique case (operand_c_i[1:0])
+                        2'b00: fpu_fmt2_d = FMT_FP32;
+                        2'b01: fpu_fmt2_d = FMT_FP16ALT;
+                        2'b10: fpu_fmt2_d = FMT_FP16;
+                        2'b11: fpu_fmt2_d = FMT_FP8;
+                    endcase
+                // Scalar ops encoded in lowest three imm bits
+                end else begin
+                    unique case (operand_c_i[2:0])
+                        3'b000: fpu_fmt2_d = FMT_FP32;
+                        3'b001: fpu_fmt2_d = FMT_FP64;
+                        3'b010: fpu_fmt2_d = FMT_FP16;
+                        3'b110: fpu_fmt2_d = FMT_FP16ALT;
+                        3'b011: fpu_fmt2_d = FMT_FP8;
+                    endcase
+                end
+            end
+            // Scalar Sign Injection - op encoded in rm (000-010)
+            FSGNJ     : begin
+                fpu_op_d    = OP_SGNJ;
+                fpu_rm_d = {1'b0, fpu_rm_i[1:0]}; // mask out AH encoding bit
+                check_ah = 1'b1; // AH has RM MSB encoding
+            end
+            // Move from FPR to GPR - mapped to SGNJ-passthrough since no recoding
+            FMV_F2X   : begin
+                fpu_op_d          = OP_SGNJ;
+                fpu_rm_d          = 3'b011; // passthrough without checking nan-box
+                fpu_op_mod_d      = 1'b1; // no NaN-Boxing
+                check_ah          = 1'b1; // AH has RM MSB encoding
+                vec_replication   = 1'b0; // no replication, we set second operand
+            end
+            // Move from GPR to FPR - mapped to NOP since no recoding
+            FMV_X2F   : begin
+                fpu_op_d          = OP_SGNJ;
+                fpu_rm_d          = 3'b011; // passthrough without checking nan-box
+                check_ah          = 1'b1; // AH has RM MSB encoding
+                vec_replication   = 1'b0; // no replication, we set second operand
+            end
+            // Scalar Comparisons - op encoded in rm (000-010)
+            FCMP      : begin
+                fpu_op_d = OP_CMP;
+                fpu_rm_d = {1'b0, fpu_rm_i[1:0]}; // mask out AH encoding bit
+                check_ah = 1'b1; // AH has RM MSB encoding
+            end
+            // Classification
+            FCLASS    : begin
+                fpu_op_d = OP_CLASS;
+                fpu_rm_d = {1'b0, fpu_rm_i[1:0]}; // mask out AH encoding bit - CLASS doesn't care anyways
+                check_ah = 1'b1; // AH has RM MSB encoding
+            end
+            // Vectorial Minimum - set up scalar encoding in rm
+            VFMIN     : begin
+                fpu_op_d = OP_MINMAX;
+                fpu_rm_d = 3'b000; // min
+            end
+            // Vectorial Maximum - set up scalar encoding in rm
+            VFMAX     : begin
+                fpu_op_d = OP_MINMAX;
+                fpu_rm_d = 3'b001; // max
+            end
+            // Vectorial Sign Injection - set up scalar encoding in rm
+            VFSGNJ    : begin
+                fpu_op_d = OP_SGNJ;
+                fpu_rm_d = 3'b000; // sgnj
+            end
+            // Vectorial Negated Sign Injection - set up scalar encoding in rm
+            VFSGNJN   : begin
+                fpu_op_d = OP_SGNJ;
+                fpu_rm_d = 3'b001; // sgnjn
+            end
+            // Vectorial Xored Sign Injection - set up scalar encoding in rm
+            VFSGNJX   : begin
+                fpu_op_d = OP_SGNJ;
+                fpu_rm_d = 3'b010; // sgnjx
+            end
+            // Vectorial Equals - set up scalar encoding in rm
+            VFEQ      : begin
+                fpu_op_d = OP_CMP;
+                fpu_rm_d = 3'b010; // eq
+            end
+            // Vectorial Not Equals - set up scalar encoding in rm
+            VFNE      : begin
+                fpu_op_d     = OP_CMP;
+                fpu_op_mod_d = 1'b1;   // invert output
+                fpu_rm_d     = 3'b010; // eq
+                end
+            // Vectorial Less Than - set up scalar encoding in rm
+            VFLT      : begin
+                fpu_op_d = OP_CMP;
+                fpu_rm_d = 3'b001; // lt
+            end
+            // Vectorial Greater or Equal - set up scalar encoding in rm
+            VFGE      : begin
+                fpu_op_d     = OP_CMP;
+                fpu_op_mod_d = 1'b1;   // invert output
+                fpu_rm_d     = 3'b001; // lt
+            end
+            // Vectorial Less or Equal - set up scalar encoding in rm
+            VFLE      : begin
+                fpu_op_d = OP_CMP;
+                fpu_rm_d = 3'b000; // le
+            end
+            // Vectorial Greater Than - set up scalar encoding in rm
+            VFGT      : begin
+                fpu_op_d     = OP_CMP;
+                fpu_op_mod_d = 1'b1;   // invert output
+                fpu_rm_d     = 3'b000; // le
+            end
+            // Vectorial Convert-and-Pack from FP32, lower 4 entries
+            VFCPKAB_S : begin
+                fpu_op_d        = OP_CPKAB;
+                fpu_op_mod_d    = fpu_rm_i[0]; // A/B selection from R bit
+                vec_replication = 1'b0;        // no replication, R bit used for op
+                fpu_fmt2_d      = FMT_FP32;    // Cast from FP32
+            end
+            // Vectorial Convert-and-Pack from FP32, upper 4 entries
+            VFCPKCD_S : begin
+                fpu_op_d        = OP_CPKCD;
+                fpu_op_mod_d    = fpu_rm_i[0]; // C/D selection from R bit
+                vec_replication = 1'b0;        // no replication, R bit used for op
+                fpu_fmt2_d      = FMT_FP64;    // Cast from FP64
+            end
+            // Vectorial Convert-and-Pack from FP64, lower 4 entries
+            VFCPKAB_S : begin
+                fpu_op_d        = OP_CPKAB;
+                fpu_op_mod_d    = fpu_rm_i[0]; // A/B selection from R bit
+                vec_replication = 1'b0;        // no replication, R bit used for op
+                fpu_fmt2_d      = FMT_FP64;    // Cast from FP64
+            end
+            // Vectorial Convert-and-Pack from FP64, upper 4 entries
+            VFCPKCD_S : begin
+                fpu_op_d        = OP_CPKCD;
+                fpu_op_mod_d    = fpu_rm_i[0]; // C/D selection from R bit
+                vec_replication = 1'b0;        // no replication, R bit used for op
+                fpu_fmt2_d      = FMT_FP64;    // Cast from FP64
+            end
+
+            // No changes per default
+            default : ; //nothing
+        endcase
+
+        // Scalar AH encoding fixing
+        if (!fpu_vec_op_d && check_ah)
+            if (fpu_rm_i[2])
+                fpu_fmt_d = FMT_FP16ALT;
+
+        // Replication
+        if (fpu_vec_op_d && vec_replication) begin
+            if (replicate_c) begin
+                unique case (fpu_fmt_d)
+                    FMT_FP32    : operand_c_d = RVD ? {2{operand_c_i[31:0]}} : operand_c_i;
+                    FMT_FP16,
+                    FMT_FP16ALT : operand_c_d = RVD ? {4{operand_c_i[15:0]}} : {2{operand_c_i[15:0]}};
+                    FMT_FP8     : operand_c_d = RVD ? {8{operand_c_i[7:0]}}  : {4{operand_c_i[7:0]}};
+                endcase // fpu_fmt_d
+            end else begin
+                unique case (fpu_fmt_d)
+                    FMT_FP32    : operand_b_d = RVD ? {2{operand_b_i[31:0]}} : operand_b_i;
+                    FMT_FP16,
+                    FMT_FP16ALT : operand_b_d = RVD ? {4{operand_b_i[15:0]}} : {2{operand_b_i[15:0]}};
+                    FMT_FP8     : operand_b_d = RVD ? {8{operand_b_i[7:0]}}  : {4{operand_b_i[7:0]}};
+                endcase // fpu_fmt_d
+            end
+        end
+    end
+
+
+    //---------------------------------------------------------
+    // Upstream protocol inversion: InValid depends on InReady
+    //---------------------------------------------------------
+
+    always_comb begin : p_inputFSM
+        // Default Values
+        fpu_ready_o  = 1'b0;
+        fpu_in_valid = 1'b0;
+        hold_inputs = 1'b0;    // hold register disabled
+        use_hold    = 1'b0;    // inputs go directly to unit
+        state_d     = state_q; // stay in the same state
+
+        // FSM
+        unique case (state_q)
+            // Default state, ready for instructions
+            READY : begin
+                fpu_ready_o  = 1'b1;        // Act as if FPU ready
+                fpu_in_valid = fpu_valid_i; // Forward input valid to FPU
+                // There is a transaction but the FPU can't handle it
+                if (fpu_valid_i & ~fpu_in_ready) begin
+                    fpu_ready_o = 1'b0;  // No token given to Issue
+                    hold_inputs = 1'b1;  // save inputs to the holding register
+                    state_d     = STALL; // stall future incoming requests
+                end
+            end
+            // We're stalling the upstream (ready=0)
+            STALL : begin
+                fpu_in_valid = 1'b1; // we have data for the FPU
+                use_hold     = 1'b1; // the data comes from the hold reg
+                // Wait until it's consumed
+                if (fpu_in_ready) begin
+                    fpu_ready_o = 1'b1;  // Give a token to issue
+                    state_d     = READY; // accept future requests
+                end
+            end
+            // Default: emit default values
+            default : ;
+        endcase
+
+        // Flushing will override issue and go back to idle
+        if (flush_i) begin
+            state_d      = READY;
+        end
+
+    end
+
+    // Buffer register and FSM state holding
+    always_ff @(posedge clk_i or negedge rst_ni) begin : fp_hold_reg
+        if(~rst_ni) begin
+            state_q       <= READY;
+            operand_a_q   <= '0;
+            operand_b_q   <= '0;
+            operand_c_q   <= '0;
+            fpu_op_q      <= '0;
+            fpu_op_mod_q  <= '0;
+            fpu_fmt_q     <= '0;
+            fpu_fmt2_q    <= '0;
+            fpu_ifmt_q    <= '0;
+            fpu_rm_q      <= '0;
+            fpu_vec_op_q  <= '0;
+            fpu_tag_q     <= '0;
+        end else begin
+            state_q       <= state_d;
+            // Hold register is [TRIGGERED] by FSM
+            if (hold_inputs) begin
+                operand_a_q   <= operand_a_d;
+                operand_b_q   <= operand_b_d;
+                operand_c_q   <= operand_c_d;
+                fpu_op_q      <= fpu_op_d;
+                fpu_op_mod_q  <= fpu_op_mod_d;
+                fpu_fmt_q     <= fpu_fmt_d;
+                fpu_fmt2_q    <= fpu_fmt2_d;
+                fpu_ifmt_q    <= fpu_ifmt_d;
+                fpu_rm_q      <= fpu_rm_d;
+                fpu_vec_op_q  <= fpu_vec_op_d;
+                fpu_tag_q     <= fpu_tag_d;
+            end
+        end
+    end
+
+    // Select FPU input data: from register if valid data in register, else directly from input
+    assign operand_a  = use_hold ? operand_a_q  : operand_a_d;
+    assign operand_b  = use_hold ? operand_b_q  : operand_b_d;
+    assign operand_c  = use_hold ? operand_c_q  : operand_c_d;
+    assign fpu_op     = use_hold ? fpu_op_q     : fpu_op_d;
+    assign fpu_op_mod = use_hold ? fpu_op_mod_q : fpu_op_mod_d;
+    assign fpu_fmt    = use_hold ? fpu_fmt_q    : fpu_fmt_d;
+    assign fpu_fmt2   = use_hold ? fpu_fmt2_q   : fpu_fmt2_d;
+    assign fpu_ifmt   = use_hold ? fpu_ifmt_q   : fpu_ifmt_d;
+    assign fpu_rm     = use_hold ? fpu_rm_q     : fpu_rm_d;
+    assign fpu_vec_op = use_hold ? fpu_vec_op_q : fpu_vec_op_d;
+    assign fpu_tag    = use_hold ? fpu_tag_q    : fpu_tag_d;
+
+    //---------------
+    // FPU instance
+    //---------------
+    fpnew_top #(
+        .WIDTH                ( FLEN          ),
+        .TAG_WIDTH            ( TRANS_ID_BITS ),
+        .RV64                 ( 1'b1          ),
+        .RVF                  ( RVF           ),
+        .RVD                  ( RVD           ),
+        .Xf16                 ( XF16          ),
+        .Xf16alt              ( XF16ALT       ),
+        .Xf8                  ( XF8           ),
+        .Xfvec                ( XFVEC         ),
+        // TODO MOVE THESE VALUES TO PACKAGE
+        .LATENCY_COMP_F       ( LAT_COMP_FP32    ),
+        .LATENCY_COMP_D       ( LAT_COMP_FP64    ),
+        .LATENCY_COMP_Xf16    ( LAT_COMP_FP16    ),
+        .LATENCY_COMP_Xf16alt ( LAT_COMP_FP16ALT ),
+        .LATENCY_COMP_Xf8     ( LAT_COMP_FP8     ),
+        .LATENCY_DIVSQRT      ( LAT_DIVSQRT      ),
+        .LATENCY_NONCOMP      ( LAT_NONCOMP      ),
+        .LATENCY_CONV         ( LAT_CONV         )
+    ) fpnew_top_i (
+        .Clk_CI         ( clk_i          ),
+        .Reset_RBI      ( rst_ni         ),
+        .A_DI           ( operand_a      ),
+        .B_DI           ( operand_b      ),
+        .C_DI           ( operand_c      ),
+        .RoundMode_SI   ( fpu_rm         ),
+        .Op_SI          ( fpu_op         ),
+        .OpMod_SI       ( fpu_op_mod     ),
+        .VectorialOp_SI ( fpu_vec_op     ),
+        .FpFmt_SI       ( fpu_fmt        ),
+        .FpFmt2_SI      ( fpu_fmt2       ),
+        .IntFmt_SI      ( fpu_ifmt       ),
+        .Tag_DI         ( fpu_tag        ),
+        .PrecCtl_SI     ( fpu_prec_i     ),
+        .InValid_SI     ( fpu_in_valid   ),
+        .InReady_SO     ( fpu_in_ready   ),
+        .Flush_SI       ( flush_i        ),
+        .Z_DO           ( result_o       ),
+        .Status_DO      ( fpu_status     ),
+        .Tag_DO         ( fpu_trans_id_o ),
+        .OutValid_SO    ( fpu_out_valid  ),
+        .OutReady_SI    ( fpu_out_ready  )
+    );
+
+    // Pack status flag into exception cause, tval ignored in wb, exception is always invalid
+    assign fpu_exception_o.cause = {59'h0, fpu_status};
+    assign fpu_exception_o.valid = 1'b0;
+
+    // Donwstream write port is dedicated to FPU and always ready
+    assign fpu_out_ready = 1'b1;
+
+    // Downstream valid from unit
+    assign fpu_valid_o = fpu_out_valid;
+
+endmodule
diff --git a/src/frontend/instr_scan.sv b/src/frontend/instr_scan.sv
index 766f911cd..06fa3b928 100644
--- a/src/frontend/instr_scan.sv
+++ b/src/frontend/instr_scan.sv
@@ -42,16 +42,16 @@ module instr_scan (
     assign rvi_jalr_o   = (instr_i[6:0] == riscv::OpcodeJalr)   ? 1'b1 : 1'b0;
     assign rvi_jump_o   = (instr_i[6:0] == riscv::OpcodeJal)    ? 1'b1 : 1'b0;
     // opcode JAL
-    assign rvc_jump_o   = (instr_i[15:13] == riscv::OpcodeCJ) & is_rvc_o & (instr_i[1:0] == 2'b01);
+    assign rvc_jump_o   = (instr_i[15:13] == riscv::OpcodeC1J) & is_rvc_o & (instr_i[1:0] == riscv::OpcodeC1);
     // always links to register 0
     assign rvc_jr_o     = (instr_i[15:13] == riscv::OpcodeC2JalrMvAdd)
                         & ~instr_i[12]
                         & (instr_i[6:2] == 5'b00000)
-                        & (instr_i[1:0] == 2'b10)
+                        & (instr_i[1:0] == riscv::OpcodeC2)
+                        & is_rvc_o;
+    assign rvc_branch_o = ((instr_i[15:13] == riscv::OpcodeC1Beqz) | (instr_i[15:13] == riscv::OpcodeC1Bnez))
+                        & (instr_i[1:0] == riscv::OpcodeC1)
                         & is_rvc_o;
-    assign rvc_branch_o = ((instr_i[15:13] == riscv::OpcodeCBeqz) | (instr_i[15:13] == riscv::OpcodeCBnez))
-                        & (instr_i[1:0] == 2'b01)
-                        & is_rvc_o ;
     // check that rs1 is x1 or x5
     assign rvc_return_o = ~instr_i[11] & ~instr_i[10] & ~instr_i[8] & instr_i[7] & rvc_jr_o ;
     // always links to register 1 e.g.: it is a jump
diff --git a/src/id_stage.sv b/src/id_stage.sv
index 7c5d525b9..46b248c60 100644
--- a/src/id_stage.sv
+++ b/src/id_stage.sv
@@ -32,6 +32,9 @@ module id_stage (
     input  logic                  issue_instr_ack_i,   // issue stage acknowledged sampling of instructions
     // from CSR file
     input  riscv::priv_lvl_t      priv_lvl_i,          // current privilege level
+    input  riscv::xs_t            fs_i,                // floating point extension status
+    input  logic [2:0]            frm_i,               // floating-point dynamic rounding mode
+
     input  logic                  debug_mode_i,        // we are in debug mode
     input  logic                  tvm_i,
     input  logic                  tw_i,
@@ -39,9 +42,9 @@ module id_stage (
 );
     // register stage
     struct packed {
-        logic            valid;
+        logic              valid;
         scoreboard_entry_t sbe;
-        logic            is_ctrl_flow;
+        logic              is_ctrl_flow;
 
     } issue_n, issue_q;
 
@@ -90,6 +93,8 @@ module id_stage (
         .ex_i                    ( fetch_entry.ex              ),
         .instruction_o           ( decoded_instruction         ),
         .is_control_flow_instr_o ( is_control_flow_instr       ),
+        .fs_i,
+        .frm_i,
         .*
     );
 
diff --git a/src/issue_read_operands.sv b/src/issue_read_operands.sv
index e88de864c..492e2b305 100644
--- a/src/issue_read_operands.sv
+++ b/src/issue_read_operands.sv
@@ -17,7 +17,7 @@ import ariane_pkg::*;
 
 module issue_read_operands #(
     parameter int unsigned NR_COMMIT_PORTS = 2
-    )(
+)(
     input  logic                                   clk_i,    // Clock
     input  logic                                   rst_ni,   // Asynchronous reset active low
     // flush
@@ -33,8 +33,12 @@ module issue_read_operands #(
     output logic [REG_ADDR_SIZE-1:0]               rs2_o,
     input  logic [63:0]                            rs2_i,
     input  logic                                   rs2_valid_i,
+    output logic [REG_ADDR_SIZE-1:0]               rs3_o,
+    input  logic [FLEN-1:0]                        rs3_i,
+    input  logic                                   rs3_valid_i,
     // get clobber input
-    input  fu_t [2**REG_ADDR_SIZE:0]               rd_clobber_i,
+    input  fu_t [2**REG_ADDR_SIZE:0]               rd_clobber_gpr_i,
+    input  fu_t [2**REG_ADDR_SIZE:0]               rd_clobber_fpr_i,
     // To FU, just single issue for now
     output fu_t                                    fu_o,
     output fu_op                                   operator_o,
@@ -48,22 +52,26 @@ module issue_read_operands #(
     input  logic                                   alu_ready_i,      // FU is ready
     output logic                                   alu_valid_o,      // Output is valid
     // Branches and Jumps
-    input  logic                                   branch_ready_i,
     output logic                                   branch_valid_o,   // this is a valid branch instruction
     output branchpredict_sbe_t                     branch_predict_o,
     // LSU
     input  logic                                   lsu_ready_i,      // FU is ready
     output logic                                   lsu_valid_o,      // Output is valid
     // MULT
-    input  logic                                   mult_ready_i,      // FU is ready
-    output logic                                   mult_valid_o,      // Output is valid
+    input  logic                                   mult_ready_i,     // FU is ready
+    output logic                                   mult_valid_o,     // Output is valid
+    // FPU
+    input  logic                                   fpu_ready_i,      // FU is ready
+    output logic                                   fpu_valid_o,      // Output is valid
+    output logic [1:0]                             fpu_fmt_o,        // FP fmt field from instr.
+    output logic [2:0]                             fpu_rm_o,         // FP rm field from instr.
     // CSR
-    input  logic                                   csr_ready_i,      // FU is ready
     output logic                                   csr_valid_o,      // Output is valid
     // commit port
     input  logic [NR_COMMIT_PORTS-1:0][4:0]        waddr_i,
     input  logic [NR_COMMIT_PORTS-1:0][63:0]       wdata_i,
-    input  logic [NR_COMMIT_PORTS-1:0]             we_i
+    input  logic [NR_COMMIT_PORTS-1:0]             we_gpr_i,
+    input  logic [NR_COMMIT_PORTS-1:0]             we_fpr_i
     // committing instruction instruction
     // from scoreboard
     // input  scoreboard_entry     commit_instr_i,
@@ -72,24 +80,33 @@ module issue_read_operands #(
     logic stall;   // stall signal, we do not want to fetch any more entries
     logic fu_busy; // functional unit is busy
     logic [63:0] operand_a_regfile, operand_b_regfile;  // operands coming from regfile
+    logic [FLEN-1:0] operand_c_regfile; // third operand only from fp regfile
 
     // output flipflop (ID <-> EX)
     logic [63:0] operand_a_n, operand_a_q,
                  operand_b_n, operand_b_q,
                  imm_n, imm_q;
 
-    logic alu_valid_n,    alu_valid_q;
-    logic mult_valid_n,   mult_valid_q;
-    logic lsu_valid_n,    lsu_valid_q;
-    logic csr_valid_n,    csr_valid_q;
-    logic branch_valid_n, branch_valid_q;
+    logic       alu_valid_n,    alu_valid_q;
+    logic       mult_valid_n,   mult_valid_q;
+    logic       fpu_valid_n,    fpu_valid_q;
+    logic [1:0] fpu_fmt_n,      fpu_fmt_q;
+    logic [2:0] fpu_rm_n,       fpu_rm_q;
+    logic       lsu_valid_n,    lsu_valid_q;
+    logic       csr_valid_n,    csr_valid_q;
+    logic       branch_valid_n, branch_valid_q;
 
     logic [TRANS_ID_BITS-1:0] trans_id_n, trans_id_q;
     fu_op operator_n, operator_q; // operation to perform
     fu_t  fu_n,       fu_q; // functional unit to use
 
     // forwarding signals
-    logic forward_rs1, forward_rs2;
+    logic forward_rs1, forward_rs2, forward_rs3;
+
+    // original instruction stored in tval
+    riscv::instruction_t orig_instr;
+    assign orig_instr = riscv::instruction_t'(issue_instr_i.ex.tval[31:0]);
+
     // ID <-> EX registers
     assign operand_a_o    = operand_a_q;
     assign operand_b_o    = operand_b_q;
@@ -100,11 +117,174 @@ module issue_read_operands #(
     assign lsu_valid_o    = lsu_valid_q;
     assign csr_valid_o    = csr_valid_q;
     assign mult_valid_o   = mult_valid_q;
+    assign fpu_valid_o    = fpu_valid_q;
+    assign fpu_fmt_o      = fpu_fmt_q;
+    assign fpu_rm_o       = fpu_rm_q;
     assign trans_id_o     = trans_id_q;
     assign imm_o          = imm_q;
     // ---------------
     // Issue Stage
     // ---------------
+
+    // select the right busy signal
+    // this obviously depends on the functional unit we need
+    always_comb begin : unit_busy
+        unique case (issue_instr_i.fu)
+            NONE:
+                fu_busy = 1'b0;
+            ALU, CTRL_FLOW, CSR:
+                fu_busy = ~alu_ready_i;
+            MULT:
+                fu_busy = ~mult_ready_i;
+            FPU, FPU_VEC:
+                fu_busy = ~fpu_ready_i;
+            LOAD, STORE:
+                fu_busy = ~lsu_ready_i;
+            default:
+                fu_busy = 1'b0;
+        endcase
+    end
+
+    // ---------------
+    // Register stage
+    // ---------------
+    // check that all operands are available, otherwise stall
+    // forward corresponding register
+    always_comb begin : operands_available
+        stall = 1'b0;
+        // operand forwarding signals
+        forward_rs1 = 1'b0;
+        forward_rs2 = 1'b0;
+        forward_rs3 = 1'b0; // FPR only
+        // poll the scoreboard for those values
+        rs1_o = issue_instr_i.rs1;
+        rs2_o = issue_instr_i.rs2;
+        rs3_o = issue_instr_i.result[REG_ADDR_SIZE-1:0]; // rs3 is encoded in imm field
+
+        // 0. check that we are not using the zimm type in RS1
+        //    as this is an immediate we do not have to wait on anything here
+        // 1. check if the source registers are clobbered --> check appropriate clobber list (gpr/fpr)
+        // 2. poll the scoreboard
+        if (~issue_instr_i.use_zimm && (is_rs1_fpr(issue_instr_i.op) ? rd_clobber_fpr_i[issue_instr_i.rs1] != NONE
+                                                                     : rd_clobber_gpr_i[issue_instr_i.rs1] != NONE)) begin
+            // check if the clobbering instruction is not a CSR instruction, CSR instructions can only
+            // be fetched through the register file since they can't be forwarded
+            // if the operand is available, forward it. CSRs don't write to/from FPR
+            if (rs1_valid_i && (is_rs1_fpr(issue_instr_i.op) ? 1'b1 : rd_clobber_gpr_i[issue_instr_i.rs1] != CSR))
+                forward_rs1 = 1'b1;
+            else // the operand is not available -> stall
+                stall = 1'b1;
+        end
+
+        if (is_rs2_fpr(issue_instr_i.op) ? rd_clobber_fpr_i[issue_instr_i.rs2] != NONE
+                                         : rd_clobber_gpr_i[issue_instr_i.rs2] != NONE) begin
+            // if the operand is available, forward it. CSRs don't write to/from FPR
+            if (rs2_valid_i && (is_rs2_fpr(issue_instr_i.op) ? 1'b1 : rd_clobber_gpr_i[issue_instr_i.rs2] != CSR))
+                forward_rs2 = 1'b1;
+            else // the operand is not available -> stall
+                stall = 1'b1;
+        end
+
+        if (is_imm_fpr(issue_instr_i.op) && rd_clobber_fpr_i[issue_instr_i.result[REG_ADDR_SIZE-1:0]] != NONE) begin
+            // if the operand is available, forward it. CSRs don't write to/from FPR so no need to check
+            if (rs3_valid_i)
+                forward_rs3 = 1'b1;
+            else // the operand is not available -> stall
+                stall = 1'b1;
+        end
+    end
+
+    // Forwarding/Output MUX
+    always_comb begin : forwarding_operand_select
+        // default is regfiles (gpr or fpr)
+        operand_a_n = operand_a_regfile;
+        operand_b_n = operand_b_regfile;
+        // immediates are the third operands in the store case
+        // for FP operations, the imm field can also be the third operand from the regfile
+        imm_n      = is_imm_fpr(issue_instr_i.op) ? operand_c_regfile : issue_instr_i.result;
+        trans_id_n = issue_instr_i.trans_id;
+        fu_n       = issue_instr_i.fu;
+        operator_n = issue_instr_i.op;
+        // or should we forward
+        if (forward_rs1) begin
+            operand_a_n  = rs1_i;
+        end
+
+        if (forward_rs2) begin
+            operand_b_n  = rs2_i;
+        end
+
+        if (forward_rs3) begin
+            imm_n  = rs3_i;
+        end
+
+        // use the PC as operand a
+        if (issue_instr_i.use_pc) begin
+            operand_a_n = issue_instr_i.pc;
+        end
+
+        // use the zimm as operand a
+        if (issue_instr_i.use_zimm) begin
+            // zero extend operand a
+            operand_a_n = {52'b0, issue_instr_i.rs1[4:0]};
+        end
+        // or is it an immediate (including PC), this is not the case for a store and control flow instructions
+        // also make sure operand B is not already used as an FP operand
+        if (issue_instr_i.use_imm && (issue_instr_i.fu != STORE) && (issue_instr_i.fu != CTRL_FLOW) && !is_rs2_fpr(issue_instr_i.op)) begin
+            operand_b_n = issue_instr_i.result;
+        end
+    end
+
+    // FU select, assert the correct valid out signal (in the next cycle)
+    always_comb begin : unit_valid
+        alu_valid_n    = 1'b0;
+        lsu_valid_n    = 1'b0;
+        mult_valid_n   = 1'b0;
+        fpu_valid_n    = 1'b0;
+        fpu_fmt_n      = 2'b0;
+        fpu_rm_n       = 3'b0;
+        csr_valid_n    = 1'b0;
+        branch_valid_n = 1'b0;
+        // Exception pass through:
+        // If an exception has occurred simply pass it through
+        // we do not want to issue this instruction
+        if (~issue_instr_i.ex.valid && issue_instr_valid_i && issue_ack_o) begin
+            case (issue_instr_i.fu)
+                ALU:
+                    alu_valid_n    = 1'b1;
+                CTRL_FLOW:
+                    branch_valid_n = 1'b1;
+                MULT:
+                    mult_valid_n   = 1'b1;
+                FPU : begin
+                    fpu_valid_n    = 1'b1;
+                    fpu_fmt_n      = orig_instr.rftype.fmt; // fmt bits from instruction
+                    fpu_rm_n       = orig_instr.rftype.rm;  // rm bits from instruction
+                end
+                FPU_VEC : begin
+                    fpu_valid_n    = 1'b1;
+                    fpu_fmt_n      = orig_instr.rvftype.vfmt;         // vfmt bits from instruction
+                    fpu_rm_n       = {2'b0, orig_instr.rvftype.repl}; // repl bit from instruction
+                end
+                LOAD, STORE:
+                    lsu_valid_n    = 1'b1;
+                CSR:
+                    csr_valid_n    = 1'b1;
+                default:;
+            endcase
+        end
+        // if we got a flush request, de-assert the valid flag, otherwise we will start this
+        // functional unit with the wrong inputs
+        if (flush_i) begin
+            alu_valid_n    = 1'b0;
+            lsu_valid_n    = 1'b0;
+            mult_valid_n   = 1'b0;
+            fpu_valid_n    = 1'b0;
+            csr_valid_n    = 1'b0;
+            branch_valid_n = 1'b0;
+        end
+    end
+
     // We can issue an instruction if we do not detect that any other instruction is writing the same
     // destination register.
     // We also need to check if there is an unresolved branch in the scoreboard.
@@ -120,13 +300,15 @@ module issue_read_operands #(
                 // WAW - Write After Write Dependency Check
                 // -----------------------------------------
                 // no other instruction has the same destination register -> issue the instruction
-                if (rd_clobber_i[issue_instr_i.rd] == NONE) begin
+                if (is_rd_fpr(issue_instr_i.op) ? (rd_clobber_fpr_i[issue_instr_i.rd] == NONE)
+                                                : (rd_clobber_gpr_i[issue_instr_i.rd] == NONE)) begin
                     issue_ack_o = 1'b1;
                 end
                 // or check that the target destination register will be written in this cycle by the
                 // commit stage
                 for (int unsigned i = 0; i < NR_COMMIT_PORTS; i++)
-                    if (we_i[i] && waddr_i[i] == issue_instr_i.rd) begin
+                    if (is_rd_fpr(issue_instr_i.op) ? (we_fpr_i[i] && waddr_i[i] == issue_instr_i.rd)
+                                                    : (we_gpr_i[i] && waddr_i[i] == issue_instr_i.rd)) begin
                         issue_ack_o = 1'b1;
                     end
             end
@@ -145,159 +327,73 @@ module issue_read_operands #(
         end
     end
 
-    // select the right busy signal
-    // this obviously depends on the functional unit we need
-    always_comb begin : unit_busy
-        unique case (issue_instr_i.fu)
-            NONE:
-                fu_busy = 1'b0;
-            ALU:
-                fu_busy = ~alu_ready_i;
-            CTRL_FLOW:
-                fu_busy = ~branch_ready_i;
-            MULT:
-                fu_busy = ~mult_ready_i;
-            LOAD, STORE:
-                fu_busy = ~lsu_ready_i;
-            CSR:
-                fu_busy = ~csr_ready_i;
-            default:
-                fu_busy = 1'b0;
-        endcase
-    end
-
-    // ---------------
-    // Register stage
-    // ---------------
-    // check that all operands are available, otherwise stall
-    // forward corresponding register
-    always_comb begin : operands_available
-        stall = 1'b0;
-        // operand forwarding signals
-        forward_rs1 = 1'b0;
-        forward_rs2 = 1'b0;
-        // poll the scoreboard for those values
-        rs1_o = issue_instr_i.rs1;
-        rs2_o = issue_instr_i.rs2;
-        // 0. check that we are not using the zimm type in RS1
-        //    as this is an immediate we do not have to wait on anything here
-        // 1. check if the source registers are clobberd
-        // 2. poll the scoreboard
-        if (~issue_instr_i.use_zimm && rd_clobber_i[issue_instr_i.rs1] != NONE) begin
-            // check if the clobbering instruction is not a CSR instruction, CSR instructions can only
-            // be fetched through the register file since they can't be forwarded
-            // the operand is available, forward it
-            if (rs1_valid_i && rd_clobber_i[issue_instr_i.rs1] != CSR)
-                forward_rs1 = 1'b1;
-            else // the operand is not available -> stall
-                stall = 1'b1;
-
-        end
-
-        if (rd_clobber_i[issue_instr_i.rs2] != NONE) begin
-            // the operand is available, forward it
-            if (rs2_valid_i && rd_clobber_i[issue_instr_i.rs2] != CSR)
-                forward_rs2 = 1'b1;
-            else // the operand is not available -> stall
-                stall = 1'b1;
-        end
-    end
-    // Forwarding/Output MUX
-    always_comb begin : forwarding_operand_select
-        // default is regfile
-        operand_a_n = operand_a_regfile;
-        operand_b_n = operand_b_regfile;
-        // immediates are the third operands in the store case
-        imm_n      = issue_instr_i.result;
-        trans_id_n = issue_instr_i.trans_id;
-        fu_n       = issue_instr_i.fu;
-        operator_n = issue_instr_i.op;
-        // or should we forward
-        if (forward_rs1) begin
-            operand_a_n  = rs1_i;
-        end
-
-        if (forward_rs2) begin
-            operand_b_n  = rs2_i;
-        end
-
-        // use the PC as operand a
-        if (issue_instr_i.use_pc) begin
-            operand_a_n = issue_instr_i.pc;
-        end
-
-        // use the zimm as operand a
-        if (issue_instr_i.use_zimm) begin
-            // zero extend operand a
-            operand_a_n = {52'b0, issue_instr_i.rs1[4:0]};
-        end
-        // or is it an immediate (including PC), this is not the case for a store and control flow instructions
-        if (issue_instr_i.use_imm && (issue_instr_i.fu != STORE) && (issue_instr_i.fu != CTRL_FLOW)) begin
-            operand_b_n = issue_instr_i.result;
-        end
-    end
-    // FU select, assert the correct valid out signal (in the next cycle)
-    always_comb begin : unit_valid
-        alu_valid_n    = 1'b0;
-        lsu_valid_n    = 1'b0;
-        mult_valid_n   = 1'b0;
-        csr_valid_n    = 1'b0;
-        branch_valid_n = 1'b0;
-        // Exception pass through:
-        // If an exception has occurred simply pass it through
-        // we do not want to issue this instruction
-        if (~issue_instr_i.ex.valid && issue_instr_valid_i && issue_ack_o) begin
-            case (issue_instr_i.fu)
-                ALU:
-                    alu_valid_n    = 1'b1;
-                CTRL_FLOW:
-                    branch_valid_n = 1'b1;
-                MULT:
-                    mult_valid_n   = 1'b1;
-                LOAD, STORE:
-                    lsu_valid_n    = 1'b1;
-                CSR:
-                    csr_valid_n    = 1'b1;
-                default:;
-            endcase
-        end
-        // if we got a flush request, de-assert the valid flag, otherwise we will start this
-        // functional unit with the wrong inputs
-        if (flush_i) begin
-            alu_valid_n    = 1'b0;
-            lsu_valid_n    = 1'b0;
-            mult_valid_n   = 1'b0;
-            csr_valid_n    = 1'b0;
-            branch_valid_n = 1'b0;
-        end
-    end
-
     // ----------------------
     // Integer Register File
     // ----------------------
+    logic [1:0][63:0] rdata;
+    logic [1:0][4:0]  raddr_pack;
+
+    // pack signals
+    logic [NR_COMMIT_PORTS-1:0][4:0]  waddr_pack;
+    logic [NR_COMMIT_PORTS-1:0][63:0] wdata_pack;
+    logic [NR_COMMIT_PORTS-1:0]       we_pack;
+    assign raddr_pack = {issue_instr_i.rs2[4:0], issue_instr_i.rs1[4:0]};
+    assign waddr_pack = {waddr_i[1],  waddr_i[0]};
+    assign wdata_pack = {wdata_i[1],  wdata_i[0]};
+    assign we_pack    = {we_gpr_i[1], we_gpr_i[0]};
+
     ariane_regfile #(
-        .DATA_WIDTH     ( 64                     )
-    ) regfile_i (
-        // Clock and Reset
-        .clk            ( clk_i                  ),
-        .rst_n          ( rst_ni                 ),
-        .test_en_i      ( 1'b0                   ),
-
-        .raddr_a_i      ( issue_instr_i.rs1[4:0] ),
-        .rdata_a_o      ( operand_a_regfile      ),
-
-        .raddr_b_i      ( issue_instr_i.rs2[4:0] ),
-        .rdata_b_o      ( operand_b_regfile      ),
-
-        .waddr_a_i      ( waddr_i[0]             ),
-        .wdata_a_i      ( wdata_i[0]             ),
-        .we_a_i         ( we_i[0]                ),
-
-        .waddr_b_i      ( waddr_i[1]             ),
-        .wdata_b_i      ( wdata_i[1]             ),
-        .we_b_i         ( we_i[1]                )
+        .DATA_WIDTH     ( 64              ),
+        .NR_READ_PORTS  ( 2               ),
+        .NR_WRITE_PORTS ( NR_COMMIT_PORTS ),
+        .ZERO_REG_ZERO  ( 1               )
+    ) i_ariane_regfile (
+        .test_en_i ( 1'b0       ),
+        .raddr_i   ( raddr_pack ),
+        .rdata_o   ( rdata      ),
+        .waddr_i   ( waddr_pack ),
+        .wdata_i   ( wdata_pack ),
+        .we_i      ( we_pack    ),
+        .*
     );
 
+    // -----------------------------
+    // Floating-Point Register File
+    // -----------------------------
+    logic [2:0][FLEN-1:0] fprdata;
+
+    // pack signals
+    logic [2:0][4:0]  fp_raddr_pack;
+    logic [NR_COMMIT_PORTS-1:0][63:0] fp_wdata_pack;
+
+    generate
+        if (FP_PRESENT) begin : float_regfile_gen
+            assign fp_raddr_pack = {issue_instr_i.result[4:0], issue_instr_i.rs2[4:0], issue_instr_i.rs1[4:0]};
+            assign fp_wdata_pack = {wdata_i[1][FLEN-1:0], wdata_i[0][FLEN-1:0]};
+
+            ariane_regfile #(
+                .DATA_WIDTH     ( FLEN            ),
+                .NR_READ_PORTS  ( 3               ),
+                .NR_WRITE_PORTS ( NR_COMMIT_PORTS ),
+                .ZERO_REG_ZERO  ( 0               )
+            ) i_ariane_fp_regfile (
+                .test_en_i ( 1'b0          ),
+                .raddr_i   ( fp_raddr_pack ),
+                .rdata_o   ( fprdata       ),
+                .waddr_i   ( waddr_pack    ),
+                .wdata_i   ( wdata_pack    ),
+                .we_i      ( we_fpr_i      ),
+                .*
+            );
+        end else begin : no_fpr_gen
+            assign fprdata = '{default: '0};
+        end
+    endgenerate
+
+    assign operand_a_regfile = is_rs1_fpr(issue_instr_i.op) ? fprdata[0] : rdata[0];
+    assign operand_b_regfile = is_rs2_fpr(issue_instr_i.op) ? fprdata[1] : rdata[1];
+    assign operand_c_regfile = fprdata[2];
+
     // ----------------------
     // Registers (ID <-> EX)
     // ----------------------
@@ -309,6 +405,9 @@ module issue_read_operands #(
             alu_valid_q           <= 1'b0;
             branch_valid_q        <= 1'b0;
             mult_valid_q          <= 1'b0;
+            fpu_valid_q           <= 1'b0;
+            fpu_fmt_q             <= 2'b0;
+            fpu_rm_q              <= 3'b0;
             lsu_valid_q           <= 1'b0;
             csr_valid_q           <= 1'b0;
             fu_q                  <= NONE;
@@ -324,6 +423,9 @@ module issue_read_operands #(
             alu_valid_q           <= alu_valid_n;
             branch_valid_q        <= branch_valid_n;
             mult_valid_q          <= mult_valid_n;
+            fpu_valid_q           <= fpu_valid_n;
+            fpu_fmt_q             <= fpu_fmt_n;
+            fpu_rm_q              <= fpu_rm_n;
             lsu_valid_q           <= lsu_valid_n;
             csr_valid_q           <= csr_valid_n;
             fu_q                  <= fu_n;
diff --git a/src/issue_stage.sv b/src/issue_stage.sv
index 793aedaa5..1cc08afc4 100644
--- a/src/issue_stage.sv
+++ b/src/issue_stage.sv
@@ -16,9 +16,9 @@
 import ariane_pkg::*;
 
 module issue_stage #(
-        parameter int unsigned NR_ENTRIES = 8,
-        parameter int unsigned NR_WB_PORTS = 4,
-        parameter int unsigned NR_COMMIT_PORTS = 2
+    parameter int unsigned NR_ENTRIES = 8,
+    parameter int unsigned NR_WB_PORTS = 4,
+    parameter int unsigned NR_COMMIT_PORTS = 2
 )(
     input  logic                                     clk_i,     // Clock
     input  logic                                     rst_ni,    // Asynchronous reset active low
@@ -48,14 +48,17 @@ module issue_stage #(
     input  logic                                     lsu_ready_i,
     output logic                                     lsu_valid_o,
     // branch prediction
-    input  logic                                     branch_ready_i,
-    output logic                                     branch_valid_o, // use branch prediction unit
+    output logic                                     branch_valid_o,   // use branch prediction unit
     output branchpredict_sbe_t                       branch_predict_o,
 
     input  logic                                     mult_ready_i,
     output logic                                     mult_valid_o,    // Branch predict Out
 
-    input  logic                                     csr_ready_i,
+    input  logic                                     fpu_ready_i,
+    output logic                                     fpu_valid_o,
+    output logic [1:0]                               fpu_fmt_o,        // FP fmt field from instr.
+    output logic [2:0]                               fpu_rm_o,         // FP rm field from instr.
+
     output logic                                     csr_valid_o,
 
     // write back port
@@ -68,7 +71,8 @@ module issue_stage #(
     // commit port
     input  logic [NR_COMMIT_PORTS-1:0][4:0]          waddr_i,
     input  logic [NR_COMMIT_PORTS-1:0][63:0]         wdata_i,
-    input  logic [NR_COMMIT_PORTS-1:0]               we_i,
+    input  logic [NR_COMMIT_PORTS-1:0]               we_gpr_i,
+    input  logic [NR_COMMIT_PORTS-1:0]               we_fpr_i,
 
     output scoreboard_entry_t [NR_COMMIT_PORTS-1:0]  commit_instr_o,
     input  logic              [NR_COMMIT_PORTS-1:0]  commit_ack_i
@@ -76,7 +80,8 @@ module issue_stage #(
     // ---------------------------------------------------
     // Scoreboard (SB) <-> Issue and Read Operands (IRO)
     // ---------------------------------------------------
-    fu_t  [2**REG_ADDR_SIZE:0] rd_clobber_sb_iro;
+    fu_t  [2**REG_ADDR_SIZE:0] rd_clobber_gpr_sb_iro;
+    fu_t  [2**REG_ADDR_SIZE:0] rd_clobber_fpr_sb_iro;
 
     logic [REG_ADDR_SIZE-1:0]  rs1_iro_sb;
     logic [63:0]               rs1_sb_iro;
@@ -86,6 +91,10 @@ module issue_stage #(
     logic [63:0]               rs2_sb_iro;
     logic                      rs2_valid_iro_sb;
 
+    logic [REG_ADDR_SIZE-1:0]  rs3_iro_sb;
+    logic [FLEN-1:0]           rs3_sb_iro;
+    logic                      rs3_valid_iro_sb;
+
     scoreboard_entry_t         issue_instr_rename_sb;
     logic                      issue_instr_valid_rename_sb;
     logic                      issue_ack_sb_rename;
@@ -117,35 +126,31 @@ module issue_stage #(
         .NR_ENTRIES (NR_ENTRIES ),
         .NR_WB_PORTS(NR_WB_PORTS)
     ) i_scoreboard (
-        .clk_i                  ( clk_i                       ),
-        .rst_ni                 ( rst_ni                      ),
-        .flush_unissued_instr_i ( flush_unissued_instr_i      ),
-        .flush_i                ( flush_i                     ),
-        .unresolved_branch_i    ( 1'b0                        ),
+        .unresolved_branch_i   ( 1'b0                                      ),
+        .rd_clobber_gpr_o      ( rd_clobber_gpr_sb_iro                     ),
+        .rd_clobber_fpr_o      ( rd_clobber_fpr_sb_iro                     ),
+        .rs1_i                 ( rs1_iro_sb                                ),
+        .rs1_o                 ( rs1_sb_iro                                ),
+        .rs1_valid_o           ( rs1_valid_sb_iro                          ),
+        .rs2_i                 ( rs2_iro_sb                                ),
+        .rs2_o                 ( rs2_sb_iro                                ),
+        .rs2_valid_o           ( rs2_valid_iro_sb                          ),
+        .rs3_i                 ( rs3_iro_sb                                ),
+        .rs3_o                 ( rs3_sb_iro                                ),
+        .rs3_valid_o           ( rs3_valid_iro_sb                          ),
 
-        .rd_clobber_o           ( rd_clobber_sb_iro           ),
-        .rs1_i                  ( rs1_iro_sb                  ),
-        .rs1_o                  ( rs1_sb_iro                  ),
-        .rs1_valid_o            ( rs1_valid_sb_iro            ),
-        .rs2_i                  ( rs2_iro_sb                  ),
-        .rs2_o                  ( rs2_sb_iro                  ),
-        .rs2_valid_o            ( rs2_valid_iro_sb            ),
+        .decoded_instr_i       ( issue_instr_rename_sb                     ),
+        .decoded_instr_valid_i ( issue_instr_valid_rename_sb               ),
+        .decoded_instr_ack_o   ( issue_ack_sb_rename                       ),
+        .issue_instr_o         ( issue_instr_sb_iro                        ),
+        .issue_instr_valid_o   ( issue_instr_valid_sb_iro                  ),
+        .issue_ack_i           ( issue_ack_iro_sb                          ),
 
-        .commit_instr_o         ( commit_instr_o              ),
-        .commit_ack_i           ( commit_ack_i                ),
-
-        .decoded_instr_i        ( issue_instr_rename_sb       ),
-        .decoded_instr_valid_i  ( issue_instr_valid_rename_sb ),
-        .decoded_instr_ack_o    ( issue_ack_sb_rename         ),
-
-        .issue_instr_o          ( issue_instr_sb_iro          ),
-        .issue_instr_valid_o    ( issue_instr_valid_sb_iro    ),
-        .issue_ack_i            ( issue_ack_iro_sb            ),
-        .resolved_branch_i      ( resolved_branch_i           ),
-        .trans_id_i             ( trans_id_i                  ),
-        .wbdata_i               ( wbdata_i                    ),
-        .ex_i                   ( ex_ex_i                     ),
-        .wb_valid_i             ( wb_valid_i                  )
+        .resolved_branch_i     ( resolved_branch_i                         ),
+        .trans_id_i            ( trans_id_i                                ),
+        .wbdata_i              ( wbdata_i                                  ),
+        .ex_i                  ( ex_ex_i                                   ),
+        .*
     );
 
     // ---------------------------------------------------------
@@ -162,7 +167,11 @@ module issue_stage #(
         .rs2_o               ( rs2_iro_sb                      ),
         .rs2_i               ( rs2_sb_iro                      ),
         .rs2_valid_i         ( rs2_valid_iro_sb                ),
-        .rd_clobber_i        ( rd_clobber_sb_iro               ),
+        .rs3_o               ( rs3_iro_sb                      ),
+        .rs3_i               ( rs3_sb_iro                      ),
+        .rs3_valid_i         ( rs3_valid_iro_sb                ),
+        .rd_clobber_gpr_i    ( rd_clobber_gpr_sb_iro           ),
+        .rd_clobber_fpr_i    ( rd_clobber_fpr_sb_iro           ),
         .*
     );
 
diff --git a/src/load_unit.sv b/src/load_unit.sv
index 5d299d279..39437fe68 100644
--- a/src/load_unit.sv
+++ b/src/load_unit.sv
@@ -299,17 +299,10 @@ module load_unit (
 
     // prepare these signals for faster selection in the next cycle
     assign signed_d  = load_data_d.operator inside {LW, LH, LB};
-    assign fp_sign_d = 1'b0;
-    assign idx_d     = (load_data_d.operator inside {LW}) ? load_data_d.address_offset + 3 :
-                       (load_data_d.operator inside {LH}) ? load_data_d.address_offset + 1 :
-                                                            load_data_d.address_offset;
-
-    // use this with FP support:
-    // assign signed_d  = load_data_d.operator inside {LW, LH, LB};
-    // assign fp_sign_d = load_data_d.operator inside {FLW, FLH, FLB};
-    // assign idx_d     = (load_data_d.operator inside {LW, FLW}) ? load_data_d.address_offset + 3 :
-    //                    (load_data_d.operator inside {LH, FLH}) ? load_data_d.address_offset + 1 :
-    //                                                              load_data_d.address_offset;
+    assign fp_sign_d = load_data_d.operator inside {FLW, FLH, FLB};
+    assign idx_d     = (load_data_d.operator inside {LW, FLW}) ? load_data_d.address_offset + 3 :
+                       (load_data_d.operator inside {LH, FLH}) ? load_data_d.address_offset + 1 :
+                                                                 load_data_d.address_offset;
 
 
     assign sign_bits = { req_port_i.data_rdata[63],
@@ -328,25 +321,13 @@ module load_unit (
     // result mux
     always_comb begin
         unique case (load_data_q.operator)
-            LW, LWU: begin
-                result_o = {{32{sign_bit}}, shifted_data[31:0]};
-            end
-            LH, LHU:    result_o = {{48{sign_bit}}, shifted_data[15:0]};
-            LB, LBU:    result_o = {{56{sign_bit}}, shifted_data[7:0]};
+            LW, LWU, FLW:    result_o = {{32{sign_bit}}, shifted_data[31:0]};
+            LH, LHU, FLH:    result_o = {{48{sign_bit}}, shifted_data[15:0]};
+            LB, LBU, FLB:    result_o = {{56{sign_bit}}, shifted_data[7:0]};
             default:    result_o = shifted_data;
         endcase
     end
 
-    // use this with FP support:
-    // always_comb begin
-    //     unique case (load_data_q.operator)
-    //         LW, LWU, FLW:    result_o = {{32{sign_bit}}, shifted_data[31:0]};
-    //         LH, LHU, FLH:    result_o = {{48{sign_bit}}, shifted_data[15:0]};
-    //         LB, LBU, FLB:    result_o = {{56{sign_bit}}, shifted_data[7:0]};
-    //         default:    result_o = shifted_data;
-    //     endcase
-    // end
-
     always_ff @(posedge clk_i or negedge rst_ni) begin : p_regs
         if (~rst_ni) begin
             idx_q     <= 0;
diff --git a/src/lsu.sv b/src/lsu.sv
index ef2d8fbfe..1f668ff87 100644
--- a/src/lsu.sv
+++ b/src/lsu.sv
@@ -280,7 +280,7 @@ module lsu #(
         if (lsu_ctrl.valid) begin
             case (lsu_ctrl.operator)
                 // double word
-                LD, SD,
+                LD, SD, FLD, FSD,
                 AMO_LRD, AMO_SCD,
                 AMO_SWAPD, AMO_ADDD, AMO_ANDD, AMO_ORD,
                 AMO_XORD, AMO_MAXD, AMO_MAXDU, AMO_MIND,
@@ -290,7 +290,7 @@ module lsu #(
                     end
                 end
                 // word
-                LW, LWU, SW,
+                LW, LWU, SW, FLW, FSW,
                 AMO_LRW, AMO_SCW,
                 AMO_SWAPW, AMO_ADDW, AMO_ANDW, AMO_ORW,
                 AMO_XORW, AMO_MAXW, AMO_MAXWU, AMO_MINW,
@@ -300,7 +300,7 @@ module lsu #(
                     end
                 end
                 // half word
-                LH, LHU, SH: begin
+                LH, LHU, SH, FLH, FSH: begin
                     if (lsu_ctrl.vaddr[0] != 1'b0) begin
                         data_misaligned = 1'b1;
                     end
@@ -366,6 +366,7 @@ module lsu #(
         .ready_o            ( lsu_ready_o ),
         .*
     );
+
 endmodule
 
 // ------------------
diff --git a/src/lsu_arbiter.sv b/src/lsu_arbiter.sv
index 0f4c73df5..7dafa1466 100644
--- a/src/lsu_arbiter.sv
+++ b/src/lsu_arbiter.sv
@@ -40,9 +40,10 @@ module lsu_arbiter (
     // RR fashion. FIFOs need to be 2 deep in order to unconditionally accept loads and stores since we can
     // have a maximum of 2 outstanding loads.
     // if there are valid elements in the fifos, the unit posts the result on its output ports and expects it
-    // to be consumed unconditionally 
+    // to be consumed unconditionally
 
-    localparam int DEPTH = 2;
+    // Important: this needs to be greater than 2 to unconditionally acept incoming requests
+    localparam int DEPTH = 4;
 
     typedef struct packed {
         logic [TRANS_ID_BITS-1:0] trans_id;
@@ -64,9 +65,9 @@ module lsu_arbiter (
     assign ld_in.result   = ld_result_i;
     assign ld_in.ex       = ld_ex_i;
 
-    assign trans_id_o     = (idx) ? st_out.trans_id : ld_out.trans_id; 
-    assign result_o       = (idx) ? st_out.result   : ld_out.result;   
-    assign ex_o           = (idx) ? st_out.ex       : ld_out.ex;      
+    assign trans_id_o     = (idx) ? st_out.trans_id : ld_out.trans_id;
+    assign result_o       = (idx) ? st_out.result   : ld_out.result;
+    assign ex_o           = (idx) ? st_out.ex       : ld_out.ex;
 
     // round robin with "lookahead" for 2 requesters
     rrarbiter #(
@@ -85,7 +86,7 @@ module lsu_arbiter (
     fifo_v2 #(
         .dtype       (  fifo_t     ),
         .DEPTH       (  DEPTH      )
-    ) i_ld_fifo (    
+    ) i_ld_fifo (
         .clk_i       (  clk_i      ),
         .rst_ni      (  rst_ni     ),
         .flush_i     (  flush_i    ),
@@ -98,12 +99,12 @@ module lsu_arbiter (
         .push_i      (  ld_valid_i ),
         .data_o      (  ld_out     ),
         .pop_i       (  ld_ren     )
-    );  
+    );
 
     fifo_v2 #(
         .dtype       (  fifo_t     ),
         .DEPTH       (  DEPTH      )
-    ) i_st_fifo (    
+    ) i_st_fifo (
         .clk_i       (  clk_i      ),
         .rst_ni      (  rst_ni     ),
         .flush_i     (  flush_i    ),
@@ -116,7 +117,7 @@ module lsu_arbiter (
         .push_i      (  st_valid_i ),
         .data_o      (  st_out     ),
         .pop_i       (  st_ren     )
-    ); 
+    );
 
 
 `ifndef SYNTHESIS
diff --git a/src/re_name.sv b/src/re_name.sv
index 93def90f9..4b2e22509 100644
--- a/src/re_name.sv
+++ b/src/re_name.sv
@@ -41,34 +41,48 @@ module re_name (
 
     // keep track of re-naming data structures
     logic [31:0] re_name_table_gpr_n, re_name_table_gpr_q;
+    logic [31:0] re_name_table_fpr_n, re_name_table_fpr_q;
 
     // -------------------
     // Re-naming
     // -------------------
     always_comb begin
         // MSB of the renamed source register addresses
-        logic name_bit_rs1, name_bit_rs2, name_bit_rd;
+        logic name_bit_rs1, name_bit_rs2, name_bit_rs3, name_bit_rd;
 
         // default assignments
         re_name_table_gpr_n = re_name_table_gpr_q;
+        re_name_table_fpr_n = re_name_table_fpr_q;
         issue_instr_o       = issue_instr_i;
 
         if (issue_ack_i && !flush_unissied_instr_i) begin
             // if we acknowledge the instruction tic the corresponding destination register
-            re_name_table_gpr_n[issue_instr_i.rd] = re_name_table_gpr_q[issue_instr_i.rd] ^ 1'b1;
+            if (is_rd_fpr(issue_instr_i.op))
+                re_name_table_fpr_n[issue_instr_i.rd] = re_name_table_fpr_q[issue_instr_i.rd] ^ 1'b1;
+            else
+                re_name_table_gpr_n[issue_instr_i.rd] = re_name_table_gpr_q[issue_instr_i.rd] ^ 1'b1;
         end
 
         // select name bit according to the register file used for source operands
-        name_bit_rs1 = re_name_table_gpr_q[issue_instr_i.rs1];
-        name_bit_rs2 = re_name_table_gpr_q[issue_instr_i.rs2];
+        name_bit_rs1 = is_rs1_fpr(issue_instr_i.op) ? re_name_table_fpr_q[issue_instr_i.rs1]
+                                                    : re_name_table_gpr_q[issue_instr_i.rs1];
+        name_bit_rs2 = is_rs2_fpr(issue_instr_i.op) ? re_name_table_fpr_q[issue_instr_i.rs2]
+                                                    : re_name_table_gpr_q[issue_instr_i.rs2];
+        // rs3 is only used in certain FP operations and held like an immediate
+        name_bit_rs3 = re_name_table_fpr_q[issue_instr_i.result[4:0]]; // make sure only the addr bits are read
 
         // select name bit according to the state it will have after renaming
-        name_bit_rd = re_name_table_gpr_q[issue_instr_i.rd] ^ (issue_instr_i.rd != '0); // don't rename x0
+        name_bit_rd = is_rd_fpr(issue_instr_i.op) ? re_name_table_fpr_q[issue_instr_i.rd] ^ 1'b1
+                                                  : re_name_table_gpr_q[issue_instr_i.rd] ^ (issue_instr_i.rd != '0); // don't rename x0
 
         // re-name the source registers
         issue_instr_o.rs1 = { ENABLE_RENAME & name_bit_rs1, issue_instr_i.rs1[4:0] };
         issue_instr_o.rs2 = { ENABLE_RENAME & name_bit_rs2, issue_instr_i.rs2[4:0] };
 
+        // re-name the third operand in imm if it's actually an operand
+        if (is_imm_fpr(issue_instr_i.op))
+            issue_instr_o.result = { ENABLE_RENAME & name_bit_rs3, issue_instr_i.result[4:0]};
+
         // re-name the destination register
         issue_instr_o.rd = { ENABLE_RENAME & name_bit_rd, issue_instr_i.rd[4:0] };
 
@@ -78,6 +92,7 @@ module re_name (
         // Handle flushes
         if (flush_i) begin
             re_name_table_gpr_n = '0;
+            re_name_table_fpr_n = '0;
         end
 
     end
@@ -88,8 +103,10 @@ module re_name (
     always_ff @(posedge clk_i or negedge rst_ni) begin
         if (~rst_ni) begin
             re_name_table_gpr_q <= '0;
+            re_name_table_fpr_q <= '0;
         end else begin
             re_name_table_gpr_q <= re_name_table_gpr_n;
+            re_name_table_fpr_q <= re_name_table_fpr_n;
         end
     end
 endmodule
diff --git a/src/scoreboard.sv b/src/scoreboard.sv
index 5ed3e587e..990c59858 100644
--- a/src/scoreboard.sv
+++ b/src/scoreboard.sv
@@ -25,7 +25,8 @@ module scoreboard #(
     input  logic                                      flush_i,  // flush whole scoreboard
     input  logic                                      unresolved_branch_i, // we have an unresolved branch
     // list of clobbered registers to issue stage
-    output fu_t [2**REG_ADDR_SIZE:0]                  rd_clobber_o,
+    output fu_t [2**REG_ADDR_SIZE:0]                  rd_clobber_gpr_o,
+    output fu_t [2**REG_ADDR_SIZE:0]                  rd_clobber_fpr_o,
 
     // regfile like interface to operand read stage
     input  logic [REG_ADDR_SIZE-1:0]                  rs1_i,
@@ -36,12 +37,16 @@ module scoreboard #(
     output logic [63:0]                               rs2_o,
     output logic                                      rs2_valid_o,
 
+    input  logic [REG_ADDR_SIZE-1:0]                  rs3_i,
+    output logic [FLEN-1:0]                           rs3_o,
+    output logic                                      rs3_valid_o,
+
     // advertise instruction to commit stage, if commit_ack_i is asserted advance the commit pointer
     output scoreboard_entry_t [NR_COMMIT_PORTS-1:0]   commit_instr_o,
     input  logic              [NR_COMMIT_PORTS-1:0]   commit_ack_i,
 
-    // instruction to put on top of scoreboard e.g.   : top pointer
-    // we can always put this instruction to the to   p unless we signal with asserted full_o
+    // instruction to put on top of scoreboard e.g.: top pointer
+    // we can always put this instruction to the top unless we signal with asserted full_o
     input  scoreboard_entry_t                         decoded_instr_i,
     input  logic                                      decoded_instr_valid_i,
     output logic                                      decoded_instr_ack_o,
@@ -66,9 +71,9 @@ module scoreboard #(
         scoreboard_entry_t sbe;    // this is the score board entry we will send to ex
     } mem_q [NR_ENTRIES-1:0], mem_n [NR_ENTRIES-1:0];
 
-    logic [$clog2(NR_ENTRIES)-1:0] issue_cnt_n,      issue_cnt_q;
-    logic [$clog2(NR_ENTRIES)-1:0] issue_pointer_n,  issue_pointer_q;
-    logic [$clog2(NR_ENTRIES)-1:0] commit_pointer_n, commit_pointer_q;
+    logic [BITS_ENTRIES-1:0] issue_cnt_n,      issue_cnt_q;
+    logic [BITS_ENTRIES-1:0] issue_pointer_n,  issue_pointer_q;
+    logic [BITS_ENTRIES-1:0] commit_pointer_n, commit_pointer_q;
     logic                          issue_full;
 
     // the issue queue is full don't issue any new instructions
@@ -76,7 +81,7 @@ module scoreboard #(
 
     // output commit instruction directly
     always_comb begin : commit_ports
-        for (logic [$clog2(NR_ENTRIES)-1:0] i = 0; i < NR_COMMIT_PORTS; i++)
+        for (logic [BITS_ENTRIES-1:0] i = 0; i < NR_COMMIT_PORTS; i++)
             commit_instr_o[i] = mem_q[commit_pointer_q + i].sbe;
     end
 
@@ -94,8 +99,8 @@ module scoreboard #(
     // maintain a FIFO with issued instructions
     // keep track of all issued instructions
     always_comb begin : issue_fifo
-        automatic logic [$clog2(NR_ENTRIES)-1:0] issue_cnt;
-        automatic logic [$clog2(NR_ENTRIES)-1:0] commit_pointer;
+        automatic logic [BITS_ENTRIES-1:0] issue_cnt;
+        automatic logic [BITS_ENTRIES-1:0] commit_pointer;
 
         commit_pointer = commit_pointer_q;
         issue_cnt = issue_cnt_q;
@@ -124,13 +129,13 @@ module scoreboard #(
                 mem_n[trans_id_i[i]].sbe.valid  = 1'b1;
                 mem_n[trans_id_i[i]].sbe.result = wbdata_i[i];
                 // save the target address of a branch (needed for debug in commit stage)
-                if (resolved_branch_i.valid) begin
-                    mem_n[trans_id_i[i]].sbe.bp.predict_address = resolved_branch_i.target_address;
-                end
+                mem_n[trans_id_i[i]].sbe.bp.predict_address = resolved_branch_i.target_address;
                 // write the exception back if it is valid
-                if (ex_i[i].valid) begin
+                if (ex_i[i].valid)
                     mem_n[trans_id_i[i]].sbe.ex = ex_i[i];
-                end
+                // write the fflags back from the FPU (exception valid is never set), leave tval intact
+                else if (mem_n[trans_id_i[i]].sbe.fu inside {FPU, FPU_VEC})
+                    mem_n[trans_id_i[i]].sbe.ex.cause = ex_i[i].cause;
             end
         end
 
@@ -138,7 +143,7 @@ module scoreboard #(
         // Commit Port
         // ------------
         // we've got an acknowledge from commit
-        for (logic [$clog2(NR_ENTRIES)-1:0] i = 0; i < NR_COMMIT_PORTS; i++) begin
+        for (logic [BITS_ENTRIES-1:0] i = 0; i < NR_COMMIT_PORTS; i++) begin
             if (commit_ack_i[i]) begin
                 // decrease the issue counter
                 issue_cnt--;
@@ -149,6 +154,7 @@ module scoreboard #(
                 commit_pointer++;
             end
         end
+
         // ------
         // Flush
         // ------
@@ -164,6 +170,7 @@ module scoreboard #(
                 commit_pointer        = '0;
             end
         end
+
         // update issue counter
         issue_cnt_n      = issue_cnt;
         // update commit potiner
@@ -175,16 +182,20 @@ module scoreboard #(
     // -------------------
     // rd_clobber output: output currently clobbered destination registers
     always_comb begin : clobber_output
-        rd_clobber_o = '{default: NONE};
+        rd_clobber_gpr_o = '{default: NONE};
+        rd_clobber_fpr_o = '{default: NONE};
         // check for all valid entries and set the clobber register accordingly
         for (int unsigned i = 0; i < NR_ENTRIES; i++) begin
             if (mem_q[i].issued) begin
                 // output the functional unit which is going to clobber this register
-                rd_clobber_o[mem_q[i].sbe.rd] = mem_q[i].sbe.fu;
+                if (is_rd_fpr(mem_q[i].sbe.op))
+                    rd_clobber_fpr_o[mem_q[i].sbe.rd] = mem_q[i].sbe.fu;
+                else
+                    rd_clobber_gpr_o[mem_q[i].sbe.rd] = mem_q[i].sbe.fu;
             end
         end
-        // the zero register is always free
-        rd_clobber_o[0] = NONE;
+        // the gpr zero register is always free
+        rd_clobber_gpr_o[0] = NONE;
     end
 
     // ----------------------------------
@@ -194,20 +205,26 @@ module scoreboard #(
     always_comb begin : read_operands
         rs1_o       = 64'b0;
         rs2_o       = 64'b0;
+        rs3_o       = '0;
         rs1_valid_o = 1'b0;
         rs2_valid_o = 1'b0;
+        rs3_valid_o = 1'b0;
 
         for (int unsigned i = 0; i < NR_ENTRIES; i++) begin
             // only consider this entry if it is valid
             if (mem_q[i].issued) begin
                 // look at the appropriate fields and look whether there was an
-                // instruction that wrote the rd field before, first for RS1 and then for RS2
-                if (mem_q[i].sbe.rd == rs1_i) begin
+                // instruction that wrote the rd field before, first for RS1 and then for RS2, then for RS3
+                // we check the type of the stored result register file against issued register file
+                if ((mem_q[i].sbe.rd == rs1_i) && (is_rd_fpr(mem_q[i].sbe.op) == is_rs1_fpr(issue_instr_o.op))) begin
                     rs1_o       = mem_q[i].sbe.result;
                     rs1_valid_o = mem_q[i].sbe.valid;
-                end else if (mem_q[i].sbe.rd == rs2_i) begin
+                end else if ((mem_q[i].sbe.rd == rs2_i) && (is_rd_fpr(mem_q[i].sbe.op) == is_rs2_fpr(issue_instr_o.op))) begin
                     rs2_o       = mem_q[i].sbe.result;
                     rs2_valid_o = mem_q[i].sbe.valid;
+                end else if ((mem_q[i].sbe.rd == rs3_i) && (is_rd_fpr(mem_q[i].sbe.op) == is_imm_fpr(issue_instr_o.op))) begin
+                    rs3_o       = mem_q[i].sbe.result;
+                    rs3_valid_o = mem_q[i].sbe.valid;
                 end
             end
         end
@@ -218,22 +235,30 @@ module scoreboard #(
         // provide a direct combinational path from WB a.k.a forwarding
         // make sure that we are not forwarding a result that got an exception
         for (int unsigned j = 0; j < NR_WB_PORTS; j++) begin
-            if (mem_q[trans_id_i[j]].sbe.rd == rs1_i && wb_valid_i[j] && ~ex_i[j].valid) begin
+            if (mem_q[trans_id_i[j]].sbe.rd == rs1_i && wb_valid_i[j] && ~ex_i[j].valid
+               && (is_rd_fpr(mem_q[trans_id_i[j]].sbe.op) == is_rs1_fpr(issue_instr_o.op))) begin
                 rs1_o = wbdata_i[j];
                 rs1_valid_o = wb_valid_i[j];
                 break;
             end
-            if (mem_q[trans_id_i[j]].sbe.rd == rs2_i && wb_valid_i[j] && ~ex_i[j].valid) begin
+            if (mem_q[trans_id_i[j]].sbe.rd == rs2_i && wb_valid_i[j] && ~ex_i[j].valid
+               && (is_rd_fpr(mem_q[trans_id_i[j]].sbe.op) == is_rs2_fpr(issue_instr_o.op))) begin
                 rs2_o = wbdata_i[j];
                 rs2_valid_o = wb_valid_i[j];
                 break;
             end
+            if (mem_q[trans_id_i[j]].sbe.rd == rs3_i && wb_valid_i[j] && ~ex_i[j].valid
+               && (is_rd_fpr(mem_q[trans_id_i[j]].sbe.op) == is_imm_fpr(issue_instr_o.op))) begin
+                rs3_o = wbdata_i[j];
+                rs3_valid_o = wb_valid_i[j];
+                break;
+            end
         end
 
         // make sure we didn't read the zero register
-        if (rs1_i == '0)
+        if (rs1_i == '0 && ~is_rs1_fpr(issue_instr_o.op)) // only GPR reg0 is 0
             rs1_valid_o = 1'b0;
-        if (rs2_i == '0)
+        if (rs2_i == '0 && ~is_rs2_fpr(issue_instr_o.op)) // only GPR reg0 is 0
             rs2_valid_o = 1'b0;
     end
 
@@ -254,12 +279,12 @@ module scoreboard #(
     `ifndef SYNTHESIS
     `ifndef verilator
     initial begin
-        assert (NR_ENTRIES == 2**$clog2(NR_ENTRIES)) else $fatal("Scoreboard size needs to be a power of two.");
+        assert (NR_ENTRIES == 2**BITS_ENTRIES) else $fatal("Scoreboard size needs to be a power of two.");
     end
 
     // assert that zero is never set
     assert property (
-        @(posedge clk_i) rst_ni |-> (rd_clobber_o[0] == NONE))
+        @(posedge clk_i) rst_ni |-> (rd_clobber_gpr_o[0] == NONE))
         else $error ("RD 0 should not bet set");
     // assert that we never acknowledge a commit if the instruction is not valid
     assert property (
diff --git a/src/tech_cells_generic b/src/tech_cells_generic
new file mode 160000
index 000000000..ffe7818dc
--- /dev/null
+++ b/src/tech_cells_generic
@@ -0,0 +1 @@
+Subproject commit ffe7818dc24eba29cf3634d404d1b3b85034272b
diff --git a/src/util/cluster_clock_gating.sv b/src/util/cluster_clock_gating.sv
deleted file mode 100644
index f2b10b29f..000000000
--- a/src/util/cluster_clock_gating.sv
+++ /dev/null
@@ -1,40 +0,0 @@
-// Copyright 2018 ETH Zurich and University of Bologna.
-// Copyright and related rights are licensed under the Solderpad Hardware
-// License, Version 0.51 (the "License"); you may not use this file except in
-// compliance with the License.  You may obtain a copy of the License at
-// http://solderpad.org/licenses/SHL-0.51. Unless required by applicable law
-// or agreed to in writing, software, hardware and materials distributed under
-// this License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-//
-// Behavioural GLock Gating
-// File:   cluster_clock_gating.sv
-// Author: ?
-// Date:   ?
-
-module cluster_clock_gating (
-    input  logic clk_i,
-    input  logic en_i,
-    input  logic test_en_i,
-    output logic clk_o
- );
-
-`ifdef PULP_FPGA_EMUL
-  // no clock gates in FPGA flow
-  assign clk_o = clk_i;
-`elsif verilator
-  assign clk_o = clk_i;
-`else
-  logic clk_en;
-
-  always_latch
-  begin
-     if (clk_i == 1'b0)
-       clk_en <= en_i | test_en_i;
-  end
-
-  assign clk_o = clk_i & clk_en;
-`endif
-
-endmodule
diff --git a/src/util/find_first_one.sv b/src/util/find_first_one.sv
new file mode 100644
index 000000000..53653f20d
--- /dev/null
+++ b/src/util/find_first_one.sv
@@ -0,0 +1,85 @@
+// Copyright (c) 2018 ETH Zurich, University of Bologna
+// All rights reserved.
+//
+// This code is under development and not yet released to the public.
+// Until it is released, the code is under the copyright of ETH Zurich and
+// the University of Bologna, and may contain confidential and/or unpublished
+// work. Any reuse/redistribution is strictly forbidden without written
+// permission from ETH Zurich.
+//
+// Bug fixes and contributions will eventually be released under the
+// SolderPad open hardware license in the context of the PULP platform
+// (http://www.pulp-platform.org), under the copyright of ETH Zurich and the
+// University of Bologna.
+
+
+/// A leading-one finder / leading zero counter.
+/// Set FLIP to 0 for find_first_one => first_one_o is the index of the first one (from the LSB)
+/// Set FLIP to 1 for leading zero counter => first_one_o is the number of leading zeroes (from the MSB)
+module find_first_one #(
+    /// The width of the input vector.
+    parameter int WIDTH = -1,
+    parameter int FLIP = 0
+)(
+    input  logic [WIDTH-1:0]         in_i,
+    output logic [$clog2(WIDTH)-1:0] first_one_o,
+    output logic                     no_ones_o
+);
+
+    localparam int NUM_LEVELS = $clog2(WIDTH);
+
+    // pragma translate_off
+    initial begin
+        assert(WIDTH >= 0);
+    end
+    // pragma translate_on
+
+    logic [WIDTH-1:0][NUM_LEVELS-1:0]          index_lut;
+    logic [2**NUM_LEVELS-1:0]                  sel_nodes;
+    logic [2**NUM_LEVELS-1:0][NUM_LEVELS-1:0]  index_nodes;
+
+    logic [WIDTH-1:0] in_tmp;
+
+    for (genvar i = 0; i < WIDTH; i++) begin
+        assign in_tmp[i] = FLIP ? in_i[WIDTH-1-i] : in_i[i];
+    end
+
+    for (genvar j = 0; j < WIDTH; j++) begin
+        assign index_lut[j] = j;
+    end
+
+    for (genvar level = 0; level < NUM_LEVELS; level++) begin
+
+        if (level < NUM_LEVELS-1) begin
+            for (genvar l = 0; l < 2**level; l++) begin
+                assign sel_nodes[2**level-1+l]   = sel_nodes[2**(level+1)-1+l*2] | sel_nodes[2**(level+1)-1+l*2+1];
+                assign index_nodes[2**level-1+l] = (sel_nodes[2**(level+1)-1+l*2] == 1'b1) ?
+                    index_nodes[2**(level+1)-1+l*2] : index_nodes[2**(level+1)-1+l*2+1];
+            end
+        end
+
+        if (level == NUM_LEVELS-1) begin
+            for (genvar k = 0; k < 2**level; k++) begin
+                // if two successive indices are still in the vector...
+                if (k * 2 < WIDTH-1) begin
+                    assign sel_nodes[2**level-1+k]   = in_tmp[k*2] | in_tmp[k*2+1];
+                    assign index_nodes[2**level-1+k] = (in_tmp[k*2] == 1'b1) ? index_lut[k*2] : index_lut[k*2+1];
+                end
+                // if only the first index is still in the vector...
+                if (k * 2 == WIDTH-1) begin
+                    assign sel_nodes[2**level-1+k]   = in_tmp[k*2];
+                    assign index_nodes[2**level-1+k] = index_lut[k*2];
+                end
+                // if index is out of range
+                if (k * 2 > WIDTH-1) begin
+                    assign sel_nodes[2**level-1+k]   = 1'b0;
+                    assign index_nodes[2**level-1+k] = '0;
+                end
+            end
+        end
+    end
+
+    assign first_one_o = NUM_LEVELS > 0 ? index_nodes[0] : '0;
+    assign no_ones_o   = NUM_LEVELS > 0 ? ~sel_nodes[0]  : '1;
+
+endmodule
diff --git a/src/util/instruction_trace_item.svh b/src/util/instruction_trace_item.svh
index 144f275dd..d976179c6 100644
--- a/src/util/instruction_trace_item.svh
+++ b/src/util/instruction_trace_item.svh
@@ -19,9 +19,12 @@ class instruction_trace_item;
     scoreboard_entry_t sbe;
     logic [31:0]       pc;
     logic [31:0]       instr;
-    logic [63:0]       reg_file [32];
+    logic [63:0]       gp_reg_file [32];
+    logic [63:0]       fp_reg_file [32];
     logic [4:0]        read_regs [$];
+    logic              read_fpr [$];
     logic [4:0]        result_regs [$];
+    logic              result_fpr [$];
     logic [63:0]       imm;
     logic [63:0]       result;
     logic [63:0]       paddr;
@@ -31,14 +34,15 @@ class instruction_trace_item;
     logic [4:0] rs1, rs2, rs3, rd;
 
     // constructor creating a new instruction trace item, e.g.: a single instruction with all relevant information
-    function new (time simtime, longint unsigned cycle, scoreboard_entry_t sbe, logic [31:0] instr, logic [63:0] reg_file [32],
-                  logic [63:0] result, logic [63:0] paddr, riscv::priv_lvl_t priv_lvl, logic debug_mode, branchpredict_t bp);
+    function new (time simtime, longint unsigned cycle, scoreboard_entry_t sbe, logic [31:0] instr, logic [63:0] gp_reg_file [32],
+                logic [63:0] fp_reg_file [32], logic [63:0] result, logic [63:0] paddr, riscv::priv_lvl_t priv_lvl, logic debug_mode, branchpredict_t bp);
         this.simtime  = simtime;
         this.cycle    = cycle;
         this.pc       = sbe.pc;
         this.sbe      = sbe;
         this.instr    = instr;
-        this.reg_file = reg_file;
+        this.gp_reg_file = gp_reg_file;
+        this.fp_reg_file = fp_reg_file;
         this.result   = result;
         this.paddr    = paddr;
         this.bp       = bp;
@@ -48,7 +52,8 @@ class instruction_trace_item;
         this.rs3      = instr[31:27];
         this.rd       = sbe.rd[4:0];
     endfunction
-    // convert register address to ABI compatible form
+
+    // convert gp register address to ABI compatible form
     function string regAddrToStr(logic [5:0] addr);
         case (addr[4:0])
             0: return "x0";
@@ -63,9 +68,64 @@ class instruction_trace_item;
             default: return $sformatf("s%0d", (addr[4:0] - 16));
         endcase
     endfunction
+    // convert fp register address to ABI compatible form
+    function string fpRegAddrToStr(logic [5:0] addr);
+        case (addr) inside
+            [0:7]   : return $sformatf("ft%0d", addr);
+            [8:9]   : return $sformatf("fs%0d", (addr - 8));
+            [10:17] : return $sformatf("fa%0d", (addr - 10));
+            [18:27] : return $sformatf("fs%0d", (addr - 16));
+            [28:31] : return $sformatf("ft%0d", (addr - 20));
+        endcase
+    endfunction
+
+    function string fpFmtToStr(logic [1:0] fmt);
+        case (fmt)
+            2'b00 : return "s";
+            2'b01 : return "d";
+            2'b10 : return "h";
+            2'b11 : return "b";
+            default : return "XX";
+        endcase
+    endfunction
+
+    function string fmvFpFmtToStr(logic [1:0] fmt);
+        case (fmt)
+            2'b00 : return "w";
+            2'b01 : return "d";
+            2'b10 : return "h";
+            2'b11 : return "b";
+            default : return "XX";
+        endcase
+    endfunction
+
+    function string intFmtToStr(logic [1:0] ifmt);
+        case (ifmt)
+            2'b00 : return "w";
+            2'b01 : return "wu";
+            2'b10 : return "l";
+            2'b11 : return "lu";
+            default : return "XX";
+        endcase
+    endfunction
+
+    function string fpRmToStr(logic [2:0] rm);
+        case (rm)
+            3'b000 : return "rne";
+            3'b001 : return "rtz";
+            3'b010 : return "rdn";
+            3'b011 : return "rup";
+            3'b100 : return "rmm";
+            3'b111 : return "dyn"; // what is this called in rv binutils?
+            default: return "INVALID";
+        endcase
+    endfunction
 
     function string csrAddrToStr(logic [11:0] addr);
         case (addr)
+            riscv::CSR_FFLAGS:     return "fflags";
+            riscv::CSR_FRM:        return "frm";
+            riscv::CSR_FCSR:       return "fcsr";
             riscv::CSR_SSTATUS:    return "sstatus";
             riscv::CSR_SIE:        return "sie";
             riscv::CSR_STVEC:      return "stvec";
@@ -120,7 +180,7 @@ class instruction_trace_item;
     function string printInstr();
         string s;
 
-        casex (instr)
+        case (instr) inside
              // Aliases
             32'h00_00_00_13:           s = this.printMnemonic("nop");
             // Regular opcodes
@@ -174,6 +234,33 @@ class instruction_trace_item;
             INSTR_SRLW:                s = this.printRInstr("srlw");
             INSTR_SRAW:                s = this.printRInstr("sraw");
             INSTR_MULW:                s = this.printMulInstr(1'b1);
+            // FP
+            INSTR_FMADD:               s = this.printR4Instr("fmadd");
+            INSTR_FMSUB:               s = this.printR4Instr("fmsub");
+            INSTR_FNSMSUB:             s = this.printR4Instr("fnmsub");
+            INSTR_FNMADD:              s = this.printR4Instr("fnmadd");
+
+            INSTR_FADD:                s = this.printRFBCInstr("fadd", 1'b1);
+            INSTR_FSUB:                s = this.printRFBCInstr("fsub", 1'b1);
+            INSTR_FMUL:                s = this.printRFInstr("fmul", 1'b1);
+            INSTR_FDIV:                s = this.printRFInstr("fdiv", 1'b1);
+            INSTR_FSQRT:               s = this.printRFInstr1Op("fsqrt", 1'b1);
+            INSTR_FSGNJ:               s = this.printRFInstr("fsgnj", 1'b0);
+            INSTR_FSGNJN:              s = this.printRFInstr("fsgnjn", 1'b0);
+            INSTR_FSGNJX:              s = this.printRFInstr("fsgnjx", 1'b0);
+            INSTR_FMIN:                s = this.printRFInstr("fmin", 1'b0);
+            INSTR_FMAX:                s = this.printRFInstr("fmax", 1'b0);
+            INSTR_FLE:                 s = this.printRFInstr("fle", 1'b0);
+            INSTR_FLT:                 s = this.printRFInstr("flt", 1'b0);
+            INSTR_FEQ:                 s = this.printRFInstr("feq", 1'b0);
+
+            INSTR_FCLASS:              s = this.printRFInstr1Op("fclass", 1'b0);
+
+            INSTR_FCVT_F2F,
+            INSTR_FMV_F2X,
+            INSTR_FMV_X2F,
+            INSTR_FCVT_F2I,
+            INSTR_FCVT_I2F:            s = this.printFpSpecialInstr(); // these are a mess to do nicely
             // FENCE
             INSTR_FENCE:               s = this.printMnemonic("fence");
             INSTR_FENCEI:              s = this.printMnemonic("fence.i");
@@ -201,14 +288,16 @@ class instruction_trace_item;
             INSTR_WFI:                 s = this.printMnemonic("wfi");
             INSTR_SFENCE:              s = this.printMnemonic("sfence.vma");
             // loads and stores
-            INSTR_LOAD:                s = this.printLoadInstr();
-            INSTR_STORE:               s = this.printStoreInstr();
+            INSTR_LOAD,
+            INSTR_LOAD_FP:             s = this.printLoadInstr();
+            INSTR_STORE,
+            INSTR_STORE_FP:            s = this.printStoreInstr();
             INSTR_AMO:                 s = this.printAMOInstr();
             default:                   s = this.printMnemonic("INVALID");
         endcase
 
 
-        s = $sformatf("%10t %10d %s %h %h %h %-36s", simtime,
+        s = $sformatf("%8dns %8d %s %h %h %h %-36s", simtime,
                                              cycle,
                                              priv_lvl,
                                              sbe.pc,
@@ -223,23 +312,29 @@ class instruction_trace_item;
         //                                      s);
 
         foreach (result_regs[i]) begin
-            if (result_regs[i] != 0)
+            if (result_fpr[i])
+                s = $sformatf("%s %-4s:%16x", s, fpRegAddrToStr(result_regs[i]), this.result);
+            else if (result_regs[i] != 0)
                 s = $sformatf("%s %-4s:%16x", s, regAddrToStr(result_regs[i]), this.result);
         end
 
         foreach (read_regs[i]) begin
-            if (read_regs[i] != 0)
-                s = $sformatf("%s %-4s:%16x", s, regAddrToStr(read_regs[i]), reg_file[read_regs[i]]);
+            if (read_fpr[i])
+                s = $sformatf("%s %-4s:%16x", s, fpRegAddrToStr(read_regs[i]), fp_reg_file[read_regs[i]]);
+            else if (read_regs[i] != 0)
+                s = $sformatf("%s %-4s:%16x", s, regAddrToStr(read_regs[i]), gp_reg_file[read_regs[i]]);
         end
 
-        casex (instr)
+        case (instr) inside
             // check of the instrction was a load or store
-            INSTR_STORE: begin
-                logic [63:0] vaddress = reg_file[read_regs[1]] + this.imm;
+            INSTR_STORE,
+            INSTR_STORE_FP: begin
+                logic [63:0] vaddress = gp_reg_file[read_regs[1]] + this.imm;
                 s = $sformatf("%s VA: %x PA: %x", s, vaddress, this.paddr);
             end
-            INSTR_LOAD: begin
-                logic [63:0] vaddress = reg_file[read_regs[0]] + this.imm;
+            INSTR_LOAD,
+            INSTR_LOAD_FP: begin
+                logic [63:0] vaddress = gp_reg_file[read_regs[0]] + this.imm;
                 s = $sformatf("%s VA: %x PA: %x", s, vaddress, this.paddr);
             end
         endcase
@@ -261,48 +356,131 @@ class instruction_trace_item;
 
     function string printRInstr(input string mnemonic);
 
-        result_regs.push_back(sbe.rd);
-        read_regs.push_back(sbe.rs1);
-        read_regs.push_back(sbe.rs2);
+        result_regs.push_back(rd);
+        result_fpr.push_back(1'b0);
+        read_regs.push_back(rs1);
+        read_fpr.push_back(1'b0);
+        read_regs.push_back(rs2);
+        read_fpr.push_back(1'b0);
 
-        return $sformatf("%-16s %s, %s, %s", mnemonic, regAddrToStr(sbe.rd), regAddrToStr(sbe.rs1), regAddrToStr(sbe.rs2));
+        return $sformatf("%-12s %4s, %s, %s", mnemonic, regAddrToStr(rd), regAddrToStr(rs1), regAddrToStr(rs2));
     endfunction // printRInstr
 
+    function string printRFBCInstr(input string mnemonic, input bit use_rnd);
+
+        result_regs.push_back(rd);
+        result_fpr.push_back(is_rd_fpr(sbe.op));
+        read_regs.push_back(rs2);
+        read_fpr.push_back(is_rs2_fpr(sbe.op));
+        read_regs.push_back(sbe.result[4:0]);
+        read_fpr.push_back(is_imm_fpr(sbe.op));
+
+        if (use_rnd && instr[14:12]!=3'b111)
+            return $sformatf("%-12s %4s, %s, %s, %s", $sformatf("%s.%s",mnemonic, fpFmtToStr(instr[26:25])), is_rd_fpr(sbe.op)?fpRegAddrToStr(rd):regAddrToStr(rd), is_rs2_fpr(sbe.op)?fpRegAddrToStr(rs2):regAddrToStr(rs2), is_imm_fpr(sbe.op)?fpRegAddrToStr(sbe.result[4:0]):regAddrToStr(sbe.result[4:0]), fpRmToStr(instr[14:12]));
+        else
+            return $sformatf("%-12s %4s, %s, %s", $sformatf("%s.%s",mnemonic, fpFmtToStr(instr[26:25])), is_rd_fpr(sbe.op)?fpRegAddrToStr(rd):regAddrToStr(rd), is_rs2_fpr(sbe.op)?fpRegAddrToStr(rs2):regAddrToStr(rs2), is_imm_fpr(sbe.op)?fpRegAddrToStr(sbe.result[4:0]):regAddrToStr(sbe.result[4:0]));
+    endfunction // printRFInstr
+
+    function string printRFInstr(input string mnemonic, input bit use_rnd);
+
+        result_regs.push_back(rd);
+        result_fpr.push_back(is_rd_fpr(sbe.op));
+        read_regs.push_back(rs1);
+        read_fpr.push_back(is_rs1_fpr(sbe.op));
+        read_regs.push_back(rs2);
+        read_fpr.push_back(is_rs2_fpr(sbe.op));
+
+        if (use_rnd && instr[14:12]!=3'b111)
+            return $sformatf("%-12s %4s, %s, %s, %s", $sformatf("%s.%s",mnemonic, fpFmtToStr(instr[26:25])), is_rd_fpr(sbe.op)?fpRegAddrToStr(rd):regAddrToStr(rd), is_rs1_fpr(sbe.op)?fpRegAddrToStr(rs1):regAddrToStr(rs1), is_rs2_fpr(sbe.op)?fpRegAddrToStr(rs2):regAddrToStr(rs2), fpRmToStr(instr[14:12]));
+        else
+            return $sformatf("%-12s %4s, %s, %s", $sformatf("%s.%s",mnemonic, fpFmtToStr(instr[26:25])), is_rd_fpr(sbe.op)?fpRegAddrToStr(rd):regAddrToStr(rd), is_rs1_fpr(sbe.op)?fpRegAddrToStr(rs1):regAddrToStr(rs1), is_rs2_fpr(sbe.op)?fpRegAddrToStr(rs2):regAddrToStr(rs2));
+    endfunction // printRFInstr
+
+    function string printRFInstr1Op(input string mnemonic, input bit use_rnd);
+
+        result_regs.push_back(rd);
+        result_fpr.push_back(is_rd_fpr(sbe.op));
+        read_regs.push_back(rs1);
+        read_fpr.push_back(is_rs1_fpr(sbe.op));
+
+        if (use_rnd && instr[14:12]!=3'b111)
+            return $sformatf("%-12s %4s, %s, %s", $sformatf("%s.%s",mnemonic, fpFmtToStr(instr[26:25])), is_rd_fpr(sbe.op)?fpRegAddrToStr(rd):regAddrToStr(rd), is_rs1_fpr(sbe.op)?fpRegAddrToStr(rs1):regAddrToStr(rs1), fpRmToStr(instr[14:12]));
+        else
+            return $sformatf("%-12s %4s, %s", $sformatf("%s.%s",mnemonic, fpFmtToStr(instr[26:25])), is_rd_fpr(sbe.op)?fpRegAddrToStr(rd):regAddrToStr(rd), is_rs1_fpr(sbe.op)?fpRegAddrToStr(rs1):regAddrToStr(rs1));
+    endfunction // printRFInstr1Op
+
+    function string printR4Instr(input string mnemonic);
+
+        result_regs.push_back(rd);
+        result_fpr.push_back(1'b1);
+        read_regs.push_back(rs1);
+        read_fpr.push_back(1'b1);
+        read_regs.push_back(rs2);
+        read_fpr.push_back(1'b1);
+        read_regs.push_back(rs3);
+        read_fpr.push_back(1'b1);
+
+        return $sformatf("%-12s %4s, %s, %s, %s, %s", $sformatf("%s.%s",mnemonic, fpFmtToStr(instr[26:25])), fpRegAddrToStr(rd), fpRegAddrToStr(rs1), fpRegAddrToStr(rs2), fpRegAddrToStr(instr[31:27]), fpRmToStr(instr[14:12]));
+    endfunction // printR4Instr
+
+    function string printFpSpecialInstr();
+
+        result_regs.push_back(rd);
+        result_fpr.push_back(is_rd_fpr(sbe.op));
+        read_regs.push_back(rs1);
+        read_fpr.push_back(is_rs1_fpr(sbe.op));
+
+        case (sbe.op)
+            FCVT_F2F : return $sformatf("%-12s %4s, %s, %s", $sformatf("fcvt.%s.%s", fpFmtToStr(instr[26:25]), fpFmtToStr(instr[21:20])), fpRegAddrToStr(rd), fpRegAddrToStr(rs1), fpRmToStr(instr[14:12]));
+            FCVT_F2I : return $sformatf("%-12s %4s, %s, %s", $sformatf("fcvt.%s.%s", intFmtToStr(instr[21:20]), fpFmtToStr(instr[26:25])), regAddrToStr(rd), fpRegAddrToStr(rs1), fpRmToStr(instr[14:12]));
+            FCVT_I2F : return $sformatf("%-12s %4s, %s, %s", $sformatf("fcvt.%s.%s", fpFmtToStr(instr[26:25]), intFmtToStr(instr[21:20])), fpRegAddrToStr(rd), regAddrToStr(rs1), fpRmToStr(instr[14:12]));
+            FMV_F2X  : return $sformatf("%-12s %4s, %s", $sformatf("fmv.x.%s", fmvFpFmtToStr(instr[26:25])), regAddrToStr(rd), fpRegAddrToStr(rs1));
+            FMV_X2F  : return $sformatf("%-12s %4s, %s", $sformatf("fmv.%s.x", fmvFpFmtToStr(instr[26:25])), fpRegAddrToStr(rd), regAddrToStr(rs1));
+        endcase
+    endfunction
+
     function string printIInstr(input string mnemonic);
 
-        result_regs.push_back(sbe.rd);
-        read_regs.push_back(sbe.rs1);
+        result_regs.push_back(rd);
+        result_fpr.push_back(1'b0);
+        read_regs.push_back(rs1);
+        read_fpr.push_back(1'b0);
 
-        if (sbe.rs1 == 0)
-            return $sformatf("%-16s %s, %0d", mnemonic, regAddrToStr(sbe.rd), $signed(sbe.result));
+        if (rs1 == 0)
+            return $sformatf("%-12s %4s, %0d", mnemonic, regAddrToStr(rd), $signed(sbe.result));
 
-        return $sformatf("%-16s %s, %s, %0d", mnemonic, regAddrToStr(sbe.rd), regAddrToStr(sbe.rs1), $signed(sbe.result));
+        return $sformatf("%-12s %4s, %s, %0d", mnemonic, regAddrToStr(rd), regAddrToStr(rs1), $signed(sbe.result));
     endfunction // printIInstr
 
     function string printIuInstr(input string mnemonic);
 
-        result_regs.push_back(sbe.rd);
-        read_regs.push_back(sbe.rs1);
+        result_regs.push_back(rd);
+        result_fpr.push_back(1'b0);
+        read_regs.push_back(rs1);
+        read_fpr.push_back(1'b0);
 
-        return $sformatf("%-16s %s, %s, 0x%0x", mnemonic, regAddrToStr(sbe.rd), regAddrToStr(sbe.rs1), sbe.result);
+        return $sformatf("%-12s %4s, %s, 0x%0x", mnemonic, regAddrToStr(rd), regAddrToStr(rs1), sbe.result);
     endfunction // printIuInstr
 
     function string printSBInstr(input string mnemonic);
 
-        read_regs.push_back(sbe.rs1);
-        read_regs.push_back(sbe.rs2);
+        read_regs.push_back(rs1);
+        read_fpr.push_back(1'b0);
+        read_regs.push_back(rs2);
+        read_fpr.push_back(1'b0);
 
-        if (sbe.rs2 == 0)
-            return $sformatf("%-16s %s, pc + %0d", mnemonic, regAddrToStr(sbe.rs1), $signed(sbe.result));
+        if (rs2 == 0)
+            return $sformatf("%-12s %4s, pc + %0d", mnemonic, regAddrToStr(rs1), $signed(sbe.result));
         else
-            return $sformatf("%-16s %s, %s, pc + %0d", mnemonic, regAddrToStr(sbe.rs1), regAddrToStr(sbe.rs2), $signed(sbe.result));
+            return $sformatf("%-12s %4s, %s, pc + %0d", mnemonic, regAddrToStr(rs1), regAddrToStr(rs2), $signed(sbe.result));
     endfunction // printIuInstr
 
     function string printUInstr(input string mnemonic);
 
-        result_regs.push_back(sbe.rd);
+        result_regs.push_back(rd);
+        result_fpr.push_back(1'b0);
 
-        return $sformatf("%-16s %s, 0x%0h", mnemonic, regAddrToStr(sbe.rd), sbe.result[31:12]);
+        return $sformatf("%-12s %4s, 0x%0h", mnemonic, regAddrToStr(rd), sbe.result[31:12]);
     endfunction // printUInstr
 
     function string printJump();
@@ -329,58 +507,70 @@ class instruction_trace_item;
 
     function string printUJInstr(input string mnemonic);
 
-        result_regs.push_back(sbe.rd);
+        result_regs.push_back(rd);
+        result_fpr.push_back(1'b0);
         // jump instruction
-        if (sbe.rd == 0)
-            return $sformatf("%-16s pc + %0d", mnemonic, $signed(sbe.result));
+        if (rd == 0)
+            return $sformatf("%-12s   pc + %0d", mnemonic, $signed(sbe.result));
         else
-            return $sformatf("%-16s %s, pc + %0d", mnemonic, regAddrToStr(sbe.rd), $signed(sbe.result));
+            return $sformatf("%-12s %4s, pc + %0d", mnemonic, regAddrToStr(rd), $signed(sbe.result));
     endfunction // printUJInstr
 
     function string printCSRInstr(input string mnemonic);
 
-        result_regs.push_back(sbe.rd);
+        result_regs.push_back(rd);
+        result_fpr.push_back(1'b0);
         if (instr[14] == 0) begin
-        read_regs.push_back(sbe.rs1);
-            if (sbe.rd != 0 && sbe.rs1 != 0) begin
-                  return $sformatf("%-16s %s, %s, %s", mnemonic, regAddrToStr(sbe.rd), regAddrToStr(sbe.rs1), csrAddrToStr(sbe.result[11:0]));
+        read_regs.push_back(rs1);
+        read_fpr.push_back(1'b0);
+            if (rd != 0 && rs1 != 0) begin
+                  return $sformatf("%-12s %4s, %s, %s", mnemonic, regAddrToStr(rd), regAddrToStr(rs1), csrAddrToStr(sbe.result[11:0]));
             // don't display instructions which write to zero
-            end else if (sbe.rd == 0) begin
-                  return $sformatf("%-16s %s, %s", mnemonic, regAddrToStr(sbe.rs1), csrAddrToStr(sbe.result[11:0]));
-            end else if (sbe.rs1 == 0) begin
-                return $sformatf("%-16s %s, %s", mnemonic, regAddrToStr(sbe.rd), csrAddrToStr(sbe.result[11:0]));
+            end else if (rd == 0) begin
+                  return $sformatf("%-12s %4s, %s", mnemonic, regAddrToStr(rs1), csrAddrToStr(sbe.result[11:0]));
+            end else if (rs1 == 0) begin
+                return $sformatf("%-12s %4s, %s", mnemonic, regAddrToStr(rd), csrAddrToStr(sbe.result[11:0]));
             end
         end else begin
-            if (sbe.rd != 0 && sbe.rs1 != 0) begin
-                  return $sformatf("%-16s %s, %d, %s", mnemonic, regAddrToStr(sbe.rd), $unsigned(sbe.rs1), csrAddrToStr(sbe.result[11:0]));
+            if (rd != 0 && rs1 != 0) begin
+                  return $sformatf("%-12s %4s, %d, %s", mnemonic, regAddrToStr(rd), $unsigned(rs1), csrAddrToStr(sbe.result[11:0]));
             // don't display instructions which write to zero
-            end else if (sbe.rd == 0) begin
-                  return $sformatf("%-16s %d, %s", mnemonic, $unsigned(sbe.rs1), csrAddrToStr(sbe.result[11:0]));
-            end else if (sbe.rs1 == 0) begin
-                return $sformatf("%-16s %s, %s", mnemonic, regAddrToStr(sbe.rd), csrAddrToStr(sbe.result[11:0]));
+            end else if (rd == 0) begin
+                  return $sformatf("%-14s %2d, %s", mnemonic, $unsigned(rs1), csrAddrToStr(sbe.result[11:0]));
+            end else if (rs1 == 0) begin
+                return $sformatf("%-12s %4s, %s", mnemonic, regAddrToStr(rd), csrAddrToStr(sbe.result[11:0]));
             end
         end
     endfunction // printCSRInstr
 
     function string printLoadInstr();
       string mnemonic;
-      case (instr[14:12])
-        3'b000: mnemonic = "lb";
-        3'b001: mnemonic = "lh";
-        3'b010: mnemonic = "lw";
-        3'b100: mnemonic = "lbu";
-        3'b101: mnemonic = "lhu";
-        3'b110: mnemonic = "lwu";
-        3'b011: mnemonic = "ld";
-        default: return printMnemonic("INVALID");
-      endcase
 
-      result_regs.push_back(sbe.rd);
-      read_regs.push_back(sbe.rs1);
-      // save the immediate for calculating the virtual address
-      this.imm = sbe.result;
+        case (instr[14:12])
+          3'b000: mnemonic = "lb";
+          3'b001: mnemonic = "lh";
+          3'b010: mnemonic = "lw";
+          3'b100: mnemonic = "lbu";
+          3'b101: mnemonic = "lhu";
+          3'b110: mnemonic = "lwu";
+          3'b011: mnemonic = "ld";
+          default: return printMnemonic("INVALID");
+        endcase
 
-      return $sformatf("%-16s %s, %0d(%s)", mnemonic, regAddrToStr(sbe.rd), $signed(sbe.result), regAddrToStr(sbe.rs1));
+        if (instr[6:0] == riscv::OpcodeLoadFp)
+            mnemonic = $sformatf("f%s",mnemonic);
+
+        result_regs.push_back(rd);
+        result_fpr.push_back(is_rd_fpr(sbe.op));
+        read_regs.push_back(rs1);
+        read_fpr.push_back(1'b0);
+        // save the immediate for calculating the virtual address
+        this.imm = sbe.result;
+
+        if (instr[6:0] == riscv::OpcodeLoadFp)
+            return $sformatf("%-12s %4s, %0d(%s)", mnemonic, fpRegAddrToStr(rd), $signed(sbe.result), regAddrToStr(rs1));
+        else
+            return $sformatf("%-12s %4s, %0d(%s)", mnemonic, regAddrToStr(rd), $signed(sbe.result), regAddrToStr(rs1));
     endfunction
 
     function string printStoreInstr();
@@ -393,12 +583,20 @@ class instruction_trace_item;
           default: return printMnemonic("INVALID");
         endcase
 
-        read_regs.push_back(sbe.rs2);
-        read_regs.push_back(sbe.rs1);
+        if (instr[6:0] == riscv::OpcodeStoreFp)
+            mnemonic = $sformatf("f%s",mnemonic);
+
+        read_regs.push_back(rs2);
+        read_fpr.push_back(is_rs2_fpr(sbe.op));
+        read_regs.push_back(rs1);
+        read_fpr.push_back(1'b0);
         // save the immediate for calculating the virtual address
         this.imm = sbe.result;
 
-        return $sformatf("%-16s %s, %0d(%s)", mnemonic, regAddrToStr(sbe.rs2), $signed(sbe.result), regAddrToStr(sbe.rs1));
+        if (instr[6:0] == riscv::OpcodeStoreFp)
+            return $sformatf("%-12s %4s, %0d(%s)", mnemonic, fpRegAddrToStr(rs2), $signed(sbe.result), regAddrToStr(rs1));
+        else
+            return $sformatf("%-12s %4s, %0d(%s)", mnemonic, regAddrToStr(rs2), $signed(sbe.result), regAddrToStr(rs1));
     endfunction // printSInstr
 
     function string printAMOInstr();
diff --git a/src/util/instruction_tracer.svh b/src/util/instruction_tracer.svh
index 2b96043c4..87f5d730a 100644
--- a/src/util/instruction_tracer.svh
+++ b/src/util/instruction_tracer.svh
@@ -25,8 +25,9 @@ class instruction_tracer;
     scoreboard_entry_t issue_sbe;
     // store resolved branches, get (mis-)predictions
     branchpredict_t bp [$];
-    // shadow copy of the register file
-    logic [63:0] reg_file [32];
+    // shadow copy of the register files
+    logic [63:0] gp_reg_file [32];
+    logic [63:0] fp_reg_file [32];
     // 64 bit clock tick count
     longint unsigned clk_ticks;
     int f, commit_log;
@@ -60,7 +61,7 @@ class instruction_tracer;
         logic [31:0] decode_instruction, issue_instruction, issue_commit_instruction;
         scoreboard_entry_t commit_instruction;
         // initialize register 0
-        reg_file [0] = 0;
+        gp_reg_file [0] = 0;
 
         forever begin
             automatic branchpredict_t bp_instruction = '0;
@@ -125,10 +126,12 @@ class instruction_tracer;
                     // the scoreboards issue entry still contains the immediate value as a result
                     // check if the write back is valid, if not we need to source the result from the register file
                     // as the most recent version of this register will be there.
-                    if (tracer_if.pck.we[i]) begin
+                    if (tracer_if.pck.we_gpr[i] || tracer_if.pck.we_fpr[i]) begin
                         printInstr(issue_sbe, issue_commit_instruction, tracer_if.pck.wdata[i], address_mapping, tracer_if.pck.priv_lvl, tracer_if.pck.debug_mode, bp_instruction);
+                    end else if (is_rd_fpr(commit_instruction.op)) begin
+                        printInstr(issue_sbe, issue_commit_instruction, fp_reg_file[commit_instruction.rd], address_mapping, tracer_if.pck.priv_lvl, tracer_if.pck.debug_mode, bp_instruction);
                     end else begin
-                        printInstr(issue_sbe, issue_commit_instruction, reg_file[commit_instruction.rd], address_mapping, tracer_if.pck.priv_lvl, tracer_if.pck.debug_mode, bp_instruction);
+                        printInstr(issue_sbe, issue_commit_instruction, gp_reg_file[commit_instruction.rd], address_mapping, tracer_if.pck.priv_lvl, tracer_if.pck.debug_mode, bp_instruction);
                     end
                 end
             end
@@ -142,13 +145,14 @@ class instruction_tracer;
             // ----------------------
             // Commit Registers
             // ----------------------
-            // update shadow reg file here
+            // update shadow reg files here
             for (int i = 0; i < 2; i++) begin
-                if (tracer_if.pck.we[i] && tracer_if.pck.waddr[i] != 5'b0) begin
-                    reg_file[tracer_if.pck.waddr[i]] = tracer_if.pck.wdata[i];
+                if (tracer_if.pck.we_gpr[i] && tracer_if.pck.waddr[i] != 5'b0) begin
+                    gp_reg_file[tracer_if.pck.waddr[i]] = tracer_if.pck.wdata[i];
+                end else if (tracer_if.pck.we_fpr[i]) begin
+                    fp_reg_file[tracer_if.pck.waddr[i]] = tracer_if.pck.wdata[i];
                 end
             end
-
             // --------------
             // Flush Signals
             // --------------
@@ -182,11 +186,11 @@ class instruction_tracer;
     endfunction
 
     function void printInstr(scoreboard_entry_t sbe, logic [31:0] instr, logic [63:0] result, logic [63:0] paddr, riscv::priv_lvl_t priv_lvl, logic debug_mode, branchpredict_t bp);
-        instruction_trace_item iti = new ($time, clk_ticks, sbe, instr, this.reg_file, result, paddr, priv_lvl, debug_mode, bp);
+        instruction_trace_item iti = new ($time, clk_ticks, sbe, instr, this.gp_reg_file, this.fp_reg_file, result, paddr, priv_lvl, debug_mode, bp);
         // print instruction to console
         string print_instr = iti.printInstr();
         if (ENABLE_SPIKE_COMMIT_LOG && !debug_mode) begin
-            $fwrite(this.commit_log, riscv::spikeCommitLog(sbe.pc, priv_lvl, instr, sbe.rd, result));
+            $fwrite(this.commit_log, riscv::spikeCommitLog(sbe.pc, priv_lvl, instr, sbe.rd, result, is_rd_fpr(sbe.op)));
         end
         uvm_report_info( "Tracer",  print_instr, UVM_HIGH);
         $fwrite(this.f, {print_instr, "\n"});
diff --git a/src/util/instruction_tracer_defines.svh b/src/util/instruction_tracer_defines.svh
index 4b9756e45..e0f961819 100644
--- a/src/util/instruction_tracer_defines.svh
+++ b/src/util/instruction_tracer_defines.svh
@@ -28,23 +28,23 @@ parameter INSTR_BGE      =  { 7'b?, 5'b?, 5'b?, 3'b101, 5'b?, riscv::OpcodeBranc
 parameter INSTR_BLTU     =  { 7'b?, 5'b?, 5'b?, 3'b110, 5'b?, riscv::OpcodeBranch };
 parameter INSTR_BGEU     =  { 7'b?, 5'b?, 5'b?, 3'b111, 5'b?, riscv::OpcodeBranch };
 
-// OPIMM
-parameter INSTR_LI       =  { 12'b?, 5'b0, 3'b000, 5'b?, riscv::OpcodeOpimm };
-parameter INSTR_ADDI     =  { 17'b?, 3'b000, 5'b?, riscv::OpcodeOpimm };
-parameter INSTR_SLTI     =  { 17'b?, 3'b010, 5'b?, riscv::OpcodeOpimm };
-parameter INSTR_SLTIU    =  { 17'b?, 3'b011, 5'b?, riscv::OpcodeOpimm };
-parameter INSTR_XORI     =  { 17'b?, 3'b100, 5'b?, riscv::OpcodeOpimm };
-parameter INSTR_ORI      =  { 17'b?, 3'b110, 5'b?, riscv::OpcodeOpimm };
-parameter INSTR_ANDI     =  { 17'b?, 3'b111, 5'b?, riscv::OpcodeOpimm };
-parameter INSTR_SLLI     =  { 6'b000000, 11'b?, 3'b001, 5'b?, riscv::OpcodeOpimm };
-parameter INSTR_SRLI     =  { 6'b000000, 11'b?, 3'b101, 5'b?, riscv::OpcodeOpimm };
-parameter INSTR_SRAI     =  { 6'b010000, 11'b?, 3'b101, 5'b?, riscv::OpcodeOpimm };
+// OP-IMM
+parameter INSTR_LI       =  { 12'b?, 5'b0, 3'b000, 5'b?, riscv::OpcodeOpImm };
+parameter INSTR_ADDI     =  { 17'b?, 3'b000, 5'b?, riscv::OpcodeOpImm };
+parameter INSTR_SLTI     =  { 17'b?, 3'b010, 5'b?, riscv::OpcodeOpImm };
+parameter INSTR_SLTIU    =  { 17'b?, 3'b011, 5'b?, riscv::OpcodeOpImm };
+parameter INSTR_XORI     =  { 17'b?, 3'b100, 5'b?, riscv::OpcodeOpImm };
+parameter INSTR_ORI      =  { 17'b?, 3'b110, 5'b?, riscv::OpcodeOpImm };
+parameter INSTR_ANDI     =  { 17'b?, 3'b111, 5'b?, riscv::OpcodeOpImm };
+parameter INSTR_SLLI     =  { 6'b000000, 11'b?, 3'b001, 5'b?, riscv::OpcodeOpImm };
+parameter INSTR_SRLI     =  { 6'b000000, 11'b?, 3'b101, 5'b?, riscv::OpcodeOpImm };
+parameter INSTR_SRAI     =  { 6'b010000, 11'b?, 3'b101, 5'b?, riscv::OpcodeOpImm };
 
-// OPIMM32
-parameter INSTR_ADDIW    =  { 17'b?, 3'b000, 5'b?, riscv::OpcodeOpimm32 };
-parameter INSTR_SLLIW    =  { 7'b0000000, 10'b?, 3'b001, 5'b?, riscv::OpcodeOpimm32 };
-parameter INSTR_SRLIW    =  { 7'b0000000, 10'b?, 3'b101, 5'b?, riscv::OpcodeOpimm32 };
-parameter INSTR_SRAIW    =  { 7'b0100000, 10'b?, 3'b101, 5'b?, riscv::OpcodeOpimm32 };
+// OP-IMM-32
+parameter INSTR_ADDIW    =  { 17'b?, 3'b000, 5'b?, riscv::OpcodeOpImm32 };
+parameter INSTR_SLLIW    =  { 7'b0000000, 10'b?, 3'b001, 5'b?, riscv::OpcodeOpImm32 };
+parameter INSTR_SRLIW    =  { 7'b0000000, 10'b?, 3'b101, 5'b?, riscv::OpcodeOpImm32 };
+parameter INSTR_SRAIW    =  { 7'b0100000, 10'b?, 3'b101, 5'b?, riscv::OpcodeOpImm32 };
 
 // OP
 parameter INSTR_ADD      =  { 7'b0000000, 10'b?, 3'b000, 5'b?, riscv::OpcodeOp };
@@ -67,9 +67,10 @@ parameter INSTR_SRLW     =  { 7'b0000000, 10'b?, 3'b101, 5'b?, riscv::OpcodeOp32
 parameter INSTR_SRAW     =  { 7'b0100000, 10'b?, 3'b101, 5'b?, riscv::OpcodeOp32 };
 parameter INSTR_MULW     =  { 7'b0000001, 10'b?, 3'b???, 5'b?, riscv::OpcodeOp32 };
 
-// FENCE
-parameter INSTR_FENCE    =  { 4'b0, 8'b?, 13'b0, riscv::OpcodeFence };
-parameter INSTR_FENCEI   =  { 17'b0, 3'b001, 5'b0, riscv::OpcodeFence };
+// MISC-MEM
+parameter INSTR_FENCE    =  { 4'b0, 8'b?, 13'b0, riscv::OpcodeMiscMem };
+parameter INSTR_FENCEI   =  { 17'b0, 3'b001, 5'b0, riscv::OpcodeMiscMem };
+
 // SYSTEM
 parameter INSTR_CSRW     =  { 12'b?, 5'b?, 3'b001, 5'b0, riscv::OpcodeSystem };
 parameter INSTR_CSRRW    =  { 12'b?, 5'b?, 3'b001, 5'b?, riscv::OpcodeSystem };
@@ -101,9 +102,38 @@ parameter INSTR_DIVU     =  { 7'b0000001, 10'b?, 3'b101, 5'b?, riscv::OpcodeOp }
 parameter INSTR_REM      =  { 7'b0000001, 10'b?, 3'b110, 5'b?, riscv::OpcodeOp };
 parameter INSTR_REMU     =  { 7'b0000001, 10'b?, 3'b111, 5'b?, riscv::OpcodeOp };
 
+// RVFD
+parameter INSTR_FMADD    =  { 5'b?, 2'b?, 5'b?, 5'b?, 3'b?, 5'b?, riscv::OpcodeMadd};
+parameter INSTR_FMSUB    =  { 5'b?, 2'b?, 5'b?, 5'b?, 3'b?, 5'b?, riscv::OpcodeMsub};
+parameter INSTR_FNSMSUB  =  { 5'b?, 2'b?, 5'b?, 5'b?, 3'b?, 5'b?, riscv::OpcodeNmsub};
+parameter INSTR_FNMADD   =  { 5'b?, 2'b?, 5'b?, 5'b?, 3'b?, 5'b?, riscv::OpcodeNmadd};
+
+parameter INSTR_FADD     =  { 5'b00000, 2'b?, 5'b?, 5'b?, 3'b?, 5'b?, riscv::OpcodeOpFp};
+parameter INSTR_FSUB     =  { 5'b00001, 2'b?, 5'b?, 5'b?, 3'b?, 5'b?, riscv::OpcodeOpFp};
+parameter INSTR_FMUL     =  { 5'b00010, 2'b?, 5'b?, 5'b?, 3'b?, 5'b?, riscv::OpcodeOpFp};
+parameter INSTR_FDIV     =  { 5'b00011, 2'b?, 5'b?, 5'b?, 3'b?, 5'b?, riscv::OpcodeOpFp};
+parameter INSTR_FSQRT    =  { 5'b01011, 2'b?, 5'b0, 5'b?, 3'b?, 5'b?, riscv::OpcodeOpFp};
+parameter INSTR_FSGNJ    =  { 5'b00100, 2'b?, 5'b?, 5'b?, 3'b000, 5'b?, riscv::OpcodeOpFp};
+parameter INSTR_FSGNJN   =  { 5'b00100, 2'b?, 5'b?, 5'b?, 3'b001, 5'b?, riscv::OpcodeOpFp};
+parameter INSTR_FSGNJX   =  { 5'b00100, 2'b?, 5'b?, 5'b?, 3'b010, 5'b?, riscv::OpcodeOpFp};
+parameter INSTR_FMIN     =  { 5'b00101, 2'b?, 5'b?, 5'b?, 3'b000, 5'b?, riscv::OpcodeOpFp};
+parameter INSTR_FMAX     =  { 5'b00101, 2'b?, 5'b?, 5'b?, 3'b001, 5'b?, riscv::OpcodeOpFp};
+parameter INSTR_FLE      =  { 5'b10100, 2'b?, 5'b?, 5'b?, 3'b000, 5'b?, riscv::OpcodeOpFp};
+parameter INSTR_FLT      =  { 5'b10100, 2'b?, 5'b?, 5'b?, 3'b001, 5'b?, riscv::OpcodeOpFp};
+parameter INSTR_FEQ      =  { 5'b10100, 2'b?, 5'b?, 5'b?, 3'b010, 5'b?, riscv::OpcodeOpFp};
+
+parameter INSTR_FCVT_F2F =  { 5'b01000, 2'b?, 5'b000??, 5'b?, 3'b?, 5'b?, riscv::OpcodeOpFp};
+parameter INSTR_FMV_F2X  =  { 5'b11100, 2'b?, 5'b0, 5'b?, 3'b000, 5'b?,   riscv::OpcodeOpFp};
+parameter INSTR_FCLASS   =  { 5'b11100, 2'b?, 5'b0, 5'b?, 3'b001, 5'b?,   riscv::OpcodeOpFp};
+parameter INSTR_FMV_X2F  =  { 5'b11110, 2'b?, 5'b0, 5'b?, 3'b000, 5'b?,   riscv::OpcodeOpFp};
+parameter INSTR_FCVT_F2I =  { 5'b11000, 2'b?, 5'b000??, 5'b?, 3'b?, 5'b?, riscv::OpcodeOpFp};
+parameter INSTR_FCVT_I2F =  { 5'b11010, 2'b?, 5'b000??, 5'b?, 3'b?, 5'b?, riscv::OpcodeOpFp};
+
 // A
 parameter INSTR_AMO      =  {25'b?, riscv::OpcodeAmo };
 
 // Load/Stores
-parameter INSTR_LOAD     =  {25'b?, riscv::OpcodeLoad };
-parameter INSTR_STORE    =  {25'b?, riscv::OpcodeStore };
+parameter INSTR_LOAD     =  {25'b?, riscv::OpcodeLoad};
+parameter INSTR_LOAD_FP  =  {25'b?, riscv::OpcodeLoadFp};
+parameter INSTR_STORE    =  {25'b?, riscv::OpcodeStore};
+parameter INSTR_STORE_FP =  {25'b?, riscv::OpcodeStoreFp};
diff --git a/src/util/instruction_tracer_if.sv b/src/util/instruction_tracer_if.sv
index 86e8007e2..fe19684ea 100644
--- a/src/util/instruction_tracer_if.sv
+++ b/src/util/instruction_tracer_if.sv
@@ -32,7 +32,8 @@ interface instruction_tracer_if (
     // WB stage
     logic [1:0][4:0]  waddr;
     logic [1:0][63:0] wdata;
-    logic [1:0]       we;
+    logic [1:0]       we_gpr;
+    logic [1:0]       we_fpr;
     // commit stage
     scoreboard_entry_t [1:0] commit_instr; // commit instruction
     logic              [1:0] commit_ack;
@@ -56,7 +57,7 @@ interface instruction_tracer_if (
     clocking pck @(posedge clk);
         input rstn, flush_unissued, flush, instruction, fetch_valid, fetch_ack, issue_ack, issue_sbe, waddr,
               st_valid, st_paddr, ld_valid, ld_kill, ld_paddr, resolve_branch,
-              wdata, we, commit_instr, commit_ack, exception, priv_lvl, debug_mode;
+              wdata, we_gpr, we_fpr, commit_instr, commit_ack, exception, priv_lvl, debug_mode;
     endclocking
     `endif
 
diff --git a/tb/ariane_testharness.sv b/tb/ariane_testharness.sv
index 3b451ca10..66103be8b 100644
--- a/tb/ariane_testharness.sv
+++ b/tb/ariane_testharness.sv
@@ -131,6 +131,7 @@ module ariane_testharness #(
     dmi_jtag i_dmi_jtag (
         .clk_i            ( clk_i           ),
         .rst_ni           ( rst_ni          ),
+        .testmode_i       ( test_en         ),
         .dmi_req_o        ( jtag_dmi_req    ),
         .dmi_req_valid_o  ( jtag_req_valid  ),
         .dmi_req_ready_i  ( debug_req_ready ),
@@ -300,6 +301,7 @@ module ariane_testharness #(
     ) i_clint (
         .clk_i       ( clk_i     ),
         .rst_ni      ( rst_ni    ),
+        .testmode_i  ( test_en   ),
         .slave       ( master[1] ),
         .rtc_i       ( rtc_i     ),
         .timer_irq_o ( timer_irq ),
diff --git a/tb/wave/wave_core.do b/tb/wave/wave_core.do
index 8e74fc121..53caa9d0c 100644
--- a/tb/wave/wave_core.do
+++ b/tb/wave/wave_core.do
@@ -5,8 +5,8 @@ add wave -noupdate -group frontend -group icache /ariane_tb/dut/i_ariane/i_std_c
 add wave -noupdate -group frontend -group ras /ariane_tb/dut/i_ariane/i_frontend/i_ras/*
 add wave -noupdate -group frontend -group btb /ariane_tb/dut/i_ariane/i_frontend/i_btb/*
 add wave -noupdate -group frontend -group bht /ariane_tb/dut/i_ariane/i_frontend/i_bht/*
-add wave -noupdate -group frontend -group instr_scan /ariane_tb/dut/i_ariane/i_frontend/*/i_instr_scan/*
-add wave -noupdate -group frontend -group fetch_fifo /ariane_tb/dut/i_ariane/i_frontend/i_fetch_fifo/*
+# add wave -noupdate -group frontend -group instr_scan /ariane_tb/dut/i_ariane/i_frontend/*/i_instr_scan/*
+# add wave -noupdate -group frontend -group fetch_fifo /ariane_tb/dut/i_ariane/i_frontend/i_fetch_fifo/*
 
 add wave -noupdate -group id_stage -group decoder /ariane_tb/dut/i_ariane/id_stage_i/decoder_i/*
 add wave -noupdate -group id_stage -group compressed_decoder /ariane_tb/dut/i_ariane/id_stage_i/compressed_decoder_i/*
@@ -22,6 +22,8 @@ add wave -noupdate -group ex_stage -group alu /ariane_tb/dut/i_ariane/ex_stage_i
 add wave -noupdate -group ex_stage -group mult /ariane_tb/dut/i_ariane/ex_stage_i/i_mult/*
 add wave -noupdate -group ex_stage -group mult -group mul /ariane_tb/dut/i_ariane/ex_stage_i/i_mult/i_mul/*
 add wave -noupdate -group ex_stage -group mult -group div /ariane_tb/dut/i_ariane/ex_stage_i/i_mult/i_div/*
+add wave -noupdate -group ex_stage -group fpu /ariane_tb/dut/i_ariane/ex_stage_i/fpu_gen/fpu_i/*
+add wave -noupdate -group ex_stage -group fpu -group fpnew /ariane_tb/dut/i_ariane/ex_stage_i/fpu_gen/fpu_i/fpnew_top_i/i_fpnew/*
 
 add wave -noupdate -group ex_stage -group lsu /ariane_tb/dut/i_ariane/ex_stage_i/lsu_i/*
 add wave -noupdate -group ex_stage -group lsu  -group lsu_bypass /ariane_tb/dut/i_ariane/ex_stage_i/lsu_i/lsu_bypass_i/*