diff --git a/README.md b/README.md index e839f24bf..691a46a04 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,9 @@ CVA6 is a 6-stage, single-issue, in-order CPU which implements the 64-bit RISC-V It has a configurable size, separate TLBs, a hardware PTW and branch-prediction (branch target buffer and branch history table). The primary design goal was on reducing critical path length. +A performance model of CVA6 is available in the `perf-model/` folder of this repository. +It can be used to investigate performance-related micro-architecture changes. + diff --git a/perf-model/README.md b/perf-model/README.md new file mode 100644 index 000000000..77ba8575a --- /dev/null +++ b/perf-model/README.md @@ -0,0 +1,79 @@ +# CVA6 cycle-accurate performance model + +This repository contains a cycle-accurate performance model of CVA6 control-path. + +It was developed to explore microarchitecture changes in CVA6 before implementing them. + +To cite this model, please head to the end of this document. + + +## Getting started + +### Adapt RVFI trace generation + +The regular expression expects the cycle number to be in the RVFI trace. +The value is not used by the model but it is used to compare the model and CVA6. + +To emit cycle number in RVFI trace, modify `corev_apu/tb/rvfi_tracer.sv` in CVA6 repository as below. + +```diff +- $fwrite(f, "core 0: 0x%h (0x%h) DASM(%h)\n", +- pc64, rvfi_i[i].insn, rvfi_i[i].insn); ++ $fwrite(f, "core 0: 0x%h (0x%h) @%d DASM(%h)\n", ++ pc64, rvfi_i[i].insn, cycles, rvfi_i[i].insn); +``` + + +### Generate an RVFI trace + +To generate an RVFI trace, follow the instructions in the CVA6 repository to run a simulation. +The RVFI trace will be in `verif/sim/out_//.log`. + + +### Running the model + +```bash +python3 model.py verif/sim/out_//.log +``` + + +### Exploring design space + +In `model.py`, the `main` function runs the model with arguments which override default values. +Generic parameters are available in `Model.__init__`. +You can add new parameters to explore here. + +To perform exploration, run the model in a loop, like `issue_commit_graph` does. +The `display_scores` function is meant to print a 3D plot if you have `matplotlib`. +`issue_commit_graph` prints the scores so that you can store it and display the figure without re-running the model. + + +## Files + +| Name | Description | +| :--- | :--- | +| `cycle_diff.py` | Calculates duration of each instruction in an RVFI trace | +| `isa.py` | Module to create Python objects from RISC-V instructions | +| `model.py` | The CVA6 performance model | + + +## Citing + +```bibtex +@inproceedings{cf24, + author = {Allart, C\^{o}me and Coulon, Jean-Roch and Sintzoff, Andr\'{e} and Potin, Olivier and Rigaud, Jean-Baptiste}, + title = {Using a Performance Model to Implement a Superscalar CVA6}, + year = {2024}, + isbn = {9798400704925}, + publisher = {Association for Computing Machinery}, + url = {https://doi.org/10.1145/3637543.3652871}, + doi = {10.1145/3637543.3652871}, + abstract = {A performance model of CVA6 RISC-V processor is built to evaluate performance-related modifications before implementing them in RTL. Its accuracy is 99.2\% on CoreMark. This model is used to evaluate a superscalar feature for CVA6. During design phase, the model helped detecting and fixing performance bugs. The superscalar feature resulted in a CVA6 performance improvement of 40\% on CoreMark.}, + booktitle = {Proceedings of the 21st ACM International Conference on Computing Frontiers: Workshops and Special Sessions}, + pages = {43–46}, + numpages = {4}, + keywords = {CVA6, Cycle-Based Model, Multi-Issue, Performance, RISC-V, Superscalar}, + location = {Ischia, Italy}, + series = {CF '24 Companion} +} +``` diff --git a/perf-model/cycle_diff.py b/perf-model/cycle_diff.py new file mode 100644 index 000000000..f9b67bd4c --- /dev/null +++ b/perf-model/cycle_diff.py @@ -0,0 +1,80 @@ +# Copyright 2024 Thales Silicon Security +# +# Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 +# You may obtain a copy of the License at https://solderpad.org/licenses/ +# +# Original Author: Côme ALLART - Thales + +import re +import sys + +re_csrr_minstret = re.compile(r"^csrr\s+\w+,\s*minstret$") +re_full = re.compile( + r"([a-z]+)\s+0:\s*0x00000000([0-9a-f]+)\s*\(([0-9a-fx]+)\)\s*(\S*)@\s*([0-9]+)\s*(.*)" +) + +class Trace: + def __init__(self, addr, cycle, mnemo, flags): + self.addr = addr + self.cycle = cycle + self.mnemo = mnemo + self.flags = flags + self.delta = None + + def report(self): + """True if the instruction is a loading instruction""" + return f"+{self.delta} {self.flags} 0x{self.addr}: {self.mnemo}" + +def print_data(name, value): + "Prints 'name = data' with alignment of the '='" + spaces = ' ' * (24 - len(name)) + print(f"{name}{spaces} = {value}") + +def read_traces(input_file): + "Collect stage traces from file" + l = [] + def filter_add(trace): + if not hasattr(filter_add, "accepting"): + filter_add.accepting = False + if re_csrr_minstret.search(trace.mnemo): + filter_add.accepting = not filter_add.accepting + return + if filter_add.accepting: + l.append(trace) + with open(input_file, "r", encoding="utf8") as f: + for line in [l.strip() for l in f]: + found = re_full.search(line) + if found: + addr = found.group(2) + flags = found.group(4) + cycle = int(found.group(5)) + mnemo = found.group(6) + filter_add(Trace(addr, cycle, mnemo, flags)) + #l.append(Trace(addr, cycle, mnemo, flags)) + return l + +def write_traces(outfile, traces): + "Write all instructions to output file" + print("output file:", outfile) + with open(outfile, "w", encoding="utf8") as f: + for trace in traces: + f.write(trace.report() + "\n") + +def main(input_file: str): + "Main function" + traces = read_traces(input_file) + cycle = traces[0].cycle + cycle_number = traces[-1].cycle - cycle + 1 + for trace in traces: + trace.delta = trace.cycle - cycle + cycle = trace.cycle + print_data("cycle number", cycle_number) + print_data("Coremark/MHz", 1000000 / cycle_number) + print_data("instruction number", len(traces)) + print_data("IPC", len(traces) / cycle_number) + write_traces("traceout.log", traces) + +if __name__ == "__main__": + main(sys.argv[1]) diff --git a/perf-model/isa.py b/perf-model/isa.py new file mode 100644 index 000000000..90cd772a4 --- /dev/null +++ b/perf-model/isa.py @@ -0,0 +1,574 @@ +# Copyright 2024 Thales Silicon Security +# +# Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 +# You may obtain a copy of the License at https://solderpad.org/licenses/ +# +# Original Author: Côme ALLART - Thales + +""" +Represents the instruction set +""" + +from dataclasses import dataclass + +class Reg: + """Constants to represent registers""" + # ABI names + zero = 0 + ra = 1 + sp = 2 + gp = 3 + tp = 4 + t0 = 5 + t1 = 6 + t2 = 7 + s0 = 8 + fp = 8 + s1 = 9 + a0 = 10 + a1 = 11 + a2 = 12 + a3 = 13 + a4 = 14 + a5 = 15 + a6 = 16 + a7 = 17 + s2 = 18 + s3 = 19 + s4 = 20 + s5 = 21 + s6 = 22 + s7 = 23 + s8 = 24 + s9 = 25 + s10 = 26 + s11 = 27 + t3 = 28 + t4 = 29 + t5 = 30 + t6 = 31 + # Register names + x0 = 0 + x1 = 1 + x2 = 2 + x3 = 3 + x4 = 4 + x5 = 5 + x6 = 6 + x7 = 7 + x8 = 8 + x9 = 9 + x10 = 10 + x11 = 11 + x12 = 12 + x13 = 13 + x14 = 14 + x15 = 15 + x16 = 16 + x17 = 17 + x18 = 18 + x19 = 19 + x20 = 20 + x21 = 21 + x22 = 22 + x23 = 23 + x24 = 24 + x25 = 25 + x26 = 26 + x27 = 27 + x28 = 28 + x29 = 29 + x30 = 30 + x31 = 31 + +def sign_ext(imm, index, xlen=32): + """ + Sign extends a value + imm: value to sign extend + index: index of the sign bit of the value + len: target len for sign extended value + """ + imm_bits = index + 1 + assert (imm >> imm_bits) == 0 + neg = imm >> index + sext_bits = xlen - imm_bits + sext_ones = (1 << sext_bits) - 1 + sext = neg * sext_ones << imm_bits + return sext | imm + +@dataclass +class AddrFields: + """Represents the data used to build a memory address""" + base_reg: int + offset: int + +class Rtype: + """R-type instructions""" + def __init__(self, instr): + self.funct7 = instr.bin >> 25 + self.rs2 = (instr.bin >> 20) & 31 + self.rs1 = (instr.bin >> 15) & 31 + self.funct3 = (instr.bin >> 12) & 7 + self.rd = (instr.bin >> 7) & 31 + self.opcode = instr.bin & 63 + +class Itype: + """I-type instructions""" + def __init__(self, instr): + self.rs1 = (instr.bin >> 15) & 31 + self.funct3 = (instr.bin >> 12) & 7 + self.rd = (instr.bin >> 7) & 31 + self.opcode = instr.bin & 63 + self.imm = sign_ext(instr.bin >> 20, 11) + +class Stype: + """S-type instructions""" + def __init__(self, instr): + self.rs2 = (instr.bin >> 20) & 31 + self.rs1 = (instr.bin >> 15) & 31 + self.funct3 = (instr.bin >> 12) & 7 + self.opcode = instr.bin & 63 + self.imm = sign_ext( + ((instr.bin >> 25) << 5) \ + | ((instr.bin >> 7) & 31) + , 11) + +class Btype: + """B-type instructions""" + def __init__(self, instr): + self.rs2 = (instr.bin >> 20) & 31 + self.rs1 = (instr.bin >> 15) & 31 + self.funct3 = (instr.bin >> 12) & 7 + self.opcode = instr.bin & 63 + self.imm = sign_ext( + ((instr.bin >> 31) << 12) \ + | (((instr.bin >> 7) & 1) << 11) \ + | (((instr.bin >> 25) & 0x3f) << 5) \ + | (((instr.bin >> 8) & 15) << 1) + , 12) + +class Utype: + """U-type instructions""" + def __init__(self, instr): + self.imm_31_12 = instr.bin >> 12 + self.imm_4_0 = (instr.bin >> 7) & 31 + self.rd = (instr.bin >> 7) & 31 + self.opcode = instr.bin & 63 + self.imm = self.imm_31_12 << 12 + +class Jtype: + """J-type instructions""" + def __init__(self, instr): + self.rd = (instr.bin >> 7) & 31 + self.opcode = instr.bin & 63 + self.imm = sign_ext( + ((instr.bin >> 31) << 20) \ + | (((instr.bin >> 12) & 0xff) << 12) \ + | (((instr.bin >> 20) & 1) << 11) \ + | (((instr.bin >> 21) & 0x3ff) << 1) + , 20) + +class MOItype: + """Memory ordering instructions""" + def __init__(self, instr): + self.fm = instr.bin >> 28 + self.PI = (instr.bin >> 27) & 1 + self.PO = (instr.bin >> 26) & 1 + self.PR = (instr.bin >> 25) & 1 + self.PW = (instr.bin >> 24) & 1 + self.SI = (instr.bin >> 23) & 1 + self.SO = (instr.bin >> 22) & 1 + self.SR = (instr.bin >> 21) & 1 + self.SW = (instr.bin >> 20) & 1 + self.rs1 = (instr.bin >> 15) & 31 + self.funct3 = (instr.bin >> 12) & 7 + self.rd = (instr.bin >> 7) & 31 + self.opcode = instr.bin & 63 + +class CRtype: + """Compressed register""" + def __init__(self, instr): + self.funct4 = instr.bin >> 12 + r = (instr.bin >> 7) & 31 + self.rs2 = (instr.bin >> 2) & 31 + self.op = instr.bin & 3 + self.rs1 = r + base = instr.base() + if base == 'C.J[AL]R/C.MV/C.ADD': + if self.funct4 & 1: + if self.rs2 == 0: + if r == 0: + base = 'C.EBREAK' + else: + base = 'C.JALR' + else: + base = 'C.ADD' + else: + if self.rs2 == 0: + base = 'C.JR' + else: + base = 'C.MV' + if base in CRtype.regreg: + self.rd = r + self.name = base + + control = ['C.JR', 'C.JALR'] + regreg = ['C.MV', 'C.ADD'] + +class CItype: + """Compressed immediate""" + def __init__(self, instr): + self.funct3 = instr.bin >> 13 + r = (instr.bin >> 7) & 31 + self.op = instr.bin & 3 + base = instr.base() + if base == 'C.LUI/C.ADDI16SP': + if r == Reg.sp: + base = 'C.ADDI16SP' + else: + base = 'C.LUI' + if base in CItype.SPload + CItype.constgen: + self.rd = r + if base in CItype.SPload: + self.rs1 = Reg.sp + self.offset = CItype.offset[base](instr.bin) + # zero-extended offset + if base == 'C.LI': + self.imm = sign_ext(CItype.imm(instr.bin), 5) + if base == 'C.LUI': + self.nzimm = sign_ext(CItype.imm(instr.bin) << 12, 17) + if base in CItype.regimm: + self.rd = r + self.rs1 = r + if base == 'C.ADDI': + self.nzimm = sign_ext(CItype.imm(instr.bin), 5) + if base == 'C.ADDIW': + self.imm = sign_ext(CItype.imm(instr.bin), 5) + if base == 'C.ADDI16SP': + self.nzimm = sign_ext(CItype.immsp(instr.bin), 9) + if base == 'C.SLLI': + self.shamt = CItype.imm(instr.bin) + + SPload = ['C.LWSP', 'C.LDSP', 'C.LQSP', 'C.FLWSP', 'C.FLDSP'] + constgen = ['C.LI', 'C.LUI'] + regimm = ['C.ADDI', 'C.ADDIW', 'C.ADDI16SP', 'C.SLLI'] + + Woffset = lambda i: (((i >> 12) & 1) << 5) | (((i >> 4) & 7) << 2) \ + | (((i >> 2) & 3) << 6) + Doffset = lambda i: (((i >> 12) & 1) << 5) | (((i >> 5) & 3) << 3) \ + | (((i >> 2) & 7) << 6) + Qoffset = lambda i: (((i >> 12) & 1) << 5) | (((i >> 6) & 1) << 4) \ + | (((i >> 2) & 15) << 6) + imm = lambda i: (((i >> 12) & 1) << 5) | ((i >> 2) & 31) + immsp = lambda i: (((i >> 12) & 1) << 9) | (((i >> 6) & 1) << 4) \ + | (((i >> 5) & 1) << 6) | (((i >> 3) & 3) << 7) \ + | (((i >> 2) & 1) << 5) + + offset = { + 'C.LWSP': Woffset, + 'C.LDSP': Doffset, + 'C.LQSP': Qoffset, + 'C.FLWSP': Woffset, + 'C.FLDSP': Doffset, + } + +class CSStype: + """Compressed stack-relative store""" + def __init__(self, instr): + self.funct3 = instr.bin >> 13 + self.rs1 = Reg.sp + self.rs2 = (instr.bin >> 2) & 31 + self.op = instr.bin & 3 + self.offset = CSStype.offset[instr.base()](instr.bin) + # zero-extended offset + + Woffset = lambda i: (((i >> 9) & 15) << 2) | (((i >> 7) & 3) << 6) + Doffset = lambda i: (((i >> 10) & 7) << 3) | (((i >> 7) & 7) << 6) + Qoffset = lambda i: (((i >> 11) & 3) << 4) | (((i >> 7) & 15) << 6) + + offset = { + 'C.SWSP': Woffset, + 'C.SDSP': Doffset, + 'C.SQSP': Qoffset, + 'C.FSWSP': Woffset, + 'C.FSDSP': Doffset, + } + +class CIWtype: + """Compressed wide immediate""" + def __init__(self, instr): + i = instr.bin + self.funct3 = i >> 13 + rd_ = (i >> 2) & 7 + self.rd = rd_ + 8 + self.op = i & 3 + self.nzuimm = (((i >> 11) & 3) << 4) | (((i >> 7) & 15) << 6) \ + | (((i >> 6) & 1) << 2) | (((i >> 5) & 1) << 3) + # zero-extended (unsigned) non-zero immediate + if instr.base() == 'C.ADDI4SPN': + self.rs1 = Reg.sp + +CLS_Woffset = lambda i: (((i >> 10) & 7) << 3) | (((i >> 6) & 1) << 2) \ + | (((i >> 5) & 1) << 6) +CLS_Doffset = lambda i: (((i >> 10) & 7) << 3) | (((i >> 5) & 3) << 6) +CLS_Qoffset = lambda i: (((i >> 11) & 3) << 4) | (((i >> 10) & 1) << 8) \ + | (((i >> 5) & 3) << 6) + +class CLtype: + """Compressed load""" + def __init__(self, instr): + self.funct3 = instr.bin >> 13 + rs1_ = (instr.bin >> 7) & 7 + rd_ = (instr.bin >> 2) & 7 + self.rs1 = rs1_ + 8 + self.rd = rd_ + 8 + self.op = instr.bin & 3 + self.offset = CLtype.offset[instr.base()](instr.bin) + # zero-extended offset + + offset = { + 'C.LW': CLS_Woffset, + 'C.LD': CLS_Doffset, + 'C.LQ': CLS_Qoffset, + 'C.FLW': CLS_Woffset, + 'C.FLD': CLS_Doffset, + } + +class CStype: + """Compressed store""" + def __init__(self, instr): + self.funct3 = instr.bin >> 13 + rs1_ = (instr.bin >> 7) & 7 + rs2_ = (instr.bin >> 2) & 7 + self.rs1 = rs1_ + 8 + self.rs2 = rs2_ + 8 + self.op = instr.bin & 3 + self.offset = CStype.offset[instr.base()](instr.bin) + # zero-extended offset + + offset = { + 'C.SW': CLS_Woffset, + 'C.SD': CLS_Doffset, + 'C.SQ': CLS_Qoffset, + 'C.FSW': CLS_Woffset, + 'C.FSD': CLS_Doffset, + } + +class CAtype: + """Compressed arithmetic""" + def __init__(self, instr): + self.funct6 = instr.bin >> 10 + r = (instr.bin >> 7) & 7 + self.rd = r + 8 + self.rs1 = r + 8 + self.funct2 = (instr.bin >> 5) & 3 + self.rs2 = ((instr.bin >> 2) & 7) + 8 + self.op = instr.bin & 3 + +class CBtype: + """Compressed branch""" + def __init__(self, instr): + i = instr.bin + base = instr.base() + self.funct3 = i >> 13 + self.offset = (i >> 10) & 7 + rs1_ = (i >> 7) & 7 + self.rs1 = rs1_ + 8 + self.op = instr.bin & 3 + if base in CBtype.branch: + self.offset = sign_ext( + (((i >> 12) & 1) << 8) \ + | (((i >> 10) & 3) << 3) \ + | (((i >> 5) & 3) << 6) \ + | (((i >> 3) & 3) << 1) \ + | (((i >> 2) & 1) << 5) + , 8) + if base in CBtype.regimm: + if base == 'C.ANDI': + self.shamt = sign_ext(CItype.imm(i), 5) + else: + self.shamt = CItype.imm(i) + self.rd = self.rs1 + + branch = ['C.BEQZ', 'C.BNEZ'] + regimm = ['C.SRLI', 'C.SRAI', 'C.ANDI'] + +class CJtype: + """Compressed jump""" + def __init__(self, instr): + self.funct3 = instr.bin >> 13 + assert instr.base() in ['C.J', 'C.JAL'] + self.offset = sign_ext(CJtype.offset(instr.bin), 11) + self.jump_target = (instr.bin >> 2) & 0x7ff + self.op = instr.bin & 3 + + offset = lambda i: (((i >> 12) & 1) << 11) | (((i << 11) & 1) << 4) \ + | (((i >> 9) & 3) << 8) | (((i >> 8) & 1) << 10) \ + | (((i >> 7) & 1) << 6) | (((i >> 6) & 1) << 7) \ + | (((i >> 3) & 1) << 1) | (((i >> 2) & 1) << 5) + +class Instr: + """Instructions""" + + table_16_4_RV32 = [ + ['C.ADDI4SPN', 'C.FLD', 'C.LW', 'C.FLW', + 'Reserved', 'C.FSD', 'C.SW', 'C.FSW'], + ['C.ADDI', 'C.JAL', 'C.LI', 'C.LUI/C.ADDI16SP', + 'MISC-ALU', 'C.J', 'C.BEQZ', 'C.BNEZ'], + ['C.SLLI', 'C.FLDSP', 'C.LWSP', 'C.FLWSP', + 'C.J[AL]R/C.MV/C.ADD', 'C.FSDSP', 'C.SWSP', 'C.FSWSP'], + ] + + table_24_1 = [ + ['LOAD', 'LOAD-FP', 'custom-0', 'MISC-MEM', 'OP-IMM', 'AUIPC', 'OP-IMM-32', '48b'], + ['STORE', 'STORE-FP', 'custom-1', 'AMO', 'OP', 'LUI', 'OP-32', '64b'], + ['MADD', 'MSUB', 'NMSUB', 'NMADD', 'OP-FP', 'reserved', 'custom-2/rv128', '48b'], + ['BRANCH', 'JALR', 'reserved', 'JAL', 'SYSTEM', 'reserved', 'custom-3/rv128', '80b'], + ] + type_of_base = { + 'OP-IMM': Itype, + 'LUI': Utype, + 'AUIPC': Utype, + 'OP': Rtype, + 'OP-32': Rtype, + 'JAL': Jtype, + 'JALR': Itype, + 'BRANCH': Btype, + 'LOAD': Itype, + 'STORE': Stype, + 'SYSTEM': Itype, + 'C.LWSP': CItype, + 'C.LDSP': CItype, + 'C.LQSP': CItype, + 'C.FLWSP': CItype, + 'C.FLDSP': CItype, + 'C.SWSP': CSStype, + 'C.SDSP': CSStype, + 'C.SQSP': CSStype, + 'C.FSWSP': CSStype, + 'C.FSDSP': CSStype, + 'C.LW': CLtype, + 'C.LD': CLtype, + 'C.LQ': CLtype, + 'C.FLW': CLtype, + 'C.FLD': CLtype, + 'C.SW': CStype, + 'C.SD': CStype, + 'C.SQ': CStype, + 'C.FSW': CStype, + 'C.FSD': CStype, + 'C.J': CJtype, + 'C.JAL': CJtype, + 'C.J[AL]R/C.MV/C.ADD': CRtype, + 'C.BEQZ': CBtype, + 'C.BNEZ': CBtype, + 'C.LI': CItype, + 'C.LUI/C.ADDI16SP': CItype, + 'C.ADDI': CItype, + 'C.ADDIW': CItype, + 'C.ADDI4SPN': CIWtype, + 'C.SLLI': CItype, + 'MISC-ALU': CAtype, + } + iloads = ['C.LW', 'C.LWSP', 'LOAD'] + floads = ['C.FLD', 'C.FLW', 'C.FLDSP', 'C.FLWSP', 'LOAD-FP'] + istores = ['C.SW', 'C.SWSP', 'STORE'] + fstores = ['C.FSD', 'C.FSW', 'C.FSDSP', 'C.FSWSP', 'STORE-FP'] + loads = iloads + floads + stores = istores + fstores + + def __init__(self, bincode): + self.bin = bincode + self.inst_1_0 = self.bin & 3 + + def base(self): + """Get the name of the base instruction""" + result = "" + if self.is_compressed(): + line = self.bin & 3 + col = (self.bin >> 13) & 7 + result = Instr.table_16_4_RV32[line][col] + else: + line = (self.bin >> 5) & 3 + col = (self.bin >> 2) & 7 + result = Instr.table_24_1[line][col] + return result + + def fields(self): + """Get an object with the fields of the instruction""" + return Instr.type_of_base[self.base()](self) + + def is_compressed(self): + """Is the instruction from the C extension?""" + return (self.bin & 3) < 3 + + def size(self): + """Size of the instruction in bytes""" + return 2 if self.is_compressed() else 4 + + def is_load(self): + """Is the instruction a load?""" + return self.base() in Instr.loads + + def is_store(self): + """Is the instruction a store?""" + return self.base() in Instr.stores + + def is_branch(self): + """Is it a taken/not taken branch?""" + return self.base() in ['C.BEQZ', 'C.BNEZ', 'BRANCH'] + + def is_regjump(self): + """Is it a register jump?""" + if self.base() in ['JALR']: + return True + if self.base() == 'C.J[AL]R/C.MV/C.ADD': + return self.fields().name in ['C.JALR', 'C.JR'] + return False + + def is_jump(self): + """Is it an immediate jump?""" + return self.base() in ['JAL', 'C.JAL', 'C.J'] + + def is_muldiv(self): + """Is it a muldiv instruction?""" + return self.base() in ['OP', 'OP-32'] and self.fields().funct7 == 1 + + def offset(self): + """Get offset from instr (sometimes it is just 'imm' in RISCV spec)""" + fields = self.fields() + return fields.offset if hasattr(fields, 'offset') else fields.imm + + def addr_fields(self): + """Get the register and offset to build an address""" + return AddrFields(self.fields().rs1, self.offset()) + + def has_WAW_from(self, other): + """b.has_WAW_from(a) if a.rd == b.rd""" + a = other.fields() + b = self.fields() + if not (hasattr(a, 'rd') and hasattr(b, 'rd')): + return False + return a.rd == b.rd and a.rd != Reg.zero + + def has_RAW_from(self, other): + """b.has_RAW_from(a) if b.rsX == a.rd""" + a = other.fields() + b = self.fields() + if not hasattr(a, 'rd') or a.rd == Reg.zero: + return False + if hasattr(b, 'rs1') and a.rd == b.rs1: + return True + return hasattr(b, 'rs2') and a.rd == b.rs2 + + def has_WAR_from(self, other): + """b.has_WAR_from(a) if b.rd == a.rsX""" + a = other.fields() + b = self.fields() + if not hasattr(b, 'rd') or b.rd == Reg.zero: + return False + if hasattr(a, 'rs1') and a.rs1 == b.rd: + return True + return hasattr(a, 'rs2') and a.rs2 == b.rd diff --git a/perf-model/model.py b/perf-model/model.py new file mode 100644 index 000000000..736298a89 --- /dev/null +++ b/perf-model/model.py @@ -0,0 +1,666 @@ +# Copyright 2024 Thales Silicon Security +# +# Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 +# You may obtain a copy of the License at https://solderpad.org/licenses/ +# +# Original Author: Côme ALLART - Thales + +""" +Performance model of the cva6 +""" + +import sys +import re + +from dataclasses import dataclass +from enum import Enum +from collections import defaultdict + +#from matplotlib import pyplot as plt + +from isa import Instr, Reg + +EventKind = Enum('EventKind', [ + 'WAW', 'WAR', 'RAW', + 'BMISS', 'BHIT', + 'STRUCT', + 'issue', 'done', 'commit', +]) + +def to_signed(value, xlen=32): + signed = value + if signed >> (xlen - 1): + signed -= 1 << xlen + return signed + +class Event: + """Represents an event on an instruction""" + def __init__(self, kind, cycle): + self.kind = kind + self.cycle = cycle + + def __repr__(self): + return f"@{self.cycle}: {self.kind}" + +class Instruction(Instr): + """Represents a RISC-V instruction with annotations""" + + def __init__(self, line, address, hex_code, mnemo): + Instr.__init__(self, int(hex_code, base=16)) + self.line = line + self.address = int(address, base=16) + self.hex_code = hex_code + self.mnemo = mnemo + self.events = [] + + def mnemo_name(self): + """The name of the instruction (fisrt word of the mnemo)""" + return self.mnemo.split()[0] + + def next_addr(self): + """Address of next instruction""" + return self.address + self.size() + + _ret_regs = [Reg.ra, Reg.t0] + + def is_ret(self): + "Does CVA6 consider this instruction as a ret?" + f = self.fields() + # Strange conditions, no imm check, no rd-discard check + return self.is_regjump() \ + and f.rs1 in Instruction._ret_regs \ + and (self.is_compressed() or f.rs1 != f.rd) + + def is_call(self): + "Does CVA6 consider this instruction as a ret?" + base = self.base() + f = self.fields() + return base == 'C.JAL' \ + or base == 'C.J[AL]R/C.MV/C.ADD' and f.name == 'C.JALR' \ + or base in ['JAL', 'JALR'] and f.rd in Instruction._ret_regs + + def __repr__(self): + return self.mnemo + +@dataclass +class Entry: + """A scoreboard entry""" + instr: Instruction + cycles_since_issue = 0 + done: bool = False + + def __repr__(self): + status = "DONE" if self.done else "WIP " + addr = f"0x{self.instr.address:08X}" + return f"{status} {addr}:`{self.instr}` for {self.cycles_since_issue}" + +@dataclass +class LastIssue: + """To store the last issued instruction""" + instr: Instruction + issue_cycle: int + +class IqLen: + """Model of the instruction queue with only a size counter""" + def __init__(self, fetch_size, debug=False): + self.fetch_size = 4 + while self.fetch_size < fetch_size: + self.fetch_size <<= 1 + self.debug = debug + self.len = self.fetch_size + self.new_fetch = True + + def fetch(self): + """Fetch bytes""" + self.len += self.fetch_size + self._debug(f"fetched {self.fetch_size}, got {self.len}") + self.new_fetch = True + + def flush(self): + """Flush instruction queue (bmiss or exception)""" + self.len = 0 + self._debug(f"flushed, got {self.len}") + self.new_fetch = False + + def jump(self): + """Loose a fetch cycle and truncate (jump, branch hit taken)""" + if self.new_fetch: + self.len -= self.fetch_size + self._debug(f"jumping, removed {self.fetch_size}, got {self.len}") + self.new_fetch = False + self._truncate() + self._debug(f"jumped, got {self.len}") + + def has(self, instr): + """Does the instruction queue have this instruction?""" + length = self.len + if self._is_crossword(instr): + length -= (self.fetch_size - 2) + self._debug(f"comparing {length} to {instr.size()} ({instr})") + return length >= instr.size() + + def remove(self, instr): + """Remove instruction from queue""" + self.len -= instr.size() + self._debug(f"removed {instr.size()}, got {self.len}") + self._truncate(self._addr_index(instr.next_addr())) + if instr.is_jump(): + self.jump() + + def _addr_index(self, addr): + return addr & (self.fetch_size - 1) + + def _is_crossword(self, instr): + is_last = self._addr_index(instr.address) == self.fetch_size - 2 + return is_last and not instr.is_compressed() + + def _truncate(self, index=0): + occupancy = self.fetch_size - self._addr_index(self.len) + to_remove = index - occupancy + if to_remove < 0: + to_remove += self.fetch_size + self.len -= to_remove + self._debug(f"truncated, removed {to_remove}, got {self.len}") + + def _debug(self, message): + if self.debug: + print(f"iq: {message}") + +class Ras: + "Return Address Stack" + def __init__(self, depth=2, debug=False): + self.depth = depth - 1 + self.stack = [] + self.debug = debug + self.last_dropped = None + + def push(self, addr): + "Push an address on the stack, forget oldest entry if full" + self.stack.append(addr) + self._debug(f"pushed 0x{addr:08X}") + if len(self.stack) > self.depth: + self.stack.pop(0) + self._debug("overflown") + + def drop(self): + "Drop an address from the stack" + self._debug("dropping") + if len(self.stack) > 0: + self.last_dropped = self.stack.pop() + else: + self.last_dropped = None + self._debug("was already empty") + + def read(self): + "Read the top of the stack without modifying it" + self._debug("reading") + if self.last_dropped is not None: + addr = self.last_dropped + self._debug(f"read 0x{addr:08X}") + return addr + self._debug("was empty") + return None + + def resolve(self, instr): + "Push or pop depending on the instruction" + self._debug(f"issuing {instr}") + if instr.is_ret(): + self._debug("detected ret") + self.drop() + if instr.is_call(): + self._debug("detected call") + self.push(instr.next_addr()) + + def _debug(self, message): + if self.debug: + print(f"RAS: {message}") + +class Bht: + "Branch History Table" + + @dataclass + class Entry: + "A BTB entry" + valid: bool = False + sat_counter: int = 0 + + def __init__(self, entries=128): + self.contents = [Bht.Entry() for _ in range(entries)] + + def predict(self, addr): + "Is the branch taken? None if don't know" + entry = self.contents[self._index(addr)] + if entry.valid: + return entry.sat_counter >= 2 + return None + + def resolve(self, addr, taken): + "Update branch prediction" + index = self._index(addr) + entry = self.contents[index] + entry.valid = True + if taken: + if entry.sat_counter < 3: + entry.sat_counter += 1 + else: + if entry.sat_counter > 0: + entry.sat_counter -= 1 + + def _index(self, addr): + return (addr >> 1) % len(self.contents) + +Fu = Enum('Fu', ['ALU', 'MUL', 'BRANCH', 'LDU', 'STU']) + +# We have +# - FLU gathering ALU + BRANCH (+ CSR, not significant in CoreMark) +# - LSU for loads and stores +# - FP gathering MUL + second ALU (+ Floating, unused in CoreMark) +# This way we do not have more write-back ports than currently with F + +def to_fu(instr): + if instr.is_branch() or instr.is_regjump(): + return Fu.BRANCH + if instr.is_muldiv(): + return Fu.MUL + if instr.is_load(): + return Fu.LDU + if instr.is_store(): + return Fu.STU + return Fu.ALU + +class FusBusy: + "Is each functional unit busy" + def __init__(self, has_alu2 = False): + self.has_alu2 = has_alu2 + + self.alu = False + self.mul = False + self.branch = False + self.ldu = False + self.stu = False + self.alu2 = False + + self.issued_mul = False + + def _alu2_ready(self): + return self.has_alu2 and not self.alu2 + + def is_ready(self, fu): + return { + Fu.ALU: self._alu2_ready() or not self.alu, + Fu.MUL: not self.mul, + Fu.BRANCH: not self.branch, + Fu.LDU: not self.ldu, + Fu.STU: not self.stu, + }[fu] + + def is_ready_for(self, instr): + return self.is_ready(to_fu(instr)) + + def issue(self, instr): + return { + Fu.ALU: FusBusy.issue_alu, + Fu.MUL: FusBusy.issue_mul, + Fu.BRANCH: FusBusy.issue_branch, + Fu.LDU: FusBusy.issue_ldu, + Fu.STU: FusBusy.issue_stu, + }[to_fu(instr)](self) + + def issue_mul(self): + self.mul = True + self.issued_mul = True + + def issue_alu(self): + if not self._alu2_ready(): + assert not self.alu + self.alu = True + self.branch = True + else: + self.alu2 = True + + def issue_branch(self): + self.alu = True + self.branch = True + # Stores are not allowed yet + self.stu = True + + def issue_ldu(self): + self.ldu = True + self.stu = True + + def issue_stu(self): + self.stu = True + self.ldu = True + + def cycle(self): + self.alu = self.issued_mul + self.mul = False + self.branch = self.issued_mul + self.ldu = False + self.stu = False + self.alu2 = False + self.issued_mul = False + +class Model: + """Models the scheduling of CVA6""" + + re_instr = re.compile( + r"([a-z]+)\s+0:\s*0x00000000([0-9a-f]+)\s*\(([0-9a-fx]+)\)\s*@\s*([0-9]+)\s*(.*)" + ) + + def __init__( + self, + debug=False, + issue=1, + commit=2, + sb_len=8, + fetch_size=None, + has_forwarding=True, + has_renaming=True): + self.ras = Ras(debug=debug) + self.bht = Bht() + self.instr_queue = [] + self.scoreboard = [] + self.fus = FusBusy(issue > 1) + self.last_issued = None + self.last_committed = None + self.retired = [] + self.sb_len = sb_len + self.debug = debug + self.iqlen = IqLen(fetch_size or 4 * issue, debug) + self.issue_width = issue + self.commit_width = commit + self.has_forwarding = has_forwarding + self.has_renaming = has_renaming + self.log = [] + + def log_event_on(self, instr, kind, cycle): + """Log an event on the instruction""" + if self.debug: + print(f"{instr}: {kind}") + event = Event(kind, cycle) + instr.events.append(event) + self.log.append((event, instr)) + + def predict_branch(self, instr): + """Predict if branch is taken or not""" + pred = self.bht.predict(instr.address) + if pred is not None: + return pred + return instr.offset() >> 31 != 0 + + def predict_regjump(self, instr): + """Predict destination address of indirect jump""" + if instr.is_ret(): + return self.ras.read() or 0 + return 0 # always miss, as there is no btb yet + + def predict_pc(self, last): + """Predict next program counter depending on last issued instruction""" + if last.is_branch(): + taken = self.predict_branch(last) + offset = to_signed(last.offset()) if taken else last.size() + return last.address + offset + if last.is_regjump(): + return self.predict_regjump(last) + return None + + def issue_manage_last_branch(self, instr, cycle): + """Flush IQ if branch miss, jump if branch hit""" + if self.last_issued is not None: + last = self.last_issued.instr + pred = self.predict_pc(last) + if pred is not None: + bmiss = pred != instr.address + resolved = cycle >= self.last_issued.issue_cycle + 6 + if bmiss and not resolved: + self.iqlen.flush() + branch = EventKind.BMISS if bmiss else EventKind.BHIT + if branch not in [e.kind for e in instr.events]: + self.log_event_on(instr, branch, cycle) + taken = instr.address != last.next_addr() + if taken and not bmiss: + # last (not instr) was like a jump + self.iqlen.jump() + + def commit_manage_last_branch(self, instr, cycle): + "Resolve branch prediction" + if self.last_committed is not None: + last = self.last_committed + if last.is_branch(): + taken = instr.address != last.next_addr() + self.bht.resolve(last.address, taken) + self.last_committed = instr + + def find_data_hazards(self, instr, cycle): + """Detect and log data hazards""" + found = False + for entry in self.scoreboard: + if instr.has_WAW_from(entry.instr) and not self.has_renaming: + self.log_event_on(instr, EventKind.WAW, cycle) + found = True + can_forward = self.has_forwarding and entry.done + if instr.has_RAW_from(entry.instr) and not can_forward: + self.log_event_on(instr, EventKind.RAW, cycle) + found = True + return found + + def find_structural_hazard(self, instr, cycle): + """Detect and log structural hazards""" + if not self.fus.is_ready_for(instr): + self.log_event_on(instr, EventKind.STRUCT, cycle) + return True + return False + + def try_issue(self, cycle): + """Try to issue an instruction""" + if len(self.instr_queue) == 0 or len(self.scoreboard) >= self.sb_len: + return + can_issue = True + instr = self.instr_queue[0] + if self.find_data_hazards(instr, cycle): + can_issue = False + if self.find_structural_hazard(instr, cycle): + can_issue = False + self.issue_manage_last_branch(instr, cycle) + if not self.iqlen.has(instr): + can_issue = False + if can_issue: + self.iqlen.remove(instr) + instr = self.instr_queue.pop(0) + self.log_event_on(instr, EventKind.issue, cycle) + entry = Entry(instr) + self.scoreboard.append(entry) + self.fus.issue(instr) + self.last_issued = LastIssue(instr, cycle) + self.ras.resolve(instr) + + def try_execute(self, cycle): + """Try to execute instructions""" + for entry in self.scoreboard: + entry.cycles_since_issue += 1 + instr = entry.instr + duration = 1 + if instr.is_load() or instr.is_store(): + duration = 2 + if instr.is_muldiv(): + duration = 2 + if entry.cycles_since_issue == duration: + self.log_event_on(instr, EventKind.done, cycle) + entry.done = True + + def try_commit(self, cycle, commit_port): + """Try to commit an instruction""" + if len(self.scoreboard) == 0: + return + entry = self.scoreboard[0] + can_commit = True + if commit_port > 0: + if entry.instr.is_store(): + can_commit = False + if not entry.done: + can_commit = False + if can_commit: + instr = self.scoreboard.pop(0).instr + self.log_event_on(instr, EventKind.commit, cycle) + self.retired.append(instr) + self.commit_manage_last_branch(instr, cycle) + + def run_cycle(self, cycle): + """Runs a cycle""" + self.fus.cycle() + for commit_port in range(self.commit_width): + self.try_commit(cycle, commit_port) + self.try_execute(cycle) + for _ in range(self.issue_width): + self.try_issue(cycle) + self.iqlen.fetch() + + def load_file(self, path): + """Fill a model from a trace file""" + with open(path, "r", encoding="utf8") as file: + for line in [l.strip() for l in file]: + found = Model.re_instr.search(line) + if found: + address = found.group(2) + hex_code = found.group(3) + mnemo = found.group(5) + instr = Instruction(line, address, hex_code, mnemo) + self.instr_queue.append(instr) + + def run(self, cycles=None): + """Run until completion""" + cycle = 0 + while len(self.instr_queue) > 0 or len(self.scoreboard) > 0: + self.run_cycle(cycle) + if self.debug: + print(f"Scoreboard @{cycle}") + for entry in self.scoreboard: + print(f" {entry}") + print(f"iqlen = {self.iqlen.len}") + print() + cycle += 1 + + if cycles is not None and cycle > cycles: + break + return cycle + +def write_trace(output_file, instructions): + """Write cycle-annotated trace""" + pattern = re.compile(r"@\s*[0-9]+") + + lines = [] + for instr in instructions: + commit_event = instr.events[-1] + assert commit_event.kind == EventKind.commit + cycle = commit_event.cycle + annotated = re.sub(pattern, f"@ {cycle}", instr.line) + #if EventKind.STRUCT in [e.kind for e in instr.events]: + # annotated += " #STRUCT" + #if EventKind.RAW in [e.kind for e in instr.events]: + # annotated += " #RAW" + lines.append(f"{annotated}\n") + + with open(output_file, 'w') as f: + f.writelines(lines) + +def print_data(name, value, ts=24, sep='='): + "Prints 'name = data' with alignment of the '='" + + spaces = ' ' * (ts - len(name)) + print(f"{name}{spaces} {sep} {value}") + +def display_scores(scores): + """Display a 3D graph of scores against commit/issue-wide""" + bars = [] + for x, l in enumerate(scores): + for y, z in enumerate(l): + bars.append((x, y, z)) + + x, y, z, dx, dy, dz = [], [], [], [], [], [] + for bx, by, bz in bars: + x.append(bx) + y.append(by) + z.append(0) + dx.append(.5) + dy.append(.5) + dz.append(bz) + + #fig = plt.figure() + #ax1 = fig.add_subplot(111, projection='3d') + #ax1.bar3d(x, y, z, dx, dy, dz) + #ax1.set_xlabel("issue") + #ax1.set_ylabel("commit") + #ax1.set_zlabel("CoreMark/MHz") + #plt.show() + +def issue_commit_graph(input_file, n = 3): + """Plot the issue/commit graph""" + + r = range(n + 1) + scores = [[0 for _ in r] for _ in r] + + if input_file is None: + scores = [[0, 0, 0, 0, 0, 0], [0, 2.651936045910317, 2.651936045910317, 2.651936045910317, 2.651936045910317, 2.651936045910317], [0, 3.212779150348426, 3.6292766488711137, 3.6292766488711137, 3.6292766488711137, 3.6292766488711137], [0, 3.2550388000624966, 3.900216852056974, 3.914997572701505, 3.914997572701505, 3.914997572701505], [0, 3.2596436557555526, 3.9257869239889134, 3.9420984578510834, 3.9421606193922765, 3.9421606193922765], [0, 3.260695897718491, 3.944757614368385, 3.9623576027736505, 3.9625460150656, 3.9625460150656]] # pylint: disable=line-too-long + else: + r = range(1, n + 1) + for issue in r: + for commit in r: + print("running", issue, commit) + model = Model(issue=issue, commit=commit) + model.load_file(input_file) + model.run() + n_cycles = count_cycles(filter_timed_part(model.retired)) + score = 1000000 / n_cycles + scores[issue][commit] = score + print(scores) + display_scores(scores) + +def filter_timed_part(all_instructions): + "Keep only timed part from a trace" + filtered = [] + re_csrr_minstret = re.compile(r"^csrr\s+\w\w,\s*minstret$") + accepting = False + for instr in all_instructions: + if re_csrr_minstret.search(instr.mnemo): + accepting = not accepting + continue + if accepting: + filtered.append(instr) + return filtered + +def count_cycles(retired): + start = min(e.cycle for e in retired[0].events) + end = max(e.cycle for e in retired[-1].events) + return end - start + +def print_stats(instructions): + ecount = defaultdict(lambda: 0) + + for instr in instructions: + for e in instr.events: + ecount[e.kind] += 1 + cycle = e.cycle + n_instr = len(instructions) + n_cycles = count_cycles(instructions) + + print_data("cycle number", n_cycles) + print_data("Coremark/MHz", 1000000 / n_cycles) + print_data("instruction number", n_instr) + for ek, count in ecount.items(): + print_data(f"{ek}/instr", f"{100 * count / n_instr:.2f}%") + +def main(input_file: str): + "Entry point" + + model = Model(debug=True, issue=2, commit=2) + model.load_file(input_file) + model.run() + + write_trace('annotated.log', model.retired) + print_stats(filter_timed_part(model.retired)) + +if __name__ == "__main__": + main(sys.argv[1])