added benchmarking scripts to check for performance hits

2025-06-28 09:36:01 -04:00 · 2025-04-26 01:32:29 -07:00 · 2025-04-26 01:32:29 -07:00 · b2958f228f
commit b2958f228f
parent 0ca90e74a4
5 changed files with 257 additions and 3 deletions
--- a/benchmarks/coremark/coremark_sweep.py
+++ b/benchmarks/coremark/coremark_sweep.py
@ -33,6 +33,13 @@ import csv
 import os
 import re
 WALLY = os.environ.get("WALLY")
 # Set working directory to where the Makefile is
 coremark_dir = os.path.join(WALLY, "benchmarks/coremark")
 os.chdir(coremark_dir)
 # list of architectures to run. 
 arch_list = [
    "rv32i_zicsr",
@ -54,7 +61,8 @@ mt_regex = r"Elapsed MTIME: (\d+).*?Elapsed MINSTRET: (\d+).*?COREMARK/MHz Score
 #cpi_regex = r"CPI: \d+ / \d+ = (\d+\.\d+)"
 #cmhz_regex = r"COREMARK/MHz Score: [\d,]+ / [\d,]+ = (\d+\.\d+)"
 # Open a CSV file to write the results
-resultfile = 'coremark_results.csv'
+resultfile = os.path.join(coremark_dir, 'coremark_results.csv')
 # resultfile = 'coremark_results.csv'
 with open(resultfile, mode='w', newline='') as csvfile:
    fieldnames = ['Architecture', 'CM / MHz','CPI','MTIME','MINSTRET','Load Stalls','Store Stalls','D$ Accesses',
                    'D$ Misses','I$ Accesses','I$ Misses','Branches','Branch Mispredicts','BTB Misses',
--- a/benchmarks/coremark/expected_coremark_results.csv
+++ b/benchmarks/coremark/expected_coremark_results.csv
@ -0,0 +1,13 @@
 Architecture,CM / MHz,CPI,MTIME,MINSTRET,Load Stalls,Store Stalls,D$ Accesses,D$ Misses,I$ Accesses,I$ Misses,Branches,Branch Mispredicts,BTB Misses,Jump/JR,RAS Wrong,Returns,BP Class Pred Wrong
 rv32i_zicsr,1.20,1.16,8269385,7124630,261886,22307,716317,73,7827908,1040,2009578,443447,5476,122015,3468,113645,20699
 rv32im_zicsr,3.26,1.12,3061233,2716910,264489,22198,690506,73,2975498,827,543885,45067,5483,30033,69,15237,20898
 rv32imc_zicsr,3.24,1.13,3085767,2716550,264404,23853,690507,75,3011253,285,543761,44223,5615,29675,171,15237,20295
 rv32im_zicsr_zba_zbb_zbs,3.38,1.12,2954414,2624181,266850,22258,689232,75,2878922,494,544408,42375,4295,29685,18,15249,14980
 rv32gc,3.24,1.13,3085783,2716550,264134,23852,690500,74,3010201,286,543485,44182,5563,29668,162,15230,20108
 rv32gc_zba_zbb_zbs,3.32,1.14,3003843,2624181,272635,22141,689226,74,2929468,280,543245,44490,6087,29680,23,15242,22189
 rv64i_zicsr,1.02,1.13,9731538,8559698,273929,22198,720375,85,9242621,588,2340171,459594,4842,128954,178,109383,15941
 rv64im_zicsr,2.86,1.10,3493939,3156218,271101,22149,691714,83,3406099,340,547671,42901,4651,34669,14,15099,15726
 rv64imc_zicsr,2.82,1.12,3545301,3156218,271029,25304,691715,86,3457798,263,547535,43990,4970,34671,5,15099,15889
 rv64im_zicsr_zba_zbb_zbs,3.08,1.11,3241375,2901479,273442,24665,689242,85,3150626,424,547796,41798,4635,34680,43,15111,16143
 rv64gc,2.82,1.12,3545281,3156218,270740,25304,691708,86,3456812,264,547229,43969,4970,34664,5,15092,15889
 rv64gc_zba_zbb_zbs,3.03,1.13,3297540,2901479,273107,26696,689236,83,3200848,250,547359,46238,6197,34675,73,15104,21328
--- a/bin/expected_results.json
+++ b/bin/expected_results.json
@ -0,0 +1,23 @@
 {
    "coremark": {
        "coremark/mhz": 3.38
    },
    "embench_rv32imc": {
        "wallySizeOpt_size": {
            "size geometric mean": 1.04,
            "size geometric standard deviation": 1.26
        },
        "wallySizeOpt_speed": {
            "size geometric mean": 1.07,
            "size geometric standard deviation": 1.51
        },
        "wallySpeedOpt_size": {
            "size geometric mean": 1.21,
            "size geometric standard deviation": 1.28
        },
        "wallySpeedOpt_speed": {
            "size geometric mean": 1.15,
            "size geometric standard deviation": 1.61
        }
    }
 }
--- a/bin/regression-wally
+++ b/bin/regression-wally
@ -395,6 +395,8 @@ def parse_args():
    parser.add_argument("--fp", help="Include floating-point tests in coverage (slower runtime)", action="store_true") # Currently not used
    parser.add_argument("--breker", help="Run Breker tests", action="store_true") # Requires a license for the breker tool. See tests/breker/README.md for details
    parser.add_argument("--dryrun", help="Print commands invoked to console without running regression", action="store_true")
    parser.add_argument("--performance", help="Check for performance changes or discrepencies in embench and coremark", action="store_true")
    return parser.parse_args()
@ -415,7 +417,7 @@ def process_args(args):
        TIMEOUT_DUR = 30*60
        shutil.rmtree(f"{regressionDir}/questa/fcov_ucdb", ignore_errors=True)
        os.makedirs(f"{regressionDir}/questa/fcov_ucdb", exist_ok=True)
-    elif args.buildroot:
+    elif args.buildroot or args.performance: # TODO: fix timing on this bc performance shodl also be limit on nightly
        TIMEOUT_DUR = 60*3600 # 2.5 days
    elif args.testfloat:
        sims = [testfloatsim]
@ -423,7 +425,7 @@ def process_args(args):
    elif args.branch:
        TIMEOUT_DUR = 120*60 # seconds
    elif args.nightly:
-        TIMEOUT_DUR = 30*60 # seconds
+        TIMEOUT_DUR = 5*3600 # seconds # NOTE: changed this to 5 hours for nightly regression
    else:
        TIMEOUT_DUR = 10*60 # seconds
@ -505,6 +507,41 @@ def selectTests(args, sims, coverStr):
                        grepstr="All Tests completed with          0 errors",
                        grepfile = sim_log)
                configs.append(tc)
    if (args.performance or args.nightly):
        # RUNNING THE EMBENCH TEST
        embench_test = TestCase(
            name="embench",
            variant="rv32gc",  # is this the correct variant here? or rv32imac_zicsr
            cmd="cd $WALLY/benchmarks/embench && make run | tee run.log", # do we want to pipe result out?  | tee run.log
            grepstr="SUCCESS", # not sure what keyword to put here
            grepfile=os.path.expandvars("$WALLY/benchmarks/embench/run.log")
        )
        configs.append(embench_test)
        validation_log = f"{WALLY}/bin/logs/validation.log"
        os.makedirs(os.path.dirname(validation_log), exist_ok=True)
        validate_test_dir = f"{WALLY}/bin/validate_performance.py"
        coremark_sweep_test_dir = f"{WALLY}/benchmarks/coremark/coremark_sweep.py"
        # Remove './' since you're using full paths
        # Ensure the log file exists before writing (with `touch`)
        # Also chain commands properly and safely
        performance_test = TestCase(
            name="validate_performance",
            variant="performance check",
            cmd=(
                f"touch {validation_log} && "
                f"python3 {coremark_sweep_test_dir} && "
                f"python3 {validate_test_dir} | tee {validation_log}"
            ),
            grepstr="Validation Tests completed with 0 errors",  # adjust if message differs
            grepfile=validation_log
        )
        configs.append(performance_test)
    return configs
--- a/bin/validate_performance.py
+++ b/bin/validate_performance.py
@ -0,0 +1,173 @@
 #!/usr/bin/env python3
 import csv
 import json
 import os
 # global variables
 WALLY = os.environ.get('WALLY')
 coremarkDir = f'{WALLY}/benchmarks/coremark/coremark_results.csv'
 coremarkDir_expected = f'{WALLY}/benchmarks/coremark/expected_coremark_results.csv'
 embenchDir = f'{WALLY}/benchmarks/embench'
 def create_expected_results_json():
    # Create a dictionary to hold the expected results
    validation_data = {
        "coremark": {
            "coremark/mhz": 3.38
        },
        "embench_rv32imc": {
            "wallySizeOpt_size": {
                "size geometric mean": 1.04,
                "size geometric standard deviation": 1.26
            },
            "wallySizeOpt_speed": {
                "size geometric mean": 1.07,
                "size geometric standard deviation": 1.51
            },
            "wallySpeedOpt_size": {
                "size geometric mean": 1.21,
                "size geometric standard deviation": 1.28
            },
            "wallySpeedOpt_speed": {
                "size geometric mean": 1.15,
                "size geometric standard deviation": 1.61
            }
        }
    }
    # Write the data to a JSON file
    with open('expected_results.json', 'w') as json_file:
        json.dump(validation_data, json_file, indent=4)  # Use validation_data instead of 'data'
 def validate_results():
    # EMBENCH VALIDATION
    failing_value = ""
    # Create a list to keep track of all the csv files generated by embench
    embench_csv_files = ["wallySizeOpt_size", "wallySizeOpt_speed", "wallySpeedOpt_size", "wallySpeedOpt_speed"]
    for json_file in embench_csv_files:
        directory = f"{embenchDir}/{json_file}.json"
        # Open and read the JSON file
        with open(directory) as file:
            embench_log_csv_data = json.load(file)
        # Extract whether the file is a speed or size test (based on the filename)
        type_speed_sound = json_file.split("_")[1]
        # Create the keys for the JSON data
        level1 = f"{type_speed_sound} results"
        key_level2_mean = f"{type_speed_sound} geometric mean"
        key_level2_std_dev = f"{type_speed_sound} geometric standard deviation"
        # Extract the actual size geometric mean and std. dev. from the JSON data
        actual_size_geometric_mean = embench_log_csv_data[level1][key_level2_mean]
        actual_size_geometric_std_dev = embench_log_csv_data[level1][key_level2_std_dev]
        # Load the expected results from the expected_results.json file
        expected_results_path = f"{WALLY}/bin/expected_results.json"
        with open(expected_results_path) as file:
            expected_data = json.load(file)
        # Extract geometric means from the expected results for comparison
        expected_wally_geometric_mean = expected_data['embench_rv32imc'][json_file]['size geometric mean']
        expected_wally_geometric_std_dev = expected_data['embench_rv32imc'][json_file]['size geometric standard deviation']
        # Compare the actual and expected results
        if (actual_size_geometric_mean != expected_wally_geometric_mean):
            failing_value += f"embench {json_file}'s geometric mean of {actual_size_geometric_mean} does not match expected value of {expected_wally_geometric_mean}\n"
            # # Update expected results file if smaller / better results
            # if actual_size_geometric_mean < expected_wally_geometric_mean:
            #     updated_expected_json = True
            #     expected_data['embench_rv32imc'][json_file]['size geometric mean'] = actual_size_geometric_mean
            #     print(f"Updated expected geometric mean for {json_file} to {actual_size_geometric_mean}")
        if (actual_size_geometric_std_dev != expected_wally_geometric_std_dev):
            failing_value += f"embench {json_file}'s geometric std. dev. of {actual_size_geometric_std_dev} does not match expected value of {expected_wally_geometric_std_dev}\n"
            # # Update expected results file if smaller / better results
            # if actual_size_geometric_std_dev < expected_wally_geometric_std_dev:
            #     updated_expected_json = True
            #     expected_data['embench_rv32imc'][json_file]['size geometric standard deviation'] = actual_size_geometric_std_dev
            #     print(f"Updated expected std. dev. for {json_file} to {actual_size_geometric_std_dev}")
    # if (updated_expected_json):
    #     with open(expected_results_path, 'w') as f:
    #         json.dump(expected_data, f, indent=4)
    #     # automatically push the expected_results.json file to github
    #     subprocess.run(["git", "add", expected_results_path])
    #     subprocess.run(["git", "commit", "-m", "Update expected results with improved metrics"])
    #     subprocess.run(["git", "push"])
    # COREMARK VALIDATION
    # coremark_run = {}
    # with open(coremarkDir, newline='') as csvfile:
    #     reader = csv.DictReader(csvfile)
    #     for row in reader:
    #         arch = row["Architecture"]
            # coremark_run[arch] = row
    # Now you can directly index into it
    # actual_CM_MHz = coremark_run["rv32im_zicsr_zba_zbb_zbs"]["CM / MHz"]
    # expected_wally_CM_Mhz = expected_data['coremark']['coremark/mhz']
    # if (str(actual_CM_MHz) != str(expected_wally_CM_Mhz)):
    #     failure = True
    #     failing_value += f"coremark's actual CM/MHz of {actual_CM_MHz} does not match expected value of {expected_wally_CM_Mhz}\n"
    # Read in the expected results from the expected_coremark_results.csv file, 
    # as well as the new one created by the test
    actual_results = read_csv_as_sorted_list(coremarkDir)
    expected_results = read_csv_as_sorted_list(coremarkDir_expected)
    # Compare the two CSV files
    if actual_results != expected_results:
        failing_value = "Coremark results do not match expected results.\n"
        print(failing_value)
        failing_value += f"Coremark results:\n{actual_results}\n"
        failing_value += f"Expected results:\n{expected_results}\n"
        exit(1)
    # Check if there were any failures
    if (failing_value == ""):
        print("Validation Tests completed with 0 errors")
    else:
        print(failing_value)
        exit(1)
 def read_csv_as_sorted_list(filename):
    with open(filename, newline='') as f:
        reader = csv.reader(f)
        rows = list(reader)
        rows.sort()  # sort rows for consistent ordering
        return rows
 def main():
    create_expected_results_json() # NOTE: need to uncomment this line to create the expected_results.json file
    validate_results()
 if __name__ == "__main__":
    main()
 # do we only want to trigger with nightly - yes
 # is there a reason we only care about the 3.38 from the rv32im_zicsr_zba_zbb_zbs arch - most complete
 # how do i know if the two testss that produce the results i scrape from are running - just running these default
    # cd $WALLY/benchmarks/coremark
    # ./coremark_sweep.py
    # cd $WALLY/benchmarks/embench
    # make run
 # automatically push to github if better results?
 # coremark sweep -  creates the csv of values fro diff arch
 # embench benchmark - creates the 4 json files for speed/size
 # check if there are differences between runs of coremark sweep on the csv --> done
 # need to standardize timoeout duration between performance flag and nightly
 # need to make syre it is failing when different 
 # need to check if i need to validate more values in this file (maybe do a diff for the csv) --> done this part (more to come in future liekly)