added benchmarking scripts to check for performance hits

2025-06-28 01:32:49 -04:00 · 2025-04-26 01:32:29 -07:00 · 2025-04-26 01:32:29 -07:00 · b2958f228f
commit b2958f228f
parent 0ca90e74a4
5 changed files with 257 additions and 3 deletions
--- a/benchmarks/coremark/coremark_sweep.py
+++ b/benchmarks/coremark/coremark_sweep.py
@ -33,6 +33,13 @@ import csv
 import os
 import re

+WALLY = os.environ.get("WALLY")
+
+# Set working directory to where the Makefile is
+coremark_dir = os.path.join(WALLY, "benchmarks/coremark")
+os.chdir(coremark_dir)
+
+
 # list of architectures to run. 
 arch_list = [
    "rv32i_zicsr",
@ -54,7 +61,8 @@ mt_regex = r"Elapsed MTIME: (\d+).*?Elapsed MINSTRET: (\d+).*?COREMARK/MHz Score
 #cpi_regex = r"CPI: \d+ / \d+ = (\d+\.\d+)"
 #cmhz_regex = r"COREMARK/MHz Score: [\d,]+ / [\d,]+ = (\d+\.\d+)"
 # Open a CSV file to write the results
-resultfile = 'coremark_results.csv'
+resultfile = os.path.join(coremark_dir, 'coremark_results.csv')
+# resultfile = 'coremark_results.csv'
 with open(resultfile, mode='w', newline='') as csvfile:
    fieldnames = ['Architecture', 'CM / MHz','CPI','MTIME','MINSTRET','Load Stalls','Store Stalls','D$ Accesses',
                    'D$ Misses','I$ Accesses','I$ Misses','Branches','Branch Mispredicts','BTB Misses',
--- a/benchmarks/coremark/expected_coremark_results.csv
+++ b/benchmarks/coremark/expected_coremark_results.csv
@ -0,0 +1,13 @@
+Architecture,CM / MHz,CPI,MTIME,MINSTRET,Load Stalls,Store Stalls,D$ Accesses,D$ Misses,I$ Accesses,I$ Misses,Branches,Branch Mispredicts,BTB Misses,Jump/JR,RAS Wrong,Returns,BP Class Pred Wrong
+rv32i_zicsr,1.20,1.16,8269385,7124630,261886,22307,716317,73,7827908,1040,2009578,443447,5476,122015,3468,113645,20699
+rv32im_zicsr,3.26,1.12,3061233,2716910,264489,22198,690506,73,2975498,827,543885,45067,5483,30033,69,15237,20898
+rv32imc_zicsr,3.24,1.13,3085767,2716550,264404,23853,690507,75,3011253,285,543761,44223,5615,29675,171,15237,20295
+rv32im_zicsr_zba_zbb_zbs,3.38,1.12,2954414,2624181,266850,22258,689232,75,2878922,494,544408,42375,4295,29685,18,15249,14980
+rv32gc,3.24,1.13,3085783,2716550,264134,23852,690500,74,3010201,286,543485,44182,5563,29668,162,15230,20108
+rv32gc_zba_zbb_zbs,3.32,1.14,3003843,2624181,272635,22141,689226,74,2929468,280,543245,44490,6087,29680,23,15242,22189
+rv64i_zicsr,1.02,1.13,9731538,8559698,273929,22198,720375,85,9242621,588,2340171,459594,4842,128954,178,109383,15941
+rv64im_zicsr,2.86,1.10,3493939,3156218,271101,22149,691714,83,3406099,340,547671,42901,4651,34669,14,15099,15726
+rv64imc_zicsr,2.82,1.12,3545301,3156218,271029,25304,691715,86,3457798,263,547535,43990,4970,34671,5,15099,15889
+rv64im_zicsr_zba_zbb_zbs,3.08,1.11,3241375,2901479,273442,24665,689242,85,3150626,424,547796,41798,4635,34680,43,15111,16143
+rv64gc,2.82,1.12,3545281,3156218,270740,25304,691708,86,3456812,264,547229,43969,4970,34664,5,15092,15889
+rv64gc_zba_zbb_zbs,3.03,1.13,3297540,2901479,273107,26696,689236,83,3200848,250,547359,46238,6197,34675,73,15104,21328
--- a/bin/expected_results.json
+++ b/bin/expected_results.json
@ -0,0 +1,23 @@
+{
+    "coremark": {
+        "coremark/mhz": 3.38
+    },
+    "embench_rv32imc": {
+        "wallySizeOpt_size": {
+            "size geometric mean": 1.04,
+            "size geometric standard deviation": 1.26
+        },
+        "wallySizeOpt_speed": {
+            "size geometric mean": 1.07,
+            "size geometric standard deviation": 1.51
+        },
+        "wallySpeedOpt_size": {
+            "size geometric mean": 1.21,
+            "size geometric standard deviation": 1.28
+        },
+        "wallySpeedOpt_speed": {
+            "size geometric mean": 1.15,
+            "size geometric standard deviation": 1.61
+        }
+    }
+}
--- a/bin/regression-wally
+++ b/bin/regression-wally
@ -395,6 +395,8 @@ def parse_args():
    parser.add_argument("--fp", help="Include floating-point tests in coverage (slower runtime)", action="store_true") # Currently not used
    parser.add_argument("--breker", help="Run Breker tests", action="store_true") # Requires a license for the breker tool. See tests/breker/README.md for details
    parser.add_argument("--dryrun", help="Print commands invoked to console without running regression", action="store_true")
+
+    parser.add_argument("--performance", help="Check for performance changes or discrepencies in embench and coremark", action="store_true")
    return parser.parse_args()


@ -415,7 +417,7 @@ def process_args(args):
        TIMEOUT_DUR = 30*60
        shutil.rmtree(f"{regressionDir}/questa/fcov_ucdb", ignore_errors=True)
        os.makedirs(f"{regressionDir}/questa/fcov_ucdb", exist_ok=True)
-    elif args.buildroot:
+    elif args.buildroot or args.performance: # TODO: fix timing on this bc performance shodl also be limit on nightly
        TIMEOUT_DUR = 60*3600 # 2.5 days
    elif args.testfloat:
        sims = [testfloatsim]
@ -423,7 +425,7 @@ def process_args(args):
    elif args.branch:
        TIMEOUT_DUR = 120*60 # seconds
    elif args.nightly:
-        TIMEOUT_DUR = 30*60 # seconds
+        TIMEOUT_DUR = 5*3600 # seconds # NOTE: changed this to 5 hours for nightly regression
    else:
        TIMEOUT_DUR = 10*60 # seconds

@ -505,6 +507,41 @@ def selectTests(args, sims, coverStr):
                        grepstr="All Tests completed with          0 errors",
                        grepfile = sim_log)
                configs.append(tc)
+
+    if (args.performance or args.nightly):
+        # RUNNING THE EMBENCH TEST
+        embench_test = TestCase(
+            name="embench",
+            variant="rv32gc",  # is this the correct variant here? or rv32imac_zicsr
+            cmd="cd $WALLY/benchmarks/embench && make run | tee run.log", # do we want to pipe result out?  | tee run.log
+            grepstr="SUCCESS", # not sure what keyword to put here
+            grepfile=os.path.expandvars("$WALLY/benchmarks/embench/run.log")
+        )
+        configs.append(embench_test)
+
+        validation_log = f"{WALLY}/bin/logs/validation.log"
+        os.makedirs(os.path.dirname(validation_log), exist_ok=True)
+
+        validate_test_dir = f"{WALLY}/bin/validate_performance.py"
+        coremark_sweep_test_dir = f"{WALLY}/benchmarks/coremark/coremark_sweep.py"
+
+        # Remove './' since you're using full paths
+        # Ensure the log file exists before writing (with `touch`)
+        # Also chain commands properly and safely
+        performance_test = TestCase(
+            name="validate_performance",
+            variant="performance check",
+            cmd=(
+                f"touch {validation_log} && "
+                f"python3 {coremark_sweep_test_dir} && "
+                f"python3 {validate_test_dir} | tee {validation_log}"
+            ),
+            grepstr="Validation Tests completed with 0 errors",  # adjust if message differs
+            grepfile=validation_log
+        )
+
+        configs.append(performance_test)
+
    return configs


--- a/bin/validate_performance.py
+++ b/bin/validate_performance.py
@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+import csv
+import json
+import os
+
+# global variables
+WALLY = os.environ.get('WALLY')
+coremarkDir = f'{WALLY}/benchmarks/coremark/coremark_results.csv'
+coremarkDir_expected = f'{WALLY}/benchmarks/coremark/expected_coremark_results.csv'
+embenchDir = f'{WALLY}/benchmarks/embench'
+
+
+def create_expected_results_json():
+    # Create a dictionary to hold the expected results
+    validation_data = {
+        "coremark": {
+            "coremark/mhz": 3.38
+        },
+        "embench_rv32imc": {
+            "wallySizeOpt_size": {
+                "size geometric mean": 1.04,
+                "size geometric standard deviation": 1.26
+            },
+            "wallySizeOpt_speed": {
+                "size geometric mean": 1.07,
+                "size geometric standard deviation": 1.51
+            },
+            "wallySpeedOpt_size": {
+                "size geometric mean": 1.21,
+                "size geometric standard deviation": 1.28
+            },
+            "wallySpeedOpt_speed": {
+                "size geometric mean": 1.15,
+                "size geometric standard deviation": 1.61
+            }
+        }
+    }
+
+    # Write the data to a JSON file
+    with open('expected_results.json', 'w') as json_file:
+        json.dump(validation_data, json_file, indent=4)  # Use validation_data instead of 'data'
+
+def validate_results():
+    # EMBENCH VALIDATION
+    failing_value = ""
+    # Create a list to keep track of all the csv files generated by embench
+    embench_csv_files = ["wallySizeOpt_size", "wallySizeOpt_speed", "wallySpeedOpt_size", "wallySpeedOpt_speed"]
+
+    for json_file in embench_csv_files:
+        directory = f"{embenchDir}/{json_file}.json"
+        # Open and read the JSON file
+        with open(directory) as file:
+            embench_log_csv_data = json.load(file)
+        
+        # Extract whether the file is a speed or size test (based on the filename)
+        type_speed_sound = json_file.split("_")[1]
+        
+        # Create the keys for the JSON data
+        level1 = f"{type_speed_sound} results"
+        key_level2_mean = f"{type_speed_sound} geometric mean"
+        key_level2_std_dev = f"{type_speed_sound} geometric standard deviation"
+
+        # Extract the actual size geometric mean and std. dev. from the JSON data
+        actual_size_geometric_mean = embench_log_csv_data[level1][key_level2_mean]
+        actual_size_geometric_std_dev = embench_log_csv_data[level1][key_level2_std_dev]
+
+        # Load the expected results from the expected_results.json file
+        expected_results_path = f"{WALLY}/bin/expected_results.json"
+        with open(expected_results_path) as file:
+            expected_data = json.load(file)
+
+        # Extract geometric means from the expected results for comparison
+        expected_wally_geometric_mean = expected_data['embench_rv32imc'][json_file]['size geometric mean']
+        expected_wally_geometric_std_dev = expected_data['embench_rv32imc'][json_file]['size geometric standard deviation']
+
+        # Compare the actual and expected results
+        if (actual_size_geometric_mean != expected_wally_geometric_mean):
+            failing_value += f"embench {json_file}'s geometric mean of {actual_size_geometric_mean} does not match expected value of {expected_wally_geometric_mean}\n"
+            
+            # # Update expected results file if smaller / better results
+            # if actual_size_geometric_mean < expected_wally_geometric_mean:
+            #     updated_expected_json = True
+            #     expected_data['embench_rv32imc'][json_file]['size geometric mean'] = actual_size_geometric_mean
+            #     print(f"Updated expected geometric mean for {json_file} to {actual_size_geometric_mean}")
+
+
+        if (actual_size_geometric_std_dev != expected_wally_geometric_std_dev):
+            failing_value += f"embench {json_file}'s geometric std. dev. of {actual_size_geometric_std_dev} does not match expected value of {expected_wally_geometric_std_dev}\n"
+
+            # # Update expected results file if smaller / better results
+            # if actual_size_geometric_std_dev < expected_wally_geometric_std_dev:
+            #     updated_expected_json = True
+            #     expected_data['embench_rv32imc'][json_file]['size geometric standard deviation'] = actual_size_geometric_std_dev
+            #     print(f"Updated expected std. dev. for {json_file} to {actual_size_geometric_std_dev}")
+
+    # if (updated_expected_json):
+    #     with open(expected_results_path, 'w') as f:
+    #         json.dump(expected_data, f, indent=4)
+        
+    #     # automatically push the expected_results.json file to github
+    #     subprocess.run(["git", "add", expected_results_path])
+    #     subprocess.run(["git", "commit", "-m", "Update expected results with improved metrics"])
+    #     subprocess.run(["git", "push"])
+
+    # COREMARK VALIDATION
+    # coremark_run = {}
+    # with open(coremarkDir, newline='') as csvfile:
+    #     reader = csv.DictReader(csvfile)
+    #     for row in reader:
+    #         arch = row["Architecture"]
+            # coremark_run[arch] = row
+    # Now you can directly index into it
+    # actual_CM_MHz = coremark_run["rv32im_zicsr_zba_zbb_zbs"]["CM / MHz"]
+    # expected_wally_CM_Mhz = expected_data['coremark']['coremark/mhz']
+    # if (str(actual_CM_MHz) != str(expected_wally_CM_Mhz)):
+    #     failure = True
+    #     failing_value += f"coremark's actual CM/MHz of {actual_CM_MHz} does not match expected value of {expected_wally_CM_Mhz}\n"
+    
+
+    # Read in the expected results from the expected_coremark_results.csv file, 
+    # as well as the new one created by the test
+    actual_results = read_csv_as_sorted_list(coremarkDir)
+    expected_results = read_csv_as_sorted_list(coremarkDir_expected)
+
+    # Compare the two CSV files
+    if actual_results != expected_results:
+        failing_value = "Coremark results do not match expected results.\n"
+        print(failing_value)
+        failing_value += f"Coremark results:\n{actual_results}\n"
+        failing_value += f"Expected results:\n{expected_results}\n"
+        exit(1)
+    
+    # Check if there were any failures
+    if (failing_value == ""):
+        print("Validation Tests completed with 0 errors")
+    else:
+        print(failing_value)
+        exit(1)
+    
+        
+def read_csv_as_sorted_list(filename):
+    with open(filename, newline='') as f:
+        reader = csv.reader(f)
+        rows = list(reader)
+        rows.sort()  # sort rows for consistent ordering
+        return rows
+
+def main():
+    create_expected_results_json() # NOTE: need to uncomment this line to create the expected_results.json file
+    validate_results()
+
+if __name__ == "__main__":
+    main()
+
+
+# do we only want to trigger with nightly - yes
+# is there a reason we only care about the 3.38 from the rv32im_zicsr_zba_zbb_zbs arch - most complete
+# how do i know if the two testss that produce the results i scrape from are running - just running these default
+    # cd $WALLY/benchmarks/coremark
+    # ./coremark_sweep.py
+
+    # cd $WALLY/benchmarks/embench
+    # make run
+
+# automatically push to github if better results?
+
+# coremark sweep -  creates the csv of values fro diff arch
+# embench benchmark - creates the 4 json files for speed/size
+
+# check if there are differences between runs of coremark sweep on the csv --> done
+# need to standardize timoeout duration between performance flag and nightly
+# need to make syre it is failing when different 
+# need to check if i need to validate more values in this file (maybe do a diff for the csv) --> done this part (more to come in future liekly)