diff --git a/benchmarks/coremark/coremark_sweep.py b/benchmarks/coremark/coremark_sweep.py index 2e2ab4555..5e6b099e7 100755 --- a/benchmarks/coremark/coremark_sweep.py +++ b/benchmarks/coremark/coremark_sweep.py @@ -33,6 +33,13 @@ import csv import os import re +WALLY = os.environ.get("WALLY") + +# Set working directory to where the Makefile is +coremark_dir = os.path.join(WALLY, "benchmarks/coremark") +os.chdir(coremark_dir) + + # list of architectures to run. arch_list = [ "rv32i_zicsr", @@ -54,7 +61,8 @@ mt_regex = r"Elapsed MTIME: (\d+).*?Elapsed MINSTRET: (\d+).*?COREMARK/MHz Score #cpi_regex = r"CPI: \d+ / \d+ = (\d+\.\d+)" #cmhz_regex = r"COREMARK/MHz Score: [\d,]+ / [\d,]+ = (\d+\.\d+)" # Open a CSV file to write the results -resultfile = 'coremark_results.csv' +resultfile = os.path.join(coremark_dir, 'coremark_results.csv') +# resultfile = 'coremark_results.csv' with open(resultfile, mode='w', newline='') as csvfile: fieldnames = ['Architecture', 'CM / MHz','CPI','MTIME','MINSTRET','Load Stalls','Store Stalls','D$ Accesses', 'D$ Misses','I$ Accesses','I$ Misses','Branches','Branch Mispredicts','BTB Misses', diff --git a/benchmarks/coremark/expected_coremark_results.csv b/benchmarks/coremark/expected_coremark_results.csv new file mode 100644 index 000000000..fff7b0055 --- /dev/null +++ b/benchmarks/coremark/expected_coremark_results.csv @@ -0,0 +1,13 @@ +Architecture,CM / MHz,CPI,MTIME,MINSTRET,Load Stalls,Store Stalls,D$ Accesses,D$ Misses,I$ Accesses,I$ Misses,Branches,Branch Mispredicts,BTB Misses,Jump/JR,RAS Wrong,Returns,BP Class Pred Wrong +rv32i_zicsr,1.20,1.16,8269385,7124630,261886,22307,716317,73,7827908,1040,2009578,443447,5476,122015,3468,113645,20699 +rv32im_zicsr,3.26,1.12,3061233,2716910,264489,22198,690506,73,2975498,827,543885,45067,5483,30033,69,15237,20898 +rv32imc_zicsr,3.24,1.13,3085767,2716550,264404,23853,690507,75,3011253,285,543761,44223,5615,29675,171,15237,20295 +rv32im_zicsr_zba_zbb_zbs,3.38,1.12,2954414,2624181,266850,22258,689232,75,2878922,494,544408,42375,4295,29685,18,15249,14980 +rv32gc,3.24,1.13,3085783,2716550,264134,23852,690500,74,3010201,286,543485,44182,5563,29668,162,15230,20108 +rv32gc_zba_zbb_zbs,3.32,1.14,3003843,2624181,272635,22141,689226,74,2929468,280,543245,44490,6087,29680,23,15242,22189 +rv64i_zicsr,1.02,1.13,9731538,8559698,273929,22198,720375,85,9242621,588,2340171,459594,4842,128954,178,109383,15941 +rv64im_zicsr,2.86,1.10,3493939,3156218,271101,22149,691714,83,3406099,340,547671,42901,4651,34669,14,15099,15726 +rv64imc_zicsr,2.82,1.12,3545301,3156218,271029,25304,691715,86,3457798,263,547535,43990,4970,34671,5,15099,15889 +rv64im_zicsr_zba_zbb_zbs,3.08,1.11,3241375,2901479,273442,24665,689242,85,3150626,424,547796,41798,4635,34680,43,15111,16143 +rv64gc,2.82,1.12,3545281,3156218,270740,25304,691708,86,3456812,264,547229,43969,4970,34664,5,15092,15889 +rv64gc_zba_zbb_zbs,3.03,1.13,3297540,2901479,273107,26696,689236,83,3200848,250,547359,46238,6197,34675,73,15104,21328 diff --git a/bin/expected_results.json b/bin/expected_results.json new file mode 100644 index 000000000..8ffd582a8 --- /dev/null +++ b/bin/expected_results.json @@ -0,0 +1,23 @@ +{ + "coremark": { + "coremark/mhz": 3.38 + }, + "embench_rv32imc": { + "wallySizeOpt_size": { + "size geometric mean": 1.04, + "size geometric standard deviation": 1.26 + }, + "wallySizeOpt_speed": { + "size geometric mean": 1.07, + "size geometric standard deviation": 1.51 + }, + "wallySpeedOpt_size": { + "size geometric mean": 1.21, + "size geometric standard deviation": 1.28 + }, + "wallySpeedOpt_speed": { + "size geometric mean": 1.15, + "size geometric standard deviation": 1.61 + } + } +} \ No newline at end of file diff --git a/bin/regression-wally b/bin/regression-wally index 88b88a000..ae12761e8 100755 --- a/bin/regression-wally +++ b/bin/regression-wally @@ -395,6 +395,8 @@ def parse_args(): parser.add_argument("--fp", help="Include floating-point tests in coverage (slower runtime)", action="store_true") # Currently not used parser.add_argument("--breker", help="Run Breker tests", action="store_true") # Requires a license for the breker tool. See tests/breker/README.md for details parser.add_argument("--dryrun", help="Print commands invoked to console without running regression", action="store_true") + + parser.add_argument("--performance", help="Check for performance changes or discrepencies in embench and coremark", action="store_true") return parser.parse_args() @@ -415,7 +417,7 @@ def process_args(args): TIMEOUT_DUR = 30*60 shutil.rmtree(f"{regressionDir}/questa/fcov_ucdb", ignore_errors=True) os.makedirs(f"{regressionDir}/questa/fcov_ucdb", exist_ok=True) - elif args.buildroot: + elif args.buildroot or args.performance: # TODO: fix timing on this bc performance shodl also be limit on nightly TIMEOUT_DUR = 60*3600 # 2.5 days elif args.testfloat: sims = [testfloatsim] @@ -423,7 +425,7 @@ def process_args(args): elif args.branch: TIMEOUT_DUR = 120*60 # seconds elif args.nightly: - TIMEOUT_DUR = 30*60 # seconds + TIMEOUT_DUR = 5*3600 # seconds # NOTE: changed this to 5 hours for nightly regression else: TIMEOUT_DUR = 10*60 # seconds @@ -505,6 +507,41 @@ def selectTests(args, sims, coverStr): grepstr="All Tests completed with 0 errors", grepfile = sim_log) configs.append(tc) + + if (args.performance or args.nightly): + # RUNNING THE EMBENCH TEST + embench_test = TestCase( + name="embench", + variant="rv32gc", # is this the correct variant here? or rv32imac_zicsr + cmd="cd $WALLY/benchmarks/embench && make run | tee run.log", # do we want to pipe result out? | tee run.log + grepstr="SUCCESS", # not sure what keyword to put here + grepfile=os.path.expandvars("$WALLY/benchmarks/embench/run.log") + ) + configs.append(embench_test) + + validation_log = f"{WALLY}/bin/logs/validation.log" + os.makedirs(os.path.dirname(validation_log), exist_ok=True) + + validate_test_dir = f"{WALLY}/bin/validate_performance.py" + coremark_sweep_test_dir = f"{WALLY}/benchmarks/coremark/coremark_sweep.py" + + # Remove './' since you're using full paths + # Ensure the log file exists before writing (with `touch`) + # Also chain commands properly and safely + performance_test = TestCase( + name="validate_performance", + variant="performance check", + cmd=( + f"touch {validation_log} && " + f"python3 {coremark_sweep_test_dir} && " + f"python3 {validate_test_dir} | tee {validation_log}" + ), + grepstr="Validation Tests completed with 0 errors", # adjust if message differs + grepfile=validation_log + ) + + configs.append(performance_test) + return configs diff --git a/bin/validate_performance.py b/bin/validate_performance.py new file mode 100755 index 000000000..ea5598215 --- /dev/null +++ b/bin/validate_performance.py @@ -0,0 +1,173 @@ +#!/usr/bin/env python3 +import csv +import json +import os + +# global variables +WALLY = os.environ.get('WALLY') +coremarkDir = f'{WALLY}/benchmarks/coremark/coremark_results.csv' +coremarkDir_expected = f'{WALLY}/benchmarks/coremark/expected_coremark_results.csv' +embenchDir = f'{WALLY}/benchmarks/embench' + + +def create_expected_results_json(): + # Create a dictionary to hold the expected results + validation_data = { + "coremark": { + "coremark/mhz": 3.38 + }, + "embench_rv32imc": { + "wallySizeOpt_size": { + "size geometric mean": 1.04, + "size geometric standard deviation": 1.26 + }, + "wallySizeOpt_speed": { + "size geometric mean": 1.07, + "size geometric standard deviation": 1.51 + }, + "wallySpeedOpt_size": { + "size geometric mean": 1.21, + "size geometric standard deviation": 1.28 + }, + "wallySpeedOpt_speed": { + "size geometric mean": 1.15, + "size geometric standard deviation": 1.61 + } + } + } + + # Write the data to a JSON file + with open('expected_results.json', 'w') as json_file: + json.dump(validation_data, json_file, indent=4) # Use validation_data instead of 'data' + +def validate_results(): + # EMBENCH VALIDATION + failing_value = "" + # Create a list to keep track of all the csv files generated by embench + embench_csv_files = ["wallySizeOpt_size", "wallySizeOpt_speed", "wallySpeedOpt_size", "wallySpeedOpt_speed"] + + for json_file in embench_csv_files: + directory = f"{embenchDir}/{json_file}.json" + # Open and read the JSON file + with open(directory) as file: + embench_log_csv_data = json.load(file) + + # Extract whether the file is a speed or size test (based on the filename) + type_speed_sound = json_file.split("_")[1] + + # Create the keys for the JSON data + level1 = f"{type_speed_sound} results" + key_level2_mean = f"{type_speed_sound} geometric mean" + key_level2_std_dev = f"{type_speed_sound} geometric standard deviation" + + # Extract the actual size geometric mean and std. dev. from the JSON data + actual_size_geometric_mean = embench_log_csv_data[level1][key_level2_mean] + actual_size_geometric_std_dev = embench_log_csv_data[level1][key_level2_std_dev] + + # Load the expected results from the expected_results.json file + expected_results_path = f"{WALLY}/bin/expected_results.json" + with open(expected_results_path) as file: + expected_data = json.load(file) + + # Extract geometric means from the expected results for comparison + expected_wally_geometric_mean = expected_data['embench_rv32imc'][json_file]['size geometric mean'] + expected_wally_geometric_std_dev = expected_data['embench_rv32imc'][json_file]['size geometric standard deviation'] + + # Compare the actual and expected results + if (actual_size_geometric_mean != expected_wally_geometric_mean): + failing_value += f"embench {json_file}'s geometric mean of {actual_size_geometric_mean} does not match expected value of {expected_wally_geometric_mean}\n" + + # # Update expected results file if smaller / better results + # if actual_size_geometric_mean < expected_wally_geometric_mean: + # updated_expected_json = True + # expected_data['embench_rv32imc'][json_file]['size geometric mean'] = actual_size_geometric_mean + # print(f"Updated expected geometric mean for {json_file} to {actual_size_geometric_mean}") + + + if (actual_size_geometric_std_dev != expected_wally_geometric_std_dev): + failing_value += f"embench {json_file}'s geometric std. dev. of {actual_size_geometric_std_dev} does not match expected value of {expected_wally_geometric_std_dev}\n" + + # # Update expected results file if smaller / better results + # if actual_size_geometric_std_dev < expected_wally_geometric_std_dev: + # updated_expected_json = True + # expected_data['embench_rv32imc'][json_file]['size geometric standard deviation'] = actual_size_geometric_std_dev + # print(f"Updated expected std. dev. for {json_file} to {actual_size_geometric_std_dev}") + + # if (updated_expected_json): + # with open(expected_results_path, 'w') as f: + # json.dump(expected_data, f, indent=4) + + # # automatically push the expected_results.json file to github + # subprocess.run(["git", "add", expected_results_path]) + # subprocess.run(["git", "commit", "-m", "Update expected results with improved metrics"]) + # subprocess.run(["git", "push"]) + + # COREMARK VALIDATION + # coremark_run = {} + # with open(coremarkDir, newline='') as csvfile: + # reader = csv.DictReader(csvfile) + # for row in reader: + # arch = row["Architecture"] + # coremark_run[arch] = row + # Now you can directly index into it + # actual_CM_MHz = coremark_run["rv32im_zicsr_zba_zbb_zbs"]["CM / MHz"] + # expected_wally_CM_Mhz = expected_data['coremark']['coremark/mhz'] + # if (str(actual_CM_MHz) != str(expected_wally_CM_Mhz)): + # failure = True + # failing_value += f"coremark's actual CM/MHz of {actual_CM_MHz} does not match expected value of {expected_wally_CM_Mhz}\n" + + + # Read in the expected results from the expected_coremark_results.csv file, + # as well as the new one created by the test + actual_results = read_csv_as_sorted_list(coremarkDir) + expected_results = read_csv_as_sorted_list(coremarkDir_expected) + + # Compare the two CSV files + if actual_results != expected_results: + failing_value = "Coremark results do not match expected results.\n" + print(failing_value) + failing_value += f"Coremark results:\n{actual_results}\n" + failing_value += f"Expected results:\n{expected_results}\n" + exit(1) + + # Check if there were any failures + if (failing_value == ""): + print("Validation Tests completed with 0 errors") + else: + print(failing_value) + exit(1) + + +def read_csv_as_sorted_list(filename): + with open(filename, newline='') as f: + reader = csv.reader(f) + rows = list(reader) + rows.sort() # sort rows for consistent ordering + return rows + +def main(): + create_expected_results_json() # NOTE: need to uncomment this line to create the expected_results.json file + validate_results() + +if __name__ == "__main__": + main() + + +# do we only want to trigger with nightly - yes +# is there a reason we only care about the 3.38 from the rv32im_zicsr_zba_zbb_zbs arch - most complete +# how do i know if the two testss that produce the results i scrape from are running - just running these default + # cd $WALLY/benchmarks/coremark + # ./coremark_sweep.py + + # cd $WALLY/benchmarks/embench + # make run + +# automatically push to github if better results? + +# coremark sweep - creates the csv of values fro diff arch +# embench benchmark - creates the 4 json files for speed/size + +# check if there are differences between runs of coremark sweep on the csv --> done +# need to standardize timoeout duration between performance flag and nightly +# need to make syre it is failing when different +# need to check if i need to validate more values in this file (maybe do a diff for the csv) --> done this part (more to come in future liekly)