cvw/bin/validate_performance.py

#!/usr/bin/env python3
import csv
import json
import os

# global variables
WALLY = os.environ.get('WALLY')
coremarkDir = f'{WALLY}/benchmarks/coremark/coremark_results.csv'
coremarkDir_expected = f'{WALLY}/benchmarks/coremark/expected_coremark_results.csv'
embenchDir = f'{WALLY}/benchmarks/embench'


def create_expected_results_json():
    # Create a dictionary to hold the expected results
    validation_data = {
        "coremark": {
            "coremark/mhz": 3.38
        },
        "embench_rv32imc": {
            "wallySizeOpt_size": {
                "size geometric mean": 1.04,
                "size geometric standard deviation": 1.26
            },
            "wallySizeOpt_speed": {
                "size geometric mean": 1.07,
                "size geometric standard deviation": 1.51
            },
            "wallySpeedOpt_size": {
                "size geometric mean": 1.21,
                "size geometric standard deviation": 1.28
            },
            "wallySpeedOpt_speed": {
                "size geometric mean": 1.15,
                "size geometric standard deviation": 1.61
            }
        }
    }

    # Write the data to a JSON file
    with open('expected_results.json', 'w') as json_file:
        json.dump(validation_data, json_file, indent=4)  # Use validation_data instead of 'data'

def validate_results():
    # EMBENCH VALIDATION
    failing_value = ""
    # Create a list to keep track of all the csv files generated by embench
    embench_csv_files = ["wallySizeOpt_size", "wallySizeOpt_speed", "wallySpeedOpt_size", "wallySpeedOpt_speed"]

    for json_file in embench_csv_files:
        directory = f"{embenchDir}/{json_file}.json"
        # Open and read the JSON file
        with open(directory) as file:
            embench_log_csv_data = json.load(file)

        # Extract whether the file is a speed or size test (based on the filename)
        type_speed_sound = json_file.split("_")[1]

        # Create the keys for the JSON data
        level1 = f"{type_speed_sound} results"
        key_level2_mean = f"{type_speed_sound} geometric mean"
        key_level2_std_dev = f"{type_speed_sound} geometric standard deviation"

        # Extract the actual size geometric mean and std. dev. from the JSON data
        actual_size_geometric_mean = embench_log_csv_data[level1][key_level2_mean]
        actual_size_geometric_std_dev = embench_log_csv_data[level1][key_level2_std_dev]

        # Load the expected results from the expected_results.json file
        expected_results_path = f"{WALLY}/bin/expected_results.json"
        with open(expected_results_path) as file:
            expected_data = json.load(file)

        # Extract geometric means from the expected results for comparison
        expected_wally_geometric_mean = expected_data['embench_rv32imc'][json_file]['size geometric mean']
        expected_wally_geometric_std_dev = expected_data['embench_rv32imc'][json_file]['size geometric standard deviation']

        # Compare the actual and expected results
        if (actual_size_geometric_mean != expected_wally_geometric_mean):
            failing_value += f"embench {json_file}'s geometric mean of {actual_size_geometric_mean} does not match expected value of {expected_wally_geometric_mean}\n"

            # # Update expected results file if smaller / better results
            # if actual_size_geometric_mean < expected_wally_geometric_mean:
            #     updated_expected_json = True
            #     expected_data['embench_rv32imc'][json_file]['size geometric mean'] = actual_size_geometric_mean
            #     print(f"Updated expected geometric mean for {json_file} to {actual_size_geometric_mean}")


        if (actual_size_geometric_std_dev != expected_wally_geometric_std_dev):
            failing_value += f"embench {json_file}'s geometric std. dev. of {actual_size_geometric_std_dev} does not match expected value of {expected_wally_geometric_std_dev}\n"

            # # Update expected results file if smaller / better results
            # if actual_size_geometric_std_dev < expected_wally_geometric_std_dev:
            #     updated_expected_json = True
            #     expected_data['embench_rv32imc'][json_file]['size geometric standard deviation'] = actual_size_geometric_std_dev
            #     print(f"Updated expected std. dev. for {json_file} to {actual_size_geometric_std_dev}")

    # if (updated_expected_json):
    #     with open(expected_results_path, 'w') as f:
    #         json.dump(expected_data, f, indent=4)

    #     # automatically push the expected_results.json file to github
    #     subprocess.run(["git", "add", expected_results_path])
    #     subprocess.run(["git", "commit", "-m", "Update expected results with improved metrics"])
    #     subprocess.run(["git", "push"])

    # COREMARK VALIDATION
    # coremark_run = {}
    # with open(coremarkDir, newline='') as csvfile:
    #     reader = csv.DictReader(csvfile)
    #     for row in reader:
    #         arch = row["Architecture"]
            # coremark_run[arch] = row
    # Now you can directly index into it
    # actual_CM_MHz = coremark_run["rv32im_zicsr_zba_zbb_zbs"]["CM / MHz"]
    # expected_wally_CM_Mhz = expected_data['coremark']['coremark/mhz']
    # if (str(actual_CM_MHz) != str(expected_wally_CM_Mhz)):
    #     failure = True
    #     failing_value += f"coremark's actual CM/MHz of {actual_CM_MHz} does not match expected value of {expected_wally_CM_Mhz}\n"


    # Read in the expected results from the expected_coremark_results.csv file,
    # as well as the new one created by the test
    actual_results = read_csv_as_sorted_list(coremarkDir)
    expected_results = read_csv_as_sorted_list(coremarkDir_expected)

    # Compare the two CSV files
    if actual_results != expected_results:
        failing_value = "Coremark results do not match expected results.\n"
        print(failing_value)
        failing_value += f"Coremark results:\n{actual_results}\n"
        failing_value += f"Expected results:\n{expected_results}\n"
        exit(1)

    # Check if there were any failures
    if (failing_value == ""):
        print("Validation Tests completed with 0 errors")
    else:
        print(failing_value)
        exit(1)


def read_csv_as_sorted_list(filename):
    with open(filename, newline='') as f:
        reader = csv.reader(f)
        rows = list(reader)
        rows.sort()  # sort rows for consistent ordering
        return rows

def main():
    create_expected_results_json() # NOTE: need to uncomment this line to create the expected_results.json file
    validate_results()

if __name__ == "__main__":
    main()


# do we only want to trigger with nightly - yes
# is there a reason we only care about the 3.38 from the rv32im_zicsr_zba_zbb_zbs arch - most complete
# how do i know if the two testss that produce the results i scrape from are running - just running these default
    # cd $WALLY/benchmarks/coremark
    # ./coremark_sweep.py

    # cd $WALLY/benchmarks/embench
    # make run

# automatically push to github if better results?

# coremark sweep -  creates the csv of values fro diff arch
# embench benchmark - creates the 4 json files for speed/size

# check if there are differences between runs of coremark sweep on the csv --> done
# need to standardize timoeout duration between performance flag and nightly
# need to make syre it is failing when different
# need to check if i need to validate more values in this file (maybe do a diff for the csv) --> done this part (more to come in future liekly)