cvw/bin/validate_performance.py

173 lines
7.5 KiB
Python
Executable file

#!/usr/bin/env python3
import csv
import json
import os
# global variables
WALLY = os.environ.get('WALLY')
coremarkDir = f'{WALLY}/benchmarks/coremark/coremark_results.csv'
coremarkDir_expected = f'{WALLY}/benchmarks/coremark/expected_coremark_results.csv'
embenchDir = f'{WALLY}/benchmarks/embench'
def create_expected_results_json():
# Create a dictionary to hold the expected results
validation_data = {
"coremark": {
"coremark/mhz": 3.38
},
"embench_rv32imc": {
"wallySizeOpt_size": {
"size geometric mean": 1.04,
"size geometric standard deviation": 1.26
},
"wallySizeOpt_speed": {
"size geometric mean": 1.07,
"size geometric standard deviation": 1.51
},
"wallySpeedOpt_size": {
"size geometric mean": 1.21,
"size geometric standard deviation": 1.28
},
"wallySpeedOpt_speed": {
"size geometric mean": 1.15,
"size geometric standard deviation": 1.61
}
}
}
# Write the data to a JSON file
with open('expected_results.json', 'w') as json_file:
json.dump(validation_data, json_file, indent=4) # Use validation_data instead of 'data'
def validate_results():
# EMBENCH VALIDATION
failing_value = ""
# Create a list to keep track of all the csv files generated by embench
embench_csv_files = ["wallySizeOpt_size", "wallySizeOpt_speed", "wallySpeedOpt_size", "wallySpeedOpt_speed"]
for json_file in embench_csv_files:
directory = f"{embenchDir}/{json_file}.json"
# Open and read the JSON file
with open(directory) as file:
embench_log_csv_data = json.load(file)
# Extract whether the file is a speed or size test (based on the filename)
type_speed_sound = json_file.split("_")[1]
# Create the keys for the JSON data
level1 = f"{type_speed_sound} results"
key_level2_mean = f"{type_speed_sound} geometric mean"
key_level2_std_dev = f"{type_speed_sound} geometric standard deviation"
# Extract the actual size geometric mean and std. dev. from the JSON data
actual_size_geometric_mean = embench_log_csv_data[level1][key_level2_mean]
actual_size_geometric_std_dev = embench_log_csv_data[level1][key_level2_std_dev]
# Load the expected results from the expected_results.json file
expected_results_path = f"{WALLY}/bin/expected_results.json"
with open(expected_results_path) as file:
expected_data = json.load(file)
# Extract geometric means from the expected results for comparison
expected_wally_geometric_mean = expected_data['embench_rv32imc'][json_file]['size geometric mean']
expected_wally_geometric_std_dev = expected_data['embench_rv32imc'][json_file]['size geometric standard deviation']
# Compare the actual and expected results
if (actual_size_geometric_mean != expected_wally_geometric_mean):
failing_value += f"embench {json_file}'s geometric mean of {actual_size_geometric_mean} does not match expected value of {expected_wally_geometric_mean}\n"
# # Update expected results file if smaller / better results
# if actual_size_geometric_mean < expected_wally_geometric_mean:
# updated_expected_json = True
# expected_data['embench_rv32imc'][json_file]['size geometric mean'] = actual_size_geometric_mean
# print(f"Updated expected geometric mean for {json_file} to {actual_size_geometric_mean}")
if (actual_size_geometric_std_dev != expected_wally_geometric_std_dev):
failing_value += f"embench {json_file}'s geometric std. dev. of {actual_size_geometric_std_dev} does not match expected value of {expected_wally_geometric_std_dev}\n"
# # Update expected results file if smaller / better results
# if actual_size_geometric_std_dev < expected_wally_geometric_std_dev:
# updated_expected_json = True
# expected_data['embench_rv32imc'][json_file]['size geometric standard deviation'] = actual_size_geometric_std_dev
# print(f"Updated expected std. dev. for {json_file} to {actual_size_geometric_std_dev}")
# if (updated_expected_json):
# with open(expected_results_path, 'w') as f:
# json.dump(expected_data, f, indent=4)
# # automatically push the expected_results.json file to github
# subprocess.run(["git", "add", expected_results_path])
# subprocess.run(["git", "commit", "-m", "Update expected results with improved metrics"])
# subprocess.run(["git", "push"])
# COREMARK VALIDATION
# coremark_run = {}
# with open(coremarkDir, newline='') as csvfile:
# reader = csv.DictReader(csvfile)
# for row in reader:
# arch = row["Architecture"]
# coremark_run[arch] = row
# Now you can directly index into it
# actual_CM_MHz = coremark_run["rv32im_zicsr_zba_zbb_zbs"]["CM / MHz"]
# expected_wally_CM_Mhz = expected_data['coremark']['coremark/mhz']
# if (str(actual_CM_MHz) != str(expected_wally_CM_Mhz)):
# failure = True
# failing_value += f"coremark's actual CM/MHz of {actual_CM_MHz} does not match expected value of {expected_wally_CM_Mhz}\n"
# Read in the expected results from the expected_coremark_results.csv file,
# as well as the new one created by the test
actual_results = read_csv_as_sorted_list(coremarkDir)
expected_results = read_csv_as_sorted_list(coremarkDir_expected)
# Compare the two CSV files
if actual_results != expected_results:
failing_value = "Coremark results do not match expected results.\n"
print(failing_value)
failing_value += f"Coremark results:\n{actual_results}\n"
failing_value += f"Expected results:\n{expected_results}\n"
exit(1)
# Check if there were any failures
if (failing_value == ""):
print("Validation Tests completed with 0 errors")
else:
print(failing_value)
exit(1)
def read_csv_as_sorted_list(filename):
with open(filename, newline='') as f:
reader = csv.reader(f)
rows = list(reader)
rows.sort() # sort rows for consistent ordering
return rows
def main():
create_expected_results_json() # NOTE: need to uncomment this line to create the expected_results.json file
validate_results()
if __name__ == "__main__":
main()
# do we only want to trigger with nightly - yes
# is there a reason we only care about the 3.38 from the rv32im_zicsr_zba_zbb_zbs arch - most complete
# how do i know if the two testss that produce the results i scrape from are running - just running these default
# cd $WALLY/benchmarks/coremark
# ./coremark_sweep.py
# cd $WALLY/benchmarks/embench
# make run
# automatically push to github if better results?
# coremark sweep - creates the csv of values fro diff arch
# embench benchmark - creates the 4 json files for speed/size
# check if there are differences between runs of coremark sweep on the csv --> done
# need to standardize timoeout duration between performance flag and nightly
# need to make syre it is failing when different
# need to check if i need to validate more values in this file (maybe do a diff for the csv) --> done this part (more to come in future liekly)