mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
Script checkin and code cleanup
This commit is contained in:
parent
99c6a1af5a
commit
0e3badf723
2 changed files with 31 additions and 19 deletions
22
run_final.sh
Executable file
22
run_final.sh
Executable file
|
@ -0,0 +1,22 @@
|
|||
# Define arrays for threads, warps, and matrix sizes
|
||||
matrix_sizes=(16 32 64 128 256 512)
|
||||
tcsizes=(8 16 32)
|
||||
tcnums=(4 8 16 32)
|
||||
#lsulanes=(4 16)
|
||||
#cores=(32)
|
||||
|
||||
|
||||
# Loop through each combination of threads and warps
|
||||
for size in "${matrix_sizes[@]}"; do
|
||||
sed -i "s/OPTS ?= -n[0-9]\+/OPTS ?= -n${size}/" ../tests/regression/matmul/Makefile
|
||||
sed -i "s/OPTS ?= -n[0-9]\+/OPTS ?= -n${size}/" tests/regression/matmul/Makefile
|
||||
echo "Matrix size changed to ${size} in Makefile"
|
||||
for tcsize in "${tcsizes[@]}"; do
|
||||
for tcnum in "${tcnums[@]}"; do
|
||||
log_name="sim_final/mat${size}/tcsize${tcsize}_tcnum${tcnum}_32w32t"
|
||||
command="./ci/blackbox.sh --cores=4 --app=matmul --driver=simx --threads=32 --warps=32 --tc_size=${tcsize} --tc_num=${tcnum} --rebuild=1 --perf=1 > ${log_name} 2>&1"
|
||||
echo "$command"
|
||||
eval "$command"
|
||||
done
|
||||
done
|
||||
done
|
|
@ -1432,8 +1432,6 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
uint32_t data_bytes_store;
|
||||
uint32_t num_threads_per_tc = MAX (1, num_threads/TC_per_warp);
|
||||
|
||||
//int num_warps = MIN()
|
||||
//int active_tcs = MIN (TC_per_warp, num_output_tiles/num_warps)
|
||||
//LOAD
|
||||
if(num_threads > tc_size*tc_size*n_tiles*TC_per_warp)
|
||||
{
|
||||
|
@ -1448,11 +1446,6 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
data_bytes_load = mem_bytes*num_data_per_thread;
|
||||
|
||||
//STORE
|
||||
|
||||
// DP(3, "DEBUG :: num_threads = " << num_threads);
|
||||
// DP(3, "DEBUG :: tc_size*tc_size = " << tc_size*tc_size);
|
||||
//DP(3, "imm = " << immsrc);
|
||||
|
||||
if(num_threads > tc_size*tc_size*TC_per_warp)
|
||||
{
|
||||
num_threads_actv_st = tc_size*tc_size*TC_per_warp;
|
||||
|
@ -1499,8 +1492,6 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
scratchpad[loop_offset + (immsrc*(n_tiles)*tc_size*tc_size) + (t*num_data_per_thread) + n] = *temp_ref;
|
||||
DP(3, "Scratchpad Index: " << loop_offset + (immsrc*(n_tiles)*tc_size*tc_size) + (t*num_data_per_thread) + n << ", Value: " << scratchpad[loop_offset + (immsrc*(n_tiles)*tc_size*tc_size) + (t*num_data_per_thread) + n]);
|
||||
}
|
||||
//loop_offset += tc_size*tc_size;
|
||||
//}
|
||||
}
|
||||
rd_write = true;
|
||||
} break;
|
||||
|
@ -1531,7 +1522,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
uint32_t csr_index = (2*num_data_per_thread_st) + n;
|
||||
uint32_t scratchpad_index = (tc_size*tc_size*2) + (t*num_data_per_thread) + n;
|
||||
|
||||
//scratchpad -> csr (TODO :: can intermediate step of moving to CSR be skipped?)
|
||||
//scratchpad -> csr (TODO :: removed intermediate CSR stage ; incorporate limited scratchmad implementation)
|
||||
//core_->set_csr(csr_addr[(2*num_data_per_thread) + n], scratchpad[(n_tiles*tc_size*tc_size*2) + (t*num_data_per_thread) + n], t, warp_id_);
|
||||
Word* temp_ref = &(warp.ireg_file.at(t).at(rsrc0));
|
||||
*temp_ref = scratchpad[(n_tiles*tc_size*tc_size*2) + (t*num_data_per_thread_st) + n];
|
||||
|
@ -1562,14 +1553,14 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
//TC operation [only 1 thread in 1 warp needs to do this]
|
||||
if (t%threads_per_tc == 0)
|
||||
{
|
||||
//TODO - change to systolic array implementation
|
||||
//TODO : change to systolic array implementation
|
||||
uint32_t thread_offset = t*(tc_size*tc_size);
|
||||
int loop_offset = 0;
|
||||
int offset_b = n_tiles*n_tiles*n_tiles*tc_size*tc_size;
|
||||
// Loop over all tiles - output stationary
|
||||
//for(int tiles = 0 ; tiles < n_tiles ; tiles++) //What's the HW implication of this?? A counter implementation?
|
||||
//{
|
||||
/*
|
||||
/*
|
||||
// TODO : Fix needed for functional correctness
|
||||
for(int tiles = 0 ; tiles < n_tiles ; tiles++) //What's the HW implication of this?? A counter implementation?
|
||||
{
|
||||
for (int i = 0; i < tc_size; i++) { //ROW-1
|
||||
for (int j = 0; j < tc_size; j++) { //COL-2
|
||||
int sum = 0;
|
||||
|
@ -1579,12 +1570,11 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
|
|||
}
|
||||
scratchpad[accu_offset + thread_offset +(i * tc_size + j)] += sum; //[i * col2 + j] = sum
|
||||
DP(3, "Scratchpad Index: " << accu_offset + (i * tc_size + j) << " , Value=" << scratchpad[accu_offset + (i * tc_size + j)]);
|
||||
|
||||
}
|
||||
}
|
||||
*/
|
||||
//loop_offset += tc_size*tc_size; //Move to the next tiled matmul fragment
|
||||
//}
|
||||
loop_offset += tc_size*tc_size; //Move to the next tiled matmul fragment
|
||||
}
|
||||
*/
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue