Added matrix multiplication simx updates

This commit is contained in:
Varun Saxena 2023-02-02 16:26:57 -05:00
parent 88ed687557
commit 86e4ba9e4f
20 changed files with 60306 additions and 32 deletions

View file

@ -198,6 +198,26 @@ inline void vx_fence() {
asm volatile ("fence iorw, iorw");
}
inline void ml(unsigned dest, unsigned addr) {
asm volatile (".insn s 0x7b, 0, x0, %0(%1)" :: "i"(dest), "r"(addr));
}
inline void ms(unsigned addr) {
asm volatile (".insn s 0x7b, 1, x0, 0(%0)" :: "r"(addr));
}
inline void mm() {
asm volatile (".insn s 0x7b, 2, x0, 0(x0)");
}
//inline void vx_prefetch(unsigned addr) {
// asm volatile (".insn s 0x6b, 5, x0, 0(%0)" :: "r"(addr) );
//}
//inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
// asm volatile (".insn s 0x6b, 4, %1, 0(%0)" :: "r"(barried_id), "r"(num_warps));
//}
#define __if(b) vx_split(b); \
if (b)
@ -211,4 +231,4 @@ inline void vx_fence() {
}
#endif
#endif
#endif

View file

@ -181,11 +181,17 @@ void Core::cout_flush() {
}
void Core::tick() {
//std::cout << "coreid=" << id_ << " Start tick. Before commit" << std::endl;
this->commit();
//std::cout << "coreid=" << id_ << " After Commit. Before execute" << std::endl;
this->execute();
//std::cout << "coreid=" << id_ << " After execute. Before decode" << std::endl;
this->decode();
//std::cout << "coreid=" << id_ << " After decode. Before fetch" << std::endl;
this->fetch();
//std::cout << "coreid=" << id_ << " After fetch. Before schedule" << std::endl;
this->schedule();
//std::cout << "coreid=" << id_ << " After schedule" << std::endl;
// update perf counter
perf_stats_.mem_latency += perf_mem_pending_reads_;

View file

@ -1,7 +1,7 @@
#pragma once
#ifndef DEBUG_LEVEL
#define DEBUG_LEVEL 3
#define DEBUG_LEVEL 4
#endif
#define DEBUG_HEADER << "DEBUG "
@ -10,7 +10,7 @@
#define TRACE_HEADER << "TRACE "
//#define TRACE_HEADER << "DEBUG " << __FILE__ << ':' << std::dec << __LINE__ << ": "
#ifndef NDEBUG
#ifdef NDEBUG
#include <iostream>
#include <iomanip>

View file

@ -42,6 +42,7 @@ static const std::unordered_map<Opcode, struct InstTableEntry_t> sc_instTable =
{Opcode::VSET, {false, InstType::V_TYPE}},
{Opcode::GPGPU, {false, InstType::R_TYPE}},
{Opcode::GPU, {false, InstType::R4_TYPE}},
{Opcode::TCU, {false, InstType::S_TYPE}},
{Opcode::R_INST_W, {false, InstType::R_TYPE}},
{Opcode::I_INST_W, {false, InstType::I_TYPE}},
};
@ -368,6 +369,14 @@ static const char* op_string(const Instr &instr) {
default:
std::abort();
}
case Opcode::TCU:
switch(func3){
case 0: return "ML";
case 1: return "MS";
case 2: return "MM";
default:
std::abort();
}
default:
std::abort();
}
@ -430,7 +439,7 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
auto op_it = sc_instTable.find(op);
if (op_it == sc_instTable.end()) {
std::cout << std::hex << "Error: invalid opcode: 0x" << op << std::endl;
std::cout << std::hex << "Error: asdada invalid opcode: 0x" << op << std::endl;
return nullptr;
}
@ -546,8 +555,10 @@ std::shared_ptr<Instr> Decoder::decode(uint32_t code) const {
}
instr->setFunc3(func3);
auto imm = (func7 << width_reg) | rd;
instr->setImm(sext(imm, width_i_imm));
} break;
instr->setImm(sext(imm, width_i_imm));
if (op == Opcode::TCU)
std::cout << "TCUDEBUG: immediate val: " << imm << ", address in reg# " << rs1 << ", zero: " << rs2 << std::endl;
} break;
case InstType::B_TYPE: {
instr->setSrcReg(rs1, RegType::Integer);

View file

@ -124,6 +124,7 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
std::vector<reg_data_t[3]> rsdata(num_threads);
std::vector<reg_data_t> rddata(num_threads);
std::vector<reg_data_t[SIZE_SQ]> rddata_arr(num_threads);
auto num_rsrcs = instr.getNRSrc();
if (num_rsrcs) {
@ -1451,6 +1452,141 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
std::abort();
}
} break;
case TCU: {
switch (func3) {
case 0: { //Matrix Load
trace->exe_type = ExeType::LSU;
trace->lsu.type = LsuType::LOAD;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
uint32_t mem_bytes = 1 << (2 & 0x3);
for (uint32_t t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
uint64_t mem_addr = rsdata[t][0].i ;
uint64_t mem_addr_arr[SIZE][SIZE];
uint64_t base_addr = rsdata[t][0].i ;
//get the memory addresses
for(int i = 0; i < SIZE; i++){
for(int j = 0; j < SIZE; j++){
mem_addr_arr[i][j] = base_addr + ((SIZE*i) + j)*4;
}
}
uint64_t mem_data = 0;
uint64_t mem_data_arr[SIZE][SIZE];
//core_->dcache_read(&mem_data, mem_addr, mem_bytes);
//load memory addresses
for(int i = 0; i < SIZE; i++){
for(int j = 0; j < SIZE; j++){
uint64_t* temp_ref = &mem_data_arr[i][j];
//core_->dcache_read(mem_data_arr[i][j], mem_addr_arr[i][j], mem_bytes);
core_->dcache_read(temp_ref, mem_addr_arr[i][j], mem_bytes);
DP(4, "TCU LOAD MEM: ADDRESS=0x" << std::hex << mem_addr_arr[i][j] << ", DATA=0x" << mem_data_arr[i][j]);
}
}
trace->mem_addrs.at(t).push_back({mem_addr, mem_bytes});
DP(4, "TCU LOAD MEM: ADDRESS=0x" << std::hex << mem_addr << ", DATA=0x" << mem_data);
//load 32 bit data into rdata. Now what?
// RV32I: LW
rddata[t].i = sext((Word)mem_data, 32);
//put into rddata_arr[]
for(int i = 0; i < SIZE; i++){
for(int j = 0; j < SIZE; j++){
rddata_arr[t][SIZE*i + j].i = sext((Word)mem_data_arr[i][j], 32);
}
}
}
rd_write = true;
} break;
case 1: { //Matrix Store
trace->exe_type = ExeType::LSU;
trace->lsu.type = LsuType::STORE;
trace->used_iregs.set(rsrc0);
trace->used_iregs.set(rsrc1);
DP(4, "TCU STORE MEM: ADDRESS=0x");// << std::hex << mem_addr << ", DATA=0x" << mem_data);
uint32_t mem_bytes = 1 << (2 & 0x3);
uint64_t mask = ((uint64_t(1) << (8 * mem_bytes))-1);
for (uint32_t t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
uint64_t mem_addr = rsdata[t][0].i;
uint64_t base_addr = rsdata[t][0].i;
uint64_t mem_data = tcore_ireg_c[t][0];
uint64_t mem_addr_arr[SIZE][SIZE];
uint64_t mem_data_arr[SIZE][SIZE];
//memory addr array
for(int i = 0; i < SIZE; i++){
for(int j = 0; j < SIZE; j++){
mem_addr_arr[i][j] = base_addr + ((SIZE*i) + j)*4;
}
}
//data array from tcore reg c
for(int i = 0; i < SIZE; i++){
for(int j = 0; j < SIZE; j++){
mem_data_arr[i][j] = tcore_ireg_c.at(t)[i*SIZE + j];
//tcore_ireg_a.at(t)[i] = rddata_arr[t].i;
}
}
if (mem_bytes < 8) {
mem_data &= mask;
}
trace->mem_addrs.at(t).push_back({mem_addr, mem_bytes});
DP(4, "STORE MEM: ADDRESS=0x" << std::hex << mem_addr << ", DATA=0x" << mem_data);
//core_->dcache_write(&mem_data, mem_addr, mem_bytes);
for(int i = 0; i < SIZE; i++){
for(int j = 0; j < SIZE; j++){
uint64_t* temp_ref = &mem_data_arr[i][j];
core_->dcache_write(temp_ref, mem_addr_arr[i][j], mem_bytes);
DP(4, "TCU STORE MEM: ADDRESS=0x" << std::hex << mem_addr_arr[i][j] << ", DATA=0x" << mem_data_arr[i][j]);
}
}
}
} break;
case 2: { //Matrix Multiply
DP(4, "TCU MULTIPLY MAT");// << std::hex << mem_addr << ", DATA=0x" << mem_data);
trace->exe_type = ExeType::ALU;
trace->alu.type = AluType::ARITH;
trace->used_tcore_iregs_a.set(rsrc0);
trace->used_tcore_iregs_b.set(rsrc0);
for (uint32_t t = 0; t < num_threads; ++t) {
if (!tmask_.test(t))
continue;
//tcore_ireg_c.at(t)[0] = tcore_ireg_a.at(t)[0] * tcore_ireg_b.at(t)[0] + tcore_ireg_a.at(t)[1] * tcore_ireg_b.at(t)[2];
//tcore_ireg_c.at(t)[1] = tcore_ireg_a.at(t)[0] * tcore_ireg_b.at(t)[1] + tcore_ireg_a.at(t)[1] * tcore_ireg_b.at(t)[3];
//tcore_ireg_c.at(t)[2] = tcore_ireg_a.at(t)[2] * tcore_ireg_b.at(t)[0] + tcore_ireg_a.at(t)[3] * tcore_ireg_b.at(t)[2];
//tcore_ireg_c.at(t)[3] = tcore_ireg_a.at(t)[2] * tcore_ireg_b.at(t)[1] + tcore_ireg_a.at(t)[3] * tcore_ireg_b.at(t)[3];
for (int i = 0; i < SIZE; i++) { //ROW-1
for (int j = 0; j < SIZE; j++) { //COL-2
int sum = 0;
for (int k = 0; k < SIZE; k++){ //COL-1
sum = sum + tcore_ireg_a.at(t)[i * SIZE + k] * tcore_ireg_b.at(t)[k * SIZE + j]; //sum = [i * col1 + k] * [k * col2 + j]
}
tcore_ireg_c.at(t)[i * SIZE + j] = sum; //[i * col2 + j] = sum
}
}
for (int i = 0; i < SIZE_SQ; i++){
trace->used_tcore_iregs_c[i] = 1;
std::cout << "TCU MM: Multiplication result: " << std::hex << tcore_ireg_c.at(t)[i] << std::endl;
}
}
rd_write = true;
}break;
default:
std::abort();
}
} break;
case VSET: {
uint32_t VLEN = core_->arch().vsize() * 8;
uint32_t VLMAX = (instr.getVlmul() * VLEN) / instr.getVsew();
@ -2307,40 +2443,81 @@ void Warp::execute(const Instr &instr, pipeline_trace_t *trace) {
trace->wb = true;
DPH(2, "Dest Reg: ");
auto type = instr.getRDType();
switch (type) {
case RegType::Integer:
if (rdest) {
DPN(2, type << std::dec << rdest << "={");
for (uint32_t t = 0; t < num_threads; ++t) {
if (t) DPN(2, ", ");
if (!tmask_.test(t)) {
DPN(2, "-");
continue;
}
ireg_file_.at(t)[rdest] = rddata[t].i;
DPN(2, "0x" << std::hex << rddata[t].i);
}
DPN(2, "}" << std::endl);
trace->used_iregs[rdest] = 1;
}
break;
case RegType::Float:
DPN(2, type << std::dec << rdest << "={");
if(opcode == Opcode::TCU){ //tensor core
//iterate over threads
//put in tensor core reg.
std::cout << "TCU if condition" << std::endl;
DPN(2, type << std::dec << immsrc << "={"); //FIX
for (uint32_t t = 0; t < num_threads; ++t) {
if (t) DPN(2, ", ");
if (!tmask_.test(t)) {
DPN(2, "-");
continue;
}
freg_file_.at(t)[rdest] = rddata[t].f;
DPN(2, "0x" << std::hex << rddata[t].f);
//check immediate value
if(immsrc == 0){ // 0 => A; Load A
//iterate over all regs in A
for (int i = 0; i < SIZE_SQ; i++){
tcore_ireg_a.at(t)[i] = rddata_arr[t][i].i;
trace->used_tcore_iregs_a[i] = 1;
}
}
else if(immsrc == 1){ // 0 => B; Load B
//iterate over all regs in B
for (int i = 0; i < SIZE_SQ; i++){
tcore_ireg_b.at(t)[i] = rddata_arr[t][i].i;
trace->used_tcore_iregs_b[i] = 1;
}
}
/*
else if(){ // Mul A x B
for (int i = 0; i < SIZE_SQ; i++){
tcore_ireg_c.at(t)[i] = rddata[t].i;
trace->used_tcore_iregs_b[i] = 1;
}
}
*/
DPN(2, "0x" << std::hex << rddata[t].i);
}
DPN(2, "}" << std::endl);
trace->used_fregs[rdest] = 1;
break;
default:
std::abort();
break;
}
else{
switch (type) {
case RegType::Integer:
if (rdest) {
DPN(2, type << std::dec << rdest << "={");
for (uint32_t t = 0; t < num_threads; ++t) {
if (t) DPN(2, ", ");
if (!tmask_.test(t)) {
DPN(2, "-");
continue;
}
ireg_file_.at(t)[rdest] = rddata[t].i;
DPN(2, "0x" << std::hex << rddata[t].i);
}
DPN(2, "}" << std::endl);
trace->used_iregs[rdest] = 1;
}
break;
case RegType::Float:
DPN(2, type << std::dec << rdest << "={");
for (uint32_t t = 0; t < num_threads; ++t) {
if (t) DPN(2, ", ");
if (!tmask_.test(t)) {
DPN(2, "-");
continue;
}
freg_file_.at(t)[rdest] = rddata[t].f;
DPN(2, "0x" << std::hex << rddata[t].f);
}
DPN(2, "}" << std::endl);
trace->used_fregs[rdest] = 1;
break;
default:
std::abort();
break;
}
}
}

View file

@ -32,6 +32,8 @@ enum Opcode {
// GPGPU Extension
GPGPU = 0x6b,
GPU = 0x5b,
// tensorcore Extension
TCU = 0x7b,
// RV64 Standard Extensions
R_INST_W = 0x3b,
I_INST_W = 0x1b,

View file

@ -76,6 +76,7 @@ int main(int argc, char **argv) {
// attach memory module
processor.attach_ram(&ram);
//std::cout << "Test msg" << " Num of cores: " << num_cores << ". Num of warps: " << num_warps << ". Num of threads: " << num_threads << std::endl;
// run simulation
exitcode = processor.run();

View file

@ -32,6 +32,9 @@ struct pipeline_trace_t {
RegMask used_iregs;
RegMask used_fregs;
RegMask used_vregs;
RegMask used_tcore_iregs_a;
RegMask used_tcore_iregs_b;
RegMask used_tcore_iregs_c;
//-
ExeType exe_type;

View file

@ -16,6 +16,9 @@ private:
std::vector<RegMask> in_use_iregs_;
std::vector<RegMask> in_use_fregs_;
std::vector<RegMask> in_use_vregs_;
std::vector<RegMask> in_use_tcore_iregs_a;
std::vector<RegMask> in_use_tcore_iregs_b;
std::vector<RegMask> in_use_tcore_iregs_c;
std::unordered_map<uint32_t, uint64_t> owners_;
public:
@ -23,6 +26,9 @@ public:
: in_use_iregs_(arch.num_warps())
, in_use_fregs_(arch.num_warps())
, in_use_vregs_(arch.num_warps())
, in_use_tcore_iregs_a(arch.num_warps())
, in_use_tcore_iregs_b(arch.num_warps())
, in_use_tcore_iregs_c(arch.num_warps())
{
this->clear();
}
@ -32,6 +38,9 @@ public:
in_use_iregs_.at(i).reset();
in_use_fregs_.at(i).reset();
in_use_vregs_.at(i).reset();
in_use_tcore_iregs_a.at(i).reset();
in_use_tcore_iregs_b.at(i).reset();
in_use_tcore_iregs_c.at(i).reset();
}
owners_.clear();
}

View file

@ -16,6 +16,9 @@ Warp::Warp(Core *core, uint32_t id)
, ireg_file_(core->arch().num_threads(), std::vector<Word>(core->arch().num_regs()))
, freg_file_(core->arch().num_threads(), std::vector<FWord>(core->arch().num_regs()))
, vreg_file_(core->arch().num_threads(), std::vector<Byte>(core->arch().vsize()))
, tcore_ireg_a(core->arch().num_threads(), std::vector<Word>(core->arch().num_regs()))
, tcore_ireg_b(core->arch().num_threads(), std::vector<Word>(core->arch().num_regs()))
, tcore_ireg_c(core->arch().num_threads(), std::vector<Word>(core->arch().num_regs()))
{
this->clear();
}
@ -34,6 +37,30 @@ void Warp::clear() {
for (auto& reg : vreg_file_.at(i)) {
reg = 0;
}
/*
for (auto& reg : treg_file_.at(i)) {
reg = 0;
}
*/
for (auto& reg : tcore_ireg_a.at(i)) {
reg = 0;
}
for (auto& reg : tcore_ireg_b.at(i)) {
reg = 0;
}
for (auto& reg : tcore_ireg_c.at(i)) {
reg = 0;
}
//clear the tensorcore regs
//for (int j = 0; j < SIZE_SQ; j++){
// //for (int j = 0; j < 2; j++){
// tcore_ireg_a[j] = 0;
// tcore_ireg_b[j] = 0;
// tcore_ireg_c[j] = 0;
// //}
//}
}
}

View file

@ -5,6 +5,10 @@
#include <stack>
#include "types.h"
#define SIZE 2
#define SIZE_SQ SIZE*SIZE
namespace vortex {
class Core;
@ -105,6 +109,12 @@ private:
std::vector<std::vector<Word>> ireg_file_;
std::vector<std::vector<FWord>> freg_file_;
std::vector<std::vector<Byte>> vreg_file_;
//tensorcore registers
std::vector<std::vector<Word>> tcore_ireg_a;
std::vector<std::vector<Word>> tcore_ireg_b;
std::vector<std::vector<Word>> tcore_ireg_c; //accumulator
std::stack<DomStackEntry> dom_stack_;
struct vtype vtype_;

View file

@ -0,0 +1,55 @@
XLEN ?= 32
ifeq ($(XLEN),32)
RISCV_TOOLCHAIN_PATH = /opt/riscv-gnu-toolchain
else
RISCV_TOOLCHAIN_PATH = /opt/riscv64-gnu-toolchain
endif
RISCV_PREFIX ?= riscv$(XLEN)-unknown-elf-
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
CC = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)gcc
AR = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)gcc-ar
DP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)objdump
CP = $(RISCV_TOOLCHAIN_PATH)/bin/$(RISCV_PREFIX)objcopy
SIM_DIR=../../../sim
ifeq ($(XLEN),32)
CFLAGS += -march=rv32imf -mabi=ilp32f
else
CFLAGS += -march=rv64imfd -mabi=lp64d
endif
CFLAGS += -O3 -Wstack-usage=1024 -mcmodel=medany -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw -I./
LDFLAGS += -lm -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link$(XLEN).ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
PROJECT = matmul
SRCS = vx_mat_mulint32.s main.cpp
all: $(PROJECT).elf $(PROJECT).bin $(PROJECT).dump
$(PROJECT).dump: $(PROJECT).elf
$(DP) -D $(PROJECT).elf > $(PROJECT).dump
$(PROJECT).bin: $(PROJECT).elf
$(CP) -O binary $(PROJECT).elf $(PROJECT).bin
$(PROJECT).elf: $(SRCS)
$(CC) $(CFLAGS) $(SRCS) $(LDFLAGS) -o $(PROJECT).elf
run-rtlsim: $(PROJECT).bin
$(SIM_DIR)/rtlsim/rtlsim $(PROJECT).bin
run-simx: $(PROJECT).bin
$(SIM_DIR)/simx/simx -c 1 -i $(PROJECT).bin
.depend: $(SRCS)
$(CC) $(CFLAGS) -MM $^ > .depend;
clean:
rm -rf *.elf *.bin *.dump .depend

View file

@ -0,0 +1,89 @@
#include <stdio.h>
#include <vx_print.h>
#include <vx_intrinsics.h>
#include "vx_mat_mulint32.h"
int A[SIZE][SIZE] =
{
{3,5},
{7,9}
};
int B[SIZE][SIZE] =
{
{11,13},
{15,17}
};
int Ans[SIZE][SIZE] =
{
{108,124},
{212,244}
};
int main() {
int errors = 0;
vx_printf("KDEBUG Initializing output matrix\n");
int C[SIZE][SIZE] =
{
{0,0},
{0,0}
};
uint32_t a_addr = (uint32_t)A ;
uint32_t b_addr = (uint32_t)B ;
uint32_t c_addr = (uint32_t)C;
vx_printf("KDEBUG Done Initializing output matrix\n");
vx_printf("KDEBUG Starting Matmul\n");
//matmul on vortex
// for(int i = 0; i < SIZE; i++){
// for(int j = 0; j < SIZE; j++){
// for(int k = 0; k < SIZE; k++)
// {
// vx_printf("KDEBUG Just before multiply add\n");
// C[i][j] += A[i][k] * B[k][j];
// vx_printf("KDEBUG Just after multiply add\n");
// }
// }
// }
// vx_printf("KDEBUG TEST matrix address A = %u, B = %u, C = %u\n", a_addr, b_addr, c_addr);
ml(0,a_addr);
ml(1,b_addr);
mm();
ms(c_addr);
// vx_printf("KDEBUG Finished Matmul\n");
vx_printf("KDEBUG Result of mul(%dx%d) = %d\n", A[0][0], B[0][0], C[0][0]);
//comparison
vx_printf("KDEBUG Starting Comparison\n");
bool flag = true;
for(int i = 0; i < SIZE; i++){
for(int j = 0; j < SIZE; j++){
if(C[i][j] != Ans[i][j]){
flag = false;
break;
}
}
}
vx_printf("KDEBUG Finished Comparison\n");
if (flag) {
vx_printf("Passed!\n");
} else {
vx_printf("Failed!");
errors = 1;
}
return errors;
}

Binary file not shown.

File diff suppressed because it is too large Load diff

Binary file not shown.

View file

@ -0,0 +1,278 @@
ramulator.active_cycles_0 1372 # Total active cycles for level _0
ramulator.busy_cycles_0 1372 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0
ramulator.serving_requests_0 1372 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0
ramulator.average_serving_requests_0 0.006794 # The average of read and write requests that are served in this DRAM element per memory cycle for level _0
ramulator.active_cycles_0_0 1372 # Total active cycles for level _0_0
ramulator.busy_cycles_0_0 7924 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0
ramulator.serving_requests_0_0 1372 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0
ramulator.average_serving_requests_0_0 0.006794 # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0
ramulator.active_cycles_0_0_0 952 # Total active cycles for level _0_0_0
ramulator.busy_cycles_0_0_0 952 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_0
ramulator.serving_requests_0_0_0 952 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0
ramulator.average_serving_requests_0_0_0 0.004714 # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0
ramulator.active_cycles_0_0_0_0 952 # Total active cycles for level _0_0_0_0
ramulator.busy_cycles_0_0_0_0 952 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_0_0
ramulator.serving_requests_0_0_0_0 952 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0_0
ramulator.average_serving_requests_0_0_0_0 0.004714 # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0_0
ramulator.active_cycles_0_0_0_1 0 # Total active cycles for level _0_0_0_1
ramulator.busy_cycles_0_0_0_1 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_0_1
ramulator.serving_requests_0_0_0_1 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0_1
ramulator.average_serving_requests_0_0_0_1 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0_1
ramulator.active_cycles_0_0_0_2 0 # Total active cycles for level _0_0_0_2
ramulator.busy_cycles_0_0_0_2 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_0_2
ramulator.serving_requests_0_0_0_2 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0_2
ramulator.average_serving_requests_0_0_0_2 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0_2
ramulator.active_cycles_0_0_0_3 0 # Total active cycles for level _0_0_0_3
ramulator.busy_cycles_0_0_0_3 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_0_3
ramulator.serving_requests_0_0_0_3 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0_3
ramulator.average_serving_requests_0_0_0_3 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_0_3
ramulator.active_cycles_0_0_1 420 # Total active cycles for level _0_0_1
ramulator.busy_cycles_0_0_1 420 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_1
ramulator.serving_requests_0_0_1 420 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1
ramulator.average_serving_requests_0_0_1 0.002080 # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1
ramulator.active_cycles_0_0_1_0 420 # Total active cycles for level _0_0_1_0
ramulator.busy_cycles_0_0_1_0 420 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_1_0
ramulator.serving_requests_0_0_1_0 420 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1_0
ramulator.average_serving_requests_0_0_1_0 0.002080 # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1_0
ramulator.active_cycles_0_0_1_1 0 # Total active cycles for level _0_0_1_1
ramulator.busy_cycles_0_0_1_1 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_1_1
ramulator.serving_requests_0_0_1_1 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1_1
ramulator.average_serving_requests_0_0_1_1 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1_1
ramulator.active_cycles_0_0_1_2 0 # Total active cycles for level _0_0_1_2
ramulator.busy_cycles_0_0_1_2 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_1_2
ramulator.serving_requests_0_0_1_2 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1_2
ramulator.average_serving_requests_0_0_1_2 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1_2
ramulator.active_cycles_0_0_1_3 0 # Total active cycles for level _0_0_1_3
ramulator.busy_cycles_0_0_1_3 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_1_3
ramulator.serving_requests_0_0_1_3 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1_3
ramulator.average_serving_requests_0_0_1_3 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_1_3
ramulator.active_cycles_0_0_2 0 # Total active cycles for level _0_0_2
ramulator.busy_cycles_0_0_2 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_2
ramulator.serving_requests_0_0_2 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2
ramulator.average_serving_requests_0_0_2 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2
ramulator.active_cycles_0_0_2_0 0 # Total active cycles for level _0_0_2_0
ramulator.busy_cycles_0_0_2_0 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_2_0
ramulator.serving_requests_0_0_2_0 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2_0
ramulator.average_serving_requests_0_0_2_0 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2_0
ramulator.active_cycles_0_0_2_1 0 # Total active cycles for level _0_0_2_1
ramulator.busy_cycles_0_0_2_1 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_2_1
ramulator.serving_requests_0_0_2_1 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2_1
ramulator.average_serving_requests_0_0_2_1 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2_1
ramulator.active_cycles_0_0_2_2 0 # Total active cycles for level _0_0_2_2
ramulator.busy_cycles_0_0_2_2 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_2_2
ramulator.serving_requests_0_0_2_2 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2_2
ramulator.average_serving_requests_0_0_2_2 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2_2
ramulator.active_cycles_0_0_2_3 0 # Total active cycles for level _0_0_2_3
ramulator.busy_cycles_0_0_2_3 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_2_3
ramulator.serving_requests_0_0_2_3 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2_3
ramulator.average_serving_requests_0_0_2_3 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_2_3
ramulator.active_cycles_0_0_3 0 # Total active cycles for level _0_0_3
ramulator.busy_cycles_0_0_3 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_3
ramulator.serving_requests_0_0_3 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3
ramulator.average_serving_requests_0_0_3 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3
ramulator.active_cycles_0_0_3_0 0 # Total active cycles for level _0_0_3_0
ramulator.busy_cycles_0_0_3_0 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_3_0
ramulator.serving_requests_0_0_3_0 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3_0
ramulator.average_serving_requests_0_0_3_0 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3_0
ramulator.active_cycles_0_0_3_1 0 # Total active cycles for level _0_0_3_1
ramulator.busy_cycles_0_0_3_1 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_3_1
ramulator.serving_requests_0_0_3_1 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3_1
ramulator.average_serving_requests_0_0_3_1 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3_1
ramulator.active_cycles_0_0_3_2 0 # Total active cycles for level _0_0_3_2
ramulator.busy_cycles_0_0_3_2 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_3_2
ramulator.serving_requests_0_0_3_2 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3_2
ramulator.average_serving_requests_0_0_3_2 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3_2
ramulator.active_cycles_0_0_3_3 0 # Total active cycles for level _0_0_3_3
ramulator.busy_cycles_0_0_3_3 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _0_0_3_3
ramulator.serving_requests_0_0_3_3 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3_3
ramulator.average_serving_requests_0_0_3_3 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _0_0_3_3
ramulator.read_transaction_bytes_0 3008 # The total byte of read transaction per channel
ramulator.write_transaction_bytes_0 2304 # The total byte of write transaction per channel
ramulator.row_hits_channel_0_core 64 # Number of row hits per channel per core
ramulator.row_misses_channel_0_core 11 # Number of row misses per channel per core
ramulator.row_conflicts_channel_0_core 8 # Number of row conflicts per channel per core
ramulator.read_row_hits_channel_0_core 32 # Number of row hits for read requests per channel per core
[0] 32.0 #
ramulator.read_row_misses_channel_0_core 11 # Number of row misses for read requests per channel per core
[0] 11.0 #
ramulator.read_row_conflicts_channel_0_core 4 # Number of row conflicts for read requests per channel per core
[0] 4.0 #
ramulator.write_row_hits_channel_0_core 32 # Number of row hits for write requests per channel per core
[0] 32.0 #
ramulator.write_row_misses_channel_0_core 0 # Number of row misses for write requests per channel per core
[0] 0.0 #
ramulator.write_row_conflicts_channel_0_core 4 # Number of row conflicts for write requests per channel per core
[0] 4.0 #
ramulator.useless_activates_0_core 0 # Number of useless activations. E.g, ACT -> PRE w/o RD or WR
ramulator.read_latency_avg_0 28.234043 # The average memory latency cycles (in memory time domain) per request for all read requests in this channel
ramulator.read_latency_sum_0 1327 # The memory latency cycles (in memory time domain) sum for all read requests in this channel
ramulator.req_queue_length_avg_0 0.006571 # Average of read and write queue length per memory cycle per channel.
ramulator.req_queue_length_sum_0 1327 # Sum of read and write queue length per memory cycle per channel.
ramulator.read_req_queue_length_avg_0 0.005382 # Read queue length average per memory cycle per channel.
ramulator.read_req_queue_length_sum_0 1087 # Read queue length sum per memory cycle per channel.
ramulator.write_req_queue_length_avg_0 0.001188 # Write queue length average per memory cycle per channel.
ramulator.write_req_queue_length_sum_0 240 # Write queue length sum per memory cycle per channel.
ramulator.record_read_hits 0.0 # record read hit count for this core when it reaches request limit or to the end
[0] 0.0 #
ramulator.record_read_misses 0.0 # record_read_miss count for this core when it reaches request limit or to the end
[0] 0.0 #
ramulator.record_read_conflicts 0.0 # record read conflict count for this core when it reaches request limit or to the end
[0] 0.0 #
ramulator.record_write_hits 0.0 # record write hit count for this core when it reaches request limit or to the end
[0] 0.0 #
ramulator.record_write_misses 0.0 # record write miss count for this core when it reaches request limit or to the end
[0] 0.0 #
ramulator.record_write_conflicts 0.0 # record write conflict for this core when it reaches request limit or to the end
[0] 0.0 #
ramulator.active_cycles_1 1442 # Total active cycles for level _1
ramulator.busy_cycles_1 1442 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1
ramulator.serving_requests_1 1464 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1
ramulator.average_serving_requests_1 0.007249 # The average of read and write requests that are served in this DRAM element per memory cycle for level _1
ramulator.active_cycles_1_0 1442 # Total active cycles for level _1_0
ramulator.busy_cycles_1_0 7994 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0
ramulator.serving_requests_1_0 1464 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0
ramulator.average_serving_requests_1_0 0.007249 # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0
ramulator.active_cycles_1_0_0 790 # Total active cycles for level _1_0_0
ramulator.busy_cycles_1_0_0 790 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_0
ramulator.serving_requests_1_0_0 812 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0
ramulator.average_serving_requests_1_0_0 0.004021 # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0
ramulator.active_cycles_1_0_0_0 790 # Total active cycles for level _1_0_0_0
ramulator.busy_cycles_1_0_0_0 790 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_0_0
ramulator.serving_requests_1_0_0_0 812 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0_0
ramulator.average_serving_requests_1_0_0_0 0.004021 # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0_0
ramulator.active_cycles_1_0_0_1 0 # Total active cycles for level _1_0_0_1
ramulator.busy_cycles_1_0_0_1 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_0_1
ramulator.serving_requests_1_0_0_1 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0_1
ramulator.average_serving_requests_1_0_0_1 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0_1
ramulator.active_cycles_1_0_0_2 0 # Total active cycles for level _1_0_0_2
ramulator.busy_cycles_1_0_0_2 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_0_2
ramulator.serving_requests_1_0_0_2 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0_2
ramulator.average_serving_requests_1_0_0_2 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0_2
ramulator.active_cycles_1_0_0_3 0 # Total active cycles for level _1_0_0_3
ramulator.busy_cycles_1_0_0_3 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_0_3
ramulator.serving_requests_1_0_0_3 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0_3
ramulator.average_serving_requests_1_0_0_3 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_0_3
ramulator.active_cycles_1_0_1 300 # Total active cycles for level _1_0_1
ramulator.busy_cycles_1_0_1 300 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_1
ramulator.serving_requests_1_0_1 300 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1
ramulator.average_serving_requests_1_0_1 0.001486 # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1
ramulator.active_cycles_1_0_1_0 300 # Total active cycles for level _1_0_1_0
ramulator.busy_cycles_1_0_1_0 300 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_1_0
ramulator.serving_requests_1_0_1_0 300 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1_0
ramulator.average_serving_requests_1_0_1_0 0.001486 # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1_0
ramulator.active_cycles_1_0_1_1 0 # Total active cycles for level _1_0_1_1
ramulator.busy_cycles_1_0_1_1 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_1_1
ramulator.serving_requests_1_0_1_1 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1_1
ramulator.average_serving_requests_1_0_1_1 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1_1
ramulator.active_cycles_1_0_1_2 0 # Total active cycles for level _1_0_1_2
ramulator.busy_cycles_1_0_1_2 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_1_2
ramulator.serving_requests_1_0_1_2 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1_2
ramulator.average_serving_requests_1_0_1_2 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1_2
ramulator.active_cycles_1_0_1_3 0 # Total active cycles for level _1_0_1_3
ramulator.busy_cycles_1_0_1_3 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_1_3
ramulator.serving_requests_1_0_1_3 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1_3
ramulator.average_serving_requests_1_0_1_3 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_1_3
ramulator.active_cycles_1_0_2 0 # Total active cycles for level _1_0_2
ramulator.busy_cycles_1_0_2 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_2
ramulator.serving_requests_1_0_2 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2
ramulator.average_serving_requests_1_0_2 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2
ramulator.active_cycles_1_0_2_0 0 # Total active cycles for level _1_0_2_0
ramulator.busy_cycles_1_0_2_0 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_2_0
ramulator.serving_requests_1_0_2_0 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2_0
ramulator.average_serving_requests_1_0_2_0 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2_0
ramulator.active_cycles_1_0_2_1 0 # Total active cycles for level _1_0_2_1
ramulator.busy_cycles_1_0_2_1 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_2_1
ramulator.serving_requests_1_0_2_1 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2_1
ramulator.average_serving_requests_1_0_2_1 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2_1
ramulator.active_cycles_1_0_2_2 0 # Total active cycles for level _1_0_2_2
ramulator.busy_cycles_1_0_2_2 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_2_2
ramulator.serving_requests_1_0_2_2 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2_2
ramulator.average_serving_requests_1_0_2_2 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2_2
ramulator.active_cycles_1_0_2_3 0 # Total active cycles for level _1_0_2_3
ramulator.busy_cycles_1_0_2_3 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_2_3
ramulator.serving_requests_1_0_2_3 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2_3
ramulator.average_serving_requests_1_0_2_3 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_2_3
ramulator.active_cycles_1_0_3 352 # Total active cycles for level _1_0_3
ramulator.busy_cycles_1_0_3 352 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_3
ramulator.serving_requests_1_0_3 352 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3
ramulator.average_serving_requests_1_0_3 0.001743 # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3
ramulator.active_cycles_1_0_3_0 0 # Total active cycles for level _1_0_3_0
ramulator.busy_cycles_1_0_3_0 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_3_0
ramulator.serving_requests_1_0_3_0 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3_0
ramulator.average_serving_requests_1_0_3_0 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3_0
ramulator.active_cycles_1_0_3_1 0 # Total active cycles for level _1_0_3_1
ramulator.busy_cycles_1_0_3_1 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_3_1
ramulator.serving_requests_1_0_3_1 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3_1
ramulator.average_serving_requests_1_0_3_1 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3_1
ramulator.active_cycles_1_0_3_2 0 # Total active cycles for level _1_0_3_2
ramulator.busy_cycles_1_0_3_2 0 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_3_2
ramulator.serving_requests_1_0_3_2 0 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3_2
ramulator.average_serving_requests_1_0_3_2 0.000000 # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3_2
ramulator.active_cycles_1_0_3_3 352 # Total active cycles for level _1_0_3_3
ramulator.busy_cycles_1_0_3_3 352 # (All-bank refresh only. busy cycles only include refresh time in rank level) The sum of cycles that the DRAM part is active or under refresh for level _1_0_3_3
ramulator.serving_requests_1_0_3_3 352 # The sum of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3_3
ramulator.average_serving_requests_1_0_3_3 0.001743 # The average of read and write requests that are served in this DRAM element per memory cycle for level _1_0_3_3
ramulator.read_transaction_bytes_1 2688 # The total byte of read transaction per channel
ramulator.write_transaction_bytes_1 46528 # The total byte of write transaction per channel
ramulator.row_hits_channel_1_core 733 # Number of row hits per channel per core
ramulator.row_misses_channel_1_core 33 # Number of row misses per channel per core
ramulator.row_conflicts_channel_1_core 3 # Number of row conflicts per channel per core
ramulator.read_row_hits_channel_1_core 30 # Number of row hits for read requests per channel per core
[0] 30.0 #
ramulator.read_row_misses_channel_1_core 11 # Number of row misses for read requests per channel per core
[0] 11.0 #
ramulator.read_row_conflicts_channel_1_core 1 # Number of row conflicts for read requests per channel per core
[0] 1.0 #
ramulator.write_row_hits_channel_1_core 703 # Number of row hits for write requests per channel per core
[0] 703.0 #
ramulator.write_row_misses_channel_1_core 22 # Number of row misses for write requests per channel per core
[0] 22.0 #
ramulator.write_row_conflicts_channel_1_core 2 # Number of row conflicts for write requests per channel per core
[0] 2.0 #
ramulator.useless_activates_1_core 0 # Number of useless activations. E.g, ACT -> PRE w/o RD or WR
ramulator.read_latency_avg_1 32.261905 # The average memory latency cycles (in memory time domain) per request for all read requests in this channel
ramulator.read_latency_sum_1 1355 # The memory latency cycles (in memory time domain) sum for all read requests in this channel
ramulator.req_queue_length_avg_1 0.029056 # Average of read and write queue length per memory cycle per channel.
ramulator.req_queue_length_sum_1 5868 # Sum of read and write queue length per memory cycle per channel.
ramulator.read_req_queue_length_avg_1 0.005759 # Read queue length average per memory cycle per channel.
ramulator.read_req_queue_length_sum_1 1163 # Read queue length sum per memory cycle per channel.
ramulator.write_req_queue_length_avg_1 0.023298 # Write queue length average per memory cycle per channel.
ramulator.write_req_queue_length_sum_1 4705 # Write queue length sum per memory cycle per channel.
ramulator.record_read_hits 0.0 # record read hit count for this core when it reaches request limit or to the end
[0] 0.0 #
ramulator.record_read_misses 0.0 # record_read_miss count for this core when it reaches request limit or to the end
[0] 0.0 #
ramulator.record_read_conflicts 0.0 # record read conflict count for this core when it reaches request limit or to the end
[0] 0.0 #
ramulator.record_write_hits 0.0 # record write hit count for this core when it reaches request limit or to the end
[0] 0.0 #
ramulator.record_write_misses 0.0 # record write miss count for this core when it reaches request limit or to the end
[0] 0.0 #
ramulator.record_write_conflicts 0.0 # record write conflict for this core when it reaches request limit or to the end
[0] 0.0 #
ramulator.dram_capacity 8589934592 # Number of bytes in simulated DRAM
ramulator.dram_cycles 201952 # Number of DRAM cycles simulated
ramulator.incoming_requests 852 # Number of incoming requests to DRAM
ramulator.read_requests 89 # Number of incoming read requests to DRAM per core
[0] 89.0 #
ramulator.write_requests 763 # Number of incoming write requests to DRAM per core
[0] 763.0 #
ramulator.ramulator_active_cycles 2780 # The total number of cycles that the DRAM part is active (serving R/W)
ramulator.incoming_requests_per_channel 852.0 # Number of incoming requests to each DRAM channel
[0] 83.0 #
[1] 769.0 #
ramulator.incoming_read_reqs_per_channel 89.0 # Number of incoming read requests to each DRAM channel
[0] 47.0 #
[1] 42.0 #
ramulator.physical_page_replacement 0 # The number of times that physical page replacement happens.
ramulator.maximum_bandwidth 38400000000 # The theoretical maximum bandwidth (Bps)
ramulator.in_queue_req_num_sum 7195 # Sum of read/write queue length
ramulator.in_queue_read_req_num_sum 2250 # Sum of read queue length
ramulator.in_queue_write_req_num_sum 4945 # Sum of write queue length
ramulator.in_queue_req_num_avg 0.035627 # Average of read/write queue length per memory cycle
ramulator.in_queue_read_req_num_avg 0.011141 # Average of read queue length per memory cycle
ramulator.in_queue_write_req_num_avg 0.024486 # Average of write queue length per memory cycle
ramulator.record_read_requests 0.0 # record read requests for this core when it reaches request limit or to the end
[0] 0.0 #
ramulator.record_write_requests 0.0 # record write requests for this core when it reaches request limit or to the end
[0] 0.0 #

View file

@ -0,0 +1,16 @@
#ifndef VX_MATMUL_H
#define VX_MATMUL_H
#include <stdarg.h>
#ifdef __cplusplus
extern "C" {
#endif
#define SIZE 2
#ifdef __cplusplus
}
#endif
#endif

View file

@ -0,0 +1,53 @@
.type vx_mat_muli32, @function
.global vx_mat_mulint32
# vector-vector add routine of 32-bit integers
# void vvaddint32(size_t n, const int*x, const int*y, int*z)
# { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } }
#
# a0 = C, a1 = A, a2 = B
# Non-vector instructions are indented
vx_mat_mulint32:
#load from a1 to r1
#mla
# #load from a1+1 to r2
# mla s2, (a1)
# #load from a1+2 to r3
# mla s3, (a1)
# #load from a1+3 to r4
# lw s4, (a1)
# #load from a2 to r5
# lw s5, (a2)
# #load from a2+1 to r6
# lw s6, (a2)
# #load from a2+2 to r7
# lw s7, (a2)
# #load from a2+3 to r8
# lw s8, (a2)
# #multiply and store in regs t1, t2, t3, t4
# #store r9 in a0
# sw t1, (a0)
# #store r10 in a0+1
# sw t2, (a0)
# #store r11 in a0+2
# sw t3, (a0)
# #store r12 in a0+3
# sw t4, (a0)
#return
ret
#loop:
# vlw.v v0, (a1) # Get first vector
# sub a0, a0, t0 # Decrement number done
# slli t0, t0, 2 # Multiply number done by 4 bytes
# add a1, a1, t0 # Bump pointer
# vlw.v v1, (a2) # Get second vector
# add a2, a2, t0 # Bump pointer
# vadd.vv v2, v0, v1 # Sum vectors
# vsw.v v2, (a3) # Store result
# add a3, a3, t0 # Bump pointer
# bnez a0, loop # Loop back
# ret # Finished

View file

@ -0,0 +1,22 @@
.type vx_vec_vvaddi32, @function
.global vx_vec_vvaddint32
# vector-vector add routine of 32-bit integers
# void vvaddint32(size_t n, const int*x, const int*y, int*z)
# { for (size_t i=0; i<n; i++) { z[i]=x[i]+y[i]; } }
#
# a0 = n, a1 = x, a2 = y, a3 = z
# Non-vector instructions are indented
vx_vec_vvaddint32:
vsetvli t0, a0, e32 # Set vector length based on 32-bit vectors
loop:
vlw.v v0, (a1) # Get first vector
sub a0, a0, t0 # Decrement number done
slli t0, t0, 2 # Multiply number done by 4 bytes
add a1, a1, t0 # Bump pointer
vlw.v v1, (a2) # Get second vector
add a2, a2, t0 # Bump pointer
vadd.vv v2, v0, v1 # Sum vectors
vsw.v v2, (a3) # Store result
add a3, a3, t0 # Bump pointer
bnez a0, loop # Loop back
ret # Finished