minor update

This commit is contained in:
Blaise Tine 2021-03-20 19:25:11 -04:00
parent 8c0c4e2b6e
commit 96a03a3edf
2 changed files with 36 additions and 473 deletions

View file

@ -1,432 +0,0 @@
// auto-generated by gen_config.py. DO NOT EDIT
// Generated at 2021-03-20 18:29:13.211392
#ifndef VX_USER_CONFIG
#define VX_USER_CONFIG
#endif
// auto-generated by gen_config.py. DO NOT EDIT
// Generated at 2021-03-20 18:29:13.214396
// Translated from VX_config.vh:
#ifndef VX_CONFIG
#define VX_CONFIG
#ifndef NUM_CLUSTERS
#define NUM_CLUSTERS 1
#endif
#ifndef NUM_CORES
#define NUM_CORES 1
#endif
#ifndef NUM_WARPS
#define NUM_WARPS 4
#endif
#ifndef NUM_THREADS
#define NUM_THREADS 4
#endif
#ifndef NUM_BARRIERS
#define NUM_BARRIERS 4
#endif
#ifndef L2_ENABLE
#define L2_ENABLE 0
#endif
#ifndef L3_ENABLE
#define L3_ENABLE 0
#endif
#ifndef SM_ENABLE
#define SM_ENABLE 1
#endif
#ifndef GLOBAL_BLOCK_SIZE
#define GLOBAL_BLOCK_SIZE 64
#endif
#ifndef L1_BLOCK_SIZE
#define L1_BLOCK_SIZE (NUM_THREADS * 4)
#endif
#ifndef STARTUP_ADDR
#define STARTUP_ADDR 0x80000000
#endif
#ifndef IO_BUS_BASE_ADDR
#define IO_BUS_BASE_ADDR 0xFF000000
#endif
#ifndef SHARED_MEM_BASE_ADDR
#define SHARED_MEM_BASE_ADDR IO_BUS_BASE_ADDR
#endif
#ifndef SHARED_MEM_BASE_ADDR_ALIGN
#define SHARED_MEM_BASE_ADDR_ALIGN 64
#endif
#ifndef IO_BUS_ADDR_COUT
#define IO_BUS_ADDR_COUT 0xFFFFFFFC
#endif
#ifndef FRAME_BUFFER_BASE_ADDR
#define FRAME_BUFFER_BASE_ADDR 0xFF000000
#endif
#ifndef FRAME_BUFFER_WIDTH
#define FRAME_BUFFER_WIDTH 1920
#endif
#ifndef FRAME_BUFFER_HEIGHT
#define FRAME_BUFFER_HEIGHT 1080
#endif
#define FRAME_BUFFER_SIZE (FRAME_BUFFER_WIDTH * FRAME_BUFFER_HEIGHT)
#ifndef EXT_M_DISABLE
#define EXT_M_ENABLE
#endif
#ifndef EXT_F_DISABLE
#define EXT_F_ENABLE
#endif
#ifndef EXT_TEX_DISABLE
#define EXT_TEX_ENABLE
#endif
// Device identification
#define VENDOR_ID 0
#define ARCHITECTURE_ID 0
#define IMPLEMENTATION_ID 0
///////////////////////////////////////////////////////////////////////////////
#ifndef LATENCY_IMUL
#define LATENCY_IMUL 3
#endif
#ifndef LATENCY_FNCP
#define LATENCY_FNCP 2
#endif
#ifndef LATENCY_FMA
#define LATENCY_FMA 4
#endif
#ifndef LATENCY_FDIV
#ifdef ALTERA_S10
#define LATENCY_FDIV 34
#else
#define LATENCY_FDIV 15
#endif
#endif
#ifndef LATENCY_FSQRT
#ifdef ALTERA_S10
#define LATENCY_FSQRT 25
#else
#define LATENCY_FSQRT 10
#endif
#endif
#ifndef LATENCY_FDIVSQRT
#define LATENCY_FDIVSQRT 32
#endif
#ifndef LATENCY_FCVT
#define LATENCY_FCVT 4
#endif
// CSR Addresses //////////////////////////////////////////////////////////////
// User Floating-Point CSRs
#define CSR_FFLAGS 0x001
#define CSR_FRM 0x002
#define CSR_FCSR 0x003
#define CSR_SATP 0x180
#define CSR_PMPCFG0 0x3A0
#define CSR_PMPADDR0 0x3B0
#define CSR_MSTATUS 0x300
#define CSR_MISA 0x301
#define CSR_MEDELEG 0x302
#define CSR_MIDELEG 0x303
#define CSR_MIE 0x304
#define CSR_MTVEC 0x305
#define CSR_MEPC 0x341
// Machine Counter/Timers
#define CSR_CYCLE 0xC00
#define CSR_CYCLE_H 0xC80
#define CSR_INSTRET 0xC02
#define CSR_INSTRET_H 0xC82
// Machine Performance-monitoring counters
// PERF: pipeline
#define CSR_MPM_IBUF_ST 0xB03
#define CSR_MPM_IBUF_ST_H 0xB83
#define CSR_MPM_SCRB_ST 0xB04
#define CSR_MPM_SCRB_ST_H 0xB84
#define CSR_MPM_ALU_ST 0xB05
#define CSR_MPM_ALU_ST_H 0xB85
#define CSR_MPM_LSU_ST 0xB06
#define CSR_MPM_LSU_ST_H 0xB86
#define CSR_MPM_CSR_ST 0xB07
#define CSR_MPM_CSR_ST_H 0xB87
#define CSR_MPM_FPU_ST 0xB08
#define CSR_MPM_FPU_ST_H 0xB88
#define CSR_MPM_GPU_ST 0xB09
#define CSR_MPM_GPU_ST_H 0xB89
// PERF: icache
#define CSR_MPM_ICACHE_READS 0xB0A // total reads
#define CSR_MPM_ICACHE_READS_H 0xB8A
#define CSR_MPM_ICACHE_MISS_R 0xB0B // total misses
#define CSR_MPM_ICACHE_MISS_R_H 0xB8B
#define CSR_MPM_ICACHE_PIPE_ST 0xB0C // pipeline stalls
#define CSR_MPM_ICACHE_PIPE_ST_H 0xB8C
#define CSR_MPM_ICACHE_CRSP_ST 0xB0D // core response stalls
#define CSR_MPM_ICACHE_CRSP_ST_H 0xB8D
// PERF: dcache
#define CSR_MPM_DCACHE_READS 0xB0E // total reads
#define CSR_MPM_DCACHE_READS_H 0xB8E
#define CSR_MPM_DCACHE_WRITES 0xB0F // total writes
#define CSR_MPM_DCACHE_WRITES_H 0xB8F
#define CSR_MPM_DCACHE_MISS_R 0xB10 // read misses
#define CSR_MPM_DCACHE_MISS_R_H 0xB90
#define CSR_MPM_DCACHE_MISS_W 0xB11 // write misses
#define CSR_MPM_DCACHE_MISS_W_H 0xB91
#define CSR_MPM_DCACHE_BANK_ST 0xB12 // bank conflicts stalls
#define CSR_MPM_DCACHE_BANK_ST_H 0xB92
#define CSR_MPM_DCACHE_MSHR_ST 0xB13 // MSHR stalls
#define CSR_MPM_DCACHE_MSHR_ST_H 0xB93
#define CSR_MPM_DCACHE_PIPE_ST 0xB14 // pipeline stalls
#define CSR_MPM_DCACHE_PIPE_ST_H 0xB94
#define CSR_MPM_DCACHE_CRSP_ST 0xB15 // core response stalls
#define CSR_MPM_DCACHE_CRSP_ST_H 0xB95
// PERF: smem
#define CSR_MPM_SMEM_READS 0xB16 // total reads
#define CSR_MPM_SMEM_READS_H 0xB96
#define CSR_MPM_SMEM_WRITES 0xB17 // total writes
#define CSR_MPM_SMEM_WRITES_H 0xB97
#define CSR_MPM_SMEM_BANK_ST 0xB18 // bank conflicts stalls
#define CSR_MPM_SMEM_BANK_ST_H 0xB98
// PERF: memory
#define CSR_MPM_DRAM_READS 0xB19 // dram reads
#define CSR_MPM_DRAM_READS_H 0xB99
#define CSR_MPM_DRAM_WRITES 0xB1A // dram writes
#define CSR_MPM_DRAM_WRITES_H 0xB9A
#define CSR_MPM_DRAM_ST 0xB1B // dram request stalls
#define CSR_MPM_DRAM_ST_H 0xB9B
#define CSR_MPM_DRAM_LAT 0xB1C // dram latency (total)
#define CSR_MPM_DRAM_LAT_H 0xB9C
// Machine Information Registers
#define CSR_MVENDORID 0xF11
#define CSR_MARCHID 0xF12
#define CSR_MIMPID 0xF13
#define CSR_MHARTID 0xF14
// User SIMT CSRs
#define CSR_WTID 0xCC0
#define CSR_LTID 0xCC1
#define CSR_GTID 0xCC2
#define CSR_LWID 0xCC3
#define CSR_GWID CSR_MHARTID
#define CSR_GCID 0xCC5
// Machine SIMT CSRs
#define CSR_NT 0xFC0
#define CSR_NW 0xFC1
#define CSR_NC 0xFC2
////////// Texture Units //////////////////////////////////////////////////////
#define NUM_TEX_UNITS 2
#define CSR_TEX_STATES 8
#define CSR_TEX_BEGIN(x) (0xFD0 + (x) * CSR_TEX_STATES)
#define CSR_TEX_ADDR(x) (CSR_TEX_BEGIN(x) + 0x00)
#define CSR_TEX_FORMAT(x) (CSR_TEX_BEGIN(x) + 0x01)
#define CSR_TEX_WIDTH(x) (CSR_TEX_BEGIN(x) + 0x02)
#define CSR_TEX_HEIGHT(x) (CSR_TEX_BEGIN(x) + 0x03)
#define CSR_TEX_STRIDE(x) (CSR_TEX_BEGIN(x) + 0x04)
#define CSR_TEX_WRAP_U(x) (CSR_TEX_BEGIN(x) + 0x05)
#define CSR_TEX_WRAP_V(x) (CSR_TEX_BEGIN(x) + 0x06)
#define CSR_TEX_FILTER(x) (CSR_TEX_BEGIN(x) + 0x07)
// Pipeline Queues ////////////////////////////////////////////////////////////
// Size of LSU Request Queue
#ifndef LSUQ_SIZE
#define LSUQ_SIZE 8
#endif
// Size of FPU Request Queue
#ifndef FPUQ_SIZE
#define FPUQ_SIZE 8
#endif
// Icache Configurable Knobs //////////////////////////////////////////////////
// Size of cache in bytes
#ifndef ICACHE_SIZE
#define ICACHE_SIZE 16384
#endif
// Core Request Queue Size
#ifndef ICREQ_SIZE
#define ICREQ_SIZE 4
#endif
// Miss Handling Register Size
#ifndef IMSHR_SIZE
#define IMSHR_SIZE NUM_WARPS
#endif
// DRAM Request Queue Size
#ifndef IDREQ_SIZE
#define IDREQ_SIZE 4
#endif
// DRAM Response Queue Size
#ifndef IDRSQ_SIZE
#define IDRSQ_SIZE 4
#endif
// Dcache Configurable Knobs //////////////////////////////////////////////////
// Size of cache in bytes
#ifndef DCACHE_SIZE
#define DCACHE_SIZE 16384
#endif
// Number of banks
#ifndef DNUM_BANKS
#define DNUM_BANKS NUM_THREADS
#endif
// Number of bank ports
#ifndef DNUM_PORTS
#define DNUM_PORTS 1
#endif
// Core Request Queue Size
#ifndef DCREQ_SIZE
#define DCREQ_SIZE 4
#endif
// Miss Handling Register Size
#ifndef DMSHR_SIZE
#define DMSHR_SIZE LSUQ_SIZE
#endif
// DRAM Request Queue Size
#ifndef DDREQ_SIZE
#define DDREQ_SIZE 4
#endif
// DRAM Response Queue Size
#ifndef DDRSQ_SIZE
#define DDRSQ_SIZE MAX(4, (DNUM_BANKS * 2))
#endif
// SM Configurable Knobs //////////////////////////////////////////////////////
// per thread stack size
#ifndef STACK_SIZE
#define STACK_SIZE 1024
#endif
// Size of cache in bytes
#ifndef SMEM_SIZE
#define SMEM_SIZE (STACK_SIZE * NUM_WARPS * NUM_THREADS)
#endif
// Number of banks
#ifndef SNUM_BANKS
#define SNUM_BANKS NUM_THREADS
#endif
// Core Request Queue Size
#ifndef SCREQ_SIZE
#define SCREQ_SIZE 4
#endif
// L2cache Configurable Knobs /////////////////////////////////////////////////
// Size of cache in bytes
#ifndef L2CACHE_SIZE
#define L2CACHE_SIZE 65536
#endif
// Number of banks
#ifndef L2NUM_BANKS
#define L2NUM_BANKS MIN(NUM_CORES, 4)
#endif
// Core Request Queue Size
#ifndef L2CREQ_SIZE
#define L2CREQ_SIZE 4
#endif
// Miss Handling Register Size
#ifndef L2MSHR_SIZE
#define L2MSHR_SIZE 16
#endif
// DRAM Request Queue Size
#ifndef L2DREQ_SIZE
#define L2DREQ_SIZE 4
#endif
// DRAM Response Queue Size
#ifndef L2DRSQ_SIZE
#define L2DRSQ_SIZE MAX(4, (L2NUM_BANKS * 2))
#endif
// L3cache Configurable Knobs /////////////////////////////////////////////////
// Size of cache in bytes
#ifndef L3CACHE_SIZE
#define L3CACHE_SIZE 131072
#endif
// Number of banks
#ifndef L3NUM_BANKS
#define L3NUM_BANKS MIN(NUM_CLUSTERS, 4)
#endif
// Core Request Queue Size
#ifndef L3CREQ_SIZE
#define L3CREQ_SIZE 4
#endif
// Miss Handling Register Size
#ifndef L3MSHR_SIZE
#define L3MSHR_SIZE 16
#endif
// DRAM Request Queue Size
#ifndef L3DREQ_SIZE
#define L3DREQ_SIZE 4
#endif
// DRAM Response Queue Size
#ifndef L3DRSQ_SIZE
#define L3DRSQ_SIZE MAX(4, (L3NUM_BANKS * 2))
#endif
#endif

View file

@ -776,51 +776,46 @@ void Warp::execute(const Instr &instr, Pipeline *pipeline) {
case FMSUB:
case FMNMADD:
case FMNMSUB: {
// select FP format
if (core_->get_csr(CSR_FPMODE, t, id_) == 1) {
// CODE
} else {
// multiplicands are infinity and zero, them set FCSR
if (fpBinIsZero(rsdata[0]) || fpBinIsZero(rsdata[1]) || fpBinIsInf(rsdata[0]) || fpBinIsInf(rsdata[1])) {
// multiplicands are infinity and zero, them set FCSR
if (fpBinIsZero(rsdata[0]) || fpBinIsZero(rsdata[1]) || fpBinIsInf(rsdata[0]) || fpBinIsInf(rsdata[1])) {
core_->set_csr(CSR_FCSR, core_->get_csr(CSR_FCSR, t, id_) | 0x10, t, id_); // set NV bit
core_->set_csr(CSR_FFLAGS, core_->get_csr(CSR_FFLAGS, t, id_) | 0x10, t, id_); // set NV bit
}
if (fpBinIsNan(rsdata[0]) || fpBinIsNan(rsdata[1]) || fpBinIsNan(rsdata[2])) {
// if one of op is NaN, if addend is not quiet NaN, them set FCSR
if ((fpBinIsNan(rsdata[0])==2) | (fpBinIsNan(rsdata[1])==2) | (fpBinIsNan(rsdata[1])==2)) {
core_->set_csr(CSR_FCSR, core_->get_csr(CSR_FCSR, t, id_) | 0x10, t, id_); // set NV bit
core_->set_csr(CSR_FFLAGS, core_->get_csr(CSR_FFLAGS, t, id_) | 0x10, t, id_); // set NV bit
core_->set_csr(CSR_FFLAGS, core_->get_csr(CSR_FFLAGS, t, id_) | 0x10, t, id_); // set NV bit
}
if (fpBinIsNan(rsdata[0]) || fpBinIsNan(rsdata[1]) || fpBinIsNan(rsdata[2])) {
// if one of op is NaN, if addend is not quiet NaN, them set FCSR
if ((fpBinIsNan(rsdata[0])==2) | (fpBinIsNan(rsdata[1])==2) | (fpBinIsNan(rsdata[1])==2)) {
core_->set_csr(CSR_FCSR, core_->get_csr(CSR_FCSR, t, id_) | 0x10, t, id_); // set NV bit
core_->set_csr(CSR_FFLAGS, core_->get_csr(CSR_FFLAGS, t, id_) | 0x10, t, id_); // set NV bit
}
rddata = 0x7fc00000; // canonical(quiet) NaN
} else {
float rs1 = intregToFloat(rsdata[0]);
float rs2 = intregToFloat(rsdata[1]);
float rs3 = intregToFloat(rsdata[2]);
float fpDest(0.0);
feclearexcept(FE_ALL_EXCEPT);
switch (opcode) {
case FMADD:
// rd = (rs1*rs2)+rs3
fpDest = (rs1 * rs2) + rs3; break;
case FMSUB:
// rd = (rs1*rs2)-rs3
fpDest = (rs1 * rs2) - rs3; break;
case FMNMADD:
// rd = -(rs1*rs2)+rs3
fpDest = -1*(rs1 * rs2) - rs3; break;
case FMNMSUB:
// rd = -(rs1*rs2)-rs3
fpDest = -1*(rs1 * rs2) + rs3; break;
default:
std::abort();
break;
}
rddata = 0x7fc00000; // canonical(quiet) NaN
} else {
float rs1 = intregToFloat(rsdata[0]);
float rs2 = intregToFloat(rsdata[1]);
float rs3 = intregToFloat(rsdata[2]);
float fpDest(0.0);
feclearexcept(FE_ALL_EXCEPT);
switch (opcode) {
case FMADD:
// rd = (rs1*rs2)+rs3
fpDest = (rs1 * rs2) + rs3; break;
case FMSUB:
// rd = (rs1*rs2)-rs3
fpDest = (rs1 * rs2) - rs3; break;
case FMNMADD:
// rd = -(rs1*rs2)+rs3
fpDest = -1*(rs1 * rs2) - rs3; break;
case FMNMSUB:
// rd = -(rs1*rs2)-rs3
fpDest = -1*(rs1 * rs2) + rs3; break;
default:
std::abort();
break;
}
// update fcsrs
update_fcrs(core_, t, id_);
// update fcsrs
update_fcrs(core_, t, id_);
rddata = floatToBin(fpDest);
}
rddata = floatToBin(fpDest);
}
}
break;