mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 13:27:29 -04:00
speeding up simulation using dedicated full dpi-based FPU core
This commit is contained in:
parent
2058718f0f
commit
2b8435471a
26 changed files with 990 additions and 430 deletions
|
@ -39,11 +39,12 @@ LDFLAGS += -shared -pthread
|
|||
TOP = vortex_afu_shim
|
||||
|
||||
RTL_DIR=../../../hw/rtl
|
||||
DPI_DIR=../../../hw/dpi
|
||||
|
||||
SRCS = fpga.cpp opae_sim.cpp
|
||||
SRCS += $(RTL_DIR)/fp_cores/svdpi/float_dpi.cpp
|
||||
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
|
||||
|
||||
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/svdpi -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
|
||||
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(DPI_DIR) -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
|
||||
RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE)
|
||||
RTL_INCLUDE += -I$(RTL_DIR)/afu -I$(RTL_DIR)/afu/ccip
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
#define CCI_RQ_SIZE 16
|
||||
#define CCI_WQ_SIZE 16
|
||||
|
||||
#define RESET_DELAY 2
|
||||
#define RESET_DELAY 4
|
||||
|
||||
#define ENABLE_DRAM_STALLS
|
||||
#define DRAM_LATENCY 24
|
||||
|
@ -135,19 +135,14 @@ void opae_sim::reset() {
|
|||
|
||||
vortex_afu_->reset = 1;
|
||||
|
||||
vortex_afu_->clk = 0;
|
||||
this->eval();
|
||||
vortex_afu_->clk = 1;
|
||||
this->eval();
|
||||
|
||||
vortex_afu_->reset = 0;
|
||||
|
||||
for (int i = 0; i < RESET_DELAY; ++i) {
|
||||
vortex_afu_->clk = 0;
|
||||
this->eval();
|
||||
vortex_afu_->clk = 1;
|
||||
this->eval();
|
||||
}
|
||||
|
||||
vortex_afu_->reset = 0;
|
||||
|
||||
// Turn on assertion after reset
|
||||
Verilated::assertOn(true);
|
||||
|
|
|
@ -39,11 +39,12 @@ LDFLAGS += -shared -pthread
|
|||
TOP = Vortex
|
||||
|
||||
RTL_DIR = ../../hw/rtl
|
||||
DPI_DIR = ../../hw/dpi
|
||||
|
||||
SRCS = vortex.cpp ../common/vx_utils.cpp ../../hw/simulate/simulator.cpp
|
||||
SRCS += $(RTL_DIR)/fp_cores/svdpi/float_dpi.cpp
|
||||
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
|
||||
|
||||
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/svdpi -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
|
||||
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(DPI_DIR) -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
|
||||
RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE)
|
||||
|
||||
VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic $(CONFIGS)
|
||||
|
|
264
hw/dpi/float_dpi.cpp
Normal file
264
hw/dpi/float_dpi.cpp
Normal file
|
@ -0,0 +1,264 @@
|
|||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
#include <mutex>
|
||||
#include <iostream>
|
||||
#include "svdpi.h"
|
||||
#include "verilated_vpi.h"
|
||||
#include "VX_config.h"
|
||||
|
||||
extern "C" {
|
||||
void dpi_fadd(int a, int b, int frm, int* result, int* fflags);
|
||||
void dpi_fsub(int a, int b, int frm, int* result, int* fflags);
|
||||
void dpi_fmul(int a, int b, int frm, int* result, int* fflags);
|
||||
void dpi_fmadd(int a, int b, int c, int frm, int* result, int* fflags);
|
||||
void dpi_fmsub(int a, int b, int c, int frm, int* result, int* fflags);
|
||||
void dpi_fnmadd(int a, int b, int c, int frm, int* result, int* fflags);
|
||||
void dpi_fnmsub(int a, int b, int c, int frm, int* result, int* fflags);
|
||||
|
||||
void dpi_fdiv(int a, int b, int frm, int* result, int* fflags);
|
||||
void dpi_fsqrt(int a, int frm, int* result, int* fflags);
|
||||
|
||||
void dpi_ftoi(int a, int frm, int* result, int* fflags);
|
||||
void dpi_ftou(int a, int frm, int* result, int* fflags);
|
||||
void dpi_itof(int a, int frm, int* result, int* fflags);
|
||||
void dpi_utof(int a, int frm, int* result, int* fflags);
|
||||
|
||||
void dpi_fclss(int a, int* result);
|
||||
void dpi_fsgnj(int a, int* result);
|
||||
void dpi_fsgnjn(int a, int* result);
|
||||
void dpi_fsgnjx(int a, int* result);
|
||||
|
||||
void dpi_flt(int a, int b, int* result, int* fflags);
|
||||
void dpi_fle(int a, int b, int* result, int* fflags);
|
||||
void dpi_feq(int a, int b, int* result, int* fflags);
|
||||
void dpi_fmin(int a, int b, int* result, int* fflags);
|
||||
void dpi_fmax(int a, int b, int* result, int* fflags);
|
||||
}
|
||||
|
||||
union Float_t {
|
||||
float f;
|
||||
int i;
|
||||
struct {
|
||||
uint32_t man : 23;
|
||||
uint32_t exp : 8;
|
||||
uint32_t sign : 1;
|
||||
} parts;
|
||||
};
|
||||
|
||||
void dpi_fadd(int a, int b, int frm, int* result, int* fflags) {
|
||||
Float_t fa, fb, fr;
|
||||
|
||||
fa.i = a;
|
||||
fb.i = b;
|
||||
fr.f = fa.f + fb.f;
|
||||
|
||||
*result = fr.i;
|
||||
*fflags = 0;
|
||||
}
|
||||
|
||||
void dpi_fsub(int a, int b, int frm, int* result, int* fflags) {
|
||||
Float_t fa, fb, fr;
|
||||
|
||||
fa.i = a;
|
||||
fb.i = b;
|
||||
fr.f = fa.f - fb.f;
|
||||
|
||||
*result = fr.i;
|
||||
*fflags = 0;
|
||||
}
|
||||
|
||||
void dpi_fmul(int a, int b, int frm, int* result, int* fflags) {
|
||||
Float_t fa, fb, fr;
|
||||
|
||||
fa.i = a;
|
||||
fb.i = b;
|
||||
fr.f = fa.f * fb.f;
|
||||
|
||||
*result = fr.i;
|
||||
*fflags = 0;
|
||||
}
|
||||
|
||||
void dpi_fmadd(int a, int b, int c, int frm, int* result, int* fflags) {
|
||||
Float_t fa, fb, fc, fr;
|
||||
|
||||
fa.i = a;
|
||||
fb.i = b;
|
||||
fc.i = c;
|
||||
fr.f = fa.f * fb.f + fc.f;
|
||||
|
||||
*result = fr.i;
|
||||
*fflags = 0;
|
||||
}
|
||||
|
||||
void dpi_fmsub(int a, int b, int c, int frm, int* result, int* fflags) {
|
||||
Float_t fa, fb, fc, fr;
|
||||
|
||||
fa.i = a;
|
||||
fb.i = b;
|
||||
fc.i = c;
|
||||
fr.f = fa.f * fb.f - fc.f;
|
||||
|
||||
*result = fr.i;
|
||||
*fflags = 0;
|
||||
}
|
||||
|
||||
void dpi_fnmadd(int a, int b, int c, int frm, int* result, int* fflags) {
|
||||
Float_t fa, fb, fc, fr;
|
||||
|
||||
fa.i = a;
|
||||
fb.i = b;
|
||||
fc.i = c;
|
||||
fr.f = -(fa.f * fb.f + fc.f);
|
||||
|
||||
*result = fr.i;
|
||||
*fflags = 0;
|
||||
}
|
||||
|
||||
void dpi_fnmsub(int a, int b, int c, int frm, int* result, int* fflags) {
|
||||
Float_t fa, fb, fc, fr;
|
||||
|
||||
fa.i = a;
|
||||
fb.i = b;
|
||||
fc.i = c;
|
||||
fr.f = -(fa.f * fb.f - fc.f);
|
||||
|
||||
*result = fr.i;
|
||||
*fflags = 0;
|
||||
}
|
||||
|
||||
void dpi_fdiv(int a, int b, int frm, int* result, int* fflags) {
|
||||
Float_t fa, fb, fr;
|
||||
|
||||
fa.i = a;
|
||||
fb.i = b;
|
||||
fr.f = fa.f / fb.f;
|
||||
|
||||
*result = fr.i;
|
||||
*fflags = 0;
|
||||
}
|
||||
|
||||
void dpi_fsqrt(int a, int frm, int* result, int* fflags) {
|
||||
Float_t fa, fr;
|
||||
|
||||
fa.i = a;
|
||||
fr.f = sqrtf(fa.f);
|
||||
|
||||
*result = fr.i;
|
||||
*fflags = 0;
|
||||
}
|
||||
|
||||
void dpi_ftoi(int a, int frm, int* result, int* fflags) {
|
||||
Float_t fa, fr;
|
||||
|
||||
fa.i = a;
|
||||
fr.i = int(fa.f);
|
||||
|
||||
*result = fr.i;
|
||||
*fflags = 0;
|
||||
}
|
||||
|
||||
void dpi_ftou(int a, int frm, int* result, int* fflags) {
|
||||
Float_t fa, fr;
|
||||
|
||||
fa.i = a;
|
||||
fr.i = unsigned(fa.f);
|
||||
|
||||
*result = fr.i;
|
||||
*fflags = 0;
|
||||
}
|
||||
|
||||
void dpi_itof(int a, int frm, int* result, int* fflags) {
|
||||
Float_t fa, fr;
|
||||
|
||||
fr.f = (float)a;
|
||||
|
||||
*result = fr.i;
|
||||
*fflags = 0;
|
||||
}
|
||||
|
||||
void dpi_utof(int a, int frm, int* result, int* fflags) {
|
||||
Float_t fa, fr;
|
||||
|
||||
unsigned ua = a;
|
||||
fr.f = (float)ua;
|
||||
|
||||
*result = fr.i;
|
||||
*fflags = 0;
|
||||
}
|
||||
|
||||
void dpi_flt(int a, int b, int* result, int* fflags) {
|
||||
Float_t fa, fb, fr;
|
||||
|
||||
fa.i = a;
|
||||
fb.i = b;
|
||||
fr.f = fa.f < fb.f;
|
||||
|
||||
*result = fr.i;
|
||||
*fflags = 0;
|
||||
}
|
||||
|
||||
void dpi_fle(int a, int b, int* result, int* fflags) {
|
||||
Float_t fa, fb, fr;
|
||||
|
||||
fa.i = a;
|
||||
fb.i = b;
|
||||
fr.f = fa.f <= fb.f;
|
||||
|
||||
*result = fr.i;
|
||||
*fflags = 0;
|
||||
}
|
||||
|
||||
void dpi_feq(int a, int b, int* result, int* fflags) {
|
||||
Float_t fa, fb, fr;
|
||||
|
||||
fa.i = a;
|
||||
fb.i = b;
|
||||
fr.f = fa.f == fb.f;
|
||||
|
||||
*result = fr.i;
|
||||
*fflags = 0;
|
||||
}
|
||||
|
||||
void dpi_fmin(int a, int b, int* result, int* fflags) {
|
||||
Float_t fa, fb, fr;
|
||||
|
||||
fa.i = a;
|
||||
fb.i = b;
|
||||
fr.f = std::min<float>(fa.f, fb.f);
|
||||
|
||||
*result = fr.i;
|
||||
*fflags = 0;
|
||||
}
|
||||
|
||||
void dpi_fmax(int a, int b, int* result, int* fflags) {
|
||||
Float_t fa, fb, fr;
|
||||
|
||||
fa.i = a;
|
||||
fb.i = b;
|
||||
fr.f = std::max<float>(fa.f, fb.f);
|
||||
|
||||
*result = fr.i;
|
||||
*fflags = 0;
|
||||
}
|
||||
|
||||
void dpi_fclss(int a, int* result) {
|
||||
// TODO
|
||||
*result = 0;
|
||||
}
|
||||
|
||||
void dpi_fsgnj(int a, int* result) {
|
||||
// TODO
|
||||
*result = 0;
|
||||
}
|
||||
|
||||
void dpi_fsgnjn(int a, int* result) {
|
||||
// TODO
|
||||
*result = 0;
|
||||
}
|
||||
|
||||
void dpi_fsgnjx(int a, int* result) {
|
||||
// TODO
|
||||
*result = 0;
|
||||
}
|
31
hw/dpi/float_dpi.vh
Normal file
31
hw/dpi/float_dpi.vh
Normal file
|
@ -0,0 +1,31 @@
|
|||
`ifndef FLOAT_DPI
|
||||
`define FLOAT_DPI
|
||||
|
||||
import "DPI-C" context function void dpi_fadd(input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" context function void dpi_fsub(input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" context function void dpi_fmul(input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" context function void dpi_fmadd(input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" context function void dpi_fmsub(input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" context function void dpi_fnmadd(input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" context function void dpi_fnmsub(input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
|
||||
|
||||
import "DPI-C" context function void dpi_fdiv(input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" context function void dpi_fsqrt(input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
|
||||
|
||||
import "DPI-C" context function void dpi_ftoi(input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" context function void dpi_ftou(input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" context function void dpi_itof(input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" context function void dpi_utof(input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
|
||||
|
||||
import "DPI-C" context function void dpi_fclss(input int a, output int result);
|
||||
import "DPI-C" context function void dpi_fsgnj(input int a, output int result);
|
||||
import "DPI-C" context function void dpi_fsgnjn(input int a, output int result);
|
||||
import "DPI-C" context function void dpi_fsgnjx(input int a, output int result);
|
||||
|
||||
import "DPI-C" context function void dpi_flt(input int a, input int b, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" context function void dpi_fle(input int a, input int b, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" context function void dpi_feq(input int a, input int b, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" context function void dpi_fmin(input int a, input int b, output int result, output bit[4:0] fflags);
|
||||
import "DPI-C" context function void dpi_fmax(input int a, input int b, output int result, output bit[4:0] fflags);
|
||||
|
||||
`endif
|
84
hw/dpi/util_dpi.cpp
Normal file
84
hw/dpi/util_dpi.cpp
Normal file
|
@ -0,0 +1,84 @@
|
|||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
#include <mutex>
|
||||
#include <iostream>
|
||||
#include "svdpi.h"
|
||||
#include "verilated_vpi.h"
|
||||
#include "VX_config.h"
|
||||
|
||||
extern "C" {
|
||||
int dpi_register();
|
||||
void dpi_assert(int inst, bool cond, int delay);
|
||||
}
|
||||
|
||||
class ShiftRegister {
|
||||
public:
|
||||
ShiftRegister() : init_(false), depth_(0) {}
|
||||
|
||||
void ensure_init(int depth) {
|
||||
if (!init_) {
|
||||
buffer_.resize(depth);
|
||||
init_ = true;
|
||||
depth_ = depth;
|
||||
}
|
||||
}
|
||||
|
||||
void push(int value, bool enable) {
|
||||
if (!enable)
|
||||
return;
|
||||
for (unsigned i = 0; i < depth_-1; ++i) {
|
||||
buffer_[i] = buffer_[i+1];
|
||||
}
|
||||
buffer_[depth_-1] = value;
|
||||
}
|
||||
|
||||
int top() const {
|
||||
return buffer_[0];
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
std::vector<int> buffer_;
|
||||
bool init_;
|
||||
unsigned depth_;
|
||||
};
|
||||
|
||||
class Instances {
|
||||
public:
|
||||
ShiftRegister& get(int inst) {
|
||||
return instances_.at(inst);
|
||||
}
|
||||
|
||||
int allocate() {
|
||||
mutex_.lock();
|
||||
int inst = instances_.size();
|
||||
instances_.resize(inst + 1);
|
||||
mutex_.unlock();
|
||||
return inst;
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<ShiftRegister> instances_;
|
||||
std::mutex mutex_;
|
||||
};
|
||||
|
||||
Instances instances;
|
||||
|
||||
int dpi_register() {
|
||||
return instances.allocate();
|
||||
}
|
||||
|
||||
void dpi_assert(int inst, bool cond, int delay) {
|
||||
ShiftRegister& sr = instances.get(inst);
|
||||
|
||||
sr.ensure_init(delay);
|
||||
sr.push(!cond, 1);
|
||||
|
||||
auto status = sr.top();
|
||||
if (status) {
|
||||
printf("delayed assertion at %s!\n", svGetNameFromScope(svGetScope()));
|
||||
std::abort();
|
||||
}
|
||||
}
|
7
hw/dpi/util_dpi.vh
Normal file
7
hw/dpi/util_dpi.vh
Normal file
|
@ -0,0 +1,7 @@
|
|||
`ifndef UTIL_DPI
|
||||
`define UTIL_DPI
|
||||
|
||||
import "DPI-C" context function int dpi_register();
|
||||
import "DPI-C" context function void dpi_assert(int inst, input logic cond, input int delay);
|
||||
|
||||
`endif
|
|
@ -40,7 +40,8 @@ module VX_cluster #(
|
|||
// Status
|
||||
output wire busy,
|
||||
output wire ebreak
|
||||
);
|
||||
);
|
||||
|
||||
wire [`NUM_CORES-1:0] per_core_dram_req_valid;
|
||||
wire [`NUM_CORES-1:0] per_core_dram_req_rw;
|
||||
wire [`NUM_CORES-1:0][`DDRAM_BYTEEN_WIDTH-1:0] per_core_dram_req_byteen;
|
||||
|
@ -70,15 +71,13 @@ module VX_cluster #(
|
|||
for (genvar i = 0; i < `NUM_CORES; i++) begin
|
||||
|
||||
wire core_reset;
|
||||
if (`NUM_CORES > 1) begin
|
||||
reg core_reset_r;
|
||||
always @(posedge clk) begin
|
||||
core_reset_r <= reset;
|
||||
end
|
||||
assign core_reset = core_reset_r;
|
||||
end else begin
|
||||
assign core_reset = reset;
|
||||
end
|
||||
VX_reset_relay #(
|
||||
.PASSTHRU (`NUM_CORES == 1)
|
||||
) reset_relay (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset_out (core_reset)
|
||||
);
|
||||
|
||||
VX_core #(
|
||||
.CORE_ID(i + (CLUSTER_ID * `NUM_CORES))
|
||||
|
|
|
@ -92,8 +92,8 @@
|
|||
`define LATENCY_IMUL 3
|
||||
`endif
|
||||
|
||||
`ifndef LATENCY_FNCOMP
|
||||
`define LATENCY_FNCOMP 2
|
||||
`ifndef LATENCY_FNCP
|
||||
`define LATENCY_FNCP 2
|
||||
`endif
|
||||
|
||||
`ifndef LATENCY_FMA
|
||||
|
@ -128,8 +128,8 @@
|
|||
`define LATENCY_FDIVSQRT 32
|
||||
`endif
|
||||
|
||||
`ifndef LATENCY_FCONV
|
||||
`define LATENCY_FCONV 4
|
||||
`ifndef LATENCY_FCVT
|
||||
`define LATENCY_FCVT 4
|
||||
`endif
|
||||
|
||||
// CSR Addresses //////////////////////////////////////////////////////////////
|
||||
|
|
|
@ -63,11 +63,11 @@ module VX_fpu_unit #(
|
|||
|
||||
// resolve dynamic FRM from CSR
|
||||
assign fpu_to_csr_if.read_wid = fpu_req_if.wid;
|
||||
wire [`FRM_BITS-1:0] fpu_frm = (fpu_req_if.op_mod == `FRM_DYN) ? fpu_to_csr_if.read_frm : fpu_req_if.op_mod;
|
||||
wire [`FRM_BITS-1:0] fpu_frm = (fpu_req_if.op_mod == `FRM_DYN) ? fpu_to_csr_if.read_frm : fpu_req_if.op_mod;
|
||||
|
||||
`ifdef FPU_FAST
|
||||
|
||||
VX_fp_fpga #(
|
||||
VX_fp_dpi #(
|
||||
.TAGW (FPUQ_BITS)
|
||||
) fp_core (
|
||||
.clk (clk),
|
||||
|
@ -91,21 +91,51 @@ module VX_fpu_unit #(
|
|||
|
||||
.tag_out (tag_out),
|
||||
|
||||
.ready_out (ready_out),
|
||||
.valid_out (valid_out)
|
||||
);
|
||||
|
||||
`elsif FPU_FPNEW
|
||||
|
||||
VX_fpnew #(
|
||||
.FMULADD (1),
|
||||
.FDIVSQRT (1),
|
||||
.FNONCOMP (1),
|
||||
.FCONV (1),
|
||||
.TAGW (FPUQ_BITS)
|
||||
) fp_core (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
.valid_in (valid_in),
|
||||
.ready_in (ready_in),
|
||||
|
||||
.tag_in (tag_in),
|
||||
|
||||
.op_type (fpu_req_if.op_type),
|
||||
.frm (fpu_frm),
|
||||
|
||||
.dataa (fpu_req_if.rs1_data),
|
||||
.datab (fpu_req_if.rs2_data),
|
||||
.datac (fpu_req_if.rs3_data),
|
||||
.result (result),
|
||||
|
||||
.has_fflags (has_fflags),
|
||||
.fflags (fflags),
|
||||
|
||||
.tag_out (tag_out),
|
||||
|
||||
.ready_out (ready_out),
|
||||
.valid_out (valid_out)
|
||||
);
|
||||
|
||||
`else
|
||||
|
||||
VX_fpnew #(
|
||||
.FMULADD (1),
|
||||
.FDIVSQRT (1),
|
||||
.FNONCOMP (1),
|
||||
.FCONV (1),
|
||||
.TAGW (FPUQ_BITS)
|
||||
VX_fp_fpga #(
|
||||
.TAGW (FPUQ_BITS)
|
||||
) fp_core (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (fpu_reset),
|
||||
|
||||
.valid_in (valid_in),
|
||||
.ready_in (ready_in),
|
||||
|
|
|
@ -64,7 +64,7 @@ module VX_lsu_unit #(
|
|||
assign mem_req_addr[i] = full_address[i][31:2];
|
||||
assign mem_req_offset[i] = full_address[i][1:0];
|
||||
assign mem_req_byteen[i] = wmask << full_address[i][1:0];
|
||||
assign mem_req_data[i] = lsu_req_if.store_data[i] << {mem_req_offset[i], 3'b0};
|
||||
assign mem_req_data[i] = lsu_req_if.store_data[i] << {full_address[i][1:0], 3'b0};
|
||||
end
|
||||
|
||||
`IGNORE_WARNINGS_BEGIN
|
||||
|
|
|
@ -79,7 +79,17 @@ module VX_mem_unit # (
|
|||
.cache_rsp_if (dcache_rsp_if),
|
||||
.smem_rsp_if (smem_rsp_if),
|
||||
.core_rsp_if (core_dcache_rsp_if)
|
||||
);
|
||||
);
|
||||
|
||||
wire icache_reset, dcache_reset;
|
||||
|
||||
VX_reset_relay #(
|
||||
.NUM_NODES (2)
|
||||
) reset_relay (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset_out ({dcache_reset, icache_reset})
|
||||
);
|
||||
|
||||
VX_cache #(
|
||||
.CACHE_ID (`ICACHE_ID),
|
||||
|
@ -102,7 +112,7 @@ module VX_mem_unit # (
|
|||
`SCOPE_BIND_VX_mem_unit_icache
|
||||
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (icache_reset),
|
||||
|
||||
// Core request
|
||||
.core_req_valid (core_icache_req_if.valid),
|
||||
|
@ -160,7 +170,7 @@ module VX_mem_unit # (
|
|||
`SCOPE_BIND_VX_mem_unit_dcache
|
||||
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (dcache_reset),
|
||||
|
||||
// Core req
|
||||
.core_req_valid (dcache_req_if.valid),
|
||||
|
@ -199,6 +209,14 @@ module VX_mem_unit # (
|
|||
|
||||
if (`SM_ENABLE) begin
|
||||
|
||||
wire scache_reset;
|
||||
|
||||
VX_reset_relay reset_relay (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset_out (scache_reset)
|
||||
);
|
||||
|
||||
VX_cache #(
|
||||
.CACHE_ID (`SCACHE_ID),
|
||||
.CACHE_SIZE (`SMEM_SIZE),
|
||||
|
@ -220,7 +238,7 @@ module VX_mem_unit # (
|
|||
`SCOPE_BIND_VX_mem_unit_smem
|
||||
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset (scache_reset),
|
||||
|
||||
// Core request
|
||||
.core_req_valid (smem_req_if.valid),
|
||||
|
|
|
@ -72,15 +72,13 @@ module Vortex (
|
|||
for (genvar i = 0; i < `NUM_CLUSTERS; i++) begin
|
||||
|
||||
wire cluster_reset;
|
||||
if (`NUM_CLUSTERS > 1) begin
|
||||
reg cluster_reset_r;
|
||||
always @(posedge clk) begin
|
||||
cluster_reset_r <= reset;
|
||||
end
|
||||
assign cluster_reset = cluster_reset_r;
|
||||
end else begin
|
||||
assign cluster_reset = reset;
|
||||
end
|
||||
VX_reset_relay #(
|
||||
.PASSTHRU (`NUM_CLUSTERS == 1)
|
||||
) reset_relay (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset_out (cluster_reset)
|
||||
);
|
||||
|
||||
VX_cluster #(
|
||||
.CLUSTER_ID(i)
|
||||
|
|
|
@ -37,7 +37,7 @@ module vortex_afu #(
|
|||
output logic [$clog2(NUM_LOCAL_MEM_BANKS)-1:0] mem_bank_select
|
||||
);
|
||||
|
||||
localparam RESET_DELAY = 2;
|
||||
localparam RESET_DELAY = 3;
|
||||
|
||||
localparam DRAM_ADDR_WIDTH = $bits(t_local_mem_addr);
|
||||
localparam DRAM_LINE_WIDTH = $bits(t_local_mem_data);
|
||||
|
|
|
@ -1,9 +1,5 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
`ifndef SYNTHESIS
|
||||
`include "float_dpi.vh"
|
||||
`endif
|
||||
|
||||
module VX_fp_div #(
|
||||
parameter TAGW = 1,
|
||||
parameter LANES = 1
|
||||
|
@ -32,26 +28,24 @@ module VX_fp_div #(
|
|||
);
|
||||
wire stall = ~ready_out && valid_out;
|
||||
wire enable = ~stall;
|
||||
|
||||
wire _reset;
|
||||
|
||||
VX_reset_relay reset_relay (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset_out (_reset)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
`ifdef QUARTUS
|
||||
acl_fdiv fdiv (
|
||||
.clk (clk),
|
||||
.areset (reset),
|
||||
.areset (_reset),
|
||||
.en (enable),
|
||||
.a (dataa[i]),
|
||||
.b (datab[i]),
|
||||
.q (result[i])
|
||||
);
|
||||
`else
|
||||
integer fdiv_h;
|
||||
initial begin
|
||||
fdiv_h = dpi_register();
|
||||
end
|
||||
always @(posedge clk) begin
|
||||
dpi_fdiv (fdiv_h, enable, dataa[i], datab[i], `LATENCY_FDIV, result[i]);
|
||||
end
|
||||
`endif
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
|
|
415
hw/rtl/fp_cores/VX_fp_dpi.v
Normal file
415
hw/rtl/fp_cores/VX_fp_dpi.v
Normal file
|
@ -0,0 +1,415 @@
|
|||
`ifndef SYNTHESIS
|
||||
|
||||
`include "VX_define.vh"
|
||||
`include "float_dpi.vh"
|
||||
|
||||
module VX_fp_dpi #(
|
||||
parameter TAGW = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
input wire valid_in,
|
||||
output wire ready_in,
|
||||
|
||||
input wire [TAGW-1:0] tag_in,
|
||||
|
||||
input wire [`FPU_BITS-1:0] op_type,
|
||||
input wire [`MOD_BITS-1:0] frm,
|
||||
|
||||
input wire [`NUM_THREADS-1:0][31:0] dataa,
|
||||
input wire [`NUM_THREADS-1:0][31:0] datab,
|
||||
input wire [`NUM_THREADS-1:0][31:0] datac,
|
||||
output wire [`NUM_THREADS-1:0][31:0] result,
|
||||
|
||||
output wire has_fflags,
|
||||
output fflags_t [`NUM_THREADS-1:0] fflags,
|
||||
|
||||
output wire [TAGW-1:0] tag_out,
|
||||
|
||||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
localparam FPU_FMA = 0;
|
||||
localparam FPU_DIV = 1;
|
||||
localparam FPU_SQRT = 2;
|
||||
localparam FPU_CVT = 3;
|
||||
localparam FPU_NCP = 4;
|
||||
localparam NUM_FPC = 5;
|
||||
localparam FPC_BITS = `LOG2UP(NUM_FPC);
|
||||
|
||||
wire [NUM_FPC-1:0] per_core_ready_in;
|
||||
wire [NUM_FPC-1:0][`NUM_THREADS-1:0][31:0] per_core_result;
|
||||
wire [NUM_FPC-1:0][TAGW-1:0] per_core_tag_out;
|
||||
reg [NUM_FPC-1:0] per_core_ready_out;
|
||||
wire [NUM_FPC-1:0] per_core_valid_out;
|
||||
|
||||
wire [NUM_FPC-1:0] per_core_has_fflags;
|
||||
fflags_t [NUM_FPC-1:0][`NUM_THREADS-1:0] per_core_fflags;
|
||||
|
||||
reg [FPC_BITS-1:0] core_select;
|
||||
|
||||
reg is_fadd, is_fsub, is_fmul, is_fmadd, is_fmsub, is_fnmadd, is_fnmsub;
|
||||
reg is_itof, is_utof, is_ftoi, is_ftou;
|
||||
reg is_fclss, is_flt, is_fle, is_feq, is_fmin, is_fmax, is_fsgnj, is_fsgnjn, is_fsgnjx;
|
||||
|
||||
always @(*) begin
|
||||
is_fadd = 0;
|
||||
is_fsub = 0;
|
||||
is_fmul = 0;
|
||||
is_fmadd = 0;
|
||||
is_fmsub = 0;
|
||||
is_fnmadd = 0;
|
||||
is_fnmsub = 0;
|
||||
is_itof = 0;
|
||||
is_utof = 0;
|
||||
is_ftoi = 0;
|
||||
is_ftou = 0;
|
||||
is_fclss = 0;
|
||||
is_flt = 0;
|
||||
is_fle = 0;
|
||||
is_feq = 0;
|
||||
is_fmin = 0;
|
||||
is_fmax = 0;
|
||||
is_fsgnj = 0;
|
||||
is_fsgnjn = 0;
|
||||
is_fsgnjx = 0;
|
||||
|
||||
case (op_type)
|
||||
`FPU_ADD: begin core_select = FPU_FMA; is_fadd = 1; end
|
||||
`FPU_SUB: begin core_select = FPU_FMA; is_fsub = 1; end
|
||||
`FPU_MUL: begin core_select = FPU_FMA; is_fmul = 1; end
|
||||
`FPU_MADD: begin core_select = FPU_FMA; is_fmadd = 1; end
|
||||
`FPU_MSUB: begin core_select = FPU_FMA; is_fmsub = 1; end
|
||||
`FPU_NMADD: begin core_select = FPU_FMA; is_fnmadd = 1; end
|
||||
`FPU_NMSUB: begin core_select = FPU_FMA; is_fnmsub = 1; end
|
||||
`FPU_DIV: begin core_select = FPU_DIV; end
|
||||
`FPU_SQRT: begin core_select = FPU_SQRT; end
|
||||
`FPU_CVTWS: begin core_select = FPU_CVT; is_ftoi = 1; end
|
||||
`FPU_CVTWUS:begin core_select = FPU_CVT; is_ftou = 1; end
|
||||
`FPU_CVTSW: begin core_select = FPU_CVT; is_itof = 1; end
|
||||
`FPU_CVTSWU:begin core_select = FPU_CVT; is_utof = 1; end
|
||||
`FPU_CLASS: begin core_select = FPU_NCP; is_fclss = 1; end
|
||||
`FPU_CMP: begin core_select = FPU_NCP;
|
||||
is_fle = (frm == 0);
|
||||
is_flt = (frm == 1);
|
||||
is_feq = (frm == 2);
|
||||
end
|
||||
default: begin core_select = FPU_NCP;
|
||||
is_fsgnj = (frm == 0);
|
||||
is_fsgnjn = (frm == 1);
|
||||
is_fsgnjx = (frm == 2);
|
||||
is_fmin = (frm == 3);
|
||||
is_fmax = (frm == 4);
|
||||
end
|
||||
endcase
|
||||
end
|
||||
|
||||
generate
|
||||
begin : fma
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] result_fma;
|
||||
wire [`NUM_THREADS-1:0][31:0] result_fadd;
|
||||
wire [`NUM_THREADS-1:0][31:0] result_fsub;
|
||||
wire [`NUM_THREADS-1:0][31:0] result_fmul;
|
||||
wire [`NUM_THREADS-1:0][31:0] result_fmadd;
|
||||
wire [`NUM_THREADS-1:0][31:0] result_fmsub;
|
||||
wire [`NUM_THREADS-1:0][31:0] result_fnmadd;
|
||||
wire [`NUM_THREADS-1:0][31:0] result_fnmsub;
|
||||
|
||||
fflags_t [`NUM_THREADS-1:0] fflags_fma;
|
||||
fflags_t [`NUM_THREADS-1:0] fflags_fadd;
|
||||
fflags_t [`NUM_THREADS-1:0] fflags_fsub;
|
||||
fflags_t [`NUM_THREADS-1:0] fflags_fmul;
|
||||
fflags_t [`NUM_THREADS-1:0] fflags_fmadd;
|
||||
fflags_t [`NUM_THREADS-1:0] fflags_fmsub;
|
||||
fflags_t [`NUM_THREADS-1:0] fflags_fnmadd;
|
||||
fflags_t [`NUM_THREADS-1:0] fflags_fnmsub;
|
||||
|
||||
always @(*) begin
|
||||
for (integer i = 0; i < `NUM_THREADS; i++) begin
|
||||
dpi_fadd (dataa[i], datab[i], frm, result_fadd[i], fflags_fadd[i]);
|
||||
dpi_fsub (dataa[i], datab[i], frm, result_fsub[i], fflags_fsub[i]);
|
||||
dpi_fmul (dataa[i], datab[i], frm, result_fmul[i], fflags_fmul[i]);
|
||||
dpi_fmadd (dataa[i], datab[i], datac[i], frm, result_fmadd[i], fflags_fmadd[i]);
|
||||
dpi_fmsub (dataa[i], datab[i], datac[i], frm, result_fmsub[i], fflags_fmsub[i]);
|
||||
dpi_fnmadd (dataa[i], datab[i], datac[i], frm, result_fnmadd[i], fflags_fnmadd[i]);
|
||||
dpi_fnmsub (dataa[i], datab[i], datac[i], frm, result_fnmsub[i], fflags_fnmsub[i]);
|
||||
end
|
||||
end
|
||||
|
||||
assign result_fma = is_fadd ? result_fadd :
|
||||
is_fsub ? result_fsub :
|
||||
is_fmul ? result_fmul :
|
||||
is_fmadd ? result_fmadd :
|
||||
is_fmsub ? result_fmsub :
|
||||
is_fnmadd ? result_fnmadd :
|
||||
is_fnmsub ? result_fnmsub :
|
||||
0;
|
||||
|
||||
assign fflags_fma = is_fadd ? fflags_fadd :
|
||||
is_fsub ? fflags_fsub :
|
||||
is_fmul ? fflags_fmul :
|
||||
is_fmadd ? fflags_fmadd :
|
||||
is_fmsub ? fflags_fmsub :
|
||||
is_fnmadd ? fflags_fnmadd :
|
||||
is_fnmsub ? fflags_fnmsub :
|
||||
0;
|
||||
|
||||
wire enable = per_core_ready_out[FPU_FMA] || ~per_core_valid_out[FPU_FMA];
|
||||
wire valid = (valid_in && core_select == FPU_FMA);
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW (1 + TAGW + `NUM_THREADS * (32 + $bits(fflags_t))),
|
||||
.DEPTH (`LATENCY_FMA),
|
||||
.RESETW (1)
|
||||
) shift_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (enable),
|
||||
.data_in ({valid, tag_in, result_fma, fflags_fma}),
|
||||
.data_out ({per_core_valid_out[FPU_FMA], per_core_tag_out[FPU_FMA], per_core_result[FPU_FMA], per_core_fflags[FPU_FMA]})
|
||||
);
|
||||
|
||||
assign per_core_has_fflags[FPU_FMA] = 1;
|
||||
assign per_core_ready_in[FPU_FMA] = enable;
|
||||
|
||||
end
|
||||
endgenerate
|
||||
|
||||
generate
|
||||
begin : fdiv
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] result_fdiv;
|
||||
fflags_t [`NUM_THREADS-1:0] fflags_fdiv;
|
||||
|
||||
always @(*) begin
|
||||
for (integer i = 0; i < `NUM_THREADS; i++) begin
|
||||
dpi_fdiv (dataa[i], datab[i], frm, result_fdiv[i], fflags_fdiv[i]);
|
||||
end
|
||||
end
|
||||
|
||||
wire enable = per_core_ready_out[FPU_DIV] || ~per_core_valid_out[FPU_DIV];
|
||||
wire valid = (valid_in && core_select == FPU_DIV);
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW (1 + TAGW + `NUM_THREADS * (32 + $bits(fflags_t))),
|
||||
.DEPTH (`LATENCY_FDIV),
|
||||
.RESETW (1)
|
||||
) shift_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (enable),
|
||||
.data_in ({valid, tag_in, result_fdiv, fflags_fdiv}),
|
||||
.data_out ({per_core_valid_out[FPU_DIV], per_core_tag_out[FPU_DIV], per_core_result[FPU_DIV], per_core_fflags[FPU_DIV]})
|
||||
);
|
||||
|
||||
assign per_core_has_fflags[FPU_DIV] = 1;
|
||||
assign per_core_ready_in[FPU_DIV] = enable;
|
||||
|
||||
end
|
||||
endgenerate
|
||||
|
||||
generate
|
||||
begin : fsqrt
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] result_fsqrt;
|
||||
fflags_t [`NUM_THREADS-1:0] fflags_fsqrt;
|
||||
|
||||
always @(*) begin
|
||||
for (integer i = 0; i < `NUM_THREADS; i++) begin
|
||||
dpi_fsqrt (dataa[i], frm, result_fsqrt[i], fflags_fsqrt[i]);
|
||||
end
|
||||
end
|
||||
|
||||
wire enable = per_core_ready_out[FPU_SQRT] || ~per_core_valid_out[FPU_SQRT];
|
||||
wire valid = (valid_in && core_select == FPU_SQRT);
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW (1 + TAGW + `NUM_THREADS * (32 + $bits(fflags_t))),
|
||||
.DEPTH (`LATENCY_FSQRT),
|
||||
.RESETW (1)
|
||||
) shift_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (enable),
|
||||
.data_in ({valid, tag_in, result_fsqrt, fflags_fsqrt}),
|
||||
.data_out ({per_core_valid_out[FPU_SQRT], per_core_tag_out[FPU_SQRT], per_core_result[FPU_SQRT], per_core_fflags[FPU_SQRT]})
|
||||
);
|
||||
|
||||
assign per_core_has_fflags[FPU_SQRT] = 1;
|
||||
assign per_core_ready_in[FPU_SQRT] = enable;
|
||||
|
||||
end
|
||||
endgenerate
|
||||
|
||||
generate
|
||||
begin : fcvt
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] result_fcvt;
|
||||
wire [`NUM_THREADS-1:0][31:0] result_itof;
|
||||
wire [`NUM_THREADS-1:0][31:0] result_utof;
|
||||
wire [`NUM_THREADS-1:0][31:0] result_ftoi;
|
||||
wire [`NUM_THREADS-1:0][31:0] result_ftou;
|
||||
|
||||
fflags_t [`NUM_THREADS-1:0] fflags_fcvt;
|
||||
fflags_t [`NUM_THREADS-1:0] fflags_itof;
|
||||
fflags_t [`NUM_THREADS-1:0] fflags_utof;
|
||||
fflags_t [`NUM_THREADS-1:0] fflags_ftoi;
|
||||
fflags_t [`NUM_THREADS-1:0] fflags_ftou;
|
||||
|
||||
always @(*) begin
|
||||
for (integer i = 0; i < `NUM_THREADS; i++) begin
|
||||
dpi_itof (dataa[i], frm, result_itof[i], fflags_itof[i]);
|
||||
dpi_utof (dataa[i], frm, result_utof[i], fflags_utof[i]);
|
||||
dpi_ftoi (dataa[i], frm, result_ftoi[i], fflags_ftoi[i]);
|
||||
dpi_ftou (dataa[i], frm, result_ftou[i], fflags_ftou[i]);
|
||||
end
|
||||
end
|
||||
|
||||
assign result_fcvt = is_itof ? result_itof :
|
||||
is_utof ? result_utof :
|
||||
is_ftoi ? result_ftoi :
|
||||
is_ftou ? result_ftou :
|
||||
0;
|
||||
|
||||
assign fflags_fcvt = is_itof ? fflags_itof :
|
||||
is_utof ? fflags_utof :
|
||||
is_ftoi ? fflags_ftoi :
|
||||
is_ftou ? fflags_ftou :
|
||||
0;
|
||||
|
||||
wire enable = per_core_ready_out[FPU_CVT] || ~per_core_valid_out[FPU_CVT];
|
||||
wire valid = (valid_in && core_select == FPU_CVT);
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW (1 + TAGW + `NUM_THREADS * (32 + $bits(fflags_t))),
|
||||
.DEPTH (`LATENCY_FCVT),
|
||||
.RESETW (1)
|
||||
) shift_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (enable),
|
||||
.data_in ({valid, tag_in, result_fcvt, fflags_fcvt}),
|
||||
.data_out ({per_core_valid_out[FPU_CVT], per_core_tag_out[FPU_CVT], per_core_result[FPU_CVT], per_core_fflags[FPU_CVT]})
|
||||
);
|
||||
|
||||
assign per_core_has_fflags[FPU_CVT] = 1;
|
||||
assign per_core_ready_in[FPU_CVT] = enable;
|
||||
|
||||
end
|
||||
endgenerate
|
||||
|
||||
generate
|
||||
begin : fncp
|
||||
|
||||
wire [`NUM_THREADS-1:0][31:0] result_fncp;
|
||||
wire [`NUM_THREADS-1:0][31:0] result_fclss;
|
||||
wire [`NUM_THREADS-1:0][31:0] result_flt;
|
||||
wire [`NUM_THREADS-1:0][31:0] result_fle;
|
||||
wire [`NUM_THREADS-1:0][31:0] result_feq;
|
||||
wire [`NUM_THREADS-1:0][31:0] result_fmin;
|
||||
wire [`NUM_THREADS-1:0][31:0] result_fmax;
|
||||
wire [`NUM_THREADS-1:0][31:0] result_fsgnj;
|
||||
wire [`NUM_THREADS-1:0][31:0] result_fsgnjn;
|
||||
wire [`NUM_THREADS-1:0][31:0] result_fsgnjx;
|
||||
reg [`NUM_THREADS-1:0][31:0] result_fmv;
|
||||
|
||||
fflags_t [`NUM_THREADS-1:0] fflags_fncp;
|
||||
fflags_t [`NUM_THREADS-1:0] fflags_flt;
|
||||
fflags_t [`NUM_THREADS-1:0] fflags_fle;
|
||||
fflags_t [`NUM_THREADS-1:0] fflags_feq;
|
||||
fflags_t [`NUM_THREADS-1:0] fflags_fmin;
|
||||
fflags_t [`NUM_THREADS-1:0] fflags_fmax;
|
||||
|
||||
always @(*) begin
|
||||
for (integer i = 0; i < `NUM_THREADS; i++) begin
|
||||
dpi_fclss (dataa[i], result_fclss[i]);
|
||||
dpi_flt (dataa[i], datab[i], result_flt[i], fflags_flt[i]);
|
||||
dpi_fle (dataa[i], datab[i], result_fle[i], fflags_fle[i]);
|
||||
dpi_feq (dataa[i], datab[i], result_feq[i], fflags_feq[i]);
|
||||
dpi_fmin (dataa[i], datab[i], result_fmin[i], fflags_fmin[i]);
|
||||
dpi_fmax (dataa[i], datab[i], result_fmax[i], fflags_fmax[i]);
|
||||
dpi_fsgnj (dataa[i], result_fsgnj[i]);
|
||||
dpi_fsgnjn (dataa[i], result_fsgnjn[i]);
|
||||
dpi_fsgnjx (dataa[i], result_fsgnjx[i]);
|
||||
result_fmv[i] = dataa[i];
|
||||
end
|
||||
end
|
||||
|
||||
assign result_fncp = is_fclss ? result_fclss :
|
||||
is_flt ? result_flt :
|
||||
is_fle ? result_fle :
|
||||
is_feq ? result_feq :
|
||||
is_fmin ? result_fmin :
|
||||
is_fmax ? result_fmax :
|
||||
is_fsgnj ? result_fsgnj :
|
||||
is_fsgnjn ? result_fsgnjn :
|
||||
is_fsgnjx ? result_fsgnjx :
|
||||
result_fmv;
|
||||
|
||||
wire has_fflags_fncp = (is_flt || is_fle || is_feq || is_fmin || is_fmax);
|
||||
|
||||
assign fflags_fncp = is_flt ? fflags_flt :
|
||||
is_fle ? fflags_fle :
|
||||
is_feq ? fflags_feq :
|
||||
is_fmin ? fflags_fmin :
|
||||
is_fmax ? fflags_fmax :
|
||||
0;
|
||||
|
||||
wire enable = per_core_ready_out[FPU_NCP] || ~per_core_valid_out[FPU_NCP];
|
||||
wire valid = (valid_in && core_select == FPU_NCP);
|
||||
|
||||
VX_shift_register #(
|
||||
.DATAW (1 + TAGW + 1 + `NUM_THREADS * (32 + $bits(fflags_t))),
|
||||
.DEPTH (`LATENCY_FNCP),
|
||||
.RESETW (1)
|
||||
) shift_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (enable),
|
||||
.data_in ({valid, tag_in, has_fflags_fncp, result_fncp, fflags_fncp}),
|
||||
.data_out ({per_core_valid_out[FPU_NCP], per_core_tag_out[FPU_NCP], per_core_has_fflags[FPU_NCP], per_core_result[FPU_NCP], per_core_fflags[FPU_NCP]})
|
||||
);
|
||||
|
||||
assign per_core_ready_in[FPU_NCP] = enable;
|
||||
|
||||
end
|
||||
endgenerate
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
reg has_fflags_n;
|
||||
fflags_t [`NUM_THREADS-1:0] fflags_n;
|
||||
reg [`NUM_THREADS-1:0][31:0] result_n;
|
||||
reg [TAGW-1:0] tag_out_n;
|
||||
|
||||
always @(*) begin
|
||||
per_core_ready_out = 0;
|
||||
has_fflags_n = 'x;
|
||||
fflags_n = 'x;
|
||||
result_n = 'x;
|
||||
tag_out_n = 'x;
|
||||
for (integer i = 0; i < NUM_FPC; i++) begin
|
||||
if (per_core_valid_out[i]) begin
|
||||
has_fflags_n = per_core_has_fflags[i];
|
||||
fflags_n = per_core_fflags[i];
|
||||
result_n = per_core_result[i];
|
||||
tag_out_n = per_core_tag_out[i];
|
||||
per_core_ready_out[i] = ready_out;
|
||||
break;
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
assign valid_out = (| per_core_valid_out);
|
||||
assign has_fflags = has_fflags_n;
|
||||
assign tag_out = tag_out_n;
|
||||
assign result = result_n;
|
||||
assign fflags = fflags_n;
|
||||
|
||||
assign ready_in = per_core_ready_in[core_select];
|
||||
|
||||
endmodule
|
||||
|
||||
`endif
|
|
@ -1,9 +1,5 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
`ifndef SYNTHESIS
|
||||
`include "float_dpi.vh"
|
||||
`endif
|
||||
|
||||
module VX_fp_fma #(
|
||||
parameter TAGW = 1,
|
||||
parameter LANES = 1
|
||||
|
@ -63,7 +59,6 @@ module VX_fp_fma #(
|
|||
end
|
||||
end
|
||||
|
||||
`ifdef QUARTUS
|
||||
acl_fmadd fmadd (
|
||||
.clk (clk),
|
||||
.areset (reset),
|
||||
|
@ -73,15 +68,6 @@ module VX_fp_fma #(
|
|||
.c (c),
|
||||
.q (result[i])
|
||||
);
|
||||
`else
|
||||
integer fmadd_h;
|
||||
initial begin
|
||||
fmadd_h = dpi_register();
|
||||
end
|
||||
always @(posedge clk) begin
|
||||
dpi_fmadd (fmadd_h, enable, a, b, c, `LATENCY_FMA, result[i]);
|
||||
end
|
||||
`endif
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
|
|
|
@ -27,6 +27,11 @@ module VX_fp_fpga #(
|
|||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
localparam FPU_FMA = 0;
|
||||
localparam FPU_DIV = 1;
|
||||
localparam FPU_SQRT = 2;
|
||||
localparam FPU_CVT = 3;
|
||||
localparam FPU_NCP = 4;
|
||||
localparam NUM_FPC = 5;
|
||||
localparam FPC_BITS = `LOG2UP(NUM_FPC);
|
||||
|
||||
|
@ -49,20 +54,20 @@ module VX_fp_fpga #(
|
|||
is_itof = 'x;
|
||||
is_signed = 'x;
|
||||
case (op_type)
|
||||
`FPU_ADD: begin core_select = 0; do_madd = 0; do_sub = 0; do_neg = 0; end
|
||||
`FPU_SUB: begin core_select = 0; do_madd = 0; do_sub = 1; do_neg = 0; end
|
||||
`FPU_MUL: begin core_select = 0; do_madd = 0; do_sub = 0; do_neg = 1; end
|
||||
`FPU_MADD: begin core_select = 0; do_madd = 1; do_sub = 0; do_neg = 0; end
|
||||
`FPU_MSUB: begin core_select = 0; do_madd = 1; do_sub = 1; do_neg = 0; end
|
||||
`FPU_NMADD: begin core_select = 0; do_madd = 1; do_sub = 0; do_neg = 1; end
|
||||
`FPU_NMSUB: begin core_select = 0; do_madd = 1; do_sub = 1; do_neg = 1; end
|
||||
`FPU_DIV: begin core_select = 1; end
|
||||
`FPU_SQRT: begin core_select = 2; end
|
||||
`FPU_CVTWS: begin core_select = 3; is_itof = 0; is_signed = 1; end
|
||||
`FPU_CVTWUS: begin core_select = 3; is_itof = 0; is_signed = 0; end
|
||||
`FPU_CVTSW: begin core_select = 3; is_itof = 1; is_signed = 1; end
|
||||
`FPU_CVTSWU: begin core_select = 3; is_itof = 1; is_signed = 0; end
|
||||
default: begin core_select = 4; end
|
||||
`FPU_ADD: begin core_select = FPU_FMA; do_madd = 0; do_sub = 0; do_neg = 0; end
|
||||
`FPU_SUB: begin core_select = FPU_FMA; do_madd = 0; do_sub = 1; do_neg = 0; end
|
||||
`FPU_MUL: begin core_select = FPU_FMA; do_madd = 0; do_sub = 0; do_neg = 1; end
|
||||
`FPU_MADD: begin core_select = FPU_FMA; do_madd = 1; do_sub = 0; do_neg = 0; end
|
||||
`FPU_MSUB: begin core_select = FPU_FMA; do_madd = 1; do_sub = 1; do_neg = 0; end
|
||||
`FPU_NMADD: begin core_select = FPU_FMA; do_madd = 1; do_sub = 0; do_neg = 1; end
|
||||
`FPU_NMSUB: begin core_select = FPU_FMA; do_madd = 1; do_sub = 1; do_neg = 1; end
|
||||
`FPU_DIV: begin core_select = FPU_DIV; end
|
||||
`FPU_SQRT: begin core_select = FPU_SQRT; end
|
||||
`FPU_CVTWS: begin core_select = FPU_CVT; is_itof = 0; is_signed = 1; end
|
||||
`FPU_CVTWUS: begin core_select = FPU_CVT; is_itof = 0; is_signed = 0; end
|
||||
`FPU_CVTSW: begin core_select = FPU_CVT; is_itof = 1; is_signed = 1; end
|
||||
`FPU_CVTSWU: begin core_select = FPU_CVT; is_itof = 1; is_signed = 0; end
|
||||
default: begin core_select = FPU_NCP; end
|
||||
endcase
|
||||
end
|
||||
|
||||
|
@ -72,8 +77,8 @@ module VX_fp_fpga #(
|
|||
) fp_fma (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 0)),
|
||||
.ready_in (per_core_ready_in[0]),
|
||||
.valid_in (valid_in && (core_select == FPU_FMA)),
|
||||
.ready_in (per_core_ready_in[FPU_FMA]),
|
||||
.tag_in (tag_in),
|
||||
.frm (frm),
|
||||
.do_madd (do_madd),
|
||||
|
@ -82,12 +87,12 @@ module VX_fp_fpga #(
|
|||
.dataa (dataa),
|
||||
.datab (datab),
|
||||
.datac (datac),
|
||||
.has_fflags (per_core_has_fflags[0]),
|
||||
.fflags (per_core_fflags[0]),
|
||||
.result (per_core_result[0]),
|
||||
.tag_out (per_core_tag_out[0]),
|
||||
.ready_out (per_core_ready_out[0]),
|
||||
.valid_out (per_core_valid_out[0])
|
||||
.has_fflags (per_core_has_fflags[FPU_FMA]),
|
||||
.fflags (per_core_fflags[FPU_FMA]),
|
||||
.result (per_core_result[FPU_FMA]),
|
||||
.tag_out (per_core_tag_out[FPU_FMA]),
|
||||
.ready_out (per_core_ready_out[FPU_FMA]),
|
||||
.valid_out (per_core_valid_out[FPU_FMA])
|
||||
);
|
||||
|
||||
VX_fp_div #(
|
||||
|
@ -96,18 +101,18 @@ module VX_fp_fpga #(
|
|||
) fp_div (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 1)),
|
||||
.ready_in (per_core_ready_in[1]),
|
||||
.valid_in (valid_in && (core_select == FPU_DIV)),
|
||||
.ready_in (per_core_ready_in[FPU_DIV]),
|
||||
.tag_in (tag_in),
|
||||
.frm (frm),
|
||||
.dataa (dataa),
|
||||
.datab (datab),
|
||||
.has_fflags (per_core_has_fflags[1]),
|
||||
.fflags (per_core_fflags[1]),
|
||||
.result (per_core_result[1]),
|
||||
.tag_out (per_core_tag_out[1]),
|
||||
.ready_out (per_core_ready_out[1]),
|
||||
.valid_out (per_core_valid_out[1])
|
||||
.has_fflags (per_core_has_fflags[FPU_DIV]),
|
||||
.fflags (per_core_fflags[FPU_DIV]),
|
||||
.result (per_core_result[FPU_DIV]),
|
||||
.tag_out (per_core_tag_out[FPU_DIV]),
|
||||
.ready_out (per_core_ready_out[FPU_DIV]),
|
||||
.valid_out (per_core_valid_out[FPU_DIV])
|
||||
);
|
||||
|
||||
VX_fp_sqrt #(
|
||||
|
@ -116,17 +121,17 @@ module VX_fp_fpga #(
|
|||
) fp_sqrt (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 2)),
|
||||
.ready_in (per_core_ready_in[2]),
|
||||
.valid_in (valid_in && (core_select == FPU_SQRT)),
|
||||
.ready_in (per_core_ready_in[FPU_SQRT]),
|
||||
.tag_in (tag_in),
|
||||
.frm (frm),
|
||||
.dataa (dataa),
|
||||
.has_fflags (per_core_has_fflags[2]),
|
||||
.fflags (per_core_fflags[2]),
|
||||
.result (per_core_result[2]),
|
||||
.tag_out (per_core_tag_out[2]),
|
||||
.ready_out (per_core_ready_out[2]),
|
||||
.valid_out (per_core_valid_out[2])
|
||||
.has_fflags (per_core_has_fflags[FPU_SQRT]),
|
||||
.fflags (per_core_fflags[FPU_SQRT]),
|
||||
.result (per_core_result[FPU_SQRT]),
|
||||
.tag_out (per_core_tag_out[FPU_SQRT]),
|
||||
.ready_out (per_core_ready_out[FPU_SQRT]),
|
||||
.valid_out (per_core_valid_out[FPU_SQRT])
|
||||
);
|
||||
|
||||
VX_fp_cvt #(
|
||||
|
@ -135,19 +140,19 @@ module VX_fp_fpga #(
|
|||
) fp_cvt (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 3)),
|
||||
.ready_in (per_core_ready_in[3]),
|
||||
.valid_in (valid_in && (core_select == FPU_CVT)),
|
||||
.ready_in (per_core_ready_in[FPU_CVT]),
|
||||
.tag_in (tag_in),
|
||||
.frm (frm),
|
||||
.is_itof (is_itof),
|
||||
.is_signed (is_signed),
|
||||
.dataa (dataa),
|
||||
.has_fflags (per_core_has_fflags[3]),
|
||||
.fflags (per_core_fflags[3]),
|
||||
.result (per_core_result[3]),
|
||||
.tag_out (per_core_tag_out[3]),
|
||||
.ready_out (per_core_ready_out[3]),
|
||||
.valid_out (per_core_valid_out[3])
|
||||
.has_fflags (per_core_has_fflags[FPU_CVT]),
|
||||
.fflags (per_core_fflags[FPU_CVT]),
|
||||
.result (per_core_result[FPU_CVT]),
|
||||
.tag_out (per_core_tag_out[FPU_CVT]),
|
||||
.ready_out (per_core_ready_out[FPU_CVT]),
|
||||
.valid_out (per_core_valid_out[FPU_CVT])
|
||||
);
|
||||
|
||||
VX_fp_ncomp #(
|
||||
|
@ -156,19 +161,19 @@ module VX_fp_fpga #(
|
|||
) fp_ncomp (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in && (core_select == 4)),
|
||||
.ready_in (per_core_ready_in[4]),
|
||||
.valid_in (valid_in && (core_select == FPU_NCP)),
|
||||
.ready_in (per_core_ready_in[FPU_NCP]),
|
||||
.tag_in (tag_in),
|
||||
.op_type (op_type),
|
||||
.frm (frm),
|
||||
.dataa (dataa),
|
||||
.datab (datab),
|
||||
.result (per_core_result[4]),
|
||||
.has_fflags (per_core_has_fflags[4]),
|
||||
.fflags (per_core_fflags[4]),
|
||||
.tag_out (per_core_tag_out[4]),
|
||||
.ready_out (per_core_ready_out[4]),
|
||||
.valid_out (per_core_valid_out[4])
|
||||
.result (per_core_result[FPU_NCP]),
|
||||
.has_fflags (per_core_has_fflags[FPU_NCP]),
|
||||
.fflags (per_core_fflags[FPU_NCP]),
|
||||
.tag_out (per_core_tag_out[FPU_NCP]),
|
||||
.ready_out (per_core_ready_out[FPU_NCP]),
|
||||
.valid_out (per_core_valid_out[FPU_NCP])
|
||||
);
|
||||
|
||||
reg has_fflags_n;
|
||||
|
|
|
@ -160,7 +160,7 @@ module VX_fp_ncomp #(
|
|||
for (genvar i = 0; i < LANES; i++) begin
|
||||
always @(*) begin
|
||||
case (frm_s0)
|
||||
`FRM_RNE: begin
|
||||
`FRM_RNE: begin // LE
|
||||
fcmp_fflags[i] = 5'h0;
|
||||
if (a_type_s0[i].is_nan || b_type_s0[i].is_nan) begin
|
||||
fcmp_res[i] = 32'h0;
|
||||
|
@ -169,7 +169,7 @@ module VX_fp_ncomp #(
|
|||
fcmp_res[i] = {31'h0, (a_smaller_s0[i] | ab_equal_s0[i])};
|
||||
end
|
||||
end
|
||||
`FRM_RTZ: begin
|
||||
`FRM_RTZ: begin // LS
|
||||
fcmp_fflags[i] = 5'h0;
|
||||
if (a_type_s0[i].is_nan || b_type_s0[i].is_nan) begin
|
||||
fcmp_res[i] = 32'h0;
|
||||
|
@ -178,7 +178,7 @@ module VX_fp_ncomp #(
|
|||
fcmp_res[i] = {31'h0, (a_smaller_s0[i] & ~ab_equal_s0[i])};
|
||||
end
|
||||
end
|
||||
`FRM_RDN: begin
|
||||
`FRM_RDN: begin // EQ
|
||||
fcmp_fflags[i] = 5'h0;
|
||||
if (a_type_s0[i].is_nan || b_type_s0[i].is_nan) begin
|
||||
fcmp_res[i] = 32'h0;
|
||||
|
@ -223,7 +223,7 @@ module VX_fp_ncomp #(
|
|||
tmp_fflags[i] = 0;
|
||||
tmp_fflags[i].NV = a_type_s0[i].is_signaling | b_type_s0[i].is_signaling;
|
||||
end
|
||||
//5,6,7:
|
||||
//5,6,7: MOVE
|
||||
default: begin
|
||||
tmp_result[i] = dataa[i];
|
||||
tmp_fflags[i] = 'x;
|
||||
|
|
|
@ -1,9 +1,5 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
`ifndef SYNTHESIS
|
||||
`include "float_dpi.vh"
|
||||
`endif
|
||||
|
||||
module VX_fp_sqrt #(
|
||||
parameter TAGW = 1,
|
||||
parameter LANES = 1
|
||||
|
@ -30,26 +26,24 @@ module VX_fp_sqrt #(
|
|||
output wire valid_out
|
||||
);
|
||||
wire stall = ~ready_out && valid_out;
|
||||
wire enable = ~stall;
|
||||
wire enable = ~stall;
|
||||
|
||||
wire _reset;
|
||||
|
||||
VX_reset_relay reset_relay (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.reset_out (_reset)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
`ifdef QUARTUS
|
||||
acl_fsqrt fsqrt (
|
||||
.clk (clk),
|
||||
.areset (reset),
|
||||
.areset (_reset),
|
||||
.en (enable),
|
||||
.a (dataa[i]),
|
||||
.q (result[i])
|
||||
);
|
||||
`else
|
||||
integer fsqrt_h;
|
||||
initial begin
|
||||
fsqrt_h = dpi_register();
|
||||
end
|
||||
always @(posedge clk) begin
|
||||
dpi_fsqrt (fsqrt_h, enable, dataa[i], `LATENCY_FSQRT, result[i]);
|
||||
end
|
||||
`endif
|
||||
end
|
||||
|
||||
VX_shift_register #(
|
||||
|
|
|
@ -56,13 +56,13 @@ module VX_fpnew
|
|||
|
||||
localparam fpnew_pkg::fpu_implementation_t FPU_IMPLEMENTATION = '{
|
||||
PipeRegs:'{'{`LATENCY_FMA, 0, 0, 0, 0}, // ADDMUL
|
||||
'{default: `LATENCY_FDIVSQRT}, // DIVSQRT
|
||||
'{default: `LATENCY_FNCOMP}, // NONCOMP
|
||||
'{default: `LATENCY_FCONV}}, // CONV
|
||||
UnitTypes:'{'{default: UNIT_FMULADD}, // ADDMUL
|
||||
'{default: UNIT_FDIVSQRT}, // DIVSQRT
|
||||
'{default: UNIT_FNONCOMP}, // NONCOMP
|
||||
'{default: UNIT_FCONV}}, // CONV
|
||||
'{default: `LATENCY_FDIVSQRT}, // DIVSQRT
|
||||
'{default: `LATENCY_FNCP}, // NONCOMP
|
||||
'{default: `LATENCY_FCVT}}, // CONV
|
||||
UnitTypes:'{'{default: UNIT_FMULADD}, // ADDMUL
|
||||
'{default: UNIT_FDIVSQRT}, // DIVSQRT
|
||||
'{default: UNIT_FNONCOMP}, // NONCOMP
|
||||
'{default: UNIT_FCONV}}, // CONV
|
||||
PipeConfig: fpnew_pkg::DISTRIBUTED
|
||||
};
|
||||
|
||||
|
|
|
@ -1,239 +0,0 @@
|
|||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
#include <mutex>
|
||||
#include <iostream>
|
||||
#include "svdpi.h"
|
||||
#include "verilated_vpi.h"
|
||||
#include "VX_config.h"
|
||||
|
||||
extern "C" {
|
||||
int dpi_register();
|
||||
void dpi_fadd(int inst, bool enable, int a, int b, int delay, int* result);
|
||||
void dpi_fsub(int inst, bool enable, int a, int b, int delay, int* result);
|
||||
void dpi_fmul(int inst, bool enable, int a, int b, int delay, int* result);
|
||||
void dpi_fmadd(int inst, bool enable, int a, int b, int c, int delay, int* result);
|
||||
void dpi_fdiv(int inst, bool enable, int a, int b, int delay, int* result);
|
||||
void dpi_fsqrt(int inst, bool enable, int a, int delay, int* result);
|
||||
void dpi_ftoi(int inst, bool enable, int a, int delay, int* result);
|
||||
void dpi_ftou(int inst, bool enable, int a, int delay, int* result);
|
||||
void dpi_itof(int inst, bool enable, int a, int delay, int* result);
|
||||
void dpi_utof(int inst, bool enable, int a, int delay, int* result);
|
||||
void dpi_assert(int inst, bool cond, int delay);
|
||||
}
|
||||
|
||||
class ShiftRegister {
|
||||
public:
|
||||
ShiftRegister() : init_(false), depth_(0) {}
|
||||
|
||||
void ensure_init(int depth) {
|
||||
if (!init_) {
|
||||
buffer_.resize(depth);
|
||||
init_ = true;
|
||||
depth_ = depth;
|
||||
}
|
||||
}
|
||||
|
||||
void push(int value, bool enable) {
|
||||
if (!enable)
|
||||
return;
|
||||
for (unsigned i = 0; i < depth_-1; ++i) {
|
||||
buffer_[i] = buffer_[i+1];
|
||||
}
|
||||
buffer_[depth_-1] = value;
|
||||
}
|
||||
|
||||
int top() const {
|
||||
return buffer_[0];
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
std::vector<int> buffer_;
|
||||
bool init_;
|
||||
unsigned depth_;
|
||||
};
|
||||
|
||||
union Float_t {
|
||||
float f;
|
||||
int i;
|
||||
struct {
|
||||
uint32_t man : 23;
|
||||
uint32_t exp : 8;
|
||||
uint32_t sign : 1;
|
||||
} parts;
|
||||
};
|
||||
|
||||
class Instances {
|
||||
public:
|
||||
ShiftRegister& get(int inst) {
|
||||
return instances_.at(inst);
|
||||
}
|
||||
|
||||
int allocate() {
|
||||
mutex_.lock();
|
||||
int inst = instances_.size();
|
||||
instances_.resize(inst + 1);
|
||||
mutex_.unlock();
|
||||
return inst;
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<ShiftRegister> instances_;
|
||||
std::mutex mutex_;
|
||||
};
|
||||
|
||||
Instances instances;
|
||||
|
||||
int dpi_register() {
|
||||
return instances.allocate();
|
||||
}
|
||||
|
||||
void dpi_fadd(int inst, bool enable, int a, int b, int delay, int* result) {
|
||||
ShiftRegister& sr = instances.get(inst);
|
||||
|
||||
Float_t fa, fb, fr;
|
||||
|
||||
fa.i = a;
|
||||
fb.i = b;
|
||||
fr.f = fa.f + fb.f;
|
||||
|
||||
sr.ensure_init(delay);
|
||||
sr.push(fr.i, enable);
|
||||
*result = sr.top();
|
||||
}
|
||||
|
||||
void dpi_fsub(int inst, bool enable, int a, int b, int delay, int* result) {
|
||||
ShiftRegister& sr = instances.get(inst);
|
||||
|
||||
Float_t fa, fb, fr;
|
||||
|
||||
fa.i = a;
|
||||
fb.i = b;
|
||||
fr.f = fa.f - fb.f;
|
||||
|
||||
sr.ensure_init(delay);
|
||||
sr.push(fr.i, enable);
|
||||
*result = sr.top();
|
||||
}
|
||||
|
||||
void dpi_fmul(int inst, bool enable, int a, int b, int delay, int* result) {
|
||||
ShiftRegister& sr = instances.get(inst);
|
||||
|
||||
Float_t fa, fb, fr;
|
||||
|
||||
fa.i = a;
|
||||
fb.i = b;
|
||||
fr.f = fa.f * fb.f;
|
||||
|
||||
sr.ensure_init(delay);
|
||||
sr.push(fr.i, enable);
|
||||
*result = sr.top();
|
||||
}
|
||||
|
||||
void dpi_fmadd(int inst, bool enable, int a, int b, int c, int delay, int* result) {
|
||||
ShiftRegister& sr = instances.get(inst);
|
||||
|
||||
Float_t fa, fb, fc, fr;
|
||||
|
||||
fa.i = a;
|
||||
fb.i = b;
|
||||
fc.i = c;
|
||||
fr.f = fa.f * fb.f + fc.f;
|
||||
|
||||
sr.ensure_init(delay);
|
||||
sr.push(fr.i, enable);
|
||||
*result = sr.top();
|
||||
}
|
||||
|
||||
void dpi_fdiv(int inst, bool enable, int a, int b, int delay, int* result) {
|
||||
ShiftRegister& sr = instances.get(inst);
|
||||
|
||||
Float_t fa, fb, fr;
|
||||
|
||||
fa.i = a;
|
||||
fb.i = b;
|
||||
fr.f = fa.f / fb.f;
|
||||
|
||||
sr.ensure_init(delay);
|
||||
sr.push(fr.i, enable);
|
||||
*result = sr.top();
|
||||
}
|
||||
|
||||
void dpi_fsqrt(int inst, bool enable, int a, int delay, int* result) {
|
||||
ShiftRegister& sr = instances.get(inst);
|
||||
|
||||
Float_t fa, fr;
|
||||
|
||||
fa.i = a;
|
||||
fr.f = sqrtf(fa.f);
|
||||
|
||||
sr.ensure_init(delay);
|
||||
sr.push(fr.i, enable);
|
||||
*result = sr.top();
|
||||
}
|
||||
|
||||
void dpi_ftoi(int inst, bool enable, int a, int delay, int* result) {
|
||||
ShiftRegister& sr = instances.get(inst);
|
||||
|
||||
Float_t fa, fr;
|
||||
|
||||
fa.i = a;
|
||||
fr.i = int(fa.f);
|
||||
|
||||
sr.ensure_init(delay);
|
||||
sr.push(fr.i, enable);
|
||||
*result = sr.top();
|
||||
}
|
||||
|
||||
void dpi_ftou(int inst, bool enable, int a, int delay, int* result) {
|
||||
ShiftRegister& sr = instances.get(inst);
|
||||
|
||||
Float_t fa, fr;
|
||||
|
||||
fa.i = a;
|
||||
fr.i = unsigned(fa.f);
|
||||
|
||||
sr.ensure_init(delay);
|
||||
sr.push(fr.i, enable);
|
||||
*result = sr.top();
|
||||
}
|
||||
|
||||
void dpi_itof(int inst, bool enable, int a, int delay, int* result) {
|
||||
ShiftRegister& sr = instances.get(inst);
|
||||
|
||||
Float_t fa, fr;
|
||||
|
||||
fr.f = (float)a;
|
||||
|
||||
sr.ensure_init(delay);
|
||||
sr.push(fr.i, enable);
|
||||
*result = sr.top();
|
||||
}
|
||||
|
||||
void dpi_utof(int inst, bool enable, int a, int delay, int* result) {
|
||||
ShiftRegister& sr = instances.get(inst);
|
||||
|
||||
Float_t fa, fr;
|
||||
|
||||
unsigned ua = a;
|
||||
fr.f = (float)ua;
|
||||
|
||||
sr.ensure_init(delay);
|
||||
sr.push(fr.i, enable);
|
||||
*result = sr.top();
|
||||
}
|
||||
|
||||
void dpi_assert(int inst, bool cond, int delay) {
|
||||
ShiftRegister& sr = instances.get(inst);
|
||||
|
||||
sr.ensure_init(delay);
|
||||
sr.push(!cond, 1);
|
||||
|
||||
auto status = sr.top();
|
||||
if (status) {
|
||||
printf("delayed assertion at %s!\n", svGetNameFromScope(svGetScope()));
|
||||
std::abort();
|
||||
}
|
||||
}
|
|
@ -1,20 +0,0 @@
|
|||
`ifndef FLOAT_DPI
|
||||
`define FLOAT_DPI
|
||||
|
||||
import "DPI-C" context function int dpi_register();
|
||||
|
||||
import "DPI-C" context function void dpi_fadd(int inst, input logic enable, input int a, input int b, input int delay, output int result);
|
||||
import "DPI-C" context function void dpi_fsub(int inst, input logic enable, input int a, input int b, input int delay, output int result);
|
||||
import "DPI-C" context function void dpi_fmul(int inst, input logic enable, input int a, input int b, input int delay, output int result);
|
||||
import "DPI-C" context function void dpi_fmadd(int inst, input logic enable, input int a, input int b, input int c, input int delay, output int result);
|
||||
import "DPI-C" context function void dpi_fmsub(int inst, input logic enable, input int a, input int b, input int c, input int delay, output int result);
|
||||
import "DPI-C" context function void dpi_fdiv(int inst, input logic enable, input int a, input int b, input int delay, output int result);
|
||||
import "DPI-C" context function void dpi_fsqrt(int inst, input logic enable, input int a, input int delay, output int result);
|
||||
import "DPI-C" context function void dpi_ftoi(int inst, input logic enable, input int a, input int delay, output int result);
|
||||
import "DPI-C" context function void dpi_ftou(int inst, input logic enable, input int a, input int delay, output int result);
|
||||
import "DPI-C" context function void dpi_itof(int inst, input logic enable, input int a, input int delay, output int result);
|
||||
import "DPI-C" context function void dpi_utof(int inst, input logic enable, input int a, input int delay, output int result);
|
||||
|
||||
import "DPI-C" context function void dpi_assert(int inst, input logic cond, input int delay);
|
||||
|
||||
`endif
|
|
@ -1,7 +1,7 @@
|
|||
`include "VX_platform.vh"
|
||||
|
||||
module VX_lzc #(
|
||||
parameter DATAW = 1,
|
||||
parameter DATAW = 32,
|
||||
parameter LDATAW = `LOG2UP(DATAW)
|
||||
) (
|
||||
input wire [DATAW-1:0] data_in,
|
||||
|
|
|
@ -31,12 +31,13 @@ MULTICORE ?= -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0
|
|||
TOP = Vortex
|
||||
|
||||
RTL_DIR=../rtl
|
||||
DPI_DIR=../dpi
|
||||
|
||||
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/svdpi -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
|
||||
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(DPI_DIR) -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
|
||||
RTL_INCLUDE = -I$(RTL_DIR)/ -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/simulate $(FPU_INCLUDE)
|
||||
|
||||
SRCS = simulator.cpp testbench.cpp
|
||||
SRCS += $(RTL_DIR)/fp_cores/svdpi/float_dpi.cpp
|
||||
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
|
||||
|
||||
VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic
|
||||
VL_FLAGS += -Wno-DECLFILENAME
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
#include <fstream>
|
||||
#include <iomanip>
|
||||
|
||||
#define RESET_DELAY 2
|
||||
#define RESET_DELAY 4
|
||||
|
||||
#define ENABLE_DRAM_STALLS
|
||||
#define DRAM_LATENCY 24
|
||||
|
@ -75,13 +75,6 @@ void Simulator::reset() {
|
|||
vortex_->csr_rsp_ready = 0;
|
||||
|
||||
vortex_->reset = 1;
|
||||
|
||||
vortex_->clk = 0;
|
||||
this->eval();
|
||||
vortex_->clk = 1;
|
||||
this->eval();
|
||||
|
||||
vortex_->reset = 0;
|
||||
|
||||
for (int i = 0; i < RESET_DELAY; ++i) {
|
||||
vortex_->clk = 0;
|
||||
|
@ -89,8 +82,11 @@ void Simulator::reset() {
|
|||
vortex_->clk = 1;
|
||||
this->eval();
|
||||
}
|
||||
|
||||
vortex_->reset = 0;
|
||||
|
||||
// Turn on assertion after reset
|
||||
printf("*** enabling assertion at tick: %ld", timestamp);
|
||||
Verilated::assertOn(true);
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue