speeding up simulation using dedicated full dpi-based FPU core

This commit is contained in:
Blaise Tine 2021-01-06 18:44:06 -08:00
parent 2058718f0f
commit 2b8435471a
26 changed files with 990 additions and 430 deletions

View file

@ -39,11 +39,12 @@ LDFLAGS += -shared -pthread
TOP = vortex_afu_shim
RTL_DIR=../../../hw/rtl
DPI_DIR=../../../hw/dpi
SRCS = fpga.cpp opae_sim.cpp
SRCS += $(RTL_DIR)/fp_cores/svdpi/float_dpi.cpp
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/svdpi -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(DPI_DIR) -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE)
RTL_INCLUDE += -I$(RTL_DIR)/afu -I$(RTL_DIR)/afu/ccip

View file

@ -8,7 +8,7 @@
#define CCI_RQ_SIZE 16
#define CCI_WQ_SIZE 16
#define RESET_DELAY 2
#define RESET_DELAY 4
#define ENABLE_DRAM_STALLS
#define DRAM_LATENCY 24
@ -135,19 +135,14 @@ void opae_sim::reset() {
vortex_afu_->reset = 1;
vortex_afu_->clk = 0;
this->eval();
vortex_afu_->clk = 1;
this->eval();
vortex_afu_->reset = 0;
for (int i = 0; i < RESET_DELAY; ++i) {
vortex_afu_->clk = 0;
this->eval();
vortex_afu_->clk = 1;
this->eval();
}
vortex_afu_->reset = 0;
// Turn on assertion after reset
Verilated::assertOn(true);

View file

@ -39,11 +39,12 @@ LDFLAGS += -shared -pthread
TOP = Vortex
RTL_DIR = ../../hw/rtl
DPI_DIR = ../../hw/dpi
SRCS = vortex.cpp ../common/vx_utils.cpp ../../hw/simulate/simulator.cpp
SRCS += $(RTL_DIR)/fp_cores/svdpi/float_dpi.cpp
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/svdpi -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(DPI_DIR) -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE)
VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic $(CONFIGS)

264
hw/dpi/float_dpi.cpp Normal file
View file

@ -0,0 +1,264 @@
#include <stdio.h>
#include <math.h>
#include <unordered_map>
#include <vector>
#include <mutex>
#include <iostream>
#include "svdpi.h"
#include "verilated_vpi.h"
#include "VX_config.h"
extern "C" {
void dpi_fadd(int a, int b, int frm, int* result, int* fflags);
void dpi_fsub(int a, int b, int frm, int* result, int* fflags);
void dpi_fmul(int a, int b, int frm, int* result, int* fflags);
void dpi_fmadd(int a, int b, int c, int frm, int* result, int* fflags);
void dpi_fmsub(int a, int b, int c, int frm, int* result, int* fflags);
void dpi_fnmadd(int a, int b, int c, int frm, int* result, int* fflags);
void dpi_fnmsub(int a, int b, int c, int frm, int* result, int* fflags);
void dpi_fdiv(int a, int b, int frm, int* result, int* fflags);
void dpi_fsqrt(int a, int frm, int* result, int* fflags);
void dpi_ftoi(int a, int frm, int* result, int* fflags);
void dpi_ftou(int a, int frm, int* result, int* fflags);
void dpi_itof(int a, int frm, int* result, int* fflags);
void dpi_utof(int a, int frm, int* result, int* fflags);
void dpi_fclss(int a, int* result);
void dpi_fsgnj(int a, int* result);
void dpi_fsgnjn(int a, int* result);
void dpi_fsgnjx(int a, int* result);
void dpi_flt(int a, int b, int* result, int* fflags);
void dpi_fle(int a, int b, int* result, int* fflags);
void dpi_feq(int a, int b, int* result, int* fflags);
void dpi_fmin(int a, int b, int* result, int* fflags);
void dpi_fmax(int a, int b, int* result, int* fflags);
}
union Float_t {
float f;
int i;
struct {
uint32_t man : 23;
uint32_t exp : 8;
uint32_t sign : 1;
} parts;
};
void dpi_fadd(int a, int b, int frm, int* result, int* fflags) {
Float_t fa, fb, fr;
fa.i = a;
fb.i = b;
fr.f = fa.f + fb.f;
*result = fr.i;
*fflags = 0;
}
void dpi_fsub(int a, int b, int frm, int* result, int* fflags) {
Float_t fa, fb, fr;
fa.i = a;
fb.i = b;
fr.f = fa.f - fb.f;
*result = fr.i;
*fflags = 0;
}
void dpi_fmul(int a, int b, int frm, int* result, int* fflags) {
Float_t fa, fb, fr;
fa.i = a;
fb.i = b;
fr.f = fa.f * fb.f;
*result = fr.i;
*fflags = 0;
}
void dpi_fmadd(int a, int b, int c, int frm, int* result, int* fflags) {
Float_t fa, fb, fc, fr;
fa.i = a;
fb.i = b;
fc.i = c;
fr.f = fa.f * fb.f + fc.f;
*result = fr.i;
*fflags = 0;
}
void dpi_fmsub(int a, int b, int c, int frm, int* result, int* fflags) {
Float_t fa, fb, fc, fr;
fa.i = a;
fb.i = b;
fc.i = c;
fr.f = fa.f * fb.f - fc.f;
*result = fr.i;
*fflags = 0;
}
void dpi_fnmadd(int a, int b, int c, int frm, int* result, int* fflags) {
Float_t fa, fb, fc, fr;
fa.i = a;
fb.i = b;
fc.i = c;
fr.f = -(fa.f * fb.f + fc.f);
*result = fr.i;
*fflags = 0;
}
void dpi_fnmsub(int a, int b, int c, int frm, int* result, int* fflags) {
Float_t fa, fb, fc, fr;
fa.i = a;
fb.i = b;
fc.i = c;
fr.f = -(fa.f * fb.f - fc.f);
*result = fr.i;
*fflags = 0;
}
void dpi_fdiv(int a, int b, int frm, int* result, int* fflags) {
Float_t fa, fb, fr;
fa.i = a;
fb.i = b;
fr.f = fa.f / fb.f;
*result = fr.i;
*fflags = 0;
}
void dpi_fsqrt(int a, int frm, int* result, int* fflags) {
Float_t fa, fr;
fa.i = a;
fr.f = sqrtf(fa.f);
*result = fr.i;
*fflags = 0;
}
void dpi_ftoi(int a, int frm, int* result, int* fflags) {
Float_t fa, fr;
fa.i = a;
fr.i = int(fa.f);
*result = fr.i;
*fflags = 0;
}
void dpi_ftou(int a, int frm, int* result, int* fflags) {
Float_t fa, fr;
fa.i = a;
fr.i = unsigned(fa.f);
*result = fr.i;
*fflags = 0;
}
void dpi_itof(int a, int frm, int* result, int* fflags) {
Float_t fa, fr;
fr.f = (float)a;
*result = fr.i;
*fflags = 0;
}
void dpi_utof(int a, int frm, int* result, int* fflags) {
Float_t fa, fr;
unsigned ua = a;
fr.f = (float)ua;
*result = fr.i;
*fflags = 0;
}
void dpi_flt(int a, int b, int* result, int* fflags) {
Float_t fa, fb, fr;
fa.i = a;
fb.i = b;
fr.f = fa.f < fb.f;
*result = fr.i;
*fflags = 0;
}
void dpi_fle(int a, int b, int* result, int* fflags) {
Float_t fa, fb, fr;
fa.i = a;
fb.i = b;
fr.f = fa.f <= fb.f;
*result = fr.i;
*fflags = 0;
}
void dpi_feq(int a, int b, int* result, int* fflags) {
Float_t fa, fb, fr;
fa.i = a;
fb.i = b;
fr.f = fa.f == fb.f;
*result = fr.i;
*fflags = 0;
}
void dpi_fmin(int a, int b, int* result, int* fflags) {
Float_t fa, fb, fr;
fa.i = a;
fb.i = b;
fr.f = std::min<float>(fa.f, fb.f);
*result = fr.i;
*fflags = 0;
}
void dpi_fmax(int a, int b, int* result, int* fflags) {
Float_t fa, fb, fr;
fa.i = a;
fb.i = b;
fr.f = std::max<float>(fa.f, fb.f);
*result = fr.i;
*fflags = 0;
}
void dpi_fclss(int a, int* result) {
// TODO
*result = 0;
}
void dpi_fsgnj(int a, int* result) {
// TODO
*result = 0;
}
void dpi_fsgnjn(int a, int* result) {
// TODO
*result = 0;
}
void dpi_fsgnjx(int a, int* result) {
// TODO
*result = 0;
}

31
hw/dpi/float_dpi.vh Normal file
View file

@ -0,0 +1,31 @@
`ifndef FLOAT_DPI
`define FLOAT_DPI
import "DPI-C" context function void dpi_fadd(input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_fsub(input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_fmul(input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_fmadd(input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_fmsub(input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_fnmadd(input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_fnmsub(input int a, input int b, input int c, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_fdiv(input int a, input int b, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_fsqrt(input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_ftoi(input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_ftou(input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_itof(input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_utof(input int a, input bit[2:0] frm, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_fclss(input int a, output int result);
import "DPI-C" context function void dpi_fsgnj(input int a, output int result);
import "DPI-C" context function void dpi_fsgnjn(input int a, output int result);
import "DPI-C" context function void dpi_fsgnjx(input int a, output int result);
import "DPI-C" context function void dpi_flt(input int a, input int b, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_fle(input int a, input int b, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_feq(input int a, input int b, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_fmin(input int a, input int b, output int result, output bit[4:0] fflags);
import "DPI-C" context function void dpi_fmax(input int a, input int b, output int result, output bit[4:0] fflags);
`endif

84
hw/dpi/util_dpi.cpp Normal file
View file

@ -0,0 +1,84 @@
#include <stdio.h>
#include <math.h>
#include <unordered_map>
#include <vector>
#include <mutex>
#include <iostream>
#include "svdpi.h"
#include "verilated_vpi.h"
#include "VX_config.h"
extern "C" {
int dpi_register();
void dpi_assert(int inst, bool cond, int delay);
}
class ShiftRegister {
public:
ShiftRegister() : init_(false), depth_(0) {}
void ensure_init(int depth) {
if (!init_) {
buffer_.resize(depth);
init_ = true;
depth_ = depth;
}
}
void push(int value, bool enable) {
if (!enable)
return;
for (unsigned i = 0; i < depth_-1; ++i) {
buffer_[i] = buffer_[i+1];
}
buffer_[depth_-1] = value;
}
int top() const {
return buffer_[0];
}
private:
std::vector<int> buffer_;
bool init_;
unsigned depth_;
};
class Instances {
public:
ShiftRegister& get(int inst) {
return instances_.at(inst);
}
int allocate() {
mutex_.lock();
int inst = instances_.size();
instances_.resize(inst + 1);
mutex_.unlock();
return inst;
}
private:
std::vector<ShiftRegister> instances_;
std::mutex mutex_;
};
Instances instances;
int dpi_register() {
return instances.allocate();
}
void dpi_assert(int inst, bool cond, int delay) {
ShiftRegister& sr = instances.get(inst);
sr.ensure_init(delay);
sr.push(!cond, 1);
auto status = sr.top();
if (status) {
printf("delayed assertion at %s!\n", svGetNameFromScope(svGetScope()));
std::abort();
}
}

7
hw/dpi/util_dpi.vh Normal file
View file

@ -0,0 +1,7 @@
`ifndef UTIL_DPI
`define UTIL_DPI
import "DPI-C" context function int dpi_register();
import "DPI-C" context function void dpi_assert(int inst, input logic cond, input int delay);
`endif

View file

@ -40,7 +40,8 @@ module VX_cluster #(
// Status
output wire busy,
output wire ebreak
);
);
wire [`NUM_CORES-1:0] per_core_dram_req_valid;
wire [`NUM_CORES-1:0] per_core_dram_req_rw;
wire [`NUM_CORES-1:0][`DDRAM_BYTEEN_WIDTH-1:0] per_core_dram_req_byteen;
@ -70,15 +71,13 @@ module VX_cluster #(
for (genvar i = 0; i < `NUM_CORES; i++) begin
wire core_reset;
if (`NUM_CORES > 1) begin
reg core_reset_r;
always @(posedge clk) begin
core_reset_r <= reset;
end
assign core_reset = core_reset_r;
end else begin
assign core_reset = reset;
end
VX_reset_relay #(
.PASSTHRU (`NUM_CORES == 1)
) reset_relay (
.clk (clk),
.reset (reset),
.reset_out (core_reset)
);
VX_core #(
.CORE_ID(i + (CLUSTER_ID * `NUM_CORES))

View file

@ -92,8 +92,8 @@
`define LATENCY_IMUL 3
`endif
`ifndef LATENCY_FNCOMP
`define LATENCY_FNCOMP 2
`ifndef LATENCY_FNCP
`define LATENCY_FNCP 2
`endif
`ifndef LATENCY_FMA
@ -128,8 +128,8 @@
`define LATENCY_FDIVSQRT 32
`endif
`ifndef LATENCY_FCONV
`define LATENCY_FCONV 4
`ifndef LATENCY_FCVT
`define LATENCY_FCVT 4
`endif
// CSR Addresses //////////////////////////////////////////////////////////////

View file

@ -63,11 +63,11 @@ module VX_fpu_unit #(
// resolve dynamic FRM from CSR
assign fpu_to_csr_if.read_wid = fpu_req_if.wid;
wire [`FRM_BITS-1:0] fpu_frm = (fpu_req_if.op_mod == `FRM_DYN) ? fpu_to_csr_if.read_frm : fpu_req_if.op_mod;
wire [`FRM_BITS-1:0] fpu_frm = (fpu_req_if.op_mod == `FRM_DYN) ? fpu_to_csr_if.read_frm : fpu_req_if.op_mod;
`ifdef FPU_FAST
VX_fp_fpga #(
VX_fp_dpi #(
.TAGW (FPUQ_BITS)
) fp_core (
.clk (clk),
@ -91,21 +91,51 @@ module VX_fpu_unit #(
.tag_out (tag_out),
.ready_out (ready_out),
.valid_out (valid_out)
);
`elsif FPU_FPNEW
VX_fpnew #(
.FMULADD (1),
.FDIVSQRT (1),
.FNONCOMP (1),
.FCONV (1),
.TAGW (FPUQ_BITS)
) fp_core (
.clk (clk),
.reset (reset),
.valid_in (valid_in),
.ready_in (ready_in),
.tag_in (tag_in),
.op_type (fpu_req_if.op_type),
.frm (fpu_frm),
.dataa (fpu_req_if.rs1_data),
.datab (fpu_req_if.rs2_data),
.datac (fpu_req_if.rs3_data),
.result (result),
.has_fflags (has_fflags),
.fflags (fflags),
.tag_out (tag_out),
.ready_out (ready_out),
.valid_out (valid_out)
);
`else
VX_fpnew #(
.FMULADD (1),
.FDIVSQRT (1),
.FNONCOMP (1),
.FCONV (1),
.TAGW (FPUQ_BITS)
VX_fp_fpga #(
.TAGW (FPUQ_BITS)
) fp_core (
.clk (clk),
.reset (reset),
.reset (fpu_reset),
.valid_in (valid_in),
.ready_in (ready_in),

View file

@ -64,7 +64,7 @@ module VX_lsu_unit #(
assign mem_req_addr[i] = full_address[i][31:2];
assign mem_req_offset[i] = full_address[i][1:0];
assign mem_req_byteen[i] = wmask << full_address[i][1:0];
assign mem_req_data[i] = lsu_req_if.store_data[i] << {mem_req_offset[i], 3'b0};
assign mem_req_data[i] = lsu_req_if.store_data[i] << {full_address[i][1:0], 3'b0};
end
`IGNORE_WARNINGS_BEGIN

View file

@ -79,7 +79,17 @@ module VX_mem_unit # (
.cache_rsp_if (dcache_rsp_if),
.smem_rsp_if (smem_rsp_if),
.core_rsp_if (core_dcache_rsp_if)
);
);
wire icache_reset, dcache_reset;
VX_reset_relay #(
.NUM_NODES (2)
) reset_relay (
.clk (clk),
.reset (reset),
.reset_out ({dcache_reset, icache_reset})
);
VX_cache #(
.CACHE_ID (`ICACHE_ID),
@ -102,7 +112,7 @@ module VX_mem_unit # (
`SCOPE_BIND_VX_mem_unit_icache
.clk (clk),
.reset (reset),
.reset (icache_reset),
// Core request
.core_req_valid (core_icache_req_if.valid),
@ -160,7 +170,7 @@ module VX_mem_unit # (
`SCOPE_BIND_VX_mem_unit_dcache
.clk (clk),
.reset (reset),
.reset (dcache_reset),
// Core req
.core_req_valid (dcache_req_if.valid),
@ -199,6 +209,14 @@ module VX_mem_unit # (
if (`SM_ENABLE) begin
wire scache_reset;
VX_reset_relay reset_relay (
.clk (clk),
.reset (reset),
.reset_out (scache_reset)
);
VX_cache #(
.CACHE_ID (`SCACHE_ID),
.CACHE_SIZE (`SMEM_SIZE),
@ -220,7 +238,7 @@ module VX_mem_unit # (
`SCOPE_BIND_VX_mem_unit_smem
.clk (clk),
.reset (reset),
.reset (scache_reset),
// Core request
.core_req_valid (smem_req_if.valid),

View file

@ -72,15 +72,13 @@ module Vortex (
for (genvar i = 0; i < `NUM_CLUSTERS; i++) begin
wire cluster_reset;
if (`NUM_CLUSTERS > 1) begin
reg cluster_reset_r;
always @(posedge clk) begin
cluster_reset_r <= reset;
end
assign cluster_reset = cluster_reset_r;
end else begin
assign cluster_reset = reset;
end
VX_reset_relay #(
.PASSTHRU (`NUM_CLUSTERS == 1)
) reset_relay (
.clk (clk),
.reset (reset),
.reset_out (cluster_reset)
);
VX_cluster #(
.CLUSTER_ID(i)

View file

@ -37,7 +37,7 @@ module vortex_afu #(
output logic [$clog2(NUM_LOCAL_MEM_BANKS)-1:0] mem_bank_select
);
localparam RESET_DELAY = 2;
localparam RESET_DELAY = 3;
localparam DRAM_ADDR_WIDTH = $bits(t_local_mem_addr);
localparam DRAM_LINE_WIDTH = $bits(t_local_mem_data);

View file

@ -1,9 +1,5 @@
`include "VX_define.vh"
`ifndef SYNTHESIS
`include "float_dpi.vh"
`endif
module VX_fp_div #(
parameter TAGW = 1,
parameter LANES = 1
@ -32,26 +28,24 @@ module VX_fp_div #(
);
wire stall = ~ready_out && valid_out;
wire enable = ~stall;
wire _reset;
VX_reset_relay reset_relay (
.clk (clk),
.reset (reset),
.reset_out (_reset)
);
for (genvar i = 0; i < LANES; i++) begin
`ifdef QUARTUS
acl_fdiv fdiv (
.clk (clk),
.areset (reset),
.areset (_reset),
.en (enable),
.a (dataa[i]),
.b (datab[i]),
.q (result[i])
);
`else
integer fdiv_h;
initial begin
fdiv_h = dpi_register();
end
always @(posedge clk) begin
dpi_fdiv (fdiv_h, enable, dataa[i], datab[i], `LATENCY_FDIV, result[i]);
end
`endif
end
VX_shift_register #(

415
hw/rtl/fp_cores/VX_fp_dpi.v Normal file
View file

@ -0,0 +1,415 @@
`ifndef SYNTHESIS
`include "VX_define.vh"
`include "float_dpi.vh"
module VX_fp_dpi #(
parameter TAGW = 1
) (
input wire clk,
input wire reset,
input wire valid_in,
output wire ready_in,
input wire [TAGW-1:0] tag_in,
input wire [`FPU_BITS-1:0] op_type,
input wire [`MOD_BITS-1:0] frm,
input wire [`NUM_THREADS-1:0][31:0] dataa,
input wire [`NUM_THREADS-1:0][31:0] datab,
input wire [`NUM_THREADS-1:0][31:0] datac,
output wire [`NUM_THREADS-1:0][31:0] result,
output wire has_fflags,
output fflags_t [`NUM_THREADS-1:0] fflags,
output wire [TAGW-1:0] tag_out,
input wire ready_out,
output wire valid_out
);
localparam FPU_FMA = 0;
localparam FPU_DIV = 1;
localparam FPU_SQRT = 2;
localparam FPU_CVT = 3;
localparam FPU_NCP = 4;
localparam NUM_FPC = 5;
localparam FPC_BITS = `LOG2UP(NUM_FPC);
wire [NUM_FPC-1:0] per_core_ready_in;
wire [NUM_FPC-1:0][`NUM_THREADS-1:0][31:0] per_core_result;
wire [NUM_FPC-1:0][TAGW-1:0] per_core_tag_out;
reg [NUM_FPC-1:0] per_core_ready_out;
wire [NUM_FPC-1:0] per_core_valid_out;
wire [NUM_FPC-1:0] per_core_has_fflags;
fflags_t [NUM_FPC-1:0][`NUM_THREADS-1:0] per_core_fflags;
reg [FPC_BITS-1:0] core_select;
reg is_fadd, is_fsub, is_fmul, is_fmadd, is_fmsub, is_fnmadd, is_fnmsub;
reg is_itof, is_utof, is_ftoi, is_ftou;
reg is_fclss, is_flt, is_fle, is_feq, is_fmin, is_fmax, is_fsgnj, is_fsgnjn, is_fsgnjx;
always @(*) begin
is_fadd = 0;
is_fsub = 0;
is_fmul = 0;
is_fmadd = 0;
is_fmsub = 0;
is_fnmadd = 0;
is_fnmsub = 0;
is_itof = 0;
is_utof = 0;
is_ftoi = 0;
is_ftou = 0;
is_fclss = 0;
is_flt = 0;
is_fle = 0;
is_feq = 0;
is_fmin = 0;
is_fmax = 0;
is_fsgnj = 0;
is_fsgnjn = 0;
is_fsgnjx = 0;
case (op_type)
`FPU_ADD: begin core_select = FPU_FMA; is_fadd = 1; end
`FPU_SUB: begin core_select = FPU_FMA; is_fsub = 1; end
`FPU_MUL: begin core_select = FPU_FMA; is_fmul = 1; end
`FPU_MADD: begin core_select = FPU_FMA; is_fmadd = 1; end
`FPU_MSUB: begin core_select = FPU_FMA; is_fmsub = 1; end
`FPU_NMADD: begin core_select = FPU_FMA; is_fnmadd = 1; end
`FPU_NMSUB: begin core_select = FPU_FMA; is_fnmsub = 1; end
`FPU_DIV: begin core_select = FPU_DIV; end
`FPU_SQRT: begin core_select = FPU_SQRT; end
`FPU_CVTWS: begin core_select = FPU_CVT; is_ftoi = 1; end
`FPU_CVTWUS:begin core_select = FPU_CVT; is_ftou = 1; end
`FPU_CVTSW: begin core_select = FPU_CVT; is_itof = 1; end
`FPU_CVTSWU:begin core_select = FPU_CVT; is_utof = 1; end
`FPU_CLASS: begin core_select = FPU_NCP; is_fclss = 1; end
`FPU_CMP: begin core_select = FPU_NCP;
is_fle = (frm == 0);
is_flt = (frm == 1);
is_feq = (frm == 2);
end
default: begin core_select = FPU_NCP;
is_fsgnj = (frm == 0);
is_fsgnjn = (frm == 1);
is_fsgnjx = (frm == 2);
is_fmin = (frm == 3);
is_fmax = (frm == 4);
end
endcase
end
generate
begin : fma
wire [`NUM_THREADS-1:0][31:0] result_fma;
wire [`NUM_THREADS-1:0][31:0] result_fadd;
wire [`NUM_THREADS-1:0][31:0] result_fsub;
wire [`NUM_THREADS-1:0][31:0] result_fmul;
wire [`NUM_THREADS-1:0][31:0] result_fmadd;
wire [`NUM_THREADS-1:0][31:0] result_fmsub;
wire [`NUM_THREADS-1:0][31:0] result_fnmadd;
wire [`NUM_THREADS-1:0][31:0] result_fnmsub;
fflags_t [`NUM_THREADS-1:0] fflags_fma;
fflags_t [`NUM_THREADS-1:0] fflags_fadd;
fflags_t [`NUM_THREADS-1:0] fflags_fsub;
fflags_t [`NUM_THREADS-1:0] fflags_fmul;
fflags_t [`NUM_THREADS-1:0] fflags_fmadd;
fflags_t [`NUM_THREADS-1:0] fflags_fmsub;
fflags_t [`NUM_THREADS-1:0] fflags_fnmadd;
fflags_t [`NUM_THREADS-1:0] fflags_fnmsub;
always @(*) begin
for (integer i = 0; i < `NUM_THREADS; i++) begin
dpi_fadd (dataa[i], datab[i], frm, result_fadd[i], fflags_fadd[i]);
dpi_fsub (dataa[i], datab[i], frm, result_fsub[i], fflags_fsub[i]);
dpi_fmul (dataa[i], datab[i], frm, result_fmul[i], fflags_fmul[i]);
dpi_fmadd (dataa[i], datab[i], datac[i], frm, result_fmadd[i], fflags_fmadd[i]);
dpi_fmsub (dataa[i], datab[i], datac[i], frm, result_fmsub[i], fflags_fmsub[i]);
dpi_fnmadd (dataa[i], datab[i], datac[i], frm, result_fnmadd[i], fflags_fnmadd[i]);
dpi_fnmsub (dataa[i], datab[i], datac[i], frm, result_fnmsub[i], fflags_fnmsub[i]);
end
end
assign result_fma = is_fadd ? result_fadd :
is_fsub ? result_fsub :
is_fmul ? result_fmul :
is_fmadd ? result_fmadd :
is_fmsub ? result_fmsub :
is_fnmadd ? result_fnmadd :
is_fnmsub ? result_fnmsub :
0;
assign fflags_fma = is_fadd ? fflags_fadd :
is_fsub ? fflags_fsub :
is_fmul ? fflags_fmul :
is_fmadd ? fflags_fmadd :
is_fmsub ? fflags_fmsub :
is_fnmadd ? fflags_fnmadd :
is_fnmsub ? fflags_fnmsub :
0;
wire enable = per_core_ready_out[FPU_FMA] || ~per_core_valid_out[FPU_FMA];
wire valid = (valid_in && core_select == FPU_FMA);
VX_shift_register #(
.DATAW (1 + TAGW + `NUM_THREADS * (32 + $bits(fflags_t))),
.DEPTH (`LATENCY_FMA),
.RESETW (1)
) shift_reg (
.clk (clk),
.reset (reset),
.enable (enable),
.data_in ({valid, tag_in, result_fma, fflags_fma}),
.data_out ({per_core_valid_out[FPU_FMA], per_core_tag_out[FPU_FMA], per_core_result[FPU_FMA], per_core_fflags[FPU_FMA]})
);
assign per_core_has_fflags[FPU_FMA] = 1;
assign per_core_ready_in[FPU_FMA] = enable;
end
endgenerate
generate
begin : fdiv
wire [`NUM_THREADS-1:0][31:0] result_fdiv;
fflags_t [`NUM_THREADS-1:0] fflags_fdiv;
always @(*) begin
for (integer i = 0; i < `NUM_THREADS; i++) begin
dpi_fdiv (dataa[i], datab[i], frm, result_fdiv[i], fflags_fdiv[i]);
end
end
wire enable = per_core_ready_out[FPU_DIV] || ~per_core_valid_out[FPU_DIV];
wire valid = (valid_in && core_select == FPU_DIV);
VX_shift_register #(
.DATAW (1 + TAGW + `NUM_THREADS * (32 + $bits(fflags_t))),
.DEPTH (`LATENCY_FDIV),
.RESETW (1)
) shift_reg (
.clk (clk),
.reset (reset),
.enable (enable),
.data_in ({valid, tag_in, result_fdiv, fflags_fdiv}),
.data_out ({per_core_valid_out[FPU_DIV], per_core_tag_out[FPU_DIV], per_core_result[FPU_DIV], per_core_fflags[FPU_DIV]})
);
assign per_core_has_fflags[FPU_DIV] = 1;
assign per_core_ready_in[FPU_DIV] = enable;
end
endgenerate
generate
begin : fsqrt
wire [`NUM_THREADS-1:0][31:0] result_fsqrt;
fflags_t [`NUM_THREADS-1:0] fflags_fsqrt;
always @(*) begin
for (integer i = 0; i < `NUM_THREADS; i++) begin
dpi_fsqrt (dataa[i], frm, result_fsqrt[i], fflags_fsqrt[i]);
end
end
wire enable = per_core_ready_out[FPU_SQRT] || ~per_core_valid_out[FPU_SQRT];
wire valid = (valid_in && core_select == FPU_SQRT);
VX_shift_register #(
.DATAW (1 + TAGW + `NUM_THREADS * (32 + $bits(fflags_t))),
.DEPTH (`LATENCY_FSQRT),
.RESETW (1)
) shift_reg (
.clk (clk),
.reset (reset),
.enable (enable),
.data_in ({valid, tag_in, result_fsqrt, fflags_fsqrt}),
.data_out ({per_core_valid_out[FPU_SQRT], per_core_tag_out[FPU_SQRT], per_core_result[FPU_SQRT], per_core_fflags[FPU_SQRT]})
);
assign per_core_has_fflags[FPU_SQRT] = 1;
assign per_core_ready_in[FPU_SQRT] = enable;
end
endgenerate
generate
begin : fcvt
wire [`NUM_THREADS-1:0][31:0] result_fcvt;
wire [`NUM_THREADS-1:0][31:0] result_itof;
wire [`NUM_THREADS-1:0][31:0] result_utof;
wire [`NUM_THREADS-1:0][31:0] result_ftoi;
wire [`NUM_THREADS-1:0][31:0] result_ftou;
fflags_t [`NUM_THREADS-1:0] fflags_fcvt;
fflags_t [`NUM_THREADS-1:0] fflags_itof;
fflags_t [`NUM_THREADS-1:0] fflags_utof;
fflags_t [`NUM_THREADS-1:0] fflags_ftoi;
fflags_t [`NUM_THREADS-1:0] fflags_ftou;
always @(*) begin
for (integer i = 0; i < `NUM_THREADS; i++) begin
dpi_itof (dataa[i], frm, result_itof[i], fflags_itof[i]);
dpi_utof (dataa[i], frm, result_utof[i], fflags_utof[i]);
dpi_ftoi (dataa[i], frm, result_ftoi[i], fflags_ftoi[i]);
dpi_ftou (dataa[i], frm, result_ftou[i], fflags_ftou[i]);
end
end
assign result_fcvt = is_itof ? result_itof :
is_utof ? result_utof :
is_ftoi ? result_ftoi :
is_ftou ? result_ftou :
0;
assign fflags_fcvt = is_itof ? fflags_itof :
is_utof ? fflags_utof :
is_ftoi ? fflags_ftoi :
is_ftou ? fflags_ftou :
0;
wire enable = per_core_ready_out[FPU_CVT] || ~per_core_valid_out[FPU_CVT];
wire valid = (valid_in && core_select == FPU_CVT);
VX_shift_register #(
.DATAW (1 + TAGW + `NUM_THREADS * (32 + $bits(fflags_t))),
.DEPTH (`LATENCY_FCVT),
.RESETW (1)
) shift_reg (
.clk (clk),
.reset (reset),
.enable (enable),
.data_in ({valid, tag_in, result_fcvt, fflags_fcvt}),
.data_out ({per_core_valid_out[FPU_CVT], per_core_tag_out[FPU_CVT], per_core_result[FPU_CVT], per_core_fflags[FPU_CVT]})
);
assign per_core_has_fflags[FPU_CVT] = 1;
assign per_core_ready_in[FPU_CVT] = enable;
end
endgenerate
generate
begin : fncp
wire [`NUM_THREADS-1:0][31:0] result_fncp;
wire [`NUM_THREADS-1:0][31:0] result_fclss;
wire [`NUM_THREADS-1:0][31:0] result_flt;
wire [`NUM_THREADS-1:0][31:0] result_fle;
wire [`NUM_THREADS-1:0][31:0] result_feq;
wire [`NUM_THREADS-1:0][31:0] result_fmin;
wire [`NUM_THREADS-1:0][31:0] result_fmax;
wire [`NUM_THREADS-1:0][31:0] result_fsgnj;
wire [`NUM_THREADS-1:0][31:0] result_fsgnjn;
wire [`NUM_THREADS-1:0][31:0] result_fsgnjx;
reg [`NUM_THREADS-1:0][31:0] result_fmv;
fflags_t [`NUM_THREADS-1:0] fflags_fncp;
fflags_t [`NUM_THREADS-1:0] fflags_flt;
fflags_t [`NUM_THREADS-1:0] fflags_fle;
fflags_t [`NUM_THREADS-1:0] fflags_feq;
fflags_t [`NUM_THREADS-1:0] fflags_fmin;
fflags_t [`NUM_THREADS-1:0] fflags_fmax;
always @(*) begin
for (integer i = 0; i < `NUM_THREADS; i++) begin
dpi_fclss (dataa[i], result_fclss[i]);
dpi_flt (dataa[i], datab[i], result_flt[i], fflags_flt[i]);
dpi_fle (dataa[i], datab[i], result_fle[i], fflags_fle[i]);
dpi_feq (dataa[i], datab[i], result_feq[i], fflags_feq[i]);
dpi_fmin (dataa[i], datab[i], result_fmin[i], fflags_fmin[i]);
dpi_fmax (dataa[i], datab[i], result_fmax[i], fflags_fmax[i]);
dpi_fsgnj (dataa[i], result_fsgnj[i]);
dpi_fsgnjn (dataa[i], result_fsgnjn[i]);
dpi_fsgnjx (dataa[i], result_fsgnjx[i]);
result_fmv[i] = dataa[i];
end
end
assign result_fncp = is_fclss ? result_fclss :
is_flt ? result_flt :
is_fle ? result_fle :
is_feq ? result_feq :
is_fmin ? result_fmin :
is_fmax ? result_fmax :
is_fsgnj ? result_fsgnj :
is_fsgnjn ? result_fsgnjn :
is_fsgnjx ? result_fsgnjx :
result_fmv;
wire has_fflags_fncp = (is_flt || is_fle || is_feq || is_fmin || is_fmax);
assign fflags_fncp = is_flt ? fflags_flt :
is_fle ? fflags_fle :
is_feq ? fflags_feq :
is_fmin ? fflags_fmin :
is_fmax ? fflags_fmax :
0;
wire enable = per_core_ready_out[FPU_NCP] || ~per_core_valid_out[FPU_NCP];
wire valid = (valid_in && core_select == FPU_NCP);
VX_shift_register #(
.DATAW (1 + TAGW + 1 + `NUM_THREADS * (32 + $bits(fflags_t))),
.DEPTH (`LATENCY_FNCP),
.RESETW (1)
) shift_reg (
.clk (clk),
.reset (reset),
.enable (enable),
.data_in ({valid, tag_in, has_fflags_fncp, result_fncp, fflags_fncp}),
.data_out ({per_core_valid_out[FPU_NCP], per_core_tag_out[FPU_NCP], per_core_has_fflags[FPU_NCP], per_core_result[FPU_NCP], per_core_fflags[FPU_NCP]})
);
assign per_core_ready_in[FPU_NCP] = enable;
end
endgenerate
///////////////////////////////////////////////////////////////////////////
reg has_fflags_n;
fflags_t [`NUM_THREADS-1:0] fflags_n;
reg [`NUM_THREADS-1:0][31:0] result_n;
reg [TAGW-1:0] tag_out_n;
always @(*) begin
per_core_ready_out = 0;
has_fflags_n = 'x;
fflags_n = 'x;
result_n = 'x;
tag_out_n = 'x;
for (integer i = 0; i < NUM_FPC; i++) begin
if (per_core_valid_out[i]) begin
has_fflags_n = per_core_has_fflags[i];
fflags_n = per_core_fflags[i];
result_n = per_core_result[i];
tag_out_n = per_core_tag_out[i];
per_core_ready_out[i] = ready_out;
break;
end
end
end
assign valid_out = (| per_core_valid_out);
assign has_fflags = has_fflags_n;
assign tag_out = tag_out_n;
assign result = result_n;
assign fflags = fflags_n;
assign ready_in = per_core_ready_in[core_select];
endmodule
`endif

View file

@ -1,9 +1,5 @@
`include "VX_define.vh"
`ifndef SYNTHESIS
`include "float_dpi.vh"
`endif
module VX_fp_fma #(
parameter TAGW = 1,
parameter LANES = 1
@ -63,7 +59,6 @@ module VX_fp_fma #(
end
end
`ifdef QUARTUS
acl_fmadd fmadd (
.clk (clk),
.areset (reset),
@ -73,15 +68,6 @@ module VX_fp_fma #(
.c (c),
.q (result[i])
);
`else
integer fmadd_h;
initial begin
fmadd_h = dpi_register();
end
always @(posedge clk) begin
dpi_fmadd (fmadd_h, enable, a, b, c, `LATENCY_FMA, result[i]);
end
`endif
end
VX_shift_register #(

View file

@ -27,6 +27,11 @@ module VX_fp_fpga #(
input wire ready_out,
output wire valid_out
);
localparam FPU_FMA = 0;
localparam FPU_DIV = 1;
localparam FPU_SQRT = 2;
localparam FPU_CVT = 3;
localparam FPU_NCP = 4;
localparam NUM_FPC = 5;
localparam FPC_BITS = `LOG2UP(NUM_FPC);
@ -49,20 +54,20 @@ module VX_fp_fpga #(
is_itof = 'x;
is_signed = 'x;
case (op_type)
`FPU_ADD: begin core_select = 0; do_madd = 0; do_sub = 0; do_neg = 0; end
`FPU_SUB: begin core_select = 0; do_madd = 0; do_sub = 1; do_neg = 0; end
`FPU_MUL: begin core_select = 0; do_madd = 0; do_sub = 0; do_neg = 1; end
`FPU_MADD: begin core_select = 0; do_madd = 1; do_sub = 0; do_neg = 0; end
`FPU_MSUB: begin core_select = 0; do_madd = 1; do_sub = 1; do_neg = 0; end
`FPU_NMADD: begin core_select = 0; do_madd = 1; do_sub = 0; do_neg = 1; end
`FPU_NMSUB: begin core_select = 0; do_madd = 1; do_sub = 1; do_neg = 1; end
`FPU_DIV: begin core_select = 1; end
`FPU_SQRT: begin core_select = 2; end
`FPU_CVTWS: begin core_select = 3; is_itof = 0; is_signed = 1; end
`FPU_CVTWUS: begin core_select = 3; is_itof = 0; is_signed = 0; end
`FPU_CVTSW: begin core_select = 3; is_itof = 1; is_signed = 1; end
`FPU_CVTSWU: begin core_select = 3; is_itof = 1; is_signed = 0; end
default: begin core_select = 4; end
`FPU_ADD: begin core_select = FPU_FMA; do_madd = 0; do_sub = 0; do_neg = 0; end
`FPU_SUB: begin core_select = FPU_FMA; do_madd = 0; do_sub = 1; do_neg = 0; end
`FPU_MUL: begin core_select = FPU_FMA; do_madd = 0; do_sub = 0; do_neg = 1; end
`FPU_MADD: begin core_select = FPU_FMA; do_madd = 1; do_sub = 0; do_neg = 0; end
`FPU_MSUB: begin core_select = FPU_FMA; do_madd = 1; do_sub = 1; do_neg = 0; end
`FPU_NMADD: begin core_select = FPU_FMA; do_madd = 1; do_sub = 0; do_neg = 1; end
`FPU_NMSUB: begin core_select = FPU_FMA; do_madd = 1; do_sub = 1; do_neg = 1; end
`FPU_DIV: begin core_select = FPU_DIV; end
`FPU_SQRT: begin core_select = FPU_SQRT; end
`FPU_CVTWS: begin core_select = FPU_CVT; is_itof = 0; is_signed = 1; end
`FPU_CVTWUS: begin core_select = FPU_CVT; is_itof = 0; is_signed = 0; end
`FPU_CVTSW: begin core_select = FPU_CVT; is_itof = 1; is_signed = 1; end
`FPU_CVTSWU: begin core_select = FPU_CVT; is_itof = 1; is_signed = 0; end
default: begin core_select = FPU_NCP; end
endcase
end
@ -72,8 +77,8 @@ module VX_fp_fpga #(
) fp_fma (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 0)),
.ready_in (per_core_ready_in[0]),
.valid_in (valid_in && (core_select == FPU_FMA)),
.ready_in (per_core_ready_in[FPU_FMA]),
.tag_in (tag_in),
.frm (frm),
.do_madd (do_madd),
@ -82,12 +87,12 @@ module VX_fp_fpga #(
.dataa (dataa),
.datab (datab),
.datac (datac),
.has_fflags (per_core_has_fflags[0]),
.fflags (per_core_fflags[0]),
.result (per_core_result[0]),
.tag_out (per_core_tag_out[0]),
.ready_out (per_core_ready_out[0]),
.valid_out (per_core_valid_out[0])
.has_fflags (per_core_has_fflags[FPU_FMA]),
.fflags (per_core_fflags[FPU_FMA]),
.result (per_core_result[FPU_FMA]),
.tag_out (per_core_tag_out[FPU_FMA]),
.ready_out (per_core_ready_out[FPU_FMA]),
.valid_out (per_core_valid_out[FPU_FMA])
);
VX_fp_div #(
@ -96,18 +101,18 @@ module VX_fp_fpga #(
) fp_div (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 1)),
.ready_in (per_core_ready_in[1]),
.valid_in (valid_in && (core_select == FPU_DIV)),
.ready_in (per_core_ready_in[FPU_DIV]),
.tag_in (tag_in),
.frm (frm),
.dataa (dataa),
.datab (datab),
.has_fflags (per_core_has_fflags[1]),
.fflags (per_core_fflags[1]),
.result (per_core_result[1]),
.tag_out (per_core_tag_out[1]),
.ready_out (per_core_ready_out[1]),
.valid_out (per_core_valid_out[1])
.has_fflags (per_core_has_fflags[FPU_DIV]),
.fflags (per_core_fflags[FPU_DIV]),
.result (per_core_result[FPU_DIV]),
.tag_out (per_core_tag_out[FPU_DIV]),
.ready_out (per_core_ready_out[FPU_DIV]),
.valid_out (per_core_valid_out[FPU_DIV])
);
VX_fp_sqrt #(
@ -116,17 +121,17 @@ module VX_fp_fpga #(
) fp_sqrt (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 2)),
.ready_in (per_core_ready_in[2]),
.valid_in (valid_in && (core_select == FPU_SQRT)),
.ready_in (per_core_ready_in[FPU_SQRT]),
.tag_in (tag_in),
.frm (frm),
.dataa (dataa),
.has_fflags (per_core_has_fflags[2]),
.fflags (per_core_fflags[2]),
.result (per_core_result[2]),
.tag_out (per_core_tag_out[2]),
.ready_out (per_core_ready_out[2]),
.valid_out (per_core_valid_out[2])
.has_fflags (per_core_has_fflags[FPU_SQRT]),
.fflags (per_core_fflags[FPU_SQRT]),
.result (per_core_result[FPU_SQRT]),
.tag_out (per_core_tag_out[FPU_SQRT]),
.ready_out (per_core_ready_out[FPU_SQRT]),
.valid_out (per_core_valid_out[FPU_SQRT])
);
VX_fp_cvt #(
@ -135,19 +140,19 @@ module VX_fp_fpga #(
) fp_cvt (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 3)),
.ready_in (per_core_ready_in[3]),
.valid_in (valid_in && (core_select == FPU_CVT)),
.ready_in (per_core_ready_in[FPU_CVT]),
.tag_in (tag_in),
.frm (frm),
.is_itof (is_itof),
.is_signed (is_signed),
.dataa (dataa),
.has_fflags (per_core_has_fflags[3]),
.fflags (per_core_fflags[3]),
.result (per_core_result[3]),
.tag_out (per_core_tag_out[3]),
.ready_out (per_core_ready_out[3]),
.valid_out (per_core_valid_out[3])
.has_fflags (per_core_has_fflags[FPU_CVT]),
.fflags (per_core_fflags[FPU_CVT]),
.result (per_core_result[FPU_CVT]),
.tag_out (per_core_tag_out[FPU_CVT]),
.ready_out (per_core_ready_out[FPU_CVT]),
.valid_out (per_core_valid_out[FPU_CVT])
);
VX_fp_ncomp #(
@ -156,19 +161,19 @@ module VX_fp_fpga #(
) fp_ncomp (
.clk (clk),
.reset (reset),
.valid_in (valid_in && (core_select == 4)),
.ready_in (per_core_ready_in[4]),
.valid_in (valid_in && (core_select == FPU_NCP)),
.ready_in (per_core_ready_in[FPU_NCP]),
.tag_in (tag_in),
.op_type (op_type),
.frm (frm),
.dataa (dataa),
.datab (datab),
.result (per_core_result[4]),
.has_fflags (per_core_has_fflags[4]),
.fflags (per_core_fflags[4]),
.tag_out (per_core_tag_out[4]),
.ready_out (per_core_ready_out[4]),
.valid_out (per_core_valid_out[4])
.result (per_core_result[FPU_NCP]),
.has_fflags (per_core_has_fflags[FPU_NCP]),
.fflags (per_core_fflags[FPU_NCP]),
.tag_out (per_core_tag_out[FPU_NCP]),
.ready_out (per_core_ready_out[FPU_NCP]),
.valid_out (per_core_valid_out[FPU_NCP])
);
reg has_fflags_n;

View file

@ -160,7 +160,7 @@ module VX_fp_ncomp #(
for (genvar i = 0; i < LANES; i++) begin
always @(*) begin
case (frm_s0)
`FRM_RNE: begin
`FRM_RNE: begin // LE
fcmp_fflags[i] = 5'h0;
if (a_type_s0[i].is_nan || b_type_s0[i].is_nan) begin
fcmp_res[i] = 32'h0;
@ -169,7 +169,7 @@ module VX_fp_ncomp #(
fcmp_res[i] = {31'h0, (a_smaller_s0[i] | ab_equal_s0[i])};
end
end
`FRM_RTZ: begin
`FRM_RTZ: begin // LS
fcmp_fflags[i] = 5'h0;
if (a_type_s0[i].is_nan || b_type_s0[i].is_nan) begin
fcmp_res[i] = 32'h0;
@ -178,7 +178,7 @@ module VX_fp_ncomp #(
fcmp_res[i] = {31'h0, (a_smaller_s0[i] & ~ab_equal_s0[i])};
end
end
`FRM_RDN: begin
`FRM_RDN: begin // EQ
fcmp_fflags[i] = 5'h0;
if (a_type_s0[i].is_nan || b_type_s0[i].is_nan) begin
fcmp_res[i] = 32'h0;
@ -223,7 +223,7 @@ module VX_fp_ncomp #(
tmp_fflags[i] = 0;
tmp_fflags[i].NV = a_type_s0[i].is_signaling | b_type_s0[i].is_signaling;
end
//5,6,7:
//5,6,7: MOVE
default: begin
tmp_result[i] = dataa[i];
tmp_fflags[i] = 'x;

View file

@ -1,9 +1,5 @@
`include "VX_define.vh"
`ifndef SYNTHESIS
`include "float_dpi.vh"
`endif
module VX_fp_sqrt #(
parameter TAGW = 1,
parameter LANES = 1
@ -30,26 +26,24 @@ module VX_fp_sqrt #(
output wire valid_out
);
wire stall = ~ready_out && valid_out;
wire enable = ~stall;
wire enable = ~stall;
wire _reset;
VX_reset_relay reset_relay (
.clk (clk),
.reset (reset),
.reset_out (_reset)
);
for (genvar i = 0; i < LANES; i++) begin
`ifdef QUARTUS
acl_fsqrt fsqrt (
.clk (clk),
.areset (reset),
.areset (_reset),
.en (enable),
.a (dataa[i]),
.q (result[i])
);
`else
integer fsqrt_h;
initial begin
fsqrt_h = dpi_register();
end
always @(posedge clk) begin
dpi_fsqrt (fsqrt_h, enable, dataa[i], `LATENCY_FSQRT, result[i]);
end
`endif
end
VX_shift_register #(

View file

@ -56,13 +56,13 @@ module VX_fpnew
localparam fpnew_pkg::fpu_implementation_t FPU_IMPLEMENTATION = '{
PipeRegs:'{'{`LATENCY_FMA, 0, 0, 0, 0}, // ADDMUL
'{default: `LATENCY_FDIVSQRT}, // DIVSQRT
'{default: `LATENCY_FNCOMP}, // NONCOMP
'{default: `LATENCY_FCONV}}, // CONV
UnitTypes:'{'{default: UNIT_FMULADD}, // ADDMUL
'{default: UNIT_FDIVSQRT}, // DIVSQRT
'{default: UNIT_FNONCOMP}, // NONCOMP
'{default: UNIT_FCONV}}, // CONV
'{default: `LATENCY_FDIVSQRT}, // DIVSQRT
'{default: `LATENCY_FNCP}, // NONCOMP
'{default: `LATENCY_FCVT}}, // CONV
UnitTypes:'{'{default: UNIT_FMULADD}, // ADDMUL
'{default: UNIT_FDIVSQRT}, // DIVSQRT
'{default: UNIT_FNONCOMP}, // NONCOMP
'{default: UNIT_FCONV}}, // CONV
PipeConfig: fpnew_pkg::DISTRIBUTED
};

View file

@ -1,239 +0,0 @@
#include <stdio.h>
#include <math.h>
#include <unordered_map>
#include <vector>
#include <mutex>
#include <iostream>
#include "svdpi.h"
#include "verilated_vpi.h"
#include "VX_config.h"
extern "C" {
int dpi_register();
void dpi_fadd(int inst, bool enable, int a, int b, int delay, int* result);
void dpi_fsub(int inst, bool enable, int a, int b, int delay, int* result);
void dpi_fmul(int inst, bool enable, int a, int b, int delay, int* result);
void dpi_fmadd(int inst, bool enable, int a, int b, int c, int delay, int* result);
void dpi_fdiv(int inst, bool enable, int a, int b, int delay, int* result);
void dpi_fsqrt(int inst, bool enable, int a, int delay, int* result);
void dpi_ftoi(int inst, bool enable, int a, int delay, int* result);
void dpi_ftou(int inst, bool enable, int a, int delay, int* result);
void dpi_itof(int inst, bool enable, int a, int delay, int* result);
void dpi_utof(int inst, bool enable, int a, int delay, int* result);
void dpi_assert(int inst, bool cond, int delay);
}
class ShiftRegister {
public:
ShiftRegister() : init_(false), depth_(0) {}
void ensure_init(int depth) {
if (!init_) {
buffer_.resize(depth);
init_ = true;
depth_ = depth;
}
}
void push(int value, bool enable) {
if (!enable)
return;
for (unsigned i = 0; i < depth_-1; ++i) {
buffer_[i] = buffer_[i+1];
}
buffer_[depth_-1] = value;
}
int top() const {
return buffer_[0];
}
private:
std::vector<int> buffer_;
bool init_;
unsigned depth_;
};
union Float_t {
float f;
int i;
struct {
uint32_t man : 23;
uint32_t exp : 8;
uint32_t sign : 1;
} parts;
};
class Instances {
public:
ShiftRegister& get(int inst) {
return instances_.at(inst);
}
int allocate() {
mutex_.lock();
int inst = instances_.size();
instances_.resize(inst + 1);
mutex_.unlock();
return inst;
}
private:
std::vector<ShiftRegister> instances_;
std::mutex mutex_;
};
Instances instances;
int dpi_register() {
return instances.allocate();
}
void dpi_fadd(int inst, bool enable, int a, int b, int delay, int* result) {
ShiftRegister& sr = instances.get(inst);
Float_t fa, fb, fr;
fa.i = a;
fb.i = b;
fr.f = fa.f + fb.f;
sr.ensure_init(delay);
sr.push(fr.i, enable);
*result = sr.top();
}
void dpi_fsub(int inst, bool enable, int a, int b, int delay, int* result) {
ShiftRegister& sr = instances.get(inst);
Float_t fa, fb, fr;
fa.i = a;
fb.i = b;
fr.f = fa.f - fb.f;
sr.ensure_init(delay);
sr.push(fr.i, enable);
*result = sr.top();
}
void dpi_fmul(int inst, bool enable, int a, int b, int delay, int* result) {
ShiftRegister& sr = instances.get(inst);
Float_t fa, fb, fr;
fa.i = a;
fb.i = b;
fr.f = fa.f * fb.f;
sr.ensure_init(delay);
sr.push(fr.i, enable);
*result = sr.top();
}
void dpi_fmadd(int inst, bool enable, int a, int b, int c, int delay, int* result) {
ShiftRegister& sr = instances.get(inst);
Float_t fa, fb, fc, fr;
fa.i = a;
fb.i = b;
fc.i = c;
fr.f = fa.f * fb.f + fc.f;
sr.ensure_init(delay);
sr.push(fr.i, enable);
*result = sr.top();
}
void dpi_fdiv(int inst, bool enable, int a, int b, int delay, int* result) {
ShiftRegister& sr = instances.get(inst);
Float_t fa, fb, fr;
fa.i = a;
fb.i = b;
fr.f = fa.f / fb.f;
sr.ensure_init(delay);
sr.push(fr.i, enable);
*result = sr.top();
}
void dpi_fsqrt(int inst, bool enable, int a, int delay, int* result) {
ShiftRegister& sr = instances.get(inst);
Float_t fa, fr;
fa.i = a;
fr.f = sqrtf(fa.f);
sr.ensure_init(delay);
sr.push(fr.i, enable);
*result = sr.top();
}
void dpi_ftoi(int inst, bool enable, int a, int delay, int* result) {
ShiftRegister& sr = instances.get(inst);
Float_t fa, fr;
fa.i = a;
fr.i = int(fa.f);
sr.ensure_init(delay);
sr.push(fr.i, enable);
*result = sr.top();
}
void dpi_ftou(int inst, bool enable, int a, int delay, int* result) {
ShiftRegister& sr = instances.get(inst);
Float_t fa, fr;
fa.i = a;
fr.i = unsigned(fa.f);
sr.ensure_init(delay);
sr.push(fr.i, enable);
*result = sr.top();
}
void dpi_itof(int inst, bool enable, int a, int delay, int* result) {
ShiftRegister& sr = instances.get(inst);
Float_t fa, fr;
fr.f = (float)a;
sr.ensure_init(delay);
sr.push(fr.i, enable);
*result = sr.top();
}
void dpi_utof(int inst, bool enable, int a, int delay, int* result) {
ShiftRegister& sr = instances.get(inst);
Float_t fa, fr;
unsigned ua = a;
fr.f = (float)ua;
sr.ensure_init(delay);
sr.push(fr.i, enable);
*result = sr.top();
}
void dpi_assert(int inst, bool cond, int delay) {
ShiftRegister& sr = instances.get(inst);
sr.ensure_init(delay);
sr.push(!cond, 1);
auto status = sr.top();
if (status) {
printf("delayed assertion at %s!\n", svGetNameFromScope(svGetScope()));
std::abort();
}
}

View file

@ -1,20 +0,0 @@
`ifndef FLOAT_DPI
`define FLOAT_DPI
import "DPI-C" context function int dpi_register();
import "DPI-C" context function void dpi_fadd(int inst, input logic enable, input int a, input int b, input int delay, output int result);
import "DPI-C" context function void dpi_fsub(int inst, input logic enable, input int a, input int b, input int delay, output int result);
import "DPI-C" context function void dpi_fmul(int inst, input logic enable, input int a, input int b, input int delay, output int result);
import "DPI-C" context function void dpi_fmadd(int inst, input logic enable, input int a, input int b, input int c, input int delay, output int result);
import "DPI-C" context function void dpi_fmsub(int inst, input logic enable, input int a, input int b, input int c, input int delay, output int result);
import "DPI-C" context function void dpi_fdiv(int inst, input logic enable, input int a, input int b, input int delay, output int result);
import "DPI-C" context function void dpi_fsqrt(int inst, input logic enable, input int a, input int delay, output int result);
import "DPI-C" context function void dpi_ftoi(int inst, input logic enable, input int a, input int delay, output int result);
import "DPI-C" context function void dpi_ftou(int inst, input logic enable, input int a, input int delay, output int result);
import "DPI-C" context function void dpi_itof(int inst, input logic enable, input int a, input int delay, output int result);
import "DPI-C" context function void dpi_utof(int inst, input logic enable, input int a, input int delay, output int result);
import "DPI-C" context function void dpi_assert(int inst, input logic cond, input int delay);
`endif

View file

@ -1,7 +1,7 @@
`include "VX_platform.vh"
module VX_lzc #(
parameter DATAW = 1,
parameter DATAW = 32,
parameter LDATAW = `LOG2UP(DATAW)
) (
input wire [DATAW-1:0] data_in,

View file

@ -31,12 +31,13 @@ MULTICORE ?= -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0
TOP = Vortex
RTL_DIR=../rtl
DPI_DIR=../dpi
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/svdpi -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(DPI_DIR) -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
RTL_INCLUDE = -I$(RTL_DIR)/ -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/simulate $(FPU_INCLUDE)
SRCS = simulator.cpp testbench.cpp
SRCS += $(RTL_DIR)/fp_cores/svdpi/float_dpi.cpp
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic
VL_FLAGS += -Wno-DECLFILENAME

View file

@ -3,7 +3,7 @@
#include <fstream>
#include <iomanip>
#define RESET_DELAY 2
#define RESET_DELAY 4
#define ENABLE_DRAM_STALLS
#define DRAM_LATENCY 24
@ -75,13 +75,6 @@ void Simulator::reset() {
vortex_->csr_rsp_ready = 0;
vortex_->reset = 1;
vortex_->clk = 0;
this->eval();
vortex_->clk = 1;
this->eval();
vortex_->reset = 0;
for (int i = 0; i < RESET_DELAY; ++i) {
vortex_->clk = 0;
@ -89,8 +82,11 @@ void Simulator::reset() {
vortex_->clk = 1;
this->eval();
}
vortex_->reset = 0;
// Turn on assertion after reset
printf("*** enabling assertion at tick: %ld", timestamp);
Verilated::assertOn(true);
}