mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-19 11:34:59 -04:00
partial implementation for SIMD_WIDTH and collector units
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / tests (vector, 32) (push) Blocked by required conditions
CI / tests (vector, 64) (push) Blocked by required conditions
CI / tests (vm, 32) (push) Blocked by required conditions
CI / tests (vm, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions
Some checks are pending
CI / setup (push) Waiting to run
CI / build (32) (push) Blocked by required conditions
CI / build (64) (push) Blocked by required conditions
CI / tests (cache, 32) (push) Blocked by required conditions
CI / tests (cache, 64) (push) Blocked by required conditions
CI / tests (config1, 32) (push) Blocked by required conditions
CI / tests (config1, 64) (push) Blocked by required conditions
CI / tests (config2, 32) (push) Blocked by required conditions
CI / tests (config2, 64) (push) Blocked by required conditions
CI / tests (debug, 32) (push) Blocked by required conditions
CI / tests (debug, 64) (push) Blocked by required conditions
CI / tests (opencl, 32) (push) Blocked by required conditions
CI / tests (opencl, 64) (push) Blocked by required conditions
CI / tests (regression, 32) (push) Blocked by required conditions
CI / tests (regression, 64) (push) Blocked by required conditions
CI / tests (scope, 32) (push) Blocked by required conditions
CI / tests (scope, 64) (push) Blocked by required conditions
CI / tests (stress, 32) (push) Blocked by required conditions
CI / tests (stress, 64) (push) Blocked by required conditions
CI / tests (synthesis, 32) (push) Blocked by required conditions
CI / tests (synthesis, 64) (push) Blocked by required conditions
CI / tests (vector, 32) (push) Blocked by required conditions
CI / tests (vector, 64) (push) Blocked by required conditions
CI / tests (vm, 32) (push) Blocked by required conditions
CI / tests (vm, 64) (push) Blocked by required conditions
CI / complete (push) Blocked by required conditions
This commit is contained in:
parent
9072ec457a
commit
e9c9678d83
104 changed files with 2480 additions and 2102 deletions
|
@ -39,7 +39,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
|
||||
`ifdef SCOPE
|
||||
localparam scope_socket = 0;
|
||||
`SCOPE_IO_SWITCH (`NUM_SOCKETS);
|
||||
`SCOPE_IO_SWITCH (NUM_SOCKETS);
|
||||
`endif
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
|
@ -53,12 +53,12 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
|
||||
`ifdef GBAR_ENABLE
|
||||
|
||||
VX_gbar_bus_if per_socket_gbar_bus_if[`NUM_SOCKETS]();
|
||||
VX_gbar_bus_if per_socket_gbar_bus_if[NUM_SOCKETS]();
|
||||
VX_gbar_bus_if gbar_bus_if();
|
||||
|
||||
VX_gbar_arb #(
|
||||
.NUM_REQS (`NUM_SOCKETS),
|
||||
.OUT_BUF ((`NUM_SOCKETS > 2) ? 1 : 0) // bgar_unit has no backpressure
|
||||
.NUM_REQS (NUM_SOCKETS),
|
||||
.OUT_BUF ((NUM_SOCKETS > 2) ? 1 : 0) // bgar_unit has no backpressure
|
||||
) gbar_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -79,7 +79,7 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
VX_mem_bus_if #(
|
||||
.DATA_SIZE (`L1_LINE_SIZE),
|
||||
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
|
||||
) per_socket_mem_bus_if[`NUM_SOCKETS * `L1_MEM_PORTS]();
|
||||
) per_socket_mem_bus_if[NUM_SOCKETS * `L1_MEM_PORTS]();
|
||||
|
||||
`RESET_RELAY (l2_reset, reset);
|
||||
|
||||
|
@ -101,8 +101,6 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
.WRITEBACK (`L2_WRITEBACK),
|
||||
.DIRTY_BYTES (`L2_DIRTYBYTES),
|
||||
.REPL_POLICY (`L2_REPL_POLICY),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
|
||||
.CORE_OUT_BUF (3),
|
||||
.MEM_OUT_BUF (3),
|
||||
.NC_ENABLE (1),
|
||||
|
@ -119,19 +117,19 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire [`NUM_SOCKETS-1:0] per_socket_busy;
|
||||
wire [NUM_SOCKETS-1:0] per_socket_busy;
|
||||
|
||||
// Generate all sockets
|
||||
for (genvar socket_id = 0; socket_id < `NUM_SOCKETS; ++socket_id) begin : g_sockets
|
||||
for (genvar socket_id = 0; socket_id < NUM_SOCKETS; ++socket_id) begin : g_sockets
|
||||
|
||||
`RESET_RELAY (socket_reset, reset);
|
||||
|
||||
VX_dcr_bus_if socket_dcr_bus_if();
|
||||
wire is_base_dcr_addr = (dcr_bus_if.write_addr >= `VX_DCR_BASE_STATE_BEGIN && dcr_bus_if.write_addr < `VX_DCR_BASE_STATE_END);
|
||||
`BUFFER_DCR_BUS_IF (socket_dcr_bus_if, dcr_bus_if, is_base_dcr_addr, (`NUM_SOCKETS > 1))
|
||||
`BUFFER_DCR_BUS_IF (socket_dcr_bus_if, dcr_bus_if, is_base_dcr_addr, (NUM_SOCKETS > 1))
|
||||
|
||||
VX_socket #(
|
||||
.SOCKET_ID ((CLUSTER_ID * `NUM_SOCKETS) + socket_id),
|
||||
.SOCKET_ID ((CLUSTER_ID * NUM_SOCKETS) + socket_id),
|
||||
.INSTANCE_ID (`SFORMATF(("%s-socket%0d", INSTANCE_ID, socket_id)))
|
||||
) socket (
|
||||
`SCOPE_IO_BIND (scope_socket+socket_id)
|
||||
|
@ -155,6 +153,6 @@ module VX_cluster import VX_gpu_pkg::*; #(
|
|||
);
|
||||
end
|
||||
|
||||
`BUFFER_EX(busy, (| per_socket_busy), 1'b1, 1, (`NUM_SOCKETS > 1));
|
||||
`BUFFER_EX(busy, (| per_socket_busy), 1'b1, 1, (NUM_SOCKETS > 1));
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -114,18 +114,6 @@
|
|||
`define SOCKET_SIZE `MIN(4, `NUM_CORES)
|
||||
`endif
|
||||
|
||||
`ifdef L2_ENABLE
|
||||
`define L2_ENABLED 1
|
||||
`else
|
||||
`define L2_ENABLED 0
|
||||
`endif
|
||||
|
||||
`ifdef L3_ENABLE
|
||||
`define L3_ENABLED 1
|
||||
`else
|
||||
`define L3_ENABLED 0
|
||||
`endif
|
||||
|
||||
`ifdef L1_DISABLE
|
||||
`define ICACHE_DISABLE
|
||||
`define DCACHE_DISABLE
|
||||
|
@ -247,18 +235,10 @@
|
|||
`ifndef IO_MPM_ADDR
|
||||
`define IO_MPM_ADDR (`IO_COUT_ADDR + `IO_COUT_SIZE)
|
||||
`endif
|
||||
`define IO_MPM_SIZE (8 * 32 * `NUM_CORES * `NUM_CLUSTERS)
|
||||
|
||||
`ifndef STACK_LOG2_SIZE
|
||||
`define STACK_LOG2_SIZE 13
|
||||
`endif
|
||||
`define STACK_SIZE (1 << `STACK_LOG2_SIZE)
|
||||
|
||||
`define RESET_DELAY 8
|
||||
|
||||
`ifndef STALL_TIMEOUT
|
||||
`define STALL_TIMEOUT (100000 * (1 ** (`L2_ENABLED + `L3_ENABLED)))
|
||||
`endif
|
||||
|
||||
`ifndef SV_DPI
|
||||
`ifndef DPI_DISABLE
|
||||
|
@ -296,6 +276,7 @@
|
|||
`ifndef MEM_PAGE_SIZE
|
||||
`define MEM_PAGE_SIZE (4096)
|
||||
`endif
|
||||
|
||||
`ifndef MEM_PAGE_LOG2_SIZE
|
||||
`define MEM_PAGE_LOG2_SIZE (12)
|
||||
`endif
|
||||
|
@ -303,54 +284,53 @@
|
|||
// Virtual Memory Configuration ///////////////////////////////////////////////
|
||||
|
||||
`ifdef VM_ENABLE
|
||||
`ifdef XLEN_32
|
||||
`ifndef VM_ADDR_MODE
|
||||
`define VM_ADDR_MODE SV32 //or BARE
|
||||
`endif
|
||||
`ifndef PT_LEVEL
|
||||
`define PT_LEVEL (2)
|
||||
`endif
|
||||
`ifndef PTE_SIZE
|
||||
`define PTE_SIZE (4)
|
||||
`endif
|
||||
`ifndef NUM_PTE_ENTRY
|
||||
`define NUM_PTE_ENTRY (1024)
|
||||
`endif
|
||||
`ifndef PT_SIZE_LIMIT
|
||||
`define PT_SIZE_LIMIT (1<<23)
|
||||
`endif
|
||||
`else
|
||||
`ifndef VM_ADDR_MODE
|
||||
`define VM_ADDR_MODE SV39 //or BARE
|
||||
`endif
|
||||
`ifndef PT_LEVEL
|
||||
`define PT_LEVEL (3)
|
||||
`endif
|
||||
`ifndef PTE_SIZE
|
||||
`define PTE_SIZE (8)
|
||||
`endif
|
||||
`ifndef NUM_PTE_ENTRY
|
||||
`define NUM_PTE_ENTRY (512)
|
||||
`endif
|
||||
`ifndef PT_SIZE_LIMIT
|
||||
`define PT_SIZE_LIMIT (1<<25)
|
||||
`endif
|
||||
`ifdef XLEN_32
|
||||
`ifndef VM_ADDR_MODE
|
||||
`define VM_ADDR_MODE SV32 //or BARE
|
||||
`endif
|
||||
|
||||
`ifndef PT_SIZE
|
||||
`define PT_SIZE MEM_PAGE_SIZE
|
||||
`ifndef PT_LEVEL
|
||||
`define PT_LEVEL (2)
|
||||
`endif
|
||||
|
||||
`ifndef TLB_SIZE
|
||||
`define TLB_SIZE (32)
|
||||
`ifndef PTE_SIZE
|
||||
`define PTE_SIZE (4)
|
||||
`endif
|
||||
`ifndef NUM_PTE_ENTRY
|
||||
`define NUM_PTE_ENTRY (1024)
|
||||
`endif
|
||||
`ifndef PT_SIZE_LIMIT
|
||||
`define PT_SIZE_LIMIT (1<<23)
|
||||
`endif
|
||||
`else
|
||||
`ifndef VM_ADDR_MODE
|
||||
`define VM_ADDR_MODE SV39 //or BARE
|
||||
`endif
|
||||
`ifndef PT_LEVEL
|
||||
`define PT_LEVEL (3)
|
||||
`endif
|
||||
`ifndef PTE_SIZE
|
||||
`define PTE_SIZE (8)
|
||||
`endif
|
||||
`ifndef NUM_PTE_ENTRY
|
||||
`define NUM_PTE_ENTRY (512)
|
||||
`endif
|
||||
`ifndef PT_SIZE_LIMIT
|
||||
`define PT_SIZE_LIMIT (1<<25)
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`ifndef PT_SIZE
|
||||
`define PT_SIZE MEM_PAGE_SIZE
|
||||
`endif
|
||||
|
||||
`ifndef TLB_SIZE
|
||||
`define TLB_SIZE (32)
|
||||
`endif
|
||||
`endif
|
||||
|
||||
// Pipeline Configuration /////////////////////////////////////////////////////
|
||||
|
||||
`ifndef SIMD_WIDTH
|
||||
`define SIMD_WIDTH `MAX(`NUM_THREADS, 16)
|
||||
`define SIMD_WIDTH `MIN(`NUM_THREADS, 2)
|
||||
`endif
|
||||
|
||||
// Issue width
|
||||
|
@ -358,9 +338,19 @@
|
|||
`define ISSUE_WIDTH `UP(`NUM_WARPS / 8)
|
||||
`endif
|
||||
|
||||
// Operand collectors
|
||||
`ifndef NUM_OPCS
|
||||
`define NUM_OPCS 4
|
||||
`endif
|
||||
|
||||
// Register File Banks
|
||||
`ifndef NUM_GPR_BANKS
|
||||
`define NUM_GPR_BANKS `MIN(`NUM_OPCS, 4)
|
||||
`endif
|
||||
|
||||
// Number of ALU units
|
||||
`ifndef NUM_ALU_LANES
|
||||
`define NUM_ALU_LANES `NUM_THREADS
|
||||
`define NUM_ALU_LANES `SIMD_WIDTH
|
||||
`endif
|
||||
`ifndef NUM_ALU_BLOCKS
|
||||
`define NUM_ALU_BLOCKS `ISSUE_WIDTH
|
||||
|
@ -368,7 +358,7 @@
|
|||
|
||||
// Number of FPU units
|
||||
`ifndef NUM_FPU_LANES
|
||||
`define NUM_FPU_LANES `NUM_THREADS
|
||||
`define NUM_FPU_LANES `SIMD_WIDTH
|
||||
`endif
|
||||
`ifndef NUM_FPU_BLOCKS
|
||||
`define NUM_FPU_BLOCKS `ISSUE_WIDTH
|
||||
|
@ -376,7 +366,7 @@
|
|||
|
||||
// Number of LSU units
|
||||
`ifndef NUM_LSU_LANES
|
||||
`define NUM_LSU_LANES `NUM_THREADS
|
||||
`define NUM_LSU_LANES `SIMD_WIDTH
|
||||
`endif
|
||||
`ifndef NUM_LSU_BLOCKS
|
||||
`define NUM_LSU_BLOCKS 1
|
||||
|
@ -384,7 +374,7 @@
|
|||
|
||||
// Number of SFU units
|
||||
`ifndef NUM_SFU_LANES
|
||||
`define NUM_SFU_LANES `NUM_THREADS
|
||||
`define NUM_SFU_LANES `SIMD_WIDTH
|
||||
`endif
|
||||
`ifndef NUM_SFU_BLOCKS
|
||||
`define NUM_SFU_BLOCKS 1
|
||||
|
@ -402,7 +392,7 @@
|
|||
|
||||
// Size of LSU Core Request Queue
|
||||
`ifndef LSUQ_IN_SIZE
|
||||
`define LSUQ_IN_SIZE (2 * (`NUM_THREADS / `NUM_LSU_LANES))
|
||||
`define LSUQ_IN_SIZE (2 * (`SIMD_WIDTH / `NUM_LSU_LANES))
|
||||
`endif
|
||||
|
||||
// Size of LSU Memory Request Queue
|
||||
|
@ -410,12 +400,6 @@
|
|||
`define LSUQ_OUT_SIZE `MAX(`LSUQ_IN_SIZE, `LSU_LINE_SIZE / (`XLEN / 8))
|
||||
`endif
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
`define GBAR_ENABLED 1
|
||||
`else
|
||||
`define GBAR_ENABLED 0
|
||||
`endif
|
||||
|
||||
`ifndef LATENCY_IMUL
|
||||
`ifdef VIVADO
|
||||
`define LATENCY_IMUL 4
|
||||
|
@ -432,7 +416,7 @@
|
|||
|
||||
// Size of FPU Request Queue
|
||||
`ifndef FPUQ_SIZE
|
||||
`define FPUQ_SIZE (2 * (`NUM_THREADS / `NUM_FPU_LANES))
|
||||
`define FPUQ_SIZE (2 * (`SIMD_WIDTH / `NUM_FPU_LANES))
|
||||
`endif
|
||||
|
||||
// FNCP Latency
|
||||
|
@ -545,10 +529,8 @@
|
|||
`ifndef ICACHE_DISABLE
|
||||
`define ICACHE_ENABLE
|
||||
`endif
|
||||
`ifdef ICACHE_ENABLE
|
||||
`define ICACHE_ENABLED 1
|
||||
`else
|
||||
`define ICACHE_ENABLED 0
|
||||
|
||||
`ifndef ICACHE_ENABLE
|
||||
`define NUM_ICACHES 0
|
||||
`endif
|
||||
|
||||
|
@ -602,10 +584,8 @@
|
|||
`ifndef DCACHE_DISABLE
|
||||
`define DCACHE_ENABLE
|
||||
`endif
|
||||
`ifdef DCACHE_ENABLE
|
||||
`define DCACHE_ENABLED 1
|
||||
`else
|
||||
`define DCACHE_ENABLED 0
|
||||
|
||||
`ifndef DCACHE_ENABLE
|
||||
`define NUM_DCACHES 0
|
||||
`define DCACHE_NUM_BANKS 1
|
||||
`endif
|
||||
|
@ -680,10 +660,7 @@
|
|||
`define LMEM_ENABLE
|
||||
`endif
|
||||
|
||||
`ifdef LMEM_ENABLE
|
||||
`define LMEM_ENABLED 1
|
||||
`else
|
||||
`define LMEM_ENABLED 0
|
||||
`ifndef LMEM_ENABLE
|
||||
`define LMEM_NUM_BANKS 1
|
||||
`endif
|
||||
|
||||
|
@ -816,6 +793,42 @@
|
|||
|
||||
// ISA Extensions /////////////////////////////////////////////////////////////
|
||||
|
||||
`ifdef ICACHE_ENABLE
|
||||
`define ICACHE_ENABLED 1
|
||||
`else
|
||||
`define ICACHE_ENABLED 0
|
||||
`endif
|
||||
|
||||
`ifdef DCACHE_ENABLE
|
||||
`define DCACHE_ENABLED 1
|
||||
`else
|
||||
`define DCACHE_ENABLED 0
|
||||
`endif
|
||||
|
||||
`ifdef LMEM_ENABLE
|
||||
`define LMEM_ENABLED 1
|
||||
`else
|
||||
`define LMEM_ENABLED 0
|
||||
`endif
|
||||
|
||||
`ifdef GBAR_ENABLE
|
||||
`define GBAR_ENABLED 1
|
||||
`else
|
||||
`define GBAR_ENABLED 0
|
||||
`endif
|
||||
|
||||
`ifdef L2_ENABLE
|
||||
`define L2_ENABLED 1
|
||||
`else
|
||||
`define L2_ENABLED 0
|
||||
`endif
|
||||
|
||||
`ifdef L3_ENABLE
|
||||
`define L3_ENABLED 1
|
||||
`else
|
||||
`define L3_ENABLED 0
|
||||
`endif
|
||||
|
||||
`ifdef EXT_A_ENABLE
|
||||
`define EXT_A_ENABLED 1
|
||||
`else
|
||||
|
|
|
@ -18,261 +18,26 @@
|
|||
`include "VX_config.vh"
|
||||
`include "VX_types.vh"
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define NC_BITS `CLOG2(`NUM_CORES)
|
||||
`define NW_BITS `CLOG2(`NUM_WARPS)
|
||||
`define NT_BITS `CLOG2(`NUM_THREADS)
|
||||
`define NB_BITS `CLOG2(`NUM_BARRIERS)
|
||||
|
||||
`define NC_WIDTH `UP(`NC_BITS)
|
||||
`define NW_WIDTH `UP(`NW_BITS)
|
||||
`define NT_WIDTH `UP(`NT_BITS)
|
||||
`define NB_WIDTH `UP(`NB_BITS)
|
||||
|
||||
`define NUM_IREGS 32
|
||||
|
||||
`define NRI_BITS `CLOG2(`NUM_IREGS)
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
`define REG_TYPES 2
|
||||
`else
|
||||
`define REG_TYPES 1
|
||||
`ifdef ICACHE_ENABLE
|
||||
`define L1_ENABLE
|
||||
`endif
|
||||
|
||||
`define REG_TYPE_BITS `CLOG2(`REG_TYPES)
|
||||
`define REG_TYPE_WIDTH `UP(`REG_TYPE_BITS)
|
||||
|
||||
`define NUM_REGS (`REG_TYPES * `NUM_IREGS)
|
||||
|
||||
`define NR_BITS `CLOG2(`NUM_REGS)
|
||||
|
||||
`define REG_EXT_VAL(ext, type) 32'h1
|
||||
//32'((1 << ((type == 1) ? ext[2:0] : ext[1:0]))-1)
|
||||
|
||||
`define IREG_EXT_BITS 2
|
||||
`define FREG_EXT_BITS 3
|
||||
`define VREG_EXT_BITS 3
|
||||
`define REG_EXT_BITS `MAX(`MAX(`IREG_EXT_BITS, `FREG_EXT_BITS), `VREG_EXT_BITS)
|
||||
|
||||
`define DV_STACK_SIZE `UP(`NUM_THREADS-1)
|
||||
`define DV_STACK_SIZEW `UP(`CLOG2(`DV_STACK_SIZE))
|
||||
|
||||
`define PERF_CTR_BITS 44
|
||||
`ifdef DCACHE_ENABLE
|
||||
`define L1_ENABLE
|
||||
`endif
|
||||
|
||||
`ifndef NDEBUG
|
||||
`define UUID_ENABLE
|
||||
`define UUID_WIDTH 44
|
||||
`else
|
||||
`ifdef SCOPE
|
||||
`define UUID_ENABLE
|
||||
`define UUID_WIDTH 44
|
||||
`else
|
||||
`define UUID_WIDTH 1
|
||||
`endif
|
||||
`endif
|
||||
|
||||
`define PC_BITS (`XLEN-1)
|
||||
`define OFFSET_BITS 12
|
||||
`define IMM_BITS `XLEN
|
||||
|
||||
`define NUM_SOCKETS `UP(`NUM_CORES / `SOCKET_SIZE)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define EX_ALU 0
|
||||
`define EX_LSU 1
|
||||
`define EX_SFU 2
|
||||
`define EX_FPU (`EX_SFU + `EXT_F_ENABLED)
|
||||
|
||||
`define NUM_EX_UNITS (3 + `EXT_F_ENABLED)
|
||||
`define EX_BITS `CLOG2(`NUM_EX_UNITS)
|
||||
`define EX_WIDTH `UP(`EX_BITS)
|
||||
|
||||
`define SFU_CSRS 0
|
||||
`define SFU_WCTL 1
|
||||
|
||||
`define NUM_SFU_UNITS (2)
|
||||
`define SFU_BITS `CLOG2(`NUM_SFU_UNITS)
|
||||
`define SFU_WIDTH `UP(`SFU_BITS)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define INST_LUI 7'b0110111
|
||||
`define INST_AUIPC 7'b0010111
|
||||
`define INST_JAL 7'b1101111
|
||||
`define INST_JALR 7'b1100111
|
||||
`define INST_B 7'b1100011 // branch instructions
|
||||
`define INST_L 7'b0000011 // load instructions
|
||||
`define INST_S 7'b0100011 // store instructions
|
||||
`define INST_I 7'b0010011 // immediate instructions
|
||||
`define INST_R 7'b0110011 // register instructions
|
||||
`define INST_FENCE 7'b0001111 // Fence instructions
|
||||
`define INST_SYS 7'b1110011 // system instructions
|
||||
|
||||
// RV64I instruction specific opcodes (for any W instruction)
|
||||
`define INST_I_W 7'b0011011 // W type immediate instructions
|
||||
`define INST_R_W 7'b0111011 // W type register instructions
|
||||
|
||||
`define INST_FL 7'b0000111 // float load instruction
|
||||
`define INST_FS 7'b0100111 // float store instruction
|
||||
`define INST_FMADD 7'b1000011
|
||||
`define INST_FMSUB 7'b1000111
|
||||
`define INST_FNMSUB 7'b1001011
|
||||
`define INST_FNMADD 7'b1001111
|
||||
`define INST_FCI 7'b1010011 // float common instructions
|
||||
|
||||
// Custom extension opcodes
|
||||
`define INST_EXT1 7'b0001011 // 0x0B
|
||||
`define INST_EXT2 7'b0101011 // 0x2B
|
||||
`define INST_EXT3 7'b1011011 // 0x5B
|
||||
`define INST_EXT4 7'b1111011 // 0x7B
|
||||
|
||||
// Opcode extensions
|
||||
`define INST_R_F7_MUL 7'b0000001
|
||||
`define INST_R_F7_ZICOND 7'b0000111
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define INST_FRM_RNE 3'b000 // round to nearest even
|
||||
`define INST_FRM_RTZ 3'b001 // round to zero
|
||||
`define INST_FRM_RDN 3'b010 // round to -inf
|
||||
`define INST_FRM_RUP 3'b011 // round to +inf
|
||||
`define INST_FRM_RMM 3'b100 // round to nearest max magnitude
|
||||
`define INST_FRM_DYN 3'b111 // dynamic mode
|
||||
`define INST_FRM_BITS 3
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define INST_OP_BITS 4
|
||||
`define INST_ARGS_BITS $bits(op_args_t)
|
||||
`define INST_FMT_BITS 2
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define INST_ALU_ADD 4'b0000
|
||||
//`define INST_ALU_UNUSED 4'b0001
|
||||
`define INST_ALU_LUI 4'b0010
|
||||
`define INST_ALU_AUIPC 4'b0011
|
||||
`define INST_ALU_SLTU 4'b0100
|
||||
`define INST_ALU_SLT 4'b0101
|
||||
//`define INST_ALU_UNUSED 4'b0110
|
||||
`define INST_ALU_SUB 4'b0111
|
||||
`define INST_ALU_SRL 4'b1000
|
||||
`define INST_ALU_SRA 4'b1001
|
||||
`define INST_ALU_CZEQ 4'b1010
|
||||
`define INST_ALU_CZNE 4'b1011
|
||||
`define INST_ALU_AND 4'b1100
|
||||
`define INST_ALU_OR 4'b1101
|
||||
`define INST_ALU_XOR 4'b1110
|
||||
`define INST_ALU_SLL 4'b1111
|
||||
|
||||
`define ALU_TYPE_BITS 2
|
||||
`define ALU_TYPE_ARITH 0
|
||||
`define ALU_TYPE_BRANCH 1
|
||||
`define ALU_TYPE_MULDIV 2
|
||||
`define ALU_TYPE_OTHER 3
|
||||
|
||||
`define INST_ALU_BITS 4
|
||||
`define INST_ALU_CLASS(op) op[3:2]
|
||||
`define INST_ALU_SIGNED(op) op[0]
|
||||
`define INST_ALU_IS_SUB(op) op[1]
|
||||
`define INST_ALU_IS_CZERO(op) (op[3:1] == 3'b101)
|
||||
|
||||
`define INST_BR_EQ 4'b0000
|
||||
`define INST_BR_NE 4'b0010
|
||||
`define INST_BR_LTU 4'b0100
|
||||
`define INST_BR_GEU 4'b0110
|
||||
`define INST_BR_LT 4'b0101
|
||||
`define INST_BR_GE 4'b0111
|
||||
`define INST_BR_JAL 4'b1000
|
||||
`define INST_BR_JALR 4'b1001
|
||||
`define INST_BR_ECALL 4'b1010
|
||||
`define INST_BR_EBREAK 4'b1011
|
||||
`define INST_BR_URET 4'b1100
|
||||
`define INST_BR_SRET 4'b1101
|
||||
`define INST_BR_MRET 4'b1110
|
||||
`define INST_BR_OTHER 4'b1111
|
||||
`define INST_BR_BITS 4
|
||||
`define INST_BR_CLASS(op) {1'b0, ~op[3]}
|
||||
`define INST_BR_IS_NEG(op) op[1]
|
||||
`define INST_BR_IS_LESS(op) op[2]
|
||||
`define INST_BR_IS_STATIC(op) op[3]
|
||||
|
||||
`define INST_M_MUL 3'b000
|
||||
`define INST_M_MULHU 3'b001
|
||||
`define INST_M_MULH 3'b010
|
||||
`define INST_M_MULHSU 3'b011
|
||||
`define INST_M_DIV 3'b100
|
||||
`define INST_M_DIVU 3'b101
|
||||
`define INST_M_REM 3'b110
|
||||
`define INST_M_REMU 3'b111
|
||||
`define INST_M_BITS 3
|
||||
`define INST_M_SIGNED(op) (~op[0])
|
||||
`define INST_M_IS_MULX(op) (~op[2])
|
||||
`define INST_M_IS_MULH(op) (op[1:0] != 0)
|
||||
`define INST_M_SIGNED_A(op) (op[1:0] != 1)
|
||||
`define INST_M_IS_REM(op) op[1]
|
||||
|
||||
`define INST_FMT_B 3'b000
|
||||
`define INST_FMT_H 3'b001
|
||||
`define INST_FMT_W 3'b010
|
||||
`define INST_FMT_D 3'b011
|
||||
`define INST_FMT_BU 3'b100
|
||||
`define INST_FMT_HU 3'b101
|
||||
`define INST_FMT_WU 3'b110
|
||||
|
||||
`define INST_LSU_LB 4'b0000
|
||||
`define INST_LSU_LH 4'b0001
|
||||
`define INST_LSU_LW 4'b0010
|
||||
`define INST_LSU_LD 4'b0011 // new for RV64I LD
|
||||
`define INST_LSU_LBU 4'b0100
|
||||
`define INST_LSU_LHU 4'b0101
|
||||
`define INST_LSU_LWU 4'b0110 // new for RV64I LWU
|
||||
`define INST_LSU_SB 4'b1000
|
||||
`define INST_LSU_SH 4'b1001
|
||||
`define INST_LSU_SW 4'b1010
|
||||
`define INST_LSU_SD 4'b1011 // new for RV64I SD
|
||||
`define INST_LSU_FENCE 4'b1111
|
||||
`define INST_LSU_BITS 4
|
||||
`define INST_LSU_FMT(op) op[2:0]
|
||||
`define INST_LSU_WSIZE(op) op[1:0]
|
||||
`define INST_LSU_IS_FENCE(op) (op[3:2] == 3)
|
||||
|
||||
`define INST_FENCE_BITS 1
|
||||
`define INST_FENCE_D 1'h0
|
||||
`define INST_FENCE_I 1'h1
|
||||
|
||||
`define INST_FPU_ADD 4'b0000 // SUB=fmt[1]
|
||||
`define INST_FPU_MUL 4'b0001
|
||||
`define INST_FPU_MADD 4'b0010 // SUB=fmt[1]
|
||||
`define INST_FPU_NMADD 4'b0011 // SUB=fmt[1]
|
||||
`define INST_FPU_DIV 4'b0100
|
||||
`define INST_FPU_SQRT 4'b0101
|
||||
`define INST_FPU_F2I 4'b1000 // fmt[0]: F32=0, F64=1, fmt[1]: I32=0, I64=1
|
||||
`define INST_FPU_F2U 4'b1001 // fmt[0]: F32=0, F64=1, fmt[1]: I32=0, I64=1
|
||||
`define INST_FPU_I2F 4'b1010 // fmt[0]: F32=0, F64=1, fmt[1]: I32=0, I64=1
|
||||
`define INST_FPU_U2F 4'b1011 // fmt[0]: F32=0, F64=1, fmt[1]: I32=0, I64=1
|
||||
`define INST_FPU_CMP 4'b1100 // frm: LE=0, LT=1, EQ=2
|
||||
`define INST_FPU_F2F 4'b1101 // fmt[0]: F32=0, F64=1
|
||||
`define INST_FPU_MISC 4'b1110 // frm: SGNJ=0, SGNJN=1, SGNJX=2, CLASS=3, MVXW=4, MVWX=5, FMIN=6, FMAX=7
|
||||
`define INST_FPU_BITS 4
|
||||
`define INST_FPU_IS_CLASS(op, frm) (op == `INST_FPU_MISC && frm == 3)
|
||||
`define INST_FPU_IS_MVXW(op, frm) (op == `INST_FPU_MISC && frm == 4)
|
||||
|
||||
`define INST_SFU_TMC 4'h0
|
||||
`define INST_SFU_WSPAWN 4'h1
|
||||
`define INST_SFU_SPLIT 4'h2
|
||||
`define INST_SFU_JOIN 4'h3
|
||||
`define INST_SFU_BAR 4'h4
|
||||
`define INST_SFU_PRED 4'h5
|
||||
`define INST_SFU_CSRRW 4'h6
|
||||
`define INST_SFU_CSRRS 4'h7
|
||||
`define INST_SFU_CSRRC 4'h8
|
||||
`define INST_SFU_BITS 4
|
||||
`define INST_SFU_CSR(f3) (4'h6 + 4'(f3) - 4'h1)
|
||||
`define INST_SFU_IS_WCTL(op) (op <= 5)
|
||||
`define INST_SFU_IS_CSR(op) (op >= 6 && op <= 8)
|
||||
`define REG_EXT_VAL(ext, type) 32'h1
|
||||
//32'((1 << ((type == 1) ? ext[2:0] : ext[1:0]))-1)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
@ -306,30 +71,6 @@
|
|||
`define CACHE_CLUSTER_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, mem_ports, line_size, word_size, tag_width, num_inputs, num_caches, uuid_width) \
|
||||
`CACHE_CLUSTER_MEM_ARB_TAG(`CACHE_NC_MEM_TAG_WIDTH(mshr_size, num_banks, num_reqs, mem_ports, line_size, word_size, `CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches), uuid_width), num_caches)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`ifdef ICACHE_ENABLE
|
||||
`define L1_ENABLE
|
||||
`endif
|
||||
|
||||
`ifdef DCACHE_ENABLE
|
||||
`define L1_ENABLE
|
||||
`endif
|
||||
|
||||
`define MEM_REQ_FLAG_FLUSH 0
|
||||
`define MEM_REQ_FLAG_IO 1
|
||||
`define MEM_REQ_FLAG_LOCAL 2 // shoud be last since optional
|
||||
`define MEM_REQ_FLAGS_WIDTH (`MEM_REQ_FLAG_LOCAL + `LMEM_ENABLED)
|
||||
|
||||
`define VX_MEM_PORTS `L3_MEM_PORTS
|
||||
`define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE
|
||||
`define VX_MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH - `CLOG2(`L3_LINE_SIZE))
|
||||
`define VX_MEM_DATA_WIDTH (`L3_LINE_SIZE * 8)
|
||||
`define VX_MEM_TAG_WIDTH L3_MEM_TAG_WIDTH
|
||||
|
||||
`define VX_DCR_ADDR_WIDTH `VX_DCR_ADDR_BITS
|
||||
`define VX_DCR_DATA_WIDTH 32
|
||||
|
||||
`define TO_FULL_ADDR(x) {x, (`MEM_ADDR_WIDTH-$bits(x))'(0)}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -371,6 +112,17 @@
|
|||
|
||||
`define POP_COUNT(out, in) `POP_COUNT_EX(out, in, 1)
|
||||
|
||||
`define CONCAT(out, left_in, right_in, L, R) \
|
||||
if (L == 0) begin : g_right`__LINE__ \
|
||||
`UNUSED_VAR (left_in) \
|
||||
assign out = right_in; \
|
||||
end else if (R == 0) begin : g_left`__LINE__ \
|
||||
`UNUSED_VAR (right_in) \
|
||||
assign out = left_in; \
|
||||
end else begin : g_concat`__LINE__ \
|
||||
assign out = {left_in, right_in}; \
|
||||
end
|
||||
|
||||
`define ASSIGN_VX_IF(dst, src) \
|
||||
assign dst.valid = src.valid; \
|
||||
assign dst.data = src.data; \
|
||||
|
@ -405,47 +157,43 @@
|
|||
assign dst.req_data.data = src.req_data.data; \
|
||||
assign dst.req_data.byteen = src.req_data.byteen; \
|
||||
assign dst.req_data.flags = src.req_data.flags; \
|
||||
/* verilator lint_off GENUNNAMED */ \
|
||||
if (TD != TS) begin \
|
||||
if (UUID != 0) begin \
|
||||
if (TD > TS) begin \
|
||||
if (TD != TS) begin : g_`__LINE__ \
|
||||
if (UUID != 0) begin : g_`__LINE__ \
|
||||
if (TD > TS) begin : g_`__LINE__ \
|
||||
assign dst.req_data.tag = {src.req_data.tag.uuid, {(TD-TS){1'b0}}, src.req_data.tag.value}; \
|
||||
end else begin \
|
||||
end else begin : g_`__LINE__ \
|
||||
assign dst.req_data.tag = {src.req_data.tag.uuid, src.req_data.tag.value[TD-UUID-1:0]}; \
|
||||
end \
|
||||
end else begin \
|
||||
if (TD > TS) begin \
|
||||
end else begin : g_`__LINE__ \
|
||||
if (TD > TS) begin : g_`__LINE__ \
|
||||
assign dst.req_data.tag = {{(TD-TS){1'b0}}, src.req_data.tag}; \
|
||||
end else begin \
|
||||
end else begin : g_`__LINE__ \
|
||||
assign dst.req_data.tag = src.req_data.tag[TD-1:0]; \
|
||||
end \
|
||||
end \
|
||||
end else begin \
|
||||
end else begin : g_`__LINE__ \
|
||||
assign dst.req_data.tag = src.req_data.tag; \
|
||||
end \
|
||||
/* verilator lint_on GENUNNAMED */ \
|
||||
assign src.req_ready = dst.req_ready; \
|
||||
assign src.rsp_valid = dst.rsp_valid; \
|
||||
assign src.rsp_data.data = dst.rsp_data.data; \
|
||||
/* verilator lint_off GENUNNAMED */ \
|
||||
if (TD != TS) begin \
|
||||
if (UUID != 0) begin \
|
||||
if (TD > TS) begin \
|
||||
if (TD != TS) begin : g_`__LINE__ \
|
||||
if (UUID != 0) begin : g_`__LINE__ \
|
||||
if (TD > TS) begin : g_`__LINE__ \
|
||||
assign src.rsp_data.tag = {dst.rsp_data.tag.uuid, dst.rsp_data.tag.value[TS-UUID-1:0]}; \
|
||||
end else begin \
|
||||
end else begin : g_`__LINE__ \
|
||||
assign src.rsp_data.tag = {dst.rsp_data.tag.uuid, {(TS-TD){1'b0}}, dst.rsp_data.tag.value}; \
|
||||
end \
|
||||
end else begin \
|
||||
if (TD > TS) begin \
|
||||
end else begin : g_`__LINE__ \
|
||||
if (TD > TS) begin : g_`__LINE__ \
|
||||
assign src.rsp_data.tag = dst.rsp_data.tag[TS-1:0]; \
|
||||
end else begin \
|
||||
end else begin : g_`__LINE__ \
|
||||
assign src.rsp_data.tag = {{(TS-TD){1'b0}}, dst.rsp_data.tag}; \
|
||||
end \
|
||||
end \
|
||||
end else begin \
|
||||
end else begin : g_`__LINE__ \
|
||||
assign src.rsp_data.tag = dst.rsp_data.tag; \
|
||||
end \
|
||||
/* verilator lint_on GENUNNAMED */ \
|
||||
assign dst.rsp_ready = src.rsp_ready
|
||||
|
||||
`define INIT_VX_MEM_BUS_IF(itf) \
|
||||
|
@ -464,13 +212,11 @@
|
|||
assign itf.rsp_data = '0; \
|
||||
`UNUSED_VAR (itf.rsp_ready)
|
||||
|
||||
|
||||
`define BUFFER_DCR_BUS_IF(dst, src, ena, latency) \
|
||||
/* verilator lint_off GENUNNAMED */ \
|
||||
if (latency != 0) begin \
|
||||
if (latency != 0) begin : g_`__LINE__ \
|
||||
VX_pipe_register #( \
|
||||
.DATAW (1 + `VX_DCR_ADDR_WIDTH + `VX_DCR_DATA_WIDTH), \
|
||||
.DEPTH (latency) \
|
||||
.DATAW (1 + VX_DCR_ADDR_WIDTH + VX_DCR_DATA_WIDTH), \
|
||||
.DEPTH (latency) \
|
||||
) pipe_reg ( \
|
||||
.clk (clk), \
|
||||
.reset (1'b0), \
|
||||
|
@ -478,24 +224,22 @@
|
|||
.data_in ({src.write_valid && ena, src.write_addr, src.write_data}), \
|
||||
.data_out ({dst.write_valid, dst.write_addr, dst.write_data}) \
|
||||
); \
|
||||
end else begin \
|
||||
end else begin : g_`__LINE__ \
|
||||
assign {dst.write_valid, dst.write_addr, dst.write_data} = {src.write_valid && ena, src.write_addr, src.write_data}; \
|
||||
end \
|
||||
/* verilator lint_on GENUNNAMED */
|
||||
end
|
||||
|
||||
`define PERF_COUNTER_ADD(dst, src, field, width, count, reg_enable) \
|
||||
/* verilator lint_off GENUNNAMED */ \
|
||||
if (count > 1) begin \
|
||||
if (count > 1) begin : g_`__LINE__ \
|
||||
wire [count-1:0][width-1:0] __reduce_add_i_field; \
|
||||
wire [width-1:0] __reduce_add_o_field; \
|
||||
for (genvar __i = 0; __i < count; ++__i) begin \
|
||||
for (genvar __i = 0; __i < count; ++__i) begin : g_`__LINE__ \
|
||||
assign __reduce_add_i_field[__i] = src[__i].``field; \
|
||||
end \
|
||||
VX_reduce_tree #(.DATAW_IN(width), .N(count), .OP("+")) __reduce_add_field ( \
|
||||
__reduce_add_i_field, \
|
||||
__reduce_add_o_field \
|
||||
); \
|
||||
if (reg_enable) begin \
|
||||
if (reg_enable) begin : g_`__LINE__ \
|
||||
reg [width-1:0] __reduce_add_r_field; \
|
||||
always @(posedge clk) begin \
|
||||
if (reset) begin \
|
||||
|
@ -505,25 +249,130 @@
|
|||
end \
|
||||
end \
|
||||
assign dst.``field = __reduce_add_r_field; \
|
||||
end else begin \
|
||||
end else begin : g_`__LINE__ \
|
||||
assign dst.``field = __reduce_add_o_field; \
|
||||
end \
|
||||
end else begin \
|
||||
end else begin : g_`__LINE__ \
|
||||
assign dst.``field = src[0].``field; \
|
||||
end \
|
||||
/* verilator lint_on GENUNNAMED */
|
||||
end
|
||||
|
||||
`define ASSIGN_BLOCKED_WID(dst, src, block_idx, block_size) \
|
||||
/* verilator lint_off GENUNNAMED */ \
|
||||
if (block_size != 1) begin \
|
||||
if (block_size != `NUM_WARPS) begin \
|
||||
assign dst = {src[`NW_WIDTH-1:`CLOG2(block_size)], `CLOG2(block_size)'(block_idx)}; \
|
||||
end else begin \
|
||||
assign dst = `NW_WIDTH'(block_idx); \
|
||||
if (block_size != 1) begin : g_`__LINE__ \
|
||||
if (block_size != `NUM_WARPS) begin : g_`__LINE__ \
|
||||
assign dst = {src[NW_WIDTH-1:`CLOG2(block_size)], `CLOG2(block_size)'(block_idx)}; \
|
||||
end else begin : g_`__LINE__ \
|
||||
assign dst = NW_WIDTH'(block_idx); \
|
||||
end \
|
||||
end else begin \
|
||||
end else begin : g_`__LINE__ \
|
||||
assign dst = src; \
|
||||
end \
|
||||
/* verilator lint_on GENUNNAMED */
|
||||
end
|
||||
|
||||
`define ITF_TO_AOS(itf, prefix, count, dataw) \
|
||||
wire [count-1:0] prefix``_valid; \
|
||||
wire [count-1:0][dataw-1:0] prefix``_data; \
|
||||
wire [count-1:0] prefix``_ready; \
|
||||
for (genvar i = 0; i < count; ++i) begin : g_`__LINE__ \
|
||||
assign prefix``_valid[i] = itf[i].valid; \
|
||||
assign prefix``_data[i] = itf[i].data; \
|
||||
assign itf[i].ready = prefix``_ready[i]; \
|
||||
end
|
||||
|
||||
`define AOS_TO_ITF(prefix, itf, count, dataw) \
|
||||
wire [count-1:0] prefix``_valid; \
|
||||
wire [count-1:0][dataw-1:0] prefix``_data; \
|
||||
wire [count-1:0] prefix``_ready; \
|
||||
for (genvar i = 0; i < count; ++i) begin : g_`__LINE__ \
|
||||
assign itf[i].valid = prefix``_valid[i]; \
|
||||
assign itf[i].data = prefix``_data[i]; \
|
||||
assign prefix``_ready[i] = itf[i].ready; \
|
||||
end
|
||||
|
||||
`define ITF_TO_AOS_V(itf, prefix, count, dataw) \
|
||||
wire [count-1:0] prefix``_valid; \
|
||||
wire [count-1:0][dataw-1:0] prefix``_data; \
|
||||
for (genvar i = 0; i < count; ++i) begin : g_`__LINE__ \
|
||||
assign prefix``_valid[i] = itf[i].valid; \
|
||||
assign prefix``_data[i] = itf[i].data; \
|
||||
end
|
||||
|
||||
`define AOS_TO_ITF_V(prefix, itf, count, dataw) \
|
||||
wire [count-1:0] prefix``_valid; \
|
||||
wire [count-1:0][dataw-1:0] prefix``_data; \
|
||||
for (genvar i = 0; i < count; ++i) begin : g_`__LINE__ \
|
||||
assign itf[i].valid = prefix``_valid[i]; \
|
||||
assign itf[i].data = prefix``_data[i]; \
|
||||
end
|
||||
|
||||
`define ITF_TO_AOS_REQ(itf, prefix, count, dataw) \
|
||||
wire [count-1:0] prefix``_req_valid; \
|
||||
wire [count-1:0][dataw-1:0] prefix``_req_data; \
|
||||
wire [count-1:0] prefix``_req_ready; \
|
||||
for (genvar i = 0; i < count; ++i) begin : g_`__LINE__ \
|
||||
assign prefix``_req_valid[i] = itf[i].req_valid; \
|
||||
assign prefix``_req_data[i] = itf[i].req_data; \
|
||||
assign itf[i].req_ready = prefix``_req_ready[i]; \
|
||||
end
|
||||
|
||||
`define AOS_TO_ITF_REQ(prefix, itf, count, dataw) \
|
||||
wire [count-1:0] prefix``_req_valid; \
|
||||
wire [count-1:0][dataw-1:0] prefix``_req_data; \
|
||||
wire [count-1:0] prefix``_req_ready; \
|
||||
for (genvar i = 0; i < count; ++i) begin : g_`__LINE__ \
|
||||
assign itf[i].req_valid = prefix``_req_valid[i]; \
|
||||
assign itf[i].req_data = prefix``_req_data[i]; \
|
||||
assign prefix``_req_ready[i] = itf[i].req_ready; \
|
||||
end
|
||||
|
||||
`define ITF_TO_AOS_REQ_V(itf, prefix, count, dataw) \
|
||||
wire [count-1:0] prefix``_req_valid; \
|
||||
wire [count-1:0][dataw-1:0] prefix``_req_data; \
|
||||
for (genvar i = 0; i < count; ++i) begin : g_`__LINE__ \
|
||||
assign prefix``_req_valid[i] = itf[i].req_valid; \
|
||||
assign prefix``_req_data[i] = itf[i].req_data; \
|
||||
end
|
||||
|
||||
`define AOS_TO_ITF_REQ_V(prefix, itf, count, dataw) \
|
||||
wire [count-1:0] prefix``_req_valid; \
|
||||
wire [count-1:0][dataw-1:0] prefix``_req_data; \
|
||||
for (genvar i = 0; i < count; ++i) begin : g_`__LINE__ \
|
||||
assign itf[i].req_valid = prefix``_req_valid[i]; \
|
||||
assign itf[i].req_data = prefix``_req_data[i]; \
|
||||
end
|
||||
|
||||
`define ITF_TO_AOS_RSP(itf, prefix, count, dataw) \
|
||||
wire [count-1:0] prefix``_rsp_valid; \
|
||||
wire [count-1:0][dataw-1:0] prefix``_rsp_data; \
|
||||
wire [count-1:0] prefix``_rsp_ready; \
|
||||
for (genvar i = 0; i < count; ++i) begin : g_`__LINE__ \
|
||||
assign prefix``_rsp_valid[i] = itf[i].rsp_valid; \
|
||||
assign prefix``_rsp_data[i] = itf[i].rsp_data; \
|
||||
assign itf[i].rsp_ready = prefix``_rsp_ready[i]; \
|
||||
end
|
||||
|
||||
`define AOS_TO_ITF_RSP(prefix, itf, count, dataw) \
|
||||
wire [count-1:0] prefix``_rsp_valid; \
|
||||
wire [count-1:0][dataw-1:0] prefix``_rsp_data; \
|
||||
wire [count-1:0] prefix``_vready; \
|
||||
for (genvar i = 0; i < count; ++i) begin : g_`__LINE__ \
|
||||
assign itf[i].rsp_valid = prefix``_rsp_valid[i]; \
|
||||
assign itf[i].rsp_data = prefix``_rsp_data[i]; \
|
||||
assign prefix``_rsp_ready[i] = itf[i].rsp_ready; \
|
||||
end
|
||||
|
||||
`define ITF_TO_AOS_RSP_V(itf, prefix, count, dataw) \
|
||||
wire [count-1:0] prefix``_rsp_valid; \
|
||||
wire [count-1:0][dataw-1:0] prefix``_rsp_data; \
|
||||
for (genvar i = 0; i < count; ++i) begin : g_`__LINE__ \
|
||||
assign prefix``_rsp_valid[i] = itf[i].rsp_valid; \
|
||||
assign prefix``_rsp_data[i] = itf[i].rsp_data; \
|
||||
end
|
||||
|
||||
`define AOS_TO_ITF_RSP_V(prefix, itf, count, dataw) \
|
||||
wire [count-1:0] prefix``_rsp_valid; \
|
||||
wire [count-1:0][dataw-1:0] prefix``_rsp_data; \
|
||||
for (genvar i = 0; i < count; ++i) begin : g_`__LINE__ \
|
||||
assign itf[i].rsp_valid = prefix``_rsp_valid[i]; \
|
||||
assign itf[i].rsp_data = prefix``_rsp_data[i]; \
|
||||
end
|
||||
|
||||
`endif // VX_DEFINE_VH
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -98,13 +98,11 @@
|
|||
localparam `STRING __``x = x; \
|
||||
/* verilator lint_on UNUSED */
|
||||
|
||||
`define UNUSED_VAR(x) /* verilator lint_off GENUNNAMED */ \
|
||||
if (1) begin \
|
||||
`define UNUSED_VAR(x) if (1) begin : g_`__LINE__ \
|
||||
/* verilator lint_off UNUSED */ \
|
||||
wire [$bits(x)-1:0] __unused = x; \
|
||||
/* verilator lint_on UNUSED */ \
|
||||
end \
|
||||
/* verilator lint_on GENUNNAMED */
|
||||
end
|
||||
|
||||
`define UNUSED_PIN(x) /* verilator lint_off PINCONNECTEMPTY */ \
|
||||
. x () \
|
||||
|
|
|
@ -103,8 +103,6 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
.MRSQ_SIZE (`ICACHE_MRSQ_SIZE),
|
||||
.MREQ_SIZE (`ICACHE_MREQ_SIZE),
|
||||
.TAG_WIDTH (ICACHE_TAG_WIDTH),
|
||||
.FLAGS_WIDTH (0),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.WRITE_ENABLE (0),
|
||||
.REPL_POLICY (`ICACHE_REPL_POLICY),
|
||||
.NC_ENABLE (0),
|
||||
|
@ -151,8 +149,6 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
.MRSQ_SIZE (`DCACHE_MRSQ_SIZE),
|
||||
.MREQ_SIZE (`DCACHE_WRITEBACK ? `DCACHE_MSHR_SIZE : `DCACHE_MREQ_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
|
||||
.WRITE_ENABLE (1),
|
||||
.WRITEBACK (`DCACHE_WRITEBACK),
|
||||
.DIRTY_BYTES (`DCACHE_DIRTYBYTES),
|
||||
|
@ -184,8 +180,8 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
|
||||
) l1_mem_arb_bus_if[1]();
|
||||
|
||||
`ASSIGN_VX_MEM_BUS_IF_EX (l1_mem_bus_if[0], icache_mem_bus_if[0], L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH, `UUID_WIDTH);
|
||||
`ASSIGN_VX_MEM_BUS_IF_EX (l1_mem_bus_if[1], dcache_mem_bus_if[0], L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH, `UUID_WIDTH);
|
||||
`ASSIGN_VX_MEM_BUS_IF_EX (l1_mem_bus_if[0], icache_mem_bus_if[0], L1_MEM_TAG_WIDTH, ICACHE_MEM_TAG_WIDTH, UUID_WIDTH);
|
||||
`ASSIGN_VX_MEM_BUS_IF_EX (l1_mem_bus_if[1], dcache_mem_bus_if[0], L1_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH, UUID_WIDTH);
|
||||
|
||||
VX_mem_arb #(
|
||||
.NUM_INPUTS (2),
|
||||
|
@ -210,7 +206,7 @@ module VX_socket import VX_gpu_pkg::*; #(
|
|||
.TAG_WIDTH (L1_MEM_ARB_TAG_WIDTH)
|
||||
) l1_mem_arb_bus_if();
|
||||
|
||||
`ASSIGN_VX_MEM_BUS_IF_EX (l1_mem_arb_bus_if, dcache_mem_bus_if[i], L1_MEM_ARB_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH, `UUID_WIDTH);
|
||||
`ASSIGN_VX_MEM_BUS_IF_EX (l1_mem_arb_bus_if, dcache_mem_bus_if[i], L1_MEM_ARB_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH, UUID_WIDTH);
|
||||
`ASSIGN_VX_MEM_BUS_IF (mem_bus_if[i], l1_mem_arb_bus_if);
|
||||
end
|
||||
end
|
||||
|
|
|
@ -21,24 +21,24 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
input wire reset,
|
||||
|
||||
// Memory request
|
||||
output wire mem_req_valid [`VX_MEM_PORTS],
|
||||
output wire mem_req_rw [`VX_MEM_PORTS],
|
||||
output wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen [`VX_MEM_PORTS],
|
||||
output wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr [`VX_MEM_PORTS],
|
||||
output wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data [`VX_MEM_PORTS],
|
||||
output wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag [`VX_MEM_PORTS],
|
||||
input wire mem_req_ready [`VX_MEM_PORTS],
|
||||
output wire mem_req_valid [VX_MEM_PORTS],
|
||||
output wire mem_req_rw [VX_MEM_PORTS],
|
||||
output wire [VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen [VX_MEM_PORTS],
|
||||
output wire [VX_MEM_ADDR_WIDTH-1:0] mem_req_addr [VX_MEM_PORTS],
|
||||
output wire [VX_MEM_DATA_WIDTH-1:0] mem_req_data [VX_MEM_PORTS],
|
||||
output wire [VX_MEM_TAG_WIDTH-1:0] mem_req_tag [VX_MEM_PORTS],
|
||||
input wire mem_req_ready [VX_MEM_PORTS],
|
||||
|
||||
// Memory response
|
||||
input wire mem_rsp_valid [`VX_MEM_PORTS],
|
||||
input wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data [`VX_MEM_PORTS],
|
||||
input wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag [`VX_MEM_PORTS],
|
||||
output wire mem_rsp_ready [`VX_MEM_PORTS],
|
||||
input wire mem_rsp_valid [VX_MEM_PORTS],
|
||||
input wire [VX_MEM_DATA_WIDTH-1:0] mem_rsp_data [VX_MEM_PORTS],
|
||||
input wire [VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag [VX_MEM_PORTS],
|
||||
output wire mem_rsp_ready [VX_MEM_PORTS],
|
||||
|
||||
// DCR write request
|
||||
input wire dcr_wr_valid,
|
||||
input wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr,
|
||||
input wire [`VX_DCR_DATA_WIDTH-1:0] dcr_wr_data,
|
||||
input wire [VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr,
|
||||
input wire [VX_DCR_DATA_WIDTH-1:0] dcr_wr_data,
|
||||
|
||||
// Status
|
||||
output wire busy
|
||||
|
@ -90,8 +90,6 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
.WRITEBACK (`L3_WRITEBACK),
|
||||
.DIRTY_BYTES (`L3_DIRTYBYTES),
|
||||
.REPL_POLICY (`L3_REPL_POLICY),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
|
||||
.CORE_OUT_BUF (3),
|
||||
.MEM_OUT_BUF (3),
|
||||
.NC_ENABLE (1),
|
||||
|
@ -164,12 +162,12 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
|
||||
`ifdef PERF_ENABLE
|
||||
|
||||
localparam MEM_PORTS_CTR_W = `CLOG2(`VX_MEM_PORTS+1);
|
||||
localparam MEM_PORTS_CTR_W = `CLOG2(VX_MEM_PORTS+1);
|
||||
|
||||
wire [`VX_MEM_PORTS-1:0] mem_req_fire, mem_rsp_fire;
|
||||
wire [`VX_MEM_PORTS-1:0] mem_rd_req_fire, mem_wr_req_fire;
|
||||
wire [VX_MEM_PORTS-1:0] mem_req_fire, mem_rsp_fire;
|
||||
wire [VX_MEM_PORTS-1:0] mem_rd_req_fire, mem_wr_req_fire;
|
||||
|
||||
for (genvar i = 0; i < `VX_MEM_PORTS; ++i) begin : g_perf_ctrs
|
||||
for (genvar i = 0; i < VX_MEM_PORTS; ++i) begin : g_perf_ctrs
|
||||
assign mem_req_fire[i] = mem_req_valid[i] & mem_req_ready[i];
|
||||
assign mem_rsp_fire[i] = mem_rsp_valid[i] & mem_rsp_ready[i];
|
||||
assign mem_rd_req_fire[i] = mem_req_fire[i] & ~mem_req_rw[i];
|
||||
|
@ -184,14 +182,14 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
`POP_COUNT(perf_mem_writes_per_cycle, mem_wr_req_fire);
|
||||
`POP_COUNT(perf_mem_rsps_per_cycle, mem_rsp_fire);
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_pending_reads;
|
||||
reg [PERF_CTR_BITS-1:0] perf_mem_pending_reads;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_mem_pending_reads <= '0;
|
||||
end else begin
|
||||
perf_mem_pending_reads <= $signed(perf_mem_pending_reads) +
|
||||
`PERF_CTR_BITS'($signed((MEM_PORTS_CTR_W+1)'(perf_mem_reads_per_cycle) - (MEM_PORTS_CTR_W+1)'(perf_mem_rsps_per_cycle)));
|
||||
PERF_CTR_BITS'($signed((MEM_PORTS_CTR_W+1)'(perf_mem_reads_per_cycle) - (MEM_PORTS_CTR_W+1)'(perf_mem_rsps_per_cycle)));
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -199,8 +197,8 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
if (reset) begin
|
||||
mem_perf <= '0;
|
||||
end else begin
|
||||
mem_perf.reads <= mem_perf.reads + `PERF_CTR_BITS'(perf_mem_reads_per_cycle);
|
||||
mem_perf.writes <= mem_perf.writes + `PERF_CTR_BITS'(perf_mem_writes_per_cycle);
|
||||
mem_perf.reads <= mem_perf.reads + PERF_CTR_BITS'(perf_mem_reads_per_cycle);
|
||||
mem_perf.writes <= mem_perf.writes + PERF_CTR_BITS'(perf_mem_writes_per_cycle);
|
||||
mem_perf.latency <= mem_perf.latency + perf_mem_pending_reads;
|
||||
end
|
||||
end
|
||||
|
@ -214,7 +212,7 @@ module Vortex import VX_gpu_pkg::*; (
|
|||
end
|
||||
|
||||
`ifdef DBG_TRACE_MEM
|
||||
for (genvar i = 0; i < `VX_MEM_PORTS; ++i) begin : g_trace
|
||||
for (genvar i = 0; i < VX_MEM_PORTS; ++i) begin : g_trace
|
||||
always @(posedge clk) begin
|
||||
if (mem_bus_if[i].req_valid && mem_bus_if[i].req_ready) begin
|
||||
if (mem_bus_if[i].req_data.rw) begin
|
||||
|
|
|
@ -14,9 +14,9 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module Vortex_axi import VX_gpu_pkg::*; #(
|
||||
parameter AXI_DATA_WIDTH = `VX_MEM_DATA_WIDTH,
|
||||
parameter AXI_DATA_WIDTH = VX_MEM_DATA_WIDTH,
|
||||
parameter AXI_ADDR_WIDTH = `MEM_ADDR_WIDTH,
|
||||
parameter AXI_TID_WIDTH = `VX_MEM_TAG_WIDTH,
|
||||
parameter AXI_TID_WIDTH = VX_MEM_TAG_WIDTH,
|
||||
parameter AXI_NUM_BANKS = 1
|
||||
)(
|
||||
`SCOPE_IO_DECL
|
||||
|
@ -76,30 +76,30 @@ module Vortex_axi import VX_gpu_pkg::*; #(
|
|||
|
||||
// DCR write request
|
||||
input wire dcr_wr_valid,
|
||||
input wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr,
|
||||
input wire [`VX_DCR_DATA_WIDTH-1:0] dcr_wr_data,
|
||||
input wire [VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr,
|
||||
input wire [VX_DCR_DATA_WIDTH-1:0] dcr_wr_data,
|
||||
|
||||
// Status
|
||||
output wire busy
|
||||
);
|
||||
localparam DST_LDATAW = `CLOG2(AXI_DATA_WIDTH);
|
||||
localparam SRC_LDATAW = `CLOG2(`VX_MEM_DATA_WIDTH);
|
||||
localparam SRC_LDATAW = `CLOG2(VX_MEM_DATA_WIDTH);
|
||||
localparam SUB_LDATAW = DST_LDATAW - SRC_LDATAW;
|
||||
localparam VX_MEM_TAG_A_WIDTH = `VX_MEM_TAG_WIDTH + `MAX(SUB_LDATAW, 0);
|
||||
localparam VX_MEM_ADDR_A_WIDTH = `VX_MEM_ADDR_WIDTH - SUB_LDATAW;
|
||||
localparam VX_MEM_TAG_A_WIDTH = VX_MEM_TAG_WIDTH + `MAX(SUB_LDATAW, 0);
|
||||
localparam VX_MEM_ADDR_A_WIDTH = VX_MEM_ADDR_WIDTH - SUB_LDATAW;
|
||||
|
||||
wire mem_req_valid [`VX_MEM_PORTS];
|
||||
wire mem_req_rw [`VX_MEM_PORTS];
|
||||
wire [`VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen [`VX_MEM_PORTS];
|
||||
wire [`VX_MEM_ADDR_WIDTH-1:0] mem_req_addr [`VX_MEM_PORTS];
|
||||
wire [`VX_MEM_DATA_WIDTH-1:0] mem_req_data [`VX_MEM_PORTS];
|
||||
wire [`VX_MEM_TAG_WIDTH-1:0] mem_req_tag [`VX_MEM_PORTS];
|
||||
wire mem_req_ready [`VX_MEM_PORTS];
|
||||
wire mem_req_valid [VX_MEM_PORTS];
|
||||
wire mem_req_rw [VX_MEM_PORTS];
|
||||
wire [VX_MEM_BYTEEN_WIDTH-1:0] mem_req_byteen [VX_MEM_PORTS];
|
||||
wire [VX_MEM_ADDR_WIDTH-1:0] mem_req_addr [VX_MEM_PORTS];
|
||||
wire [VX_MEM_DATA_WIDTH-1:0] mem_req_data [VX_MEM_PORTS];
|
||||
wire [VX_MEM_TAG_WIDTH-1:0] mem_req_tag [VX_MEM_PORTS];
|
||||
wire mem_req_ready [VX_MEM_PORTS];
|
||||
|
||||
wire mem_rsp_valid [`VX_MEM_PORTS];
|
||||
wire [`VX_MEM_DATA_WIDTH-1:0] mem_rsp_data [`VX_MEM_PORTS];
|
||||
wire [`VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag [`VX_MEM_PORTS];
|
||||
wire mem_rsp_ready [`VX_MEM_PORTS];
|
||||
wire mem_rsp_valid [VX_MEM_PORTS];
|
||||
wire [VX_MEM_DATA_WIDTH-1:0] mem_rsp_data [VX_MEM_PORTS];
|
||||
wire [VX_MEM_TAG_WIDTH-1:0] mem_rsp_tag [VX_MEM_PORTS];
|
||||
wire mem_rsp_ready [VX_MEM_PORTS];
|
||||
|
||||
`SCOPE_IO_SWITCH (1);
|
||||
|
||||
|
@ -129,27 +129,27 @@ module Vortex_axi import VX_gpu_pkg::*; #(
|
|||
.busy (busy)
|
||||
);
|
||||
|
||||
wire mem_req_valid_a [`VX_MEM_PORTS];
|
||||
wire mem_req_rw_a [`VX_MEM_PORTS];
|
||||
wire [(AXI_DATA_WIDTH/8)-1:0] mem_req_byteen_a [`VX_MEM_PORTS];
|
||||
wire [VX_MEM_ADDR_A_WIDTH-1:0] mem_req_addr_a [`VX_MEM_PORTS];
|
||||
wire [AXI_DATA_WIDTH-1:0] mem_req_data_a [`VX_MEM_PORTS];
|
||||
wire [VX_MEM_TAG_A_WIDTH-1:0] mem_req_tag_a [`VX_MEM_PORTS];
|
||||
wire mem_req_ready_a [`VX_MEM_PORTS];
|
||||
wire mem_req_valid_a [VX_MEM_PORTS];
|
||||
wire mem_req_rw_a [VX_MEM_PORTS];
|
||||
wire [(AXI_DATA_WIDTH/8)-1:0] mem_req_byteen_a [VX_MEM_PORTS];
|
||||
wire [VX_MEM_ADDR_A_WIDTH-1:0] mem_req_addr_a [VX_MEM_PORTS];
|
||||
wire [AXI_DATA_WIDTH-1:0] mem_req_data_a [VX_MEM_PORTS];
|
||||
wire [VX_MEM_TAG_A_WIDTH-1:0] mem_req_tag_a [VX_MEM_PORTS];
|
||||
wire mem_req_ready_a [VX_MEM_PORTS];
|
||||
|
||||
wire mem_rsp_valid_a [`VX_MEM_PORTS];
|
||||
wire [AXI_DATA_WIDTH-1:0] mem_rsp_data_a [`VX_MEM_PORTS];
|
||||
wire [VX_MEM_TAG_A_WIDTH-1:0] mem_rsp_tag_a [`VX_MEM_PORTS];
|
||||
wire mem_rsp_ready_a [`VX_MEM_PORTS];
|
||||
wire mem_rsp_valid_a [VX_MEM_PORTS];
|
||||
wire [AXI_DATA_WIDTH-1:0] mem_rsp_data_a [VX_MEM_PORTS];
|
||||
wire [VX_MEM_TAG_A_WIDTH-1:0] mem_rsp_tag_a [VX_MEM_PORTS];
|
||||
wire mem_rsp_ready_a [VX_MEM_PORTS];
|
||||
|
||||
// Adjust memory data width to match AXI interface
|
||||
for (genvar i = 0; i < `VX_MEM_PORTS; i++) begin : g_mem_adapter
|
||||
for (genvar i = 0; i < VX_MEM_PORTS; i++) begin : g_mem_adapter
|
||||
VX_mem_data_adapter #(
|
||||
.SRC_DATA_WIDTH (`VX_MEM_DATA_WIDTH),
|
||||
.SRC_DATA_WIDTH (VX_MEM_DATA_WIDTH),
|
||||
.DST_DATA_WIDTH (AXI_DATA_WIDTH),
|
||||
.SRC_ADDR_WIDTH (`VX_MEM_ADDR_WIDTH),
|
||||
.SRC_ADDR_WIDTH (VX_MEM_ADDR_WIDTH),
|
||||
.DST_ADDR_WIDTH (VX_MEM_ADDR_A_WIDTH),
|
||||
.SRC_TAG_WIDTH (`VX_MEM_TAG_WIDTH),
|
||||
.SRC_TAG_WIDTH (VX_MEM_TAG_WIDTH),
|
||||
.DST_TAG_WIDTH (VX_MEM_TAG_A_WIDTH),
|
||||
.REQ_OUT_BUF (0),
|
||||
.RSP_OUT_BUF (0)
|
||||
|
@ -191,11 +191,11 @@ module Vortex_axi import VX_gpu_pkg::*; #(
|
|||
.ADDR_WIDTH_OUT (AXI_ADDR_WIDTH),
|
||||
.TAG_WIDTH_IN (VX_MEM_TAG_A_WIDTH),
|
||||
.TAG_WIDTH_OUT (AXI_TID_WIDTH),
|
||||
.NUM_PORTS_IN (`VX_MEM_PORTS),
|
||||
.NUM_PORTS_IN (VX_MEM_PORTS),
|
||||
.NUM_BANKS_OUT (AXI_NUM_BANKS),
|
||||
.INTERLEAVE (`PLATFORM_MEMORY_INTERLEAVE),
|
||||
.REQ_OUT_BUF ((`VX_MEM_PORTS > 1) ? 2 : 0),
|
||||
.RSP_OUT_BUF ((`VX_MEM_PORTS > 1 || AXI_NUM_BANKS > 1) ? 2 : 0)
|
||||
.REQ_OUT_BUF ((VX_MEM_PORTS > 1) ? 2 : 0),
|
||||
.RSP_OUT_BUF ((VX_MEM_PORTS > 1 || AXI_NUM_BANKS > 1) ? 2 : 0)
|
||||
) axi_adapter (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
|
|
@ -46,23 +46,23 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
|
|||
localparam LMEM_ADDR_WIDTH = $bits(t_local_mem_addr);
|
||||
|
||||
localparam LMEM_BYTE_ADDR_WIDTH = LMEM_ADDR_WIDTH + $clog2(LMEM_DATA_SIZE);
|
||||
localparam CCI_VX_ADDR_WIDTH = `VX_MEM_ADDR_WIDTH + ($clog2(`VX_MEM_DATA_WIDTH) - $clog2(LMEM_DATA_WIDTH));
|
||||
localparam CCI_VX_ADDR_WIDTH = VX_MEM_ADDR_WIDTH + ($clog2(VX_MEM_DATA_WIDTH) - $clog2(LMEM_DATA_WIDTH));
|
||||
|
||||
localparam LMEM_BURST_CTRW = $bits(t_local_mem_burst_cnt);
|
||||
|
||||
localparam MEM_PORTS_BITS = `CLOG2(`VX_MEM_PORTS);
|
||||
localparam MEM_PORTS_BITS = `CLOG2(VX_MEM_PORTS);
|
||||
localparam MEM_PORTS_WIDTH = `UP(MEM_PORTS_BITS);
|
||||
|
||||
localparam CCI_DATA_WIDTH = $bits(t_ccip_clData);
|
||||
localparam CCI_DATA_SIZE = CCI_DATA_WIDTH / 8;
|
||||
localparam CCI_ADDR_WIDTH = $bits(t_ccip_clAddr);
|
||||
|
||||
localparam RESET_CTR_WIDTH = `CLOG2(`RESET_DELAY+1);
|
||||
localparam RESET_CTR_WIDTH = `CLOG2(RESET_DELAY+1);
|
||||
|
||||
localparam AVS_RD_QUEUE_SIZE = 32;
|
||||
localparam VX_AVS_REQ_TAGW = `VX_MEM_TAG_WIDTH + `CLOG2(LMEM_DATA_WIDTH) - `CLOG2(`VX_MEM_DATA_WIDTH);
|
||||
localparam VX_AVS_REQ_TAGW = VX_MEM_TAG_WIDTH + `CLOG2(LMEM_DATA_WIDTH) - `CLOG2(VX_MEM_DATA_WIDTH);
|
||||
localparam CCI_AVS_REQ_TAGW = CCI_ADDR_WIDTH + `CLOG2(LMEM_DATA_WIDTH) - `CLOG2(CCI_DATA_WIDTH);
|
||||
localparam VX_AVS_REQ_TAGW2 = `MAX(`VX_MEM_TAG_WIDTH, VX_AVS_REQ_TAGW);
|
||||
localparam VX_AVS_REQ_TAGW2 = `MAX(VX_MEM_TAG_WIDTH, VX_AVS_REQ_TAGW);
|
||||
localparam CCI_AVS_REQ_TAGW2 = `MAX(CCI_ADDR_WIDTH, CCI_AVS_REQ_TAGW);
|
||||
localparam CCI_VX_TAG_WIDTH = `MAX(VX_AVS_REQ_TAGW2, CCI_AVS_REQ_TAGW2);
|
||||
localparam AVS_TAG_WIDTH = CCI_VX_TAG_WIDTH + 1; // adding the arbiter bit
|
||||
|
@ -86,7 +86,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
|
|||
localparam MMIO_CMD_ARG2 = `AFU_IMAGE_MMIO_CMD_ARG2;
|
||||
localparam MMIO_STATUS = `AFU_IMAGE_MMIO_STATUS;
|
||||
|
||||
localparam COUT_TID_WIDTH = `CLOG2(`VX_MEM_BYTEEN_WIDTH);
|
||||
localparam COUT_TID_WIDTH = `CLOG2(VX_MEM_BYTEEN_WIDTH);
|
||||
localparam COUT_QUEUE_DATAW = COUT_TID_WIDTH + 8;
|
||||
localparam COUT_QUEUE_SIZE = 1024;
|
||||
|
||||
|
@ -109,7 +109,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
|
|||
wire [63:0] dev_caps = {8'b0,
|
||||
5'(LMEM_BYTE_ADDR_WIDTH-20),
|
||||
3'(`CLOG2(NUM_LOCAL_MEM_BANKS)),
|
||||
8'(`LMEM_ENABLED ? `LMEM_LOG_SIZE : 0),
|
||||
8'(LMEM_ENABLED ? `LMEM_LOG_SIZE : 0),
|
||||
16'(`NUM_CORES * `NUM_CLUSTERS),
|
||||
8'(`NUM_WARPS),
|
||||
8'(`NUM_THREADS),
|
||||
|
@ -123,18 +123,18 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
|
|||
|
||||
// Vortex ports ///////////////////////////////////////////////////////////
|
||||
|
||||
wire vx_mem_req_valid [`VX_MEM_PORTS];
|
||||
wire vx_mem_req_rw [`VX_MEM_PORTS];
|
||||
wire [`VX_MEM_BYTEEN_WIDTH-1:0] vx_mem_req_byteen [`VX_MEM_PORTS];
|
||||
wire [`VX_MEM_ADDR_WIDTH-1:0] vx_mem_req_addr [`VX_MEM_PORTS];
|
||||
wire [`VX_MEM_DATA_WIDTH-1:0] vx_mem_req_data [`VX_MEM_PORTS];
|
||||
wire [`VX_MEM_TAG_WIDTH-1:0] vx_mem_req_tag [`VX_MEM_PORTS];
|
||||
wire vx_mem_req_ready [`VX_MEM_PORTS];
|
||||
wire vx_mem_req_valid [VX_MEM_PORTS];
|
||||
wire vx_mem_req_rw [VX_MEM_PORTS];
|
||||
wire [VX_MEM_BYTEEN_WIDTH-1:0] vx_mem_req_byteen [VX_MEM_PORTS];
|
||||
wire [VX_MEM_ADDR_WIDTH-1:0] vx_mem_req_addr [VX_MEM_PORTS];
|
||||
wire [VX_MEM_DATA_WIDTH-1:0] vx_mem_req_data [VX_MEM_PORTS];
|
||||
wire [VX_MEM_TAG_WIDTH-1:0] vx_mem_req_tag [VX_MEM_PORTS];
|
||||
wire vx_mem_req_ready [VX_MEM_PORTS];
|
||||
|
||||
wire vx_mem_rsp_valid [`VX_MEM_PORTS];
|
||||
wire [`VX_MEM_DATA_WIDTH-1:0] vx_mem_rsp_data [`VX_MEM_PORTS];
|
||||
wire [`VX_MEM_TAG_WIDTH-1:0] vx_mem_rsp_tag [`VX_MEM_PORTS];
|
||||
wire vx_mem_rsp_ready [`VX_MEM_PORTS];
|
||||
wire vx_mem_rsp_valid [VX_MEM_PORTS];
|
||||
wire [VX_MEM_DATA_WIDTH-1:0] vx_mem_rsp_data [VX_MEM_PORTS];
|
||||
wire [VX_MEM_TAG_WIDTH-1:0] vx_mem_rsp_tag [VX_MEM_PORTS];
|
||||
wire vx_mem_rsp_ready [VX_MEM_PORTS];
|
||||
|
||||
// CMD variables //////////////////////////////////////////////////////////
|
||||
|
||||
|
@ -146,8 +146,8 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
|
|||
wire [CCI_ADDR_WIDTH-1:0] cmd_mem_addr = CCI_ADDR_WIDTH'(cmd_args[1]);
|
||||
wire [CCI_ADDR_WIDTH-1:0] cmd_data_size = CCI_ADDR_WIDTH'(cmd_args[2]);
|
||||
|
||||
wire [`VX_DCR_ADDR_WIDTH-1:0] cmd_dcr_addr = `VX_DCR_ADDR_WIDTH'(cmd_args[0]);
|
||||
wire [`VX_DCR_DATA_WIDTH-1:0] cmd_dcr_data = `VX_DCR_DATA_WIDTH'(cmd_args[1]);
|
||||
wire [VX_DCR_ADDR_WIDTH-1:0] cmd_dcr_addr = VX_DCR_ADDR_WIDTH'(cmd_args[0]);
|
||||
wire [VX_DCR_DATA_WIDTH-1:0] cmd_dcr_data = VX_DCR_DATA_WIDTH'(cmd_args[1]);
|
||||
|
||||
// MMIO controller ////////////////////////////////////////////////////////
|
||||
|
||||
|
@ -217,8 +217,8 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
|
|||
|
||||
// Console output queue read //////////////////////////////////////////////
|
||||
|
||||
wire [`VX_MEM_PORTS-1:0][COUT_QUEUE_DATAW-1:0] cout_q_dout;
|
||||
wire [`VX_MEM_PORTS-1:0] cout_q_full, cout_q_empty, cout_q_pop;
|
||||
wire [VX_MEM_PORTS-1:0][COUT_QUEUE_DATAW-1:0] cout_q_dout;
|
||||
wire [VX_MEM_PORTS-1:0] cout_q_full, cout_q_empty, cout_q_pop;
|
||||
|
||||
reg [MEM_PORTS_WIDTH-1:0] cout_q_id;
|
||||
|
||||
|
@ -232,7 +232,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
|
|||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `VX_MEM_PORTS; ++i) begin : g_cout_q_pop
|
||||
for (genvar i = 0; i < VX_MEM_PORTS; ++i) begin : g_cout_q_pop
|
||||
assign cout_q_pop[i] = (cp2af_sRxPort.c0.mmioRdValid && mmio_req_hdr.address == MMIO_STATUS)
|
||||
&& (cout_q_id == i)
|
||||
&& ~cout_q_empty[i];
|
||||
|
@ -244,7 +244,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
|
|||
`ifdef SIMULATION
|
||||
`ifndef VERILATOR
|
||||
// disable assertions until full reset
|
||||
reg [`CLOG2(`RESET_DELAY+1)-1:0] assert_delay_ctr;
|
||||
reg [`CLOG2(RESET_DELAY+1)-1:0] assert_delay_ctr;
|
||||
initial begin
|
||||
$assertoff;
|
||||
end
|
||||
|
@ -253,7 +253,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
|
|||
assert_delay_ctr <= '0;
|
||||
end else begin
|
||||
assert_delay_ctr <= assert_delay_ctr + $bits(assert_delay_ctr)'(1);
|
||||
if (assert_delay_ctr == (`RESET_DELAY-1)) begin
|
||||
if (assert_delay_ctr == (RESET_DELAY-1)) begin
|
||||
$asserton; // enable assertions
|
||||
end
|
||||
end
|
||||
|
@ -418,7 +418,7 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
|
|||
`TRACE(2, ("%t: AFU: Goto STATE RUN\n", $time))
|
||||
`endif
|
||||
state <= STATE_RUN;
|
||||
vx_reset_ctr <= RESET_CTR_WIDTH'(`RESET_DELAY-1);
|
||||
vx_reset_ctr <= RESET_CTR_WIDTH'(RESET_DELAY-1);
|
||||
vx_reset <= 1;
|
||||
end
|
||||
default: begin
|
||||
|
@ -507,18 +507,18 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
|
|||
.DATA_SIZE (LMEM_DATA_SIZE),
|
||||
.ADDR_WIDTH (CCI_VX_ADDR_WIDTH),
|
||||
.TAG_WIDTH (CCI_VX_TAG_WIDTH)
|
||||
) vx_mem_bus_if[`VX_MEM_PORTS]();
|
||||
) vx_mem_bus_if[VX_MEM_PORTS]();
|
||||
|
||||
wire [`VX_MEM_PORTS-1:0] vx_mem_req_valid_qual;
|
||||
wire [`VX_MEM_PORTS-1:0] vx_mem_req_ready_qual;
|
||||
wire [VX_MEM_PORTS-1:0] vx_mem_req_valid_qual;
|
||||
wire [VX_MEM_PORTS-1:0] vx_mem_req_ready_qual;
|
||||
|
||||
for (genvar i = 0; i < `VX_MEM_PORTS; ++i) begin : g_vx_mem_adapter
|
||||
for (genvar i = 0; i < VX_MEM_PORTS; ++i) begin : g_vx_mem_adapter
|
||||
VX_mem_data_adapter #(
|
||||
.SRC_DATA_WIDTH (`VX_MEM_DATA_WIDTH),
|
||||
.SRC_DATA_WIDTH (VX_MEM_DATA_WIDTH),
|
||||
.DST_DATA_WIDTH (LMEM_DATA_WIDTH),
|
||||
.SRC_ADDR_WIDTH (`VX_MEM_ADDR_WIDTH),
|
||||
.SRC_ADDR_WIDTH (VX_MEM_ADDR_WIDTH),
|
||||
.DST_ADDR_WIDTH (CCI_VX_ADDR_WIDTH),
|
||||
.SRC_TAG_WIDTH (`VX_MEM_TAG_WIDTH),
|
||||
.SRC_TAG_WIDTH (VX_MEM_TAG_WIDTH),
|
||||
.DST_TAG_WIDTH (CCI_VX_TAG_WIDTH),
|
||||
.REQ_OUT_BUF (0),
|
||||
.RSP_OUT_BUF (2)
|
||||
|
@ -632,20 +632,20 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
|
|||
`UNUSED_VAR (cci_vx_mem_arb_out_if[0].req_data.flags)
|
||||
|
||||
// final merged memory interface
|
||||
wire mem_req_valid [`VX_MEM_PORTS];
|
||||
wire mem_req_rw [`VX_MEM_PORTS];
|
||||
wire [CCI_VX_ADDR_WIDTH-1:0] mem_req_addr [`VX_MEM_PORTS];
|
||||
wire [LMEM_DATA_SIZE-1:0] mem_req_byteen [`VX_MEM_PORTS];
|
||||
wire [LMEM_DATA_WIDTH-1:0] mem_req_data [`VX_MEM_PORTS];
|
||||
wire [AVS_TAG_WIDTH-1:0] mem_req_tag [`VX_MEM_PORTS];
|
||||
wire mem_req_ready [`VX_MEM_PORTS];
|
||||
wire mem_req_valid [VX_MEM_PORTS];
|
||||
wire mem_req_rw [VX_MEM_PORTS];
|
||||
wire [CCI_VX_ADDR_WIDTH-1:0] mem_req_addr [VX_MEM_PORTS];
|
||||
wire [LMEM_DATA_SIZE-1:0] mem_req_byteen [VX_MEM_PORTS];
|
||||
wire [LMEM_DATA_WIDTH-1:0] mem_req_data [VX_MEM_PORTS];
|
||||
wire [AVS_TAG_WIDTH-1:0] mem_req_tag [VX_MEM_PORTS];
|
||||
wire mem_req_ready [VX_MEM_PORTS];
|
||||
|
||||
wire mem_rsp_valid [`VX_MEM_PORTS];
|
||||
wire [LMEM_DATA_WIDTH-1:0] mem_rsp_data [`VX_MEM_PORTS];
|
||||
wire [AVS_TAG_WIDTH-1:0] mem_rsp_tag [`VX_MEM_PORTS];
|
||||
wire mem_rsp_ready [`VX_MEM_PORTS];
|
||||
wire mem_rsp_valid [VX_MEM_PORTS];
|
||||
wire [LMEM_DATA_WIDTH-1:0] mem_rsp_data [VX_MEM_PORTS];
|
||||
wire [AVS_TAG_WIDTH-1:0] mem_rsp_tag [VX_MEM_PORTS];
|
||||
wire mem_rsp_ready [VX_MEM_PORTS];
|
||||
|
||||
for (genvar i = 0; i < `VX_MEM_PORTS; ++i) begin : g_mem_bus_if
|
||||
for (genvar i = 0; i < VX_MEM_PORTS; ++i) begin : g_mem_bus_if
|
||||
if (i == 0) begin : g_i0
|
||||
// assign port0 to CCI/VX arbiter
|
||||
assign mem_req_valid[i] = cci_vx_mem_arb_out_if[i].req_valid;
|
||||
|
@ -683,13 +683,13 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
|
|||
.ADDR_WIDTH_IN (CCI_VX_ADDR_WIDTH),
|
||||
.ADDR_WIDTH_OUT(LMEM_ADDR_WIDTH),
|
||||
.BURST_WIDTH (LMEM_BURST_CTRW),
|
||||
.NUM_PORTS_IN (`VX_MEM_PORTS),
|
||||
.NUM_PORTS_IN (VX_MEM_PORTS),
|
||||
.NUM_BANKS_OUT (NUM_LOCAL_MEM_BANKS),
|
||||
.TAG_WIDTH (AVS_TAG_WIDTH),
|
||||
.RD_QUEUE_SIZE (AVS_RD_QUEUE_SIZE),
|
||||
.INTERLEAVE (`PLATFORM_MEMORY_INTERLEAVE),
|
||||
.REQ_OUT_BUF (2), // always needed due to CCI/VX arbiter
|
||||
.RSP_OUT_BUF ((`VX_MEM_PORTS > 1 || NUM_LOCAL_MEM_BANKS > 1) ? 2 : 0)
|
||||
.RSP_OUT_BUF ((VX_MEM_PORTS > 1 || NUM_LOCAL_MEM_BANKS > 1) ? 2 : 0)
|
||||
) avs_adapter (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -1003,8 +1003,8 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
|
|||
// Vortex /////////////////////////////////////////////////////////////////
|
||||
|
||||
wire vx_dcr_wr_valid = (STATE_DCR_WRITE == state);
|
||||
wire [`VX_DCR_ADDR_WIDTH-1:0] vx_dcr_wr_addr = cmd_dcr_addr;
|
||||
wire [`VX_DCR_DATA_WIDTH-1:0] vx_dcr_wr_data = cmd_dcr_data;
|
||||
wire [VX_DCR_ADDR_WIDTH-1:0] vx_dcr_wr_addr = cmd_dcr_addr;
|
||||
wire [VX_DCR_DATA_WIDTH-1:0] vx_dcr_wr_data = cmd_dcr_data;
|
||||
|
||||
`SCOPE_IO_SWITCH (2);
|
||||
|
||||
|
@ -1040,23 +1040,23 @@ module vortex_afu import ccip_if_pkg::*; import local_mem_cfg_pkg::*; import VX_
|
|||
|
||||
// COUT HANDLING //////////////////////////////////////////////////////////
|
||||
|
||||
for (genvar i = 0; i < `VX_MEM_PORTS; ++i) begin : g_cout
|
||||
for (genvar i = 0; i < VX_MEM_PORTS; ++i) begin : g_cout
|
||||
|
||||
wire [COUT_TID_WIDTH-1:0] cout_tid;
|
||||
|
||||
VX_onehot_encoder #(
|
||||
.N (`VX_MEM_BYTEEN_WIDTH)
|
||||
.N (VX_MEM_BYTEEN_WIDTH)
|
||||
) cout_tid_enc (
|
||||
.data_in (vx_mem_req_byteen[i]),
|
||||
.data_out (cout_tid),
|
||||
`UNUSED_PIN (valid_out)
|
||||
);
|
||||
|
||||
wire [`VX_MEM_BYTEEN_WIDTH-1:0][7:0] vx_mem_req_data_m = vx_mem_req_data[i];
|
||||
wire [VX_MEM_BYTEEN_WIDTH-1:0][7:0] vx_mem_req_data_m = vx_mem_req_data[i];
|
||||
|
||||
wire [7:0] cout_char = vx_mem_req_data_m[cout_tid];
|
||||
|
||||
wire [`VX_MEM_ADDR_WIDTH-1:0] io_cout_addr_b = `VX_MEM_ADDR_WIDTH'(`IO_COUT_ADDR >> `CLOG2(`MEM_BLOCK_SIZE));
|
||||
wire [VX_MEM_ADDR_WIDTH-1:0] io_cout_addr_b = VX_MEM_ADDR_WIDTH'(`IO_COUT_ADDR >> `CLOG2(`MEM_BLOCK_SIZE));
|
||||
|
||||
wire vx_mem_is_cout = (vx_mem_req_addr[i] == io_cout_addr_b);
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
|
||||
`include "vortex_afu.vh"
|
||||
|
||||
module VX_afu_ctrl #(
|
||||
module VX_afu_ctrl import VX_gpu_pkg::*; #(
|
||||
parameter S_AXI_ADDR_WIDTH = 8,
|
||||
parameter S_AXI_DATA_WIDTH = 32
|
||||
) (
|
||||
|
@ -58,8 +58,8 @@ module VX_afu_ctrl #(
|
|||
`endif
|
||||
|
||||
output wire dcr_wr_valid,
|
||||
output wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr,
|
||||
output wire [`VX_DCR_DATA_WIDTH-1:0] dcr_wr_data
|
||||
output wire [VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr,
|
||||
output wire [VX_DCR_DATA_WIDTH-1:0] dcr_wr_data
|
||||
);
|
||||
|
||||
// Address Info
|
||||
|
@ -437,7 +437,7 @@ module VX_afu_ctrl #(
|
|||
assign ap_ctrl_read = s_axi_r_fire && (raddr == ADDR_AP_CTRL);
|
||||
|
||||
assign dcr_wr_valid = dcr_wr_valid_r;
|
||||
assign dcr_wr_addr = `VX_DCR_ADDR_WIDTH'(dcra_r);
|
||||
assign dcr_wr_data = `VX_DCR_DATA_WIDTH'(dcrv_r);
|
||||
assign dcr_wr_addr = VX_DCR_ADDR_WIDTH'(dcra_r);
|
||||
assign dcr_wr_data = VX_DCR_DATA_WIDTH'(dcrv_r);
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
|
||||
`include "vortex_afu.vh"
|
||||
|
||||
module VX_afu_wrap #(
|
||||
module VX_afu_wrap import VX_gpu_pkg::*; #(
|
||||
parameter C_S_AXI_CTRL_ADDR_WIDTH = 8,
|
||||
parameter C_S_AXI_CTRL_DATA_WIDTH = 32,
|
||||
parameter C_M_AXI_MEM_ID_WIDTH = `PLATFORM_MEMORY_ID_WIDTH,
|
||||
|
@ -111,14 +111,14 @@ module VX_afu_wrap #(
|
|||
`REPEAT (`PLATFORM_MEMORY_NUM_BANKS, AXI_MEM_TO_ARRAY, REPEAT_SEMICOLON);
|
||||
`endif
|
||||
|
||||
reg [`CLOG2(`RESET_DELAY+1)-1:0] vx_reset_ctr;
|
||||
reg [`CLOG2(RESET_DELAY+1)-1:0] vx_reset_ctr;
|
||||
reg [PENDING_WR_SIZEW-1:0] vx_pending_writes;
|
||||
reg vx_reset = 1; // asserted at initialization
|
||||
wire vx_busy;
|
||||
|
||||
wire dcr_wr_valid;
|
||||
wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr;
|
||||
wire [`VX_DCR_DATA_WIDTH-1:0] dcr_wr_data;
|
||||
wire [VX_DCR_ADDR_WIDTH-1:0] dcr_wr_addr;
|
||||
wire [VX_DCR_DATA_WIDTH-1:0] dcr_wr_data;
|
||||
|
||||
state_e state;
|
||||
|
||||
|
@ -149,7 +149,7 @@ module VX_afu_wrap #(
|
|||
`TRACE(2, ("%t: AFU: Begin initialization\n", $time))
|
||||
`endif
|
||||
state <= STATE_INIT;
|
||||
vx_reset_ctr <= (`RESET_DELAY-1);
|
||||
vx_reset_ctr <= (RESET_DELAY-1);
|
||||
vx_reset <= 1;
|
||||
end
|
||||
end
|
||||
|
@ -435,7 +435,7 @@ module VX_afu_wrap #(
|
|||
`ifdef SIMULATION
|
||||
`ifndef VERILATOR
|
||||
// disable assertions until full reset
|
||||
reg [`CLOG2(`RESET_DELAY+1)-1:0] assert_delay_ctr;
|
||||
reg [`CLOG2(RESET_DELAY+1)-1:0] assert_delay_ctr;
|
||||
reg assert_enabled;
|
||||
initial begin
|
||||
$assertoff(0, vortex_axi);
|
||||
|
@ -446,7 +446,7 @@ module VX_afu_wrap #(
|
|||
assert_enabled <= 0;
|
||||
end else begin
|
||||
if (~assert_enabled) begin
|
||||
if (assert_delay_ctr == (`RESET_DELAY-1)) begin
|
||||
if (assert_delay_ctr == (RESET_DELAY-1)) begin
|
||||
assert_enabled <= 1;
|
||||
$asserton(0, vortex_axi); // enable assertions
|
||||
end else begin
|
||||
|
|
2
hw/rtl/cache/VX_bank_flush.sv
vendored
2
hw/rtl/cache/VX_bank_flush.sv
vendored
|
@ -13,7 +13,7 @@
|
|||
|
||||
`include "VX_cache_define.vh"
|
||||
|
||||
module VX_bank_flush #(
|
||||
module VX_bank_flush import VX_gpu_pkg::*; #(
|
||||
parameter BANK_ID = 0,
|
||||
// Size of cache in bytes
|
||||
parameter CACHE_SIZE = 1024,
|
||||
|
|
61
hw/rtl/cache/VX_cache.sv
vendored
61
hw/rtl/cache/VX_cache.sv
vendored
|
@ -54,15 +54,9 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
// Replacement policy
|
||||
parameter REPL_POLICY = `CS_REPL_FIFO,
|
||||
|
||||
// Request debug identifier
|
||||
parameter UUID_WIDTH = 0,
|
||||
|
||||
// core request tag size
|
||||
parameter TAG_WIDTH = UUID_WIDTH + 1,
|
||||
|
||||
// core request flags
|
||||
parameter FLAGS_WIDTH = 0,
|
||||
|
||||
// Core response output register
|
||||
parameter CORE_OUT_BUF = 3,
|
||||
|
||||
|
@ -96,10 +90,10 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS);
|
||||
localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS);
|
||||
localparam LINE_ADDR_WIDTH = (`CS_WORD_ADDR_WIDTH - BANK_SEL_BITS - WORD_SEL_BITS);
|
||||
localparam CORE_REQ_DATAW = LINE_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + WORD_WIDTH + TAG_WIDTH + `UP(FLAGS_WIDTH);
|
||||
localparam CORE_REQ_DATAW = LINE_ADDR_WIDTH + 1 + WORD_SEL_WIDTH + WORD_SIZE + WORD_WIDTH + TAG_WIDTH + `UP(MEM_FLAGS_WIDTH);
|
||||
localparam CORE_RSP_DATAW = WORD_WIDTH + TAG_WIDTH;
|
||||
localparam BANK_MEM_TAG_WIDTH = UUID_WIDTH + MSHR_ADDR_WIDTH;
|
||||
localparam MEM_REQ_DATAW = (`CS_LINE_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + BANK_MEM_TAG_WIDTH + `UP(FLAGS_WIDTH));
|
||||
localparam MEM_REQ_DATAW = (`CS_LINE_ADDR_WIDTH + 1 + LINE_SIZE + `CS_LINE_WIDTH + BANK_MEM_TAG_WIDTH + `UP(MEM_FLAGS_WIDTH));
|
||||
localparam MEM_RSP_DATAW = `CS_LINE_WIDTH + MEM_TAG_WIDTH;
|
||||
localparam MEM_PORTS_SEL_BITS = `CLOG2(MEM_PORTS);
|
||||
localparam MEM_PORTS_SEL_WIDTH = `UP(MEM_PORTS_SEL_BITS);
|
||||
|
@ -130,7 +124,6 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
VX_cache_flush #(
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.NUM_BANKS (NUM_BANKS),
|
||||
.UUID_WIDTH(UUID_WIDTH),
|
||||
.TAG_WIDTH (TAG_WIDTH),
|
||||
.BANK_SEL_LATENCY (`TO_OUT_BUF_REG(REQ_XBAR_BUF)) // request xbar latency
|
||||
) flush_unit (
|
||||
|
@ -244,7 +237,7 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
wire [NUM_BANKS-1:0][`CS_WORD_WIDTH-1:0] per_bank_core_req_data;
|
||||
wire [NUM_BANKS-1:0][TAG_WIDTH-1:0] per_bank_core_req_tag;
|
||||
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] per_bank_core_req_idx;
|
||||
wire [NUM_BANKS-1:0][`UP(FLAGS_WIDTH)-1:0] per_bank_core_req_flags;
|
||||
wire [NUM_BANKS-1:0][`UP(MEM_FLAGS_WIDTH)-1:0] per_bank_core_req_flags;
|
||||
wire [NUM_BANKS-1:0] per_bank_core_req_ready;
|
||||
|
||||
wire [NUM_BANKS-1:0] per_bank_core_rsp_valid;
|
||||
|
@ -259,7 +252,7 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
wire [NUM_BANKS-1:0][LINE_SIZE-1:0] per_bank_mem_req_byteen;
|
||||
wire [NUM_BANKS-1:0][`CS_LINE_WIDTH-1:0] per_bank_mem_req_data;
|
||||
wire [NUM_BANKS-1:0][BANK_MEM_TAG_WIDTH-1:0] per_bank_mem_req_tag;
|
||||
wire [NUM_BANKS-1:0][`UP(FLAGS_WIDTH)-1:0] per_bank_mem_req_flags;
|
||||
wire [NUM_BANKS-1:0][`UP(MEM_FLAGS_WIDTH)-1:0] per_bank_mem_req_flags;
|
||||
wire [NUM_BANKS-1:0] per_bank_mem_req_ready;
|
||||
|
||||
wire [NUM_REQS-1:0] core_req_valid;
|
||||
|
@ -268,7 +261,7 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
wire [NUM_REQS-1:0][WORD_SIZE-1:0] core_req_byteen;
|
||||
wire [NUM_REQS-1:0][`CS_WORD_WIDTH-1:0] core_req_data;
|
||||
wire [NUM_REQS-1:0][TAG_WIDTH-1:0] core_req_tag;
|
||||
wire [NUM_REQS-1:0][`UP(FLAGS_WIDTH)-1:0] core_req_flags;
|
||||
wire [NUM_REQS-1:0][`UP(MEM_FLAGS_WIDTH)-1:0] core_req_flags;
|
||||
wire [NUM_REQS-1:0] core_req_ready;
|
||||
|
||||
wire [NUM_REQS-1:0][LINE_ADDR_WIDTH-1:0] core_req_line_addr;
|
||||
|
@ -285,7 +278,7 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
assign core_req_addr[i] = core_bus2_if[i].req_data.addr;
|
||||
assign core_req_data[i] = core_bus2_if[i].req_data.data;
|
||||
assign core_req_tag[i] = core_bus2_if[i].req_data.tag;
|
||||
assign core_req_flags[i] = `UP(FLAGS_WIDTH)'(core_bus2_if[i].req_data.flags);
|
||||
assign core_req_flags[i] = `UP(MEM_FLAGS_WIDTH)'(core_bus2_if[i].req_data.flags);
|
||||
assign core_bus2_if[i].req_ready = core_req_ready[i];
|
||||
end
|
||||
|
||||
|
@ -324,14 +317,14 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
assign per_bank_core_req_fire = per_bank_core_req_valid & per_bank_mem_req_ready;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
wire [`PERF_CTR_BITS-1:0] perf_collisions;
|
||||
wire [PERF_CTR_BITS-1:0] perf_collisions;
|
||||
`endif
|
||||
|
||||
VX_stream_xbar #(
|
||||
.NUM_INPUTS (NUM_REQS),
|
||||
.NUM_OUTPUTS (NUM_BANKS),
|
||||
.DATAW (CORE_REQ_DATAW),
|
||||
.PERF_CTR_BITS (`PERF_CTR_BITS),
|
||||
.PERF_CTR_BITS (PERF_CTR_BITS),
|
||||
.ARBITER ("R"),
|
||||
.OUT_BUF (REQ_XBAR_BUF)
|
||||
) req_xbar (
|
||||
|
@ -383,9 +376,7 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
.CRSQ_SIZE (CRSQ_SIZE),
|
||||
.MSHR_SIZE (MSHR_SIZE),
|
||||
.MREQ_SIZE (MREQ_SIZE),
|
||||
.UUID_WIDTH (UUID_WIDTH),
|
||||
.TAG_WIDTH (TAG_WIDTH),
|
||||
.FLAGS_WIDTH (FLAGS_WIDTH),
|
||||
.CORE_OUT_REG (CORE_RSP_BUF_ENABLE ? 0 : `TO_OUT_BUF_REG(CORE_OUT_BUF)),
|
||||
.MEM_OUT_REG (MEM_REQ_BUF_ENABLE ? 0 : `TO_OUT_BUF_REG(MEM_OUT_BUF))
|
||||
) bank (
|
||||
|
@ -535,7 +526,7 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
wire [`CS_LINE_ADDR_WIDTH-1:0] mem_req_addr;
|
||||
wire [`CS_LINE_WIDTH-1:0] mem_req_data;
|
||||
wire [LINE_SIZE-1:0] mem_req_byteen;
|
||||
wire [`UP(FLAGS_WIDTH)-1:0] mem_req_flags;
|
||||
wire [`UP(MEM_FLAGS_WIDTH)-1:0] mem_req_flags;
|
||||
wire [BANK_MEM_TAG_WIDTH-1:0] mem_req_tag;
|
||||
|
||||
assign {
|
||||
|
@ -549,7 +540,7 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
|
||||
wire [`CS_MEM_ADDR_WIDTH-1:0] mem_req_addr_w;
|
||||
wire [MEM_TAG_WIDTH-1:0] mem_req_tag_w;
|
||||
wire [`UP(FLAGS_WIDTH)-1:0] mem_req_flags_w;
|
||||
wire [`UP(MEM_FLAGS_WIDTH)-1:0] mem_req_flags_w;
|
||||
|
||||
if (NUM_BANKS > 1) begin : g_mem_req_tag_multibanks
|
||||
if (NUM_BANKS != MEM_PORTS) begin : g_arb_sel
|
||||
|
@ -576,7 +567,7 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
end
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH + `UP(FLAGS_WIDTH)),
|
||||
.DATAW (1 + LINE_SIZE + `CS_MEM_ADDR_WIDTH + `CS_LINE_WIDTH + MEM_TAG_WIDTH + `UP(MEM_FLAGS_WIDTH)),
|
||||
.SIZE (MEM_REQ_BUF_ENABLE ? `TO_OUT_BUF_SIZE(MEM_OUT_BUF) : 0),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(MEM_OUT_BUF))
|
||||
) mem_req_buf (
|
||||
|
@ -590,7 +581,7 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
.ready_out (mem_bus_tmp_if[i].req_ready)
|
||||
);
|
||||
|
||||
if (FLAGS_WIDTH != 0) begin : g_mem_req_flags
|
||||
if (MEM_FLAGS_WIDTH != 0) begin : g_mem_req_flags
|
||||
assign mem_bus_tmp_if[i].req_data.flags = mem_req_flags_w;
|
||||
end else begin : g_no_mem_req_flags
|
||||
assign mem_bus_tmp_if[i].req_data.flags = '0;
|
||||
|
@ -638,13 +629,13 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req);
|
||||
`POP_COUNT(perf_mem_stall_per_cycle, perf_mem_stall_per_port);
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_core_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_core_writes;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_read_misses;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_write_misses;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mshr_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
|
||||
reg [PERF_CTR_BITS-1:0] perf_core_reads;
|
||||
reg [PERF_CTR_BITS-1:0] perf_core_writes;
|
||||
reg [PERF_CTR_BITS-1:0] perf_read_misses;
|
||||
reg [PERF_CTR_BITS-1:0] perf_write_misses;
|
||||
reg [PERF_CTR_BITS-1:0] perf_mshr_stalls;
|
||||
reg [PERF_CTR_BITS-1:0] perf_mem_stalls;
|
||||
reg [PERF_CTR_BITS-1:0] perf_crsp_stalls;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
|
@ -656,13 +647,13 @@ module VX_cache import VX_gpu_pkg::*; #(
|
|||
perf_mem_stalls <= '0;
|
||||
perf_crsp_stalls <= '0;
|
||||
end else begin
|
||||
perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle);
|
||||
perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle);
|
||||
perf_read_misses <= perf_read_misses + `PERF_CTR_BITS'(perf_read_miss_per_cycle);
|
||||
perf_write_misses <= perf_write_misses + `PERF_CTR_BITS'(perf_write_miss_per_cycle);
|
||||
perf_mshr_stalls <= perf_mshr_stalls + `PERF_CTR_BITS'(perf_mshr_stall_per_cycle);
|
||||
perf_mem_stalls <= perf_mem_stalls + `PERF_CTR_BITS'(perf_mem_stall_per_cycle);
|
||||
perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
|
||||
perf_core_reads <= perf_core_reads + PERF_CTR_BITS'(perf_core_reads_per_cycle);
|
||||
perf_core_writes <= perf_core_writes + PERF_CTR_BITS'(perf_core_writes_per_cycle);
|
||||
perf_read_misses <= perf_read_misses + PERF_CTR_BITS'(perf_read_miss_per_cycle);
|
||||
perf_write_misses <= perf_write_misses + PERF_CTR_BITS'(perf_write_miss_per_cycle);
|
||||
perf_mshr_stalls <= perf_mshr_stalls + PERF_CTR_BITS'(perf_mshr_stall_per_cycle);
|
||||
perf_mem_stalls <= perf_mem_stalls + PERF_CTR_BITS'(perf_mem_stall_per_cycle);
|
||||
perf_crsp_stalls <= perf_crsp_stalls + PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
|
||||
end
|
||||
end
|
||||
|
||||
|
|
23
hw/rtl/cache/VX_cache_bank.sv
vendored
23
hw/rtl/cache/VX_cache_bank.sv
vendored
|
@ -13,7 +13,7 @@
|
|||
|
||||
`include "VX_cache_define.vh"
|
||||
|
||||
module VX_cache_bank #(
|
||||
module VX_cache_bank import VX_gpu_pkg::*; #(
|
||||
parameter `STRING INSTANCE_ID= "",
|
||||
parameter BANK_ID = 0,
|
||||
|
||||
|
@ -50,15 +50,9 @@ module VX_cache_bank #(
|
|||
// Replacement policy
|
||||
parameter REPL_POLICY = `CS_REPL_FIFO,
|
||||
|
||||
// Request debug identifier
|
||||
parameter UUID_WIDTH = 0,
|
||||
|
||||
// core request tag size
|
||||
parameter TAG_WIDTH = UUID_WIDTH + 1,
|
||||
|
||||
// core request flags
|
||||
parameter FLAGS_WIDTH = 0,
|
||||
|
||||
// Core response output register
|
||||
parameter CORE_OUT_REG = 0,
|
||||
|
||||
|
@ -88,7 +82,7 @@ module VX_cache_bank #(
|
|||
input wire [`CS_WORD_WIDTH-1:0] core_req_data, // data to be written
|
||||
input wire [TAG_WIDTH-1:0] core_req_tag, // identifier of the request (request id)
|
||||
input wire [REQ_SEL_WIDTH-1:0] core_req_idx, // index of the request in the core request array
|
||||
input wire [`UP(FLAGS_WIDTH)-1:0] core_req_flags,
|
||||
input wire [`UP(MEM_FLAGS_WIDTH)-1:0] core_req_flags,
|
||||
output wire core_req_ready,
|
||||
|
||||
// Core Response
|
||||
|
@ -105,7 +99,7 @@ module VX_cache_bank #(
|
|||
output wire [LINE_SIZE-1:0] mem_req_byteen,
|
||||
output wire [`CS_LINE_WIDTH-1:0] mem_req_data,
|
||||
output wire [MEM_TAG_WIDTH-1:0] mem_req_tag,
|
||||
output wire [`UP(FLAGS_WIDTH)-1:0] mem_req_flags,
|
||||
output wire [`UP(MEM_FLAGS_WIDTH)-1:0] mem_req_flags,
|
||||
input wire mem_req_ready,
|
||||
|
||||
// Memory response
|
||||
|
@ -169,7 +163,7 @@ module VX_cache_bank #(
|
|||
wire is_dirty_st0, is_dirty_st1;
|
||||
wire is_replay_st0, is_replay_st1;
|
||||
wire is_hit_st0, is_hit_st1;
|
||||
wire [`UP(FLAGS_WIDTH)-1:0] flags_sel, flags_st0, flags_st1;
|
||||
wire [`UP(MEM_FLAGS_WIDTH)-1:0] flags_sel, flags_st0, flags_st1;
|
||||
wire mshr_pending_st0, mshr_pending_st1;
|
||||
wire [MSHR_ADDR_WIDTH-1:0] mshr_previd_st0, mshr_previd_st1;
|
||||
wire mshr_empty;
|
||||
|
@ -301,7 +295,7 @@ module VX_cache_bank #(
|
|||
wire is_replay_sel = replay_enable;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + `UP(FLAGS_WIDTH) + `CS_WAY_SEL_WIDTH + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH),
|
||||
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + `UP(MEM_FLAGS_WIDTH) + `CS_WAY_SEL_WIDTH + `CS_LINE_ADDR_WIDTH + `CS_LINE_WIDTH + 1 + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH),
|
||||
.RESETW (1)
|
||||
) pipe_reg0 (
|
||||
.clk (clk),
|
||||
|
@ -406,7 +400,7 @@ module VX_cache_bank #(
|
|||
assign mshr_id_st0 = is_replay_st0 ? replay_id_st0 : mshr_alloc_id_st0;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `UP(FLAGS_WIDTH) + `CS_WAY_SEL_WIDTH + `CS_TAG_SEL_BITS + `CS_TAG_SEL_BITS + `CS_LINE_SEL_BITS + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1),
|
||||
.DATAW (1 + 1 + 1 + 1 + 1 + 1 + 1 + 1 + `UP(MEM_FLAGS_WIDTH) + `CS_WAY_SEL_WIDTH + `CS_TAG_SEL_BITS + `CS_TAG_SEL_BITS + `CS_LINE_SEL_BITS + `CS_LINE_WIDTH + WORD_SIZE + WORD_SEL_WIDTH + REQ_SEL_WIDTH + TAG_WIDTH + MSHR_ADDR_WIDTH + MSHR_ADDR_WIDTH + 1),
|
||||
.RESETW (1)
|
||||
) pipe_reg1 (
|
||||
.clk (clk),
|
||||
|
@ -506,7 +500,6 @@ module VX_cache_bank #(
|
|||
.NUM_BANKS (NUM_BANKS),
|
||||
.MSHR_SIZE (MSHR_SIZE),
|
||||
.WRITEBACK (WRITEBACK),
|
||||
.UUID_WIDTH (UUID_WIDTH),
|
||||
.DATA_WIDTH (WORD_SEL_WIDTH + WORD_SIZE + `CS_WORD_WIDTH + TAG_WIDTH + REQ_SEL_WIDTH)
|
||||
) cache_mshr (
|
||||
.clk (clk),
|
||||
|
@ -584,7 +577,7 @@ module VX_cache_bank #(
|
|||
wire [`CS_LINE_ADDR_WIDTH-1:0] mreq_queue_addr;
|
||||
wire [MEM_TAG_WIDTH-1:0] mreq_queue_tag;
|
||||
wire mreq_queue_rw;
|
||||
wire [`UP(FLAGS_WIDTH)-1:0] mreq_queue_flags;
|
||||
wire [`UP(MEM_FLAGS_WIDTH)-1:0] mreq_queue_flags;
|
||||
|
||||
wire is_fill_or_flush_st1 = is_fill_st1 || (is_flush_st1 && WRITEBACK);
|
||||
wire do_fill_or_flush_st1 = valid_st1 && is_fill_or_flush_st1;
|
||||
|
@ -658,7 +651,7 @@ module VX_cache_bank #(
|
|||
assign mreq_queue_flags = flags_st1;
|
||||
|
||||
VX_fifo_queue #(
|
||||
.DATAW (1 + `CS_LINE_ADDR_WIDTH + LINE_SIZE + `CS_LINE_WIDTH + MEM_TAG_WIDTH + `UP(FLAGS_WIDTH)),
|
||||
.DATAW (1 + `CS_LINE_ADDR_WIDTH + LINE_SIZE + `CS_LINE_WIDTH + MEM_TAG_WIDTH + `UP(MEM_FLAGS_WIDTH)),
|
||||
.DEPTH (MREQ_SIZE),
|
||||
.ALM_FULL (MREQ_SIZE - PIPELINE_STAGES),
|
||||
.OUT_REG (MEM_OUT_REG)
|
||||
|
|
8
hw/rtl/cache/VX_cache_bypass.sv
vendored
8
hw/rtl/cache/VX_cache_bypass.sv
vendored
|
@ -13,7 +13,7 @@
|
|||
|
||||
`include "VX_cache_define.vh"
|
||||
|
||||
module VX_cache_bypass #(
|
||||
module VX_cache_bypass import VX_gpu_pkg::*; #(
|
||||
parameter NUM_REQS = 1,
|
||||
parameter MEM_PORTS = 1,
|
||||
parameter TAG_SEL_IDX = 0,
|
||||
|
@ -30,8 +30,6 @@ module VX_cache_bypass #(
|
|||
parameter MEM_ADDR_WIDTH = 1,
|
||||
parameter MEM_TAG_IN_WIDTH = 1,
|
||||
|
||||
parameter UUID_WIDTH = 0,
|
||||
|
||||
parameter CORE_OUT_BUF = 0,
|
||||
parameter MEM_OUT_BUF = 0
|
||||
) (
|
||||
|
@ -74,7 +72,7 @@ module VX_cache_bypass #(
|
|||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_core_req_is_nc
|
||||
if (CACHE_ENABLE) begin : g_cache
|
||||
assign core_req_nc_sel[i] = ~core_bus_in_if[i].req_data.flags[`MEM_REQ_FLAG_IO];
|
||||
assign core_req_nc_sel[i] = ~core_bus_in_if[i].req_data.flags[MEM_REQ_FLAG_IO];
|
||||
end else begin : g_no_cache
|
||||
assign core_req_nc_sel[i] = 1'b0;
|
||||
end
|
||||
|
@ -156,7 +154,7 @@ module VX_cache_bypass #(
|
|||
wire core_req_nc_arb_rw;
|
||||
wire [WORD_SIZE-1:0] core_req_nc_arb_byteen;
|
||||
wire [CORE_ADDR_WIDTH-1:0] core_req_nc_arb_addr;
|
||||
wire [`MEM_REQ_FLAGS_WIDTH-1:0] core_req_nc_arb_flags;
|
||||
wire [MEM_FLAGS_WIDTH-1:0] core_req_nc_arb_flags;
|
||||
wire [CORE_DATA_WIDTH-1:0] core_req_nc_arb_data;
|
||||
wire [MEM_TAG_NC1_WIDTH-1:0] core_req_nc_arb_tag;
|
||||
|
||||
|
|
8
hw/rtl/cache/VX_cache_cluster.sv
vendored
8
hw/rtl/cache/VX_cache_cluster.sv
vendored
|
@ -58,15 +58,9 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
|
|||
// Replacement policy
|
||||
parameter REPL_POLICY = `CS_REPL_FIFO,
|
||||
|
||||
// Request debug identifier
|
||||
parameter UUID_WIDTH = 0,
|
||||
|
||||
// core request tag size
|
||||
parameter TAG_WIDTH = UUID_WIDTH + 1,
|
||||
|
||||
// core request flags
|
||||
parameter FLAGS_WIDTH = 0,
|
||||
|
||||
// enable bypass for non-cacheable addresses
|
||||
parameter NC_ENABLE = 0,
|
||||
|
||||
|
@ -167,9 +161,7 @@ module VX_cache_cluster import VX_gpu_pkg::*; #(
|
|||
.MSHR_SIZE (MSHR_SIZE),
|
||||
.MRSQ_SIZE (MRSQ_SIZE),
|
||||
.MREQ_SIZE (MREQ_SIZE),
|
||||
.UUID_WIDTH (UUID_WIDTH),
|
||||
.TAG_WIDTH (ARB_TAG_WIDTH),
|
||||
.FLAGS_WIDTH (FLAGS_WIDTH),
|
||||
.TAG_SEL_IDX (TAG_SEL_IDX),
|
||||
.CORE_OUT_BUF ((NUM_INPUTS != NUM_CACHES) ? 2 : CORE_OUT_BUF),
|
||||
.MEM_OUT_BUF ((NUM_CACHES > 1) ? 2 : MEM_OUT_BUF),
|
||||
|
|
2
hw/rtl/cache/VX_cache_data.sv
vendored
2
hw/rtl/cache/VX_cache_data.sv
vendored
|
@ -13,7 +13,7 @@
|
|||
|
||||
`include "VX_cache_define.vh"
|
||||
|
||||
module VX_cache_data #(
|
||||
module VX_cache_data import VX_gpu_pkg::*; #(
|
||||
// Size of cache in bytes
|
||||
parameter CACHE_SIZE = 1024,
|
||||
// Size of line inside a bank in bytes
|
||||
|
|
16
hw/rtl/cache/VX_cache_define.vh
vendored
16
hw/rtl/cache/VX_cache_define.vh
vendored
|
@ -61,14 +61,14 @@
|
|||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define PERF_CACHE_ADD(dst, src, count) \
|
||||
`PERF_COUNTER_ADD (dst, src, reads, `PERF_CTR_BITS, count, (count > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, writes, `PERF_CTR_BITS, count, (count > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, read_misses, `PERF_CTR_BITS, count, (count > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, write_misses, `PERF_CTR_BITS, count, (count > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, bank_stalls, `PERF_CTR_BITS, count, (count > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, mshr_stalls, `PERF_CTR_BITS, count, (count > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, mem_stalls, `PERF_CTR_BITS, count, (count > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, crsp_stalls, `PERF_CTR_BITS, count, (count > 1))
|
||||
`PERF_COUNTER_ADD (dst, src, reads, PERF_CTR_BITS, count, (count > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, writes, PERF_CTR_BITS, count, (count > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, read_misses, PERF_CTR_BITS, count, (count > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, write_misses, PERF_CTR_BITS, count, (count > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, bank_stalls, PERF_CTR_BITS, count, (count > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, mshr_stalls, PERF_CTR_BITS, count, (count > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, mem_stalls, PERF_CTR_BITS, count, (count > 1)) \
|
||||
`PERF_COUNTER_ADD (dst, src, crsp_stalls, PERF_CTR_BITS, count, (count > 1))
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
|
|
6
hw/rtl/cache/VX_cache_flush.sv
vendored
6
hw/rtl/cache/VX_cache_flush.sv
vendored
|
@ -13,13 +13,11 @@
|
|||
|
||||
`include "VX_cache_define.vh"
|
||||
|
||||
module VX_cache_flush #(
|
||||
module VX_cache_flush import VX_gpu_pkg::*; #(
|
||||
// Number of Word requests per cycle
|
||||
parameter NUM_REQS = 4,
|
||||
// Number of banks
|
||||
parameter NUM_BANKS = 1,
|
||||
// Request debug identifier
|
||||
parameter UUID_WIDTH = 0,
|
||||
// core request tag size
|
||||
parameter TAG_WIDTH = UUID_WIDTH + 1,
|
||||
// Bank select latency
|
||||
|
@ -90,7 +88,7 @@ module VX_cache_flush #(
|
|||
|
||||
wire [NUM_REQS-1:0] flush_req_mask;
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_flush_req_mask
|
||||
assign flush_req_mask[i] = core_bus_in_if[i].req_valid && core_bus_in_if[i].req_data.flags[`MEM_REQ_FLAG_FLUSH];
|
||||
assign flush_req_mask[i] = core_bus_in_if[i].req_valid && core_bus_in_if[i].req_data.flags[MEM_REQ_FLAG_FLUSH];
|
||||
end
|
||||
wire flush_req_enable = (| flush_req_mask);
|
||||
|
||||
|
|
4
hw/rtl/cache/VX_cache_mshr.sv
vendored
4
hw/rtl/cache/VX_cache_mshr.sv
vendored
|
@ -42,7 +42,7 @@
|
|||
// and as such changes to either module requires careful evaluation.
|
||||
//
|
||||
|
||||
module VX_cache_mshr #(
|
||||
module VX_cache_mshr import VX_gpu_pkg::*; #(
|
||||
parameter `STRING INSTANCE_ID= "",
|
||||
parameter BANK_ID = 0,
|
||||
// Size of line inside a bank in bytes
|
||||
|
@ -51,8 +51,6 @@ module VX_cache_mshr #(
|
|||
parameter NUM_BANKS = 1,
|
||||
// Miss Reserv Queue Knob
|
||||
parameter MSHR_SIZE = 4,
|
||||
// Request debug identifier
|
||||
parameter UUID_WIDTH = 0,
|
||||
// MSHR parameters
|
||||
parameter DATA_WIDTH = 1,
|
||||
// Enable cache writeback
|
||||
|
|
2
hw/rtl/cache/VX_cache_repl.sv
vendored
2
hw/rtl/cache/VX_cache_repl.sv
vendored
|
@ -16,7 +16,7 @@
|
|||
// Fast PLRU encoder and decoder utility
|
||||
// Adapted from BaseJump STL: http://bjump.org/data_out.html
|
||||
|
||||
module plru_decoder #(
|
||||
module plru_decoder import VX_gpu_pkg::*; #(
|
||||
parameter NUM_WAYS = 1,
|
||||
parameter WAY_IDX_BITS = $clog2(NUM_WAYS),
|
||||
parameter WAY_IDX_WIDTH = `UP(WAY_IDX_BITS)
|
||||
|
|
2
hw/rtl/cache/VX_cache_tags.sv
vendored
2
hw/rtl/cache/VX_cache_tags.sv
vendored
|
@ -13,7 +13,7 @@
|
|||
|
||||
`include "VX_cache_define.vh"
|
||||
|
||||
module VX_cache_tags #(
|
||||
module VX_cache_tags import VX_gpu_pkg::*; #(
|
||||
// Size of cache in bytes
|
||||
parameter CACHE_SIZE = 1024,
|
||||
// Size of line inside a bank in bytes
|
||||
|
|
6
hw/rtl/cache/VX_cache_top.sv
vendored
6
hw/rtl/cache/VX_cache_top.sv
vendored
|
@ -51,9 +51,6 @@ module VX_cache_top import VX_gpu_pkg::*; #(
|
|||
// Enable dirty bytes on writeback
|
||||
parameter DIRTY_BYTES = 1,
|
||||
|
||||
// Request debug identifier
|
||||
parameter UUID_WIDTH = 0,
|
||||
|
||||
// core request tag size
|
||||
parameter TAG_WIDTH = 32,
|
||||
|
||||
|
@ -78,7 +75,7 @@ module VX_cache_top import VX_gpu_pkg::*; #(
|
|||
input wire core_req_rw [NUM_REQS],
|
||||
input wire[WORD_SIZE-1:0] core_req_byteen [NUM_REQS],
|
||||
input wire[`CS_WORD_ADDR_WIDTH-1:0] core_req_addr [NUM_REQS],
|
||||
input wire[`MEM_REQ_FLAGS_WIDTH-1:0] core_req_flags [NUM_REQS],
|
||||
input wire[MEM_FLAGS_WIDTH-1:0] core_req_flags [NUM_REQS],
|
||||
input wire[`CS_WORD_WIDTH-1:0] core_req_data [NUM_REQS],
|
||||
input wire[TAG_WIDTH-1:0] core_req_tag [NUM_REQS],
|
||||
output wire core_req_ready [NUM_REQS],
|
||||
|
@ -167,7 +164,6 @@ module VX_cache_top import VX_gpu_pkg::*; #(
|
|||
.MRSQ_SIZE (MRSQ_SIZE),
|
||||
.MREQ_SIZE (MREQ_SIZE),
|
||||
.TAG_WIDTH (TAG_WIDTH),
|
||||
.UUID_WIDTH (UUID_WIDTH),
|
||||
.WRITE_ENABLE (WRITE_ENABLE),
|
||||
.WRITEBACK (WRITEBACK),
|
||||
.DIRTY_BYTES (DIRTY_BYTES),
|
||||
|
|
26
hw/rtl/cache/VX_cache_wrap.sv
vendored
26
hw/rtl/cache/VX_cache_wrap.sv
vendored
|
@ -56,15 +56,9 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
// Replacement policy
|
||||
parameter REPL_POLICY = `CS_REPL_FIFO,
|
||||
|
||||
// Request debug identifier
|
||||
parameter UUID_WIDTH = 0,
|
||||
|
||||
// core request tag size
|
||||
parameter TAG_WIDTH = UUID_WIDTH + 1,
|
||||
|
||||
// core request flags
|
||||
parameter FLAGS_WIDTH = 0,
|
||||
|
||||
// enable bypass for non-cacheable addresses
|
||||
parameter NC_ENABLE = 0,
|
||||
|
||||
|
@ -131,8 +125,6 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
.MEM_ADDR_WIDTH (`CS_MEM_ADDR_WIDTH),
|
||||
.MEM_TAG_IN_WIDTH (CACHE_MEM_TAG_WIDTH),
|
||||
|
||||
.UUID_WIDTH (UUID_WIDTH),
|
||||
|
||||
.CORE_OUT_BUF (CORE_OUT_BUF),
|
||||
.MEM_OUT_BUF (MEM_OUT_BUF)
|
||||
) cache_bypass (
|
||||
|
@ -184,9 +176,7 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
.MSHR_SIZE (MSHR_SIZE),
|
||||
.MRSQ_SIZE (MRSQ_SIZE),
|
||||
.MREQ_SIZE (MREQ_SIZE),
|
||||
.UUID_WIDTH (UUID_WIDTH),
|
||||
.TAG_WIDTH (TAG_WIDTH),
|
||||
.FLAGS_WIDTH (FLAGS_WIDTH),
|
||||
.CORE_OUT_BUF (BYPASS_ENABLE ? 1 : CORE_OUT_BUF),
|
||||
.MEM_OUT_BUF (BYPASS_ENABLE ? 1 : MEM_OUT_BUF)
|
||||
) cache (
|
||||
|
@ -236,10 +226,10 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req);
|
||||
`POP_COUNT(perf_mem_stall_per_cycle, perf_mem_stall_per_port);
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_core_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_core_writes;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_mem_stalls;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
|
||||
reg [PERF_CTR_BITS-1:0] perf_core_reads;
|
||||
reg [PERF_CTR_BITS-1:0] perf_core_writes;
|
||||
reg [PERF_CTR_BITS-1:0] perf_mem_stalls;
|
||||
reg [PERF_CTR_BITS-1:0] perf_crsp_stalls;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
|
@ -248,10 +238,10 @@ module VX_cache_wrap import VX_gpu_pkg::*; #(
|
|||
perf_mem_stalls <= '0;
|
||||
perf_crsp_stalls <= '0;
|
||||
end else begin
|
||||
perf_core_reads <= perf_core_reads + `PERF_CTR_BITS'(perf_core_reads_per_cycle);
|
||||
perf_core_writes <= perf_core_writes + `PERF_CTR_BITS'(perf_core_writes_per_cycle);
|
||||
perf_mem_stalls <= perf_mem_stalls + `PERF_CTR_BITS'(perf_mem_stall_per_cycle);
|
||||
perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
|
||||
perf_core_reads <= perf_core_reads + PERF_CTR_BITS'(perf_core_reads_per_cycle);
|
||||
perf_core_writes <= perf_core_writes + PERF_CTR_BITS'(perf_core_writes_per_cycle);
|
||||
perf_mem_stalls <= perf_mem_stalls + PERF_CTR_BITS'(perf_mem_stall_per_cycle);
|
||||
perf_crsp_stalls <= perf_crsp_stalls + PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
|
||||
`include "VX_define.vh"
|
||||
|
||||
module VX_alu_int #(
|
||||
module VX_alu_int import VX_gpu_pkg::*; #(
|
||||
parameter `STRING INSTANCE_ID = "",
|
||||
parameter BLOCK_IDX = 0,
|
||||
parameter NUM_LANES = 1
|
||||
|
@ -32,7 +32,7 @@ module VX_alu_int #(
|
|||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
localparam LANE_BITS = `CLOG2(NUM_LANES);
|
||||
localparam LANE_WIDTH = `UP(LANE_BITS);
|
||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||
localparam PID_BITS = `CLOG2(`SIMD_WIDTH / NUM_LANES);
|
||||
localparam PID_WIDTH = `UP(PID_BITS);
|
||||
localparam SHIFT_IMM_BITS = `CLOG2(`XLEN);
|
||||
|
||||
|
@ -57,12 +57,12 @@ module VX_alu_int #(
|
|||
wire is_alu_w = 0;
|
||||
`endif
|
||||
|
||||
wire [`INST_ALU_BITS-1:0] alu_op = `INST_ALU_BITS'(execute_if.data.op_type);
|
||||
wire [`INST_BR_BITS-1:0] br_op = `INST_BR_BITS'(execute_if.data.op_type);
|
||||
wire is_br_op = (execute_if.data.op_args.alu.xtype == `ALU_TYPE_BRANCH);
|
||||
wire is_sub_op = `INST_ALU_IS_SUB(alu_op);
|
||||
wire is_signed = `INST_ALU_SIGNED(alu_op);
|
||||
wire [1:0] op_class = is_br_op ? `INST_BR_CLASS(alu_op) : `INST_ALU_CLASS(alu_op);
|
||||
wire [INST_ALU_BITS-1:0] alu_op = INST_ALU_BITS'(execute_if.data.op_type);
|
||||
wire [INST_BR_BITS-1:0] br_op = INST_BR_BITS'(execute_if.data.op_type);
|
||||
wire is_br_op = (execute_if.data.op_args.alu.xtype == ALU_TYPE_BRANCH);
|
||||
wire is_sub_op = inst_alu_is_sub(alu_op);
|
||||
wire is_signed = inst_alu_signed(alu_op);
|
||||
wire [1:0] op_class = is_br_op ? inst_br_class(alu_op) : inst_alu_class(alu_op);
|
||||
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in1 = execute_if.data.rs1_data;
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] alu_in2 = execute_if.data.rs2_data;
|
||||
|
@ -133,13 +133,13 @@ module VX_alu_int #(
|
|||
|
||||
// branch
|
||||
|
||||
wire [`PC_BITS-1:0] PC_r;
|
||||
wire [`INST_BR_BITS-1:0] br_op_r;
|
||||
wire [`PC_BITS-1:0] cbr_dest, cbr_dest_r;
|
||||
wire [PC_BITS-1:0] PC_r;
|
||||
wire [INST_BR_BITS-1:0] br_op_r;
|
||||
wire [PC_BITS-1:0] cbr_dest, cbr_dest_r;
|
||||
wire [LANE_WIDTH-1:0] tid, tid_r;
|
||||
wire is_br_op_r;
|
||||
|
||||
assign cbr_dest = add_result[0][1 +: `PC_BITS];
|
||||
assign cbr_dest = add_result[0][1 +: PC_BITS];
|
||||
|
||||
if (LANE_BITS != 0) begin : g_tid
|
||||
assign tid = execute_if.data.tid[0 +: LANE_BITS];
|
||||
|
@ -148,7 +148,7 @@ module VX_alu_int #(
|
|||
end
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `NR_BITS + 1 + PID_WIDTH + 1 + 1 + (NUM_LANES * `XLEN) + `PC_BITS + `PC_BITS + 1 + `INST_BR_BITS + LANE_WIDTH)
|
||||
.DATAW (UUID_WIDTH + NW_WIDTH + NUM_LANES + NR_BITS + 1 + PID_WIDTH + 1 + 1 + (NUM_LANES * `XLEN) + PC_BITS + PC_BITS + 1 + INST_BR_BITS + LANE_WIDTH)
|
||||
) rsp_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -161,9 +161,9 @@ module VX_alu_int #(
|
|||
);
|
||||
|
||||
`UNUSED_VAR (br_op_r)
|
||||
wire is_br_neg = `INST_BR_IS_NEG(br_op_r);
|
||||
wire is_br_less = `INST_BR_IS_LESS(br_op_r);
|
||||
wire is_br_static = `INST_BR_IS_STATIC(br_op_r);
|
||||
wire is_br_neg = inst_br_is_neg(br_op_r);
|
||||
wire is_br_less = inst_br_is_less(br_op_r);
|
||||
wire is_br_static = inst_br_is_static(br_op_r);
|
||||
|
||||
wire [`XLEN-1:0] br_result = alu_result_r[tid_r];
|
||||
wire is_less = br_result[0];
|
||||
|
@ -171,12 +171,12 @@ module VX_alu_int #(
|
|||
|
||||
wire br_enable = is_br_op_r && commit_if.valid && commit_if.ready && commit_if.data.eop;
|
||||
wire br_taken = ((is_br_less ? is_less : is_equal) ^ is_br_neg) | is_br_static;
|
||||
wire [`PC_BITS-1:0] br_dest = is_br_static ? br_result[1 +: `PC_BITS] : cbr_dest_r;
|
||||
wire [`NW_WIDTH-1:0] br_wid;
|
||||
wire [PC_BITS-1:0] br_dest = is_br_static ? br_result[1 +: PC_BITS] : cbr_dest_r;
|
||||
wire [NW_WIDTH-1:0] br_wid;
|
||||
`ASSIGN_BLOCKED_WID (br_wid, commit_if.data.wid, BLOCK_IDX, `NUM_ALU_BLOCKS)
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `NW_WIDTH + 1 + `PC_BITS)
|
||||
.DATAW (1 + NW_WIDTH + 1 + PC_BITS)
|
||||
) branch_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -186,7 +186,7 @@ module VX_alu_int #(
|
|||
);
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_commit
|
||||
assign commit_if.data.data[i] = (is_br_op_r && is_br_static) ? {(PC_r + `PC_BITS'(2)), 1'd0} : alu_result_r[i];
|
||||
assign commit_if.data.data[i] = (is_br_op_r && is_br_static) ? {(PC_r + PC_BITS'(2)), 1'd0} : alu_result_r[i];
|
||||
end
|
||||
|
||||
assign commit_if.data.PC = PC_r;
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
|
||||
`include "VX_define.vh"
|
||||
|
||||
module VX_alu_muldiv #(
|
||||
module VX_alu_muldiv import VX_gpu_pkg::*; #(
|
||||
parameter `STRING INSTANCE_ID = "",
|
||||
parameter NUM_LANES = 1
|
||||
) (
|
||||
|
@ -27,16 +27,16 @@ module VX_alu_muldiv #(
|
|||
VX_commit_if.master commit_if
|
||||
);
|
||||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||
localparam PID_BITS = `CLOG2(`SIMD_WIDTH / NUM_LANES);
|
||||
localparam PID_WIDTH = `UP(PID_BITS);
|
||||
localparam TAG_WIDTH = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + PID_WIDTH + 1 + 1;
|
||||
localparam TAG_WIDTH = UUID_WIDTH + NW_WIDTH + NUM_LANES + PC_BITS + NR_BITS + 1 + PID_WIDTH + 1 + 1;
|
||||
|
||||
`UNUSED_VAR (execute_if.data.rs3_data)
|
||||
|
||||
wire [`INST_M_BITS-1:0] muldiv_op = `INST_M_BITS'(execute_if.data.op_type);
|
||||
wire [INST_M_BITS-1:0] muldiv_op = INST_M_BITS'(execute_if.data.op_type);
|
||||
|
||||
wire is_mulx_op = `INST_M_IS_MULX(muldiv_op);
|
||||
wire is_signed_op = `INST_M_SIGNED(muldiv_op);
|
||||
wire is_mulx_op = inst_m_is_mulx(muldiv_op);
|
||||
wire is_signed_op = inst_m_signed(muldiv_op);
|
||||
`ifdef XLEN_64
|
||||
wire is_alu_w = execute_if.data.op_args.alu.is_w;
|
||||
`else
|
||||
|
@ -44,11 +44,11 @@ module VX_alu_muldiv #(
|
|||
`endif
|
||||
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] mul_result_out;
|
||||
wire [`UUID_WIDTH-1:0] mul_uuid_out;
|
||||
wire [`NW_WIDTH-1:0] mul_wid_out;
|
||||
wire [UUID_WIDTH-1:0] mul_uuid_out;
|
||||
wire [NW_WIDTH-1:0] mul_wid_out;
|
||||
wire [NUM_LANES-1:0] mul_tmask_out;
|
||||
wire [`PC_BITS-1:0] mul_PC_out;
|
||||
wire [`NR_BITS-1:0] mul_rd_out;
|
||||
wire [PC_BITS-1:0] mul_PC_out;
|
||||
wire [NR_BITS-1:0] mul_rd_out;
|
||||
wire mul_wb_out;
|
||||
wire [PID_WIDTH-1:0] mul_pid_out;
|
||||
wire mul_sop_out, mul_eop_out;
|
||||
|
@ -58,8 +58,8 @@ module VX_alu_muldiv #(
|
|||
wire mul_valid_out;
|
||||
wire mul_ready_out;
|
||||
|
||||
wire is_mulh_in = `INST_M_IS_MULH(muldiv_op);
|
||||
wire is_signed_mul_a = `INST_M_SIGNED_A(muldiv_op);
|
||||
wire is_mulh_in = inst_m_is_mulh(muldiv_op);
|
||||
wire is_signed_mul_a = inst_m_signed_a(muldiv_op);
|
||||
wire is_signed_mul_b = is_signed_op;
|
||||
|
||||
`ifdef IMUL_DPI
|
||||
|
@ -200,16 +200,16 @@ module VX_alu_muldiv #(
|
|||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire [NUM_LANES-1:0][`XLEN-1:0] div_result_out;
|
||||
wire [`UUID_WIDTH-1:0] div_uuid_out;
|
||||
wire [`NW_WIDTH-1:0] div_wid_out;
|
||||
wire [UUID_WIDTH-1:0] div_uuid_out;
|
||||
wire [NW_WIDTH-1:0] div_wid_out;
|
||||
wire [NUM_LANES-1:0] div_tmask_out;
|
||||
wire [`PC_BITS-1:0] div_PC_out;
|
||||
wire [`NR_BITS-1:0] div_rd_out;
|
||||
wire [PC_BITS-1:0] div_PC_out;
|
||||
wire [NR_BITS-1:0] div_rd_out;
|
||||
wire div_wb_out;
|
||||
wire [PID_WIDTH-1:0] div_pid_out;
|
||||
wire div_sop_out, div_eop_out;
|
||||
|
||||
wire is_rem_op = `INST_M_IS_REM(muldiv_op);
|
||||
wire is_rem_op = inst_m_is_rem(muldiv_op);
|
||||
|
||||
wire div_valid_in = execute_if.valid && ~is_mulx_op;
|
||||
wire div_ready_in;
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
|
||||
`include "VX_define.vh"
|
||||
|
||||
module VX_alu_unit #(
|
||||
module VX_alu_unit import VX_gpu_pkg::*; #(
|
||||
parameter `STRING INSTANCE_ID = ""
|
||||
) (
|
||||
input wire clk,
|
||||
|
@ -30,7 +30,7 @@ module VX_alu_unit #(
|
|||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
localparam BLOCK_SIZE = `NUM_ALU_BLOCKS;
|
||||
localparam NUM_LANES = `NUM_ALU_LANES;
|
||||
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
|
||||
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `SIMD_WIDTH);
|
||||
localparam PE_COUNT = 1 + `EXT_M_ENABLED;
|
||||
localparam PE_SEL_BITS = `CLOG2(PE_COUNT);
|
||||
localparam PE_IDX_INT = 0;
|
||||
|
@ -68,7 +68,7 @@ module VX_alu_unit #(
|
|||
reg [`UP(PE_SEL_BITS)-1:0] pe_select;
|
||||
always @(*) begin
|
||||
pe_select = PE_IDX_INT;
|
||||
if (`EXT_M_ENABLED && (per_block_execute_if[block_idx].data.op_args.alu.xtype == `ALU_TYPE_MULDIV))
|
||||
if (`EXT_M_ENABLED && (per_block_execute_if[block_idx].data.op_args.alu.xtype == ALU_TYPE_MULDIV))
|
||||
pe_select = PE_IDX_MDV;
|
||||
end
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||
input wire reset,
|
||||
|
||||
// inputs
|
||||
VX_commit_if.slave commit_if [`NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
VX_commit_if.slave commit_if [NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
|
||||
// outputs
|
||||
VX_writeback_if.master writeback_if [`ISSUE_WIDTH],
|
||||
|
@ -28,8 +28,8 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||
VX_commit_sched_if.master commit_sched_if
|
||||
);
|
||||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + 1 + `NR_BITS + `NUM_THREADS * `XLEN + 1 + 1 + 1;
|
||||
localparam COMMIT_SIZEW = `CLOG2(`NUM_THREADS + 1);
|
||||
localparam DATAW = UUID_WIDTH + NW_WIDTH + `SIMD_WIDTH + PC_BITS + 1 + NR_BITS + `SIMD_WIDTH * `XLEN + 1 + 1 + 1;
|
||||
localparam COMMIT_SIZEW = `CLOG2(`SIMD_WIDTH + 1);
|
||||
localparam COMMIT_ALL_SIZEW = COMMIT_SIZEW + `ISSUE_WIDTH - 1;
|
||||
|
||||
// commit arbitration
|
||||
|
@ -37,24 +37,24 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||
VX_commit_if commit_arb_if[`ISSUE_WIDTH]();
|
||||
|
||||
wire [`ISSUE_WIDTH-1:0] per_issue_commit_fire;
|
||||
wire [`ISSUE_WIDTH-1:0][`NW_WIDTH-1:0] per_issue_commit_wid;
|
||||
wire [`ISSUE_WIDTH-1:0][`NUM_THREADS-1:0] per_issue_commit_tmask;
|
||||
wire [`ISSUE_WIDTH-1:0][NW_WIDTH-1:0] per_issue_commit_wid;
|
||||
wire [`ISSUE_WIDTH-1:0][`SIMD_WIDTH-1:0] per_issue_commit_tmask;
|
||||
wire [`ISSUE_WIDTH-1:0] per_issue_commit_eop;
|
||||
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_commit_arbs
|
||||
|
||||
wire [`NUM_EX_UNITS-1:0] valid_in;
|
||||
wire [`NUM_EX_UNITS-1:0][DATAW-1:0] data_in;
|
||||
wire [`NUM_EX_UNITS-1:0] ready_in;
|
||||
wire [NUM_EX_UNITS-1:0] valid_in;
|
||||
wire [NUM_EX_UNITS-1:0][DATAW-1:0] data_in;
|
||||
wire [NUM_EX_UNITS-1:0] ready_in;
|
||||
|
||||
for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin : g_data_in
|
||||
for (genvar j = 0; j < NUM_EX_UNITS; ++j) begin : g_data_in
|
||||
assign valid_in[j] = commit_if[j * `ISSUE_WIDTH + i].valid;
|
||||
assign data_in[j] = commit_if[j * `ISSUE_WIDTH + i].data;
|
||||
assign commit_if[j * `ISSUE_WIDTH + i].ready = ready_in[j];
|
||||
end
|
||||
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (`NUM_EX_UNITS),
|
||||
.NUM_INPUTS (NUM_EX_UNITS),
|
||||
.DATAW (DATAW),
|
||||
.ARBITER ("P"),
|
||||
.OUT_BUF (1)
|
||||
|
@ -71,7 +71,7 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||
);
|
||||
|
||||
assign per_issue_commit_fire[i] = commit_arb_if[i].valid && commit_arb_if[i].ready;
|
||||
assign per_issue_commit_tmask[i]= {`NUM_THREADS{per_issue_commit_fire[i]}} & commit_arb_if[i].data.tmask;
|
||||
assign per_issue_commit_tmask[i]= {`SIMD_WIDTH{per_issue_commit_fire[i]}} & commit_arb_if[i].data.tmask;
|
||||
assign per_issue_commit_wid[i] = commit_arb_if[i].data.wid;
|
||||
assign per_issue_commit_eop[i] = commit_arb_if[i].data.eop;
|
||||
end
|
||||
|
@ -122,13 +122,13 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||
.data_out ({commit_fire_any_rr, commit_size_all_rr})
|
||||
);
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] instret;
|
||||
reg [PERF_CTR_BITS-1:0] instret;
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
instret <= '0;
|
||||
end else begin
|
||||
if (commit_fire_any_rr) begin
|
||||
instret <= instret + `PERF_CTR_BITS'(commit_size_all_rr);
|
||||
instret <= instret + PERF_CTR_BITS'(commit_size_all_rr);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -175,13 +175,13 @@ module VX_commit import VX_gpu_pkg::*; #(
|
|||
|
||||
`ifdef DBG_TRACE_PIPELINE
|
||||
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin : g_trace
|
||||
for (genvar j = 0; j < `NUM_EX_UNITS; ++j) begin : g_j
|
||||
for (genvar j = 0; j < NUM_EX_UNITS; ++j) begin : g_j
|
||||
always @(posedge clk) begin
|
||||
if (commit_if[j * `ISSUE_WIDTH + i].valid && commit_if[j * `ISSUE_WIDTH + i].ready) begin
|
||||
`TRACE(1, ("%t: %s: wid=%0d, PC=0x%0h, ex=", $time, INSTANCE_ID, commit_if[j * `ISSUE_WIDTH + i].data.wid, {commit_if[j * `ISSUE_WIDTH + i].data.PC, 1'b0}))
|
||||
trace_ex_type(1, j);
|
||||
`TRACE(1, (", tmask=%b, wb=%0d, rd=%0d, sop=%b, eop=%b, data=", commit_if[j * `ISSUE_WIDTH + i].data.tmask, commit_if[j * `ISSUE_WIDTH + i].data.wb, commit_if[j * `ISSUE_WIDTH + i].data.rd, commit_if[j * `ISSUE_WIDTH + i].data.sop, commit_if[j * `ISSUE_WIDTH + i].data.eop))
|
||||
`TRACE_ARRAY1D(1, "0x%0h", commit_if[j * `ISSUE_WIDTH + i].data.data, `NUM_THREADS)
|
||||
`TRACE_ARRAY1D(1, "0x%0h", commit_if[j * `ISSUE_WIDTH + i].data.data, `SIMD_WIDTH)
|
||||
`TRACE(1, (" (#%0d)\n", commit_if[j * `ISSUE_WIDTH + i].data.uuid))
|
||||
end
|
||||
end
|
||||
|
|
|
@ -54,8 +54,8 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
VX_branch_ctl_if branch_ctl_if[`NUM_ALU_BLOCKS]();
|
||||
VX_warp_ctl_if warp_ctl_if();
|
||||
|
||||
VX_dispatch_if dispatch_if[`NUM_EX_UNITS * `ISSUE_WIDTH]();
|
||||
VX_commit_if commit_if[`NUM_EX_UNITS * `ISSUE_WIDTH]();
|
||||
VX_dispatch_if dispatch_if[NUM_EX_UNITS * `ISSUE_WIDTH]();
|
||||
VX_commit_if commit_if[NUM_EX_UNITS * `ISSUE_WIDTH]();
|
||||
VX_writeback_if writeback_if[`ISSUE_WIDTH]();
|
||||
|
||||
VX_lsu_mem_if #(
|
||||
|
@ -217,12 +217,12 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
wire [1:0] perf_icache_pending_read_cycle;
|
||||
wire [`CLOG2(LSU_NUM_REQS+1)+1-1:0] perf_dcache_pending_read_cycle;
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_icache_pending_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_dcache_pending_reads;
|
||||
reg [PERF_CTR_BITS-1:0] perf_icache_pending_reads;
|
||||
reg [PERF_CTR_BITS-1:0] perf_dcache_pending_reads;
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_ifetches;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_loads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_stores;
|
||||
reg [PERF_CTR_BITS-1:0] perf_ifetches;
|
||||
reg [PERF_CTR_BITS-1:0] perf_loads;
|
||||
reg [PERF_CTR_BITS-1:0] perf_stores;
|
||||
|
||||
wire perf_icache_req_fire = icache_bus_if.req_valid && icache_bus_if.req_ready;
|
||||
wire perf_icache_rsp_fire = icache_bus_if.rsp_valid && icache_bus_if.rsp_ready;
|
||||
|
@ -254,13 +254,13 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
perf_icache_pending_reads <= '0;
|
||||
perf_dcache_pending_reads <= '0;
|
||||
end else begin
|
||||
perf_icache_pending_reads <= $signed(perf_icache_pending_reads) + `PERF_CTR_BITS'($signed(perf_icache_pending_read_cycle));
|
||||
perf_dcache_pending_reads <= $signed(perf_dcache_pending_reads) + `PERF_CTR_BITS'($signed(perf_dcache_pending_read_cycle));
|
||||
perf_icache_pending_reads <= $signed(perf_icache_pending_reads) + PERF_CTR_BITS'($signed(perf_icache_pending_read_cycle));
|
||||
perf_dcache_pending_reads <= $signed(perf_dcache_pending_reads) + PERF_CTR_BITS'($signed(perf_dcache_pending_read_cycle));
|
||||
end
|
||||
end
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_icache_lat;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_dcache_lat;
|
||||
reg [PERF_CTR_BITS-1:0] perf_icache_lat;
|
||||
reg [PERF_CTR_BITS-1:0] perf_dcache_lat;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
|
@ -270,9 +270,9 @@ module VX_core import VX_gpu_pkg::*; #(
|
|||
perf_icache_lat <= '0;
|
||||
perf_dcache_lat <= '0;
|
||||
end else begin
|
||||
perf_ifetches <= perf_ifetches + `PERF_CTR_BITS'(perf_icache_req_fire);
|
||||
perf_loads <= perf_loads + `PERF_CTR_BITS'(perf_dcache_rd_req_per_cycle);
|
||||
perf_stores <= perf_stores + `PERF_CTR_BITS'(perf_dcache_wr_req_per_cycle);
|
||||
perf_ifetches <= perf_ifetches + PERF_CTR_BITS'(perf_icache_req_fire);
|
||||
perf_loads <= perf_loads + PERF_CTR_BITS'(perf_dcache_rd_req_per_cycle);
|
||||
perf_stores <= perf_stores + PERF_CTR_BITS'(perf_dcache_wr_req_per_cycle);
|
||||
perf_icache_lat <= perf_icache_lat + perf_icache_pending_reads;
|
||||
perf_dcache_lat <= perf_dcache_lat + perf_dcache_pending_reads;
|
||||
end
|
||||
|
|
|
@ -25,14 +25,14 @@ module VX_core_top import VX_gpu_pkg::*; #(
|
|||
input wire reset,
|
||||
|
||||
input wire dcr_write_valid,
|
||||
input wire [`VX_DCR_ADDR_WIDTH-1:0] dcr_write_addr,
|
||||
input wire [`VX_DCR_DATA_WIDTH-1:0] dcr_write_data,
|
||||
input wire [VX_DCR_ADDR_WIDTH-1:0] dcr_write_addr,
|
||||
input wire [VX_DCR_DATA_WIDTH-1:0] dcr_write_data,
|
||||
|
||||
output wire [DCACHE_NUM_REQS-1:0] dcache_req_valid,
|
||||
output wire [DCACHE_NUM_REQS-1:0] dcache_req_rw,
|
||||
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] dcache_req_byteen,
|
||||
output wire [DCACHE_NUM_REQS-1:0][DCACHE_ADDR_WIDTH-1:0] dcache_req_addr,
|
||||
output wire [DCACHE_NUM_REQS-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] dcache_req_flags,
|
||||
output wire [DCACHE_NUM_REQS-1:0][MEM_FLAGS_WIDTH-1:0] dcache_req_flags,
|
||||
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] dcache_req_data,
|
||||
output wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] dcache_req_tag,
|
||||
input wire [DCACHE_NUM_REQS-1:0] dcache_req_ready,
|
||||
|
@ -57,12 +57,12 @@ module VX_core_top import VX_gpu_pkg::*; #(
|
|||
|
||||
`ifdef GBAR_ENABLE
|
||||
output wire gbar_req_valid,
|
||||
output wire [`NB_WIDTH-1:0] gbar_req_id,
|
||||
output wire [`NC_WIDTH-1:0] gbar_req_size_m1,
|
||||
output wire [`NC_WIDTH-1:0] gbar_req_core_id,
|
||||
output wire [NB_WIDTH-1:0] gbar_req_id,
|
||||
output wire [NC_WIDTH-1:0] gbar_req_size_m1,
|
||||
output wire [NC_WIDTH-1:0] gbar_req_core_id,
|
||||
input wire gbar_req_ready,
|
||||
input wire gbar_rsp_valid,
|
||||
input wire [`NB_WIDTH-1:0] gbar_rsp_id,
|
||||
input wire [NB_WIDTH-1:0] gbar_rsp_id,
|
||||
`endif
|
||||
// Status
|
||||
output wire busy
|
||||
|
|
|
@ -51,20 +51,20 @@ import VX_fpu_pkg::*;
|
|||
VX_fpu_csr_if.slave fpu_csr_if [`NUM_FPU_BLOCKS],
|
||||
`endif
|
||||
|
||||
input wire [`PERF_CTR_BITS-1:0] cycles,
|
||||
input wire [PERF_CTR_BITS-1:0] cycles,
|
||||
input wire [`NUM_WARPS-1:0] active_warps,
|
||||
input wire [`NUM_WARPS-1:0][`NUM_THREADS-1:0] thread_masks,
|
||||
|
||||
input wire read_enable,
|
||||
input wire [`UUID_WIDTH-1:0] read_uuid,
|
||||
input wire [`NW_WIDTH-1:0] read_wid,
|
||||
input wire [UUID_WIDTH-1:0] read_uuid,
|
||||
input wire [NW_WIDTH-1:0] read_wid,
|
||||
input wire [`VX_CSR_ADDR_BITS-1:0] read_addr,
|
||||
output wire [`XLEN-1:0] read_data_ro,
|
||||
output wire [`XLEN-1:0] read_data_rw,
|
||||
|
||||
input wire write_enable,
|
||||
input wire [`UUID_WIDTH-1:0] write_uuid,
|
||||
input wire [`NW_WIDTH-1:0] write_wid,
|
||||
input wire [UUID_WIDTH-1:0] write_uuid,
|
||||
input wire [NW_WIDTH-1:0] write_wid,
|
||||
input wire [`VX_CSR_ADDR_BITS-1:0] write_addr,
|
||||
input wire [`XLEN-1:0] write_data
|
||||
);
|
||||
|
@ -78,9 +78,9 @@ import VX_fpu_pkg::*;
|
|||
reg [`XLEN-1:0] mscratch;
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
reg [`NUM_WARPS-1:0][`INST_FRM_BITS+`FP_FLAGS_BITS-1:0] fcsr, fcsr_n;
|
||||
reg [`NUM_WARPS-1:0][INST_FRM_BITS+`FP_FLAGS_BITS-1:0] fcsr, fcsr_n;
|
||||
wire [`NUM_FPU_BLOCKS-1:0] fpu_write_enable;
|
||||
wire [`NUM_FPU_BLOCKS-1:0][`NW_WIDTH-1:0] fpu_write_wid;
|
||||
wire [`NUM_FPU_BLOCKS-1:0][NW_WIDTH-1:0] fpu_write_wid;
|
||||
fflags_t [`NUM_FPU_BLOCKS-1:0] fpu_write_fflags;
|
||||
|
||||
for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin : g_fpu_write
|
||||
|
@ -100,15 +100,15 @@ import VX_fpu_pkg::*;
|
|||
if (write_enable) begin
|
||||
case (write_addr)
|
||||
`VX_CSR_FFLAGS: fcsr_n[write_wid][`FP_FLAGS_BITS-1:0] = write_data[`FP_FLAGS_BITS-1:0];
|
||||
`VX_CSR_FRM: fcsr_n[write_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS] = write_data[`INST_FRM_BITS-1:0];
|
||||
`VX_CSR_FCSR: fcsr_n[write_wid] = write_data[`FP_FLAGS_BITS+`INST_FRM_BITS-1:0];
|
||||
`VX_CSR_FRM: fcsr_n[write_wid][INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS] = write_data[INST_FRM_BITS-1:0];
|
||||
`VX_CSR_FCSR: fcsr_n[write_wid] = write_data[`FP_FLAGS_BITS+INST_FRM_BITS-1:0];
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `NUM_FPU_BLOCKS; ++i) begin : g_fpu_csr_read_frm
|
||||
assign fpu_csr_if[i].read_frm = fcsr[fpu_csr_if[i].read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS];
|
||||
assign fpu_csr_if[i].read_frm = fcsr[fpu_csr_if[i].read_wid][INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS];
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
|
@ -170,7 +170,7 @@ import VX_fpu_pkg::*;
|
|||
`VX_CSR_MISA : read_data_ro_w = `XLEN'({2'(`CLOG2(`XLEN/16)), 30'(`MISA_STD)});
|
||||
`ifdef EXT_F_ENABLE
|
||||
`VX_CSR_FFLAGS : read_data_rw_w = `XLEN'(fcsr[read_wid][`FP_FLAGS_BITS-1:0]);
|
||||
`VX_CSR_FRM : read_data_rw_w = `XLEN'(fcsr[read_wid][`INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS]);
|
||||
`VX_CSR_FRM : read_data_rw_w = `XLEN'(fcsr[read_wid][INST_FRM_BITS+`FP_FLAGS_BITS-1:`FP_FLAGS_BITS]);
|
||||
`VX_CSR_FCSR : read_data_rw_w = `XLEN'(fcsr[read_wid]);
|
||||
`endif
|
||||
`VX_CSR_MSCRATCH : read_data_rw_w = mscratch;
|
||||
|
@ -217,16 +217,16 @@ import VX_fpu_pkg::*;
|
|||
`CSR_READ_64(`VX_CSR_MPM_IBUF_ST, read_data_ro_w, pipeline_perf.issue.ibf_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_ST, read_data_ro_w, pipeline_perf.issue.scb_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_OPDS_ST, read_data_ro_w, pipeline_perf.issue.opd_stalls);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_ALU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_ALU, read_data_ro_w, pipeline_perf.issue.units_uses[EX_ALU]);
|
||||
`ifdef EXT_F_ENABLE
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_FPU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, pipeline_perf.issue.units_uses[EX_FPU]);
|
||||
`else
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, `PERF_CTR_BITS'(0));
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_FPU, read_data_ro_w, PERF_CTR_BITS'(0));
|
||||
`endif
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_LSU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_w, pipeline_perf.issue.units_uses[`EX_SFU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_w, pipeline_perf.issue.sfu_uses[`SFU_CSRS]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_w, pipeline_perf.issue.sfu_uses[`SFU_WCTL]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_LSU, read_data_ro_w, pipeline_perf.issue.units_uses[EX_LSU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_SFU, read_data_ro_w, pipeline_perf.issue.units_uses[EX_SFU]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_CSRS, read_data_ro_w, pipeline_perf.issue.sfu_uses[SFU_CSRS]);
|
||||
`CSR_READ_64(`VX_CSR_MPM_SCRB_WCTL, read_data_ro_w, pipeline_perf.issue.sfu_uses[SFU_WCTL]);
|
||||
// PERF: memory
|
||||
`CSR_READ_64(`VX_CSR_MPM_IFETCHES, read_data_ro_w, pipeline_perf.ifetches);
|
||||
`CSR_READ_64(`VX_CSR_MPM_LOADS, read_data_ro_w, pipeline_perf.loads);
|
||||
|
|
|
@ -38,9 +38,9 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
|||
VX_commit_if.master commit_if
|
||||
);
|
||||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||
localparam PID_BITS = `CLOG2(`SIMD_WIDTH / NUM_LANES);
|
||||
localparam PID_WIDTH = `UP(PID_BITS);
|
||||
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
|
||||
localparam DATAW = UUID_WIDTH + NW_WIDTH + NUM_LANES + PC_BITS + NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
|
||||
|
||||
`UNUSED_VAR (execute_if.data.rs3_data)
|
||||
|
||||
|
@ -53,7 +53,7 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
|||
wire csr_req_ready;
|
||||
|
||||
wire [`VX_CSR_ADDR_BITS-1:0] csr_addr = execute_if.data.op_args.csr.addr;
|
||||
wire [`NRI_BITS-1:0] csr_imm = execute_if.data.op_args.csr.imm;
|
||||
wire [RV_REGS_BITS-1:0] csr_imm = execute_if.data.op_args.csr.imm;
|
||||
|
||||
wire is_fpu_csr = (csr_addr <= `VX_CSR_FCSR);
|
||||
|
||||
|
@ -70,7 +70,7 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
|||
assign rs1_data[i] = execute_if.data.rs1_data[i];
|
||||
end
|
||||
|
||||
wire csr_write_enable = (execute_if.data.op_type == `INST_SFU_CSRRW);
|
||||
wire csr_write_enable = (execute_if.data.op_type == INST_SFU_CSRRW);
|
||||
|
||||
VX_csr_data #(
|
||||
.INSTANCE_ID (INSTANCE_ID),
|
||||
|
@ -122,7 +122,7 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
|||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_gtid
|
||||
assign gtid[i] = (`XLEN'(CORE_ID) << (`NW_BITS + `NT_BITS)) + (`XLEN'(execute_if.data.wid) << `NT_BITS) + wtid[i];
|
||||
assign gtid[i] = (`XLEN'(CORE_ID) << (NW_BITS + NT_BITS)) + (`XLEN'(execute_if.data.wid) << NT_BITS) + wtid[i];
|
||||
end
|
||||
|
||||
always @(*) begin
|
||||
|
@ -144,13 +144,13 @@ module VX_csr_unit import VX_gpu_pkg::*; #(
|
|||
|
||||
always @(*) begin
|
||||
case (execute_if.data.op_type)
|
||||
`INST_SFU_CSRRW: begin
|
||||
INST_SFU_CSRRW: begin
|
||||
csr_write_data = csr_req_data;
|
||||
end
|
||||
`INST_SFU_CSRRS: begin
|
||||
INST_SFU_CSRRS: begin
|
||||
csr_write_data = csr_read_data_rw | csr_req_data;
|
||||
end
|
||||
//`INST_SFU_CSRRC
|
||||
//INST_SFU_CSRRC
|
||||
default: begin
|
||||
csr_write_data = csr_read_data_rw & ~csr_req_data;
|
||||
end
|
||||
|
|
|
@ -13,22 +13,17 @@
|
|||
|
||||
`include "VX_define.vh"
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
`define USED_IREG(x) \
|
||||
x``_v = {1'b0, ``x}; \
|
||||
x``_ext = 1; \
|
||||
use_``x = 1
|
||||
`define USED_IREG(x) \
|
||||
x``_v.id = ``x; \
|
||||
x``_v.rtype = 0; \
|
||||
x``_v.ext = 0; \
|
||||
use_``x = 1
|
||||
|
||||
`define USED_FREG(x) \
|
||||
x``_v = {1'b1, ``x}; \
|
||||
x``_ext = 1; \
|
||||
use_``x = 1
|
||||
`else
|
||||
`define USED_IREG(x) \
|
||||
x``_v = ``x; \
|
||||
x``_ext = 1; \
|
||||
use_``x = 1
|
||||
`endif
|
||||
`define USED_FREG(x) \
|
||||
x``_v.id = ``x; \
|
||||
x``_v.rtype = 1; \
|
||||
x``_v.ext = 0; \
|
||||
use_``x = 1
|
||||
|
||||
module VX_decode import VX_gpu_pkg::*; #(
|
||||
parameter `STRING INSTANCE_ID = ""
|
||||
|
@ -44,19 +39,19 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
VX_decode_sched_if.master decode_sched_if
|
||||
);
|
||||
|
||||
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + (`NR_BITS * 4) + (`REG_EXT_BITS * 4);
|
||||
|
||||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
`UNUSED_VAR (clk)
|
||||
`UNUSED_VAR (reset)
|
||||
|
||||
reg [`EX_BITS-1:0] ex_type;
|
||||
reg [`INST_OP_BITS-1:0] op_type;
|
||||
localparam NUM_OPDS = NUM_SRC_OPDS + 1;
|
||||
localparam DATAW = UUID_WIDTH + NW_WIDTH + `NUM_THREADS + PC_BITS + EX_BITS + INST_OP_BITS + INST_ARGS_BITS + NUM_OPDS + (REG_IDX_BITS * NUM_OPDS);
|
||||
|
||||
reg [EX_BITS-1:0] ex_type;
|
||||
reg [INST_OP_BITS-1:0] op_type;
|
||||
op_args_t op_args;
|
||||
reg [`NR_BITS-1:0] rd_v, rs1_v, rs2_v, rs3_v;
|
||||
reg_idx_t rd_v, rs1_v, rs2_v, rs3_v;
|
||||
reg use_rd, use_rs1, use_rs2, use_rs3;
|
||||
reg is_wstall;
|
||||
reg [`REG_EXT_BITS-1:0] rd_ext, rs1_ext, rs2_ext, rs3_ext;
|
||||
|
||||
wire [31:0] instr = fetch_if.data.instr;
|
||||
wire [6:0] opcode = instr[6:0];
|
||||
|
@ -93,57 +88,57 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
wire [12:0] b_imm = {instr[31], instr[7], instr[30:25], instr[11:8], 1'b0};
|
||||
wire [20:0] jal_imm = {instr[31], instr[19:12], instr[20], instr[30:21], 1'b0};
|
||||
|
||||
reg [`INST_ALU_BITS-1:0] r_type;
|
||||
reg [INST_ALU_BITS-1:0] r_type;
|
||||
always @(*) begin
|
||||
case (func3)
|
||||
3'h0: r_type = (opcode[5] && func7[5]) ? `INST_ALU_SUB : `INST_ALU_ADD;
|
||||
3'h1: r_type = `INST_ALU_SLL;
|
||||
3'h2: r_type = `INST_ALU_SLT;
|
||||
3'h3: r_type = `INST_ALU_SLTU;
|
||||
3'h4: r_type = `INST_ALU_XOR;
|
||||
3'h5: r_type = func7[5] ? `INST_ALU_SRA : `INST_ALU_SRL;
|
||||
3'h6: r_type = `INST_ALU_OR;
|
||||
3'h7: r_type = `INST_ALU_AND;
|
||||
3'h0: r_type = (opcode[5] && func7[5]) ? INST_ALU_SUB : INST_ALU_ADD;
|
||||
3'h1: r_type = INST_ALU_SLL;
|
||||
3'h2: r_type = INST_ALU_SLT;
|
||||
3'h3: r_type = INST_ALU_SLTU;
|
||||
3'h4: r_type = INST_ALU_XOR;
|
||||
3'h5: r_type = func7[5] ? INST_ALU_SRA : INST_ALU_SRL;
|
||||
3'h6: r_type = INST_ALU_OR;
|
||||
3'h7: r_type = INST_ALU_AND;
|
||||
endcase
|
||||
end
|
||||
|
||||
reg [`INST_BR_BITS-1:0] b_type;
|
||||
reg [INST_BR_BITS-1:0] b_type;
|
||||
always @(*) begin
|
||||
case (func3)
|
||||
3'h0: b_type = `INST_BR_EQ;
|
||||
3'h1: b_type = `INST_BR_NE;
|
||||
3'h4: b_type = `INST_BR_LT;
|
||||
3'h5: b_type = `INST_BR_GE;
|
||||
3'h6: b_type = `INST_BR_LTU;
|
||||
3'h7: b_type = `INST_BR_GEU;
|
||||
3'h0: b_type = INST_BR_EQ;
|
||||
3'h1: b_type = INST_BR_NE;
|
||||
3'h4: b_type = INST_BR_LT;
|
||||
3'h5: b_type = INST_BR_GE;
|
||||
3'h6: b_type = INST_BR_LTU;
|
||||
3'h7: b_type = INST_BR_GEU;
|
||||
default: b_type = 'x;
|
||||
endcase
|
||||
end
|
||||
|
||||
reg [`INST_BR_BITS-1:0] s_type;
|
||||
reg [INST_BR_BITS-1:0] s_type;
|
||||
always @(*) begin
|
||||
case (u_12)
|
||||
12'h000: s_type = `INST_OP_BITS'(`INST_BR_ECALL);
|
||||
12'h001: s_type = `INST_OP_BITS'(`INST_BR_EBREAK);
|
||||
12'h002: s_type = `INST_OP_BITS'(`INST_BR_URET);
|
||||
12'h102: s_type = `INST_OP_BITS'(`INST_BR_SRET);
|
||||
12'h302: s_type = `INST_OP_BITS'(`INST_BR_MRET);
|
||||
12'h000: s_type = INST_OP_BITS'(INST_BR_ECALL);
|
||||
12'h001: s_type = INST_OP_BITS'(INST_BR_EBREAK);
|
||||
12'h002: s_type = INST_OP_BITS'(INST_BR_URET);
|
||||
12'h102: s_type = INST_OP_BITS'(INST_BR_SRET);
|
||||
12'h302: s_type = INST_OP_BITS'(INST_BR_MRET);
|
||||
default: s_type = 'x;
|
||||
endcase
|
||||
end
|
||||
|
||||
`ifdef EXT_M_ENABLE
|
||||
reg [`INST_M_BITS-1:0] m_type;
|
||||
reg [INST_M_BITS-1:0] m_type;
|
||||
always @(*) begin
|
||||
case (func3)
|
||||
3'h0: m_type = `INST_M_MUL;
|
||||
3'h1: m_type = `INST_M_MULH;
|
||||
3'h2: m_type = `INST_M_MULHSU;
|
||||
3'h3: m_type = `INST_M_MULHU;
|
||||
3'h4: m_type = `INST_M_DIV;
|
||||
3'h5: m_type = `INST_M_DIVU;
|
||||
3'h6: m_type = `INST_M_REM;
|
||||
3'h7: m_type = `INST_M_REMU;
|
||||
3'h0: m_type = INST_M_MUL;
|
||||
3'h1: m_type = INST_M_MULH;
|
||||
3'h2: m_type = INST_M_MULHSU;
|
||||
3'h3: m_type = INST_M_MULHU;
|
||||
3'h4: m_type = INST_M_DIV;
|
||||
3'h5: m_type = INST_M_DIVU;
|
||||
3'h6: m_type = INST_M_REM;
|
||||
3'h7: m_type = INST_M_REMU;
|
||||
endcase
|
||||
end
|
||||
`endif
|
||||
|
@ -159,14 +154,10 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
ex_type = 'x;
|
||||
op_type = 'x;
|
||||
op_args = 'x;
|
||||
rd_ext = 'x;
|
||||
rs1_ext = 'x;
|
||||
rs2_ext = 'x;
|
||||
rs3_ext = 'x;
|
||||
rd_v = '0;
|
||||
rs1_v = '0;
|
||||
rs2_v = '0;
|
||||
rs3_v = '0;
|
||||
rd_v = 'x;
|
||||
rs1_v = 'x;
|
||||
rs2_v = 'x;
|
||||
rs3_v = 'x;
|
||||
use_rd = 0;
|
||||
use_rs1 = 0;
|
||||
use_rs2 = 0;
|
||||
|
@ -174,20 +165,20 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
is_wstall = 0;
|
||||
|
||||
case (opcode)
|
||||
`INST_I: begin
|
||||
ex_type = `EX_ALU;
|
||||
op_type = `INST_OP_BITS'(r_type);
|
||||
op_args.alu.xtype = `ALU_TYPE_ARITH;
|
||||
INST_I: begin
|
||||
ex_type = EX_ALU;
|
||||
op_type = INST_OP_BITS'(r_type);
|
||||
op_args.alu.xtype = ALU_TYPE_ARITH;
|
||||
op_args.alu.is_w = 0;
|
||||
op_args.alu.use_PC = 0;
|
||||
op_args.alu.use_imm = 1;
|
||||
op_args.alu.imm = `SEXT(`IMM_BITS, i_imm);
|
||||
op_args.alu.imm = `SEXT(`XLEN, i_imm);
|
||||
use_rd = 1;
|
||||
`USED_IREG (rd);
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
`INST_R: begin
|
||||
ex_type = `EX_ALU;
|
||||
INST_R: begin
|
||||
ex_type = EX_ALU;
|
||||
op_args.alu.is_w = 0;
|
||||
op_args.alu.use_PC = 0;
|
||||
op_args.alu.use_imm = 0;
|
||||
|
@ -197,41 +188,41 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
`USED_IREG (rs2);
|
||||
case (func7)
|
||||
`ifdef EXT_M_ENABLE
|
||||
`INST_R_F7_MUL: begin
|
||||
INST_R_F7_MUL: begin
|
||||
// MUL, MULH, MULHSU, MULHU
|
||||
op_type = `INST_OP_BITS'(m_type);
|
||||
op_args.alu.xtype = `ALU_TYPE_MULDIV;
|
||||
op_type = INST_OP_BITS'(m_type);
|
||||
op_args.alu.xtype = ALU_TYPE_MULDIV;
|
||||
end
|
||||
`endif
|
||||
`ifdef EXT_ZICOND_ENABLE
|
||||
`INST_R_F7_ZICOND: begin
|
||||
INST_R_F7_ZICOND: begin
|
||||
// CZERO-EQZ, CZERO-NEZ
|
||||
op_type = func3[1] ? `INST_OP_BITS'(`INST_ALU_CZNE) : `INST_OP_BITS'(`INST_ALU_CZEQ);
|
||||
op_args.alu.xtype = `ALU_TYPE_ARITH;
|
||||
op_type = func3[1] ? INST_OP_BITS'(INST_ALU_CZNE) : INST_OP_BITS'(INST_ALU_CZEQ);
|
||||
op_args.alu.xtype = ALU_TYPE_ARITH;
|
||||
end
|
||||
`endif
|
||||
default: begin
|
||||
op_type = `INST_OP_BITS'(r_type);
|
||||
op_args.alu.xtype = `ALU_TYPE_ARITH;
|
||||
op_type = INST_OP_BITS'(r_type);
|
||||
op_args.alu.xtype = ALU_TYPE_ARITH;
|
||||
end
|
||||
endcase
|
||||
end
|
||||
`ifdef XLEN_64
|
||||
`INST_I_W: begin
|
||||
INST_I_W: begin
|
||||
// ADDIW, SLLIW, SRLIW, SRAIW
|
||||
ex_type = `EX_ALU;
|
||||
op_type = `INST_OP_BITS'(r_type);
|
||||
op_args.alu.xtype = `ALU_TYPE_ARITH;
|
||||
ex_type = EX_ALU;
|
||||
op_type = INST_OP_BITS'(r_type);
|
||||
op_args.alu.xtype = ALU_TYPE_ARITH;
|
||||
op_args.alu.is_w = 1;
|
||||
op_args.alu.use_PC = 0;
|
||||
op_args.alu.use_imm = 1;
|
||||
op_args.alu.imm = `SEXT(`IMM_BITS, iw_imm);
|
||||
op_args.alu.imm = `SEXT(`XLEN, iw_imm);
|
||||
use_rd = 1;
|
||||
`USED_IREG (rd);
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
`INST_R_W: begin
|
||||
ex_type = `EX_ALU;
|
||||
INST_R_W: begin
|
||||
ex_type = EX_ALU;
|
||||
op_args.alu.is_w = 1;
|
||||
op_args.alu.use_PC = 0;
|
||||
op_args.alu.use_imm = 0;
|
||||
|
@ -241,90 +232,90 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
`USED_IREG (rs2);
|
||||
case (func7)
|
||||
`ifdef EXT_M_ENABLE
|
||||
`INST_R_F7_MUL: begin
|
||||
INST_R_F7_MUL: begin
|
||||
// MULW, DIVW, DIVUW, REMW, REMUW
|
||||
op_type = `INST_OP_BITS'(m_type);
|
||||
op_args.alu.xtype = `ALU_TYPE_MULDIV;
|
||||
op_type = INST_OP_BITS'(m_type);
|
||||
op_args.alu.xtype = ALU_TYPE_MULDIV;
|
||||
end
|
||||
`endif
|
||||
default: begin
|
||||
// ADDW, SUBW, SLLW, SRLW, SRAW
|
||||
op_type = `INST_OP_BITS'(r_type);
|
||||
op_args.alu.xtype = `ALU_TYPE_ARITH;
|
||||
op_type = INST_OP_BITS'(r_type);
|
||||
op_args.alu.xtype = ALU_TYPE_ARITH;
|
||||
end
|
||||
endcase
|
||||
end
|
||||
`endif
|
||||
`INST_LUI: begin
|
||||
ex_type = `EX_ALU;
|
||||
op_type = `INST_OP_BITS'(`INST_ALU_LUI);
|
||||
op_args.alu.xtype = `ALU_TYPE_ARITH;
|
||||
INST_LUI: begin
|
||||
ex_type = EX_ALU;
|
||||
op_type = INST_OP_BITS'(INST_ALU_LUI);
|
||||
op_args.alu.xtype = ALU_TYPE_ARITH;
|
||||
op_args.alu.is_w = 0;
|
||||
op_args.alu.use_PC = 0;
|
||||
op_args.alu.use_imm = 1;
|
||||
op_args.alu.imm = {{`IMM_BITS-31{ui_imm[19]}}, ui_imm[18:0], 12'(0)};
|
||||
op_args.alu.imm = {{`XLEN-31{ui_imm[19]}}, ui_imm[18:0], 12'(0)};
|
||||
use_rd = 1;
|
||||
`USED_IREG (rd);
|
||||
end
|
||||
`INST_AUIPC: begin
|
||||
ex_type = `EX_ALU;
|
||||
op_type = `INST_OP_BITS'(`INST_ALU_AUIPC);
|
||||
op_args.alu.xtype = `ALU_TYPE_ARITH;
|
||||
INST_AUIPC: begin
|
||||
ex_type = EX_ALU;
|
||||
op_type = INST_OP_BITS'(INST_ALU_AUIPC);
|
||||
op_args.alu.xtype = ALU_TYPE_ARITH;
|
||||
op_args.alu.is_w = 0;
|
||||
op_args.alu.use_PC = 1;
|
||||
op_args.alu.use_imm = 1;
|
||||
op_args.alu.imm = {{`IMM_BITS-31{ui_imm[19]}}, ui_imm[18:0], 12'(0)};
|
||||
op_args.alu.imm = {{`XLEN-31{ui_imm[19]}}, ui_imm[18:0], 12'(0)};
|
||||
use_rd = 1;
|
||||
`USED_IREG (rd);
|
||||
end
|
||||
`INST_JAL: begin
|
||||
ex_type = `EX_ALU;
|
||||
op_type = `INST_OP_BITS'(`INST_BR_JAL);
|
||||
op_args.alu.xtype = `ALU_TYPE_BRANCH;
|
||||
INST_JAL: begin
|
||||
ex_type = EX_ALU;
|
||||
op_type = INST_OP_BITS'(INST_BR_JAL);
|
||||
op_args.alu.xtype = ALU_TYPE_BRANCH;
|
||||
op_args.alu.is_w = 0;
|
||||
op_args.alu.use_PC = 1;
|
||||
op_args.alu.use_imm = 1;
|
||||
op_args.alu.imm = `SEXT(`IMM_BITS, jal_imm);
|
||||
op_args.alu.imm = `SEXT(`XLEN, jal_imm);
|
||||
use_rd = 1;
|
||||
is_wstall = 1;
|
||||
`USED_IREG (rd);
|
||||
end
|
||||
`INST_JALR: begin
|
||||
ex_type = `EX_ALU;
|
||||
op_type = `INST_OP_BITS'(`INST_BR_JALR);
|
||||
op_args.alu.xtype = `ALU_TYPE_BRANCH;
|
||||
INST_JALR: begin
|
||||
ex_type = EX_ALU;
|
||||
op_type = INST_OP_BITS'(INST_BR_JALR);
|
||||
op_args.alu.xtype = ALU_TYPE_BRANCH;
|
||||
op_args.alu.is_w = 0;
|
||||
op_args.alu.use_PC = 0;
|
||||
op_args.alu.use_imm = 1;
|
||||
op_args.alu.imm = `SEXT(`IMM_BITS, u_12);
|
||||
op_args.alu.imm = `SEXT(`XLEN, u_12);
|
||||
use_rd = 1;
|
||||
is_wstall = 1;
|
||||
`USED_IREG (rd);
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
`INST_B: begin
|
||||
ex_type = `EX_ALU;
|
||||
op_type = `INST_OP_BITS'(b_type);
|
||||
op_args.alu.xtype = `ALU_TYPE_BRANCH;
|
||||
INST_B: begin
|
||||
ex_type = EX_ALU;
|
||||
op_type = INST_OP_BITS'(b_type);
|
||||
op_args.alu.xtype = ALU_TYPE_BRANCH;
|
||||
op_args.alu.is_w = 0;
|
||||
op_args.alu.use_PC = 1;
|
||||
op_args.alu.use_imm = 1;
|
||||
op_args.alu.imm = `SEXT(`IMM_BITS, b_imm);
|
||||
op_args.alu.imm = `SEXT(`XLEN, b_imm);
|
||||
is_wstall = 1;
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
end
|
||||
`INST_FENCE: begin
|
||||
ex_type = `EX_LSU;
|
||||
op_type = `INST_LSU_FENCE;
|
||||
INST_FENCE: begin
|
||||
ex_type = EX_LSU;
|
||||
op_type = INST_LSU_FENCE;
|
||||
op_args.lsu.is_store = 0;
|
||||
op_args.lsu.is_float = 0;
|
||||
op_args.lsu.offset = 0;
|
||||
end
|
||||
`INST_SYS : begin
|
||||
INST_SYS : begin
|
||||
if (func3[1:0] != 0) begin
|
||||
ex_type = `EX_SFU;
|
||||
op_type = `INST_OP_BITS'(`INST_SFU_CSR(func3[1:0]));
|
||||
ex_type = EX_SFU;
|
||||
op_type = INST_OP_BITS'(inst_sfu_csr(func3));
|
||||
op_args.csr.addr = u_12;
|
||||
op_args.csr.use_imm = func3[2];
|
||||
use_rd = 1;
|
||||
|
@ -336,24 +327,24 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
`USED_IREG (rs1);
|
||||
end
|
||||
end else begin
|
||||
ex_type = `EX_ALU;
|
||||
op_type = `INST_OP_BITS'(s_type);
|
||||
op_args.alu.xtype = `ALU_TYPE_BRANCH;
|
||||
ex_type = EX_ALU;
|
||||
op_type = INST_OP_BITS'(s_type);
|
||||
op_args.alu.xtype = ALU_TYPE_BRANCH;
|
||||
op_args.alu.is_w = 0;
|
||||
op_args.alu.use_imm = 1;
|
||||
op_args.alu.use_PC = 1;
|
||||
op_args.alu.imm = `IMM_BITS'd4;
|
||||
op_args.alu.imm = `XLEN'd4;
|
||||
use_rd = 1;
|
||||
is_wstall = 1;
|
||||
`USED_IREG (rd);
|
||||
end
|
||||
end
|
||||
`ifdef EXT_F_ENABLE
|
||||
`INST_FL,
|
||||
INST_FL,
|
||||
`endif
|
||||
`INST_L: begin
|
||||
ex_type = `EX_LSU;
|
||||
op_type = `INST_OP_BITS'({1'b0, func3});
|
||||
INST_L: begin
|
||||
ex_type = EX_LSU;
|
||||
op_type = INST_OP_BITS'({1'b0, func3});
|
||||
op_args.lsu.is_store = 0;
|
||||
op_args.lsu.is_float = opcode[2];
|
||||
op_args.lsu.offset = u_12;
|
||||
|
@ -367,11 +358,11 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
`USED_IREG (rs1);
|
||||
end
|
||||
`ifdef EXT_F_ENABLE
|
||||
`INST_FS,
|
||||
INST_FS,
|
||||
`endif
|
||||
`INST_S: begin
|
||||
ex_type = `EX_LSU;
|
||||
op_type = `INST_OP_BITS'({1'b1, func3});
|
||||
INST_S: begin
|
||||
ex_type = EX_LSU;
|
||||
op_type = INST_OP_BITS'({1'b1, func3});
|
||||
op_args.lsu.is_store = 1;
|
||||
op_args.lsu.is_float = opcode[2];
|
||||
op_args.lsu.offset = s_imm;
|
||||
|
@ -384,13 +375,13 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
`USED_IREG (rs2);
|
||||
end
|
||||
`ifdef EXT_F_ENABLE
|
||||
`INST_FMADD, // 7'b1000011
|
||||
`INST_FMSUB, // 7'b1000111
|
||||
`INST_FNMSUB, // 7'b1001011
|
||||
`INST_FNMADD: // 7'b1001111
|
||||
INST_FMADD, // 7'b1000011
|
||||
INST_FMSUB, // 7'b1000111
|
||||
INST_FNMSUB, // 7'b1001011
|
||||
INST_FNMADD: // 7'b1001111
|
||||
begin
|
||||
ex_type = `EX_FPU;
|
||||
op_type = `INST_OP_BITS'({2'b00, 1'b1, opcode[3]});
|
||||
ex_type = EX_FPU;
|
||||
op_type = INST_OP_BITS'({2'b00, 1'b1, opcode[3]});
|
||||
op_args.fpu.frm = func3;
|
||||
op_args.fpu.fmt[0] = func2[0]; // float / double
|
||||
op_args.fpu.fmt[1] = opcode[3] ^ opcode[2]; // SUB
|
||||
|
@ -400,8 +391,8 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
`USED_FREG (rs2);
|
||||
`USED_FREG (rs3);
|
||||
end
|
||||
`INST_FCI: begin
|
||||
ex_type = `EX_FPU;
|
||||
INST_FCI: begin
|
||||
ex_type = EX_FPU;
|
||||
op_args.fpu.frm = func3;
|
||||
op_args.fpu.fmt[0] = func2[0]; // float / double
|
||||
op_args.fpu.fmt[1] = rs2[1]; // int32 / int64
|
||||
|
@ -411,7 +402,7 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
5'b00001, // FSUB
|
||||
5'b00010: // FMUL
|
||||
begin
|
||||
op_type = `INST_OP_BITS'({2'b00, 1'b0, func5[1]});
|
||||
op_type = INST_OP_BITS'({2'b00, 1'b0, func5[1]});
|
||||
op_args.fpu.fmt[1] = func5[0]; // SUB
|
||||
`USED_FREG (rd);
|
||||
`USED_FREG (rs1);
|
||||
|
@ -419,16 +410,16 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
end
|
||||
5'b00100: begin
|
||||
// NCP: FSGNJ=0, FSGNJN=1, FSGNJX=2
|
||||
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
|
||||
op_args.fpu.frm = `INST_FRM_BITS'(func3[1:0]);
|
||||
op_type = INST_OP_BITS'(INST_FPU_MISC);
|
||||
op_args.fpu.frm = INST_FRM_BITS'(func3[1:0]);
|
||||
`USED_FREG (rd);
|
||||
`USED_FREG (rs1);
|
||||
`USED_FREG (rs2);
|
||||
end
|
||||
5'b00101: begin
|
||||
// NCP: FMIN=6, FMAX=7
|
||||
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
|
||||
op_args.fpu.frm = `INST_FRM_BITS'(func3[0] ? 7 : 6);
|
||||
op_type = INST_OP_BITS'(INST_FPU_MISC);
|
||||
op_args.fpu.frm = INST_FRM_BITS'(func3[0] ? 7 : 6);
|
||||
`USED_FREG (rd);
|
||||
`USED_FREG (rs1);
|
||||
`USED_FREG (rs2);
|
||||
|
@ -436,60 +427,60 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
`ifdef FLEN_64
|
||||
5'b01000: begin
|
||||
// FCVT.S.D, FCVT.D.S
|
||||
op_type = `INST_OP_BITS'(`INST_FPU_F2F);
|
||||
op_type = INST_OP_BITS'(INST_FPU_F2F);
|
||||
`USED_FREG (rd);
|
||||
`USED_FREG (rs1);
|
||||
end
|
||||
`endif
|
||||
5'b00011: begin
|
||||
// FDIV
|
||||
op_type = `INST_OP_BITS'(`INST_FPU_DIV);
|
||||
op_type = INST_OP_BITS'(INST_FPU_DIV);
|
||||
`USED_FREG (rd);
|
||||
`USED_FREG (rs1);
|
||||
`USED_FREG (rs2);
|
||||
end
|
||||
5'b01011: begin
|
||||
// FSQRT
|
||||
op_type = `INST_OP_BITS'(`INST_FPU_SQRT);
|
||||
op_type = INST_OP_BITS'(INST_FPU_SQRT);
|
||||
`USED_FREG (rd);
|
||||
`USED_FREG (rs1);
|
||||
end
|
||||
5'b10100: begin
|
||||
// FCMP
|
||||
op_type = `INST_OP_BITS'(`INST_FPU_CMP);
|
||||
op_type = INST_OP_BITS'(INST_FPU_CMP);
|
||||
`USED_IREG (rd);
|
||||
`USED_FREG (rs1);
|
||||
`USED_FREG (rs2);
|
||||
end
|
||||
5'b11000: begin
|
||||
// FCVT.W.X, FCVT.WU.X
|
||||
op_type = (rs2[0]) ? `INST_OP_BITS'(`INST_FPU_F2U) : `INST_OP_BITS'(`INST_FPU_F2I);
|
||||
op_type = (rs2[0]) ? INST_OP_BITS'(INST_FPU_F2U) : INST_OP_BITS'(INST_FPU_F2I);
|
||||
`USED_IREG (rd);
|
||||
`USED_FREG (rs1);
|
||||
end
|
||||
5'b11010: begin
|
||||
// FCVT.X.W, FCVT.X.WU
|
||||
op_type = (rs2[0]) ? `INST_OP_BITS'(`INST_FPU_U2F) : `INST_OP_BITS'(`INST_FPU_I2F);
|
||||
op_type = (rs2[0]) ? INST_OP_BITS'(INST_FPU_U2F) : INST_OP_BITS'(INST_FPU_I2F);
|
||||
`USED_FREG (rd);
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
5'b11100: begin
|
||||
if (func3[0]) begin
|
||||
// NCP: FCLASS=3
|
||||
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
|
||||
op_args.fpu.frm = `INST_FRM_BITS'(3);
|
||||
op_type = INST_OP_BITS'(INST_FPU_MISC);
|
||||
op_args.fpu.frm = INST_FRM_BITS'(3);
|
||||
end else begin
|
||||
// NCP: FMV.X.W=4
|
||||
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
|
||||
op_args.fpu.frm = `INST_FRM_BITS'(4);
|
||||
op_type = INST_OP_BITS'(INST_FPU_MISC);
|
||||
op_args.fpu.frm = INST_FRM_BITS'(4);
|
||||
end
|
||||
`USED_IREG (rd);
|
||||
`USED_FREG (rs1);
|
||||
end
|
||||
5'b11110: begin
|
||||
// NCP: FMV.W.X=5
|
||||
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
|
||||
op_args.fpu.frm = `INST_FRM_BITS'(5);
|
||||
op_type = INST_OP_BITS'(INST_FPU_MISC);
|
||||
op_args.fpu.frm = INST_FRM_BITS'(5);
|
||||
`USED_FREG (rd);
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
|
@ -497,39 +488,39 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
endcase
|
||||
end
|
||||
`endif
|
||||
`INST_EXT1: begin
|
||||
INST_EXT1: begin
|
||||
case (func7)
|
||||
7'h00: begin
|
||||
ex_type = `EX_SFU;
|
||||
ex_type = EX_SFU;
|
||||
is_wstall = 1;
|
||||
case (func3)
|
||||
3'h0: begin // TMC
|
||||
op_type = `INST_OP_BITS'(`INST_SFU_TMC);
|
||||
op_type = INST_OP_BITS'(INST_SFU_TMC);
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
3'h1: begin // WSPAWN
|
||||
op_type = `INST_OP_BITS'(`INST_SFU_WSPAWN);
|
||||
op_type = INST_OP_BITS'(INST_SFU_WSPAWN);
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
end
|
||||
3'h2: begin // SPLIT
|
||||
op_type = `INST_OP_BITS'(`INST_SFU_SPLIT);
|
||||
op_type = INST_OP_BITS'(INST_SFU_SPLIT);
|
||||
use_rd = 1;
|
||||
op_args.wctl.is_neg = rs2[0];
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rd);
|
||||
end
|
||||
3'h3: begin // JOIN
|
||||
op_type = `INST_OP_BITS'(`INST_SFU_JOIN);
|
||||
op_type = INST_OP_BITS'(INST_SFU_JOIN);
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
3'h4: begin // BAR
|
||||
op_type = `INST_OP_BITS'(`INST_SFU_BAR);
|
||||
op_type = INST_OP_BITS'(INST_SFU_BAR);
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
end
|
||||
3'h5: begin // PRED
|
||||
op_type = `INST_OP_BITS'(`INST_SFU_PRED);
|
||||
op_type = INST_OP_BITS'(INST_SFU_PRED);
|
||||
op_args.wctl.is_neg = rd[0];
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
|
@ -547,6 +538,8 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
// disable write to integer register r0
|
||||
wire wb = use_rd && (rd_v != 0);
|
||||
|
||||
wire [2:0] used_rs = {use_rs3, use_rs2, use_rs1};
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (0)
|
||||
|
@ -555,8 +548,8 @@ module VX_decode import VX_gpu_pkg::*; #(
|
|||
.reset (reset),
|
||||
.valid_in (fetch_if.valid),
|
||||
.ready_in (fetch_if.ready),
|
||||
.data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_args, wb, rd_v, rs1_v, rs2_v, rs3_v, rd_ext, rs1_ext, rs2_ext, rs3_ext}),
|
||||
.data_out ({decode_if.data.uuid, decode_if.data.wid, decode_if.data.tmask, decode_if.data.PC, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args, decode_if.data.wb, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3, decode_if.data.rd_ext, decode_if.data.rs1_ext, decode_if.data.rs2_ext, decode_if.data.rs3_ext}),
|
||||
.data_in ({fetch_if.data.uuid, fetch_if.data.wid, fetch_if.data.tmask, fetch_if.data.PC, ex_type, op_type, op_args, wb, used_rs, rd_v, rs1_v, rs2_v, rs3_v}),
|
||||
.data_out ({decode_if.data.uuid, decode_if.data.wid, decode_if.data.tmask, decode_if.data.PC, decode_if.data.ex_type, decode_if.data.op_type, decode_if.data.op_args, decode_if.data.wb, decode_if.data.used_rs, decode_if.data.rd, decode_if.data.rs1, decode_if.data.rs2, decode_if.data.rs3}),
|
||||
.valid_out (decode_if.valid),
|
||||
.ready_out (decode_if.ready)
|
||||
);
|
||||
|
|
|
@ -20,28 +20,28 @@ module VX_dispatch import VX_gpu_pkg::*; #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
output wire [`PERF_CTR_BITS-1:0] perf_stalls [`NUM_EX_UNITS],
|
||||
output wire [PERF_CTR_BITS-1:0] perf_stalls [NUM_EX_UNITS],
|
||||
`endif
|
||||
// inputs
|
||||
VX_operands_if.slave operands_if,
|
||||
VX_operands_if.slave operands_if [`NUM_OPCS],
|
||||
|
||||
// outputs
|
||||
VX_dispatch_if.master dispatch_if [`NUM_EX_UNITS]
|
||||
VX_dispatch_if.master dispatch_if [NUM_EX_UNITS]
|
||||
);
|
||||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
|
||||
localparam DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `NR_BITS + (3 * `NUM_THREADS * `XLEN) + `NT_WIDTH;
|
||||
localparam DATAW = UUID_WIDTH + ISSUE_WIS_W + `SIMD_WIDTH + PC_BITS + INST_OP_BITS + INST_ARGS_BITS + 1 + NR_BITS + (NUM_SRC_OPDS * `SIMD_WIDTH * `XLEN) + NT_WIDTH;
|
||||
|
||||
wire [`NUM_THREADS-1:0][`NT_WIDTH-1:0] tids;
|
||||
for (genvar i = 0; i < `NUM_THREADS; ++i) begin : g_tids
|
||||
assign tids[i] = `NT_WIDTH'(i);
|
||||
wire [`SIMD_WIDTH-1:0][NT_WIDTH-1:0] tids;
|
||||
for (genvar i = 0; i < `SIMD_WIDTH; ++i) begin : g_tids
|
||||
assign tids[i] = NT_WIDTH'(i);
|
||||
end
|
||||
|
||||
wire [`NT_WIDTH-1:0] last_active_tid;
|
||||
wire [NT_WIDTH-1:0] last_active_tid;
|
||||
|
||||
VX_find_first #(
|
||||
.N (`NUM_THREADS),
|
||||
.DATAW (`NT_WIDTH),
|
||||
.N (`SIMD_WIDTH),
|
||||
.DATAW (NT_WIDTH),
|
||||
.REVERSE (1)
|
||||
) last_tid_select (
|
||||
.valid_in (operands_if.data.tmask),
|
||||
|
@ -50,10 +50,10 @@ module VX_dispatch import VX_gpu_pkg::*; #(
|
|||
`UNUSED_PIN (valid_out)
|
||||
);
|
||||
|
||||
wire [`NUM_EX_UNITS-1:0] operands_ready_in;
|
||||
wire [NUM_EX_UNITS-1:0] operands_ready_in;
|
||||
assign operands_if.ready = operands_ready_in[operands_if.data.ex_type];
|
||||
|
||||
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin : g_buffers
|
||||
for (genvar i = 0; i < NUM_EX_UNITS; ++i) begin : g_buffers
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (2),
|
||||
|
@ -61,7 +61,7 @@ module VX_dispatch import VX_gpu_pkg::*; #(
|
|||
) buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (operands_if.valid && (operands_if.data.ex_type == `EX_BITS'(i))),
|
||||
.valid_in (operands_if.valid && (operands_if.data.ex_type == EX_BITS'(i))),
|
||||
.ready_in (operands_ready_in[i]),
|
||||
.data_in ({
|
||||
operands_if.data.uuid,
|
||||
|
@ -84,16 +84,16 @@ module VX_dispatch import VX_gpu_pkg::*; #(
|
|||
end
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_stalls_r;
|
||||
reg [NUM_EX_UNITS-1:0][PERF_CTR_BITS-1:0] perf_stalls_r;
|
||||
|
||||
wire operands_if_stall = operands_if.valid && ~operands_if.ready;
|
||||
|
||||
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin : g_perf_stalls
|
||||
for (genvar i = 0; i < NUM_EX_UNITS; ++i) begin : g_perf_stalls
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_stalls_r[i] <= '0;
|
||||
end else begin
|
||||
perf_stalls_r[i] <= perf_stalls_r[i] + `PERF_CTR_BITS'(operands_if_stall && operands_if.data.ex_type == `EX_BITS'(i));
|
||||
perf_stalls_r[i] <= perf_stalls_r[i] + PERF_CTR_BITS'(operands_if_stall && operands_if.data.ex_type == EX_BITS'(i));
|
||||
end
|
||||
end
|
||||
assign perf_stalls[i] = perf_stalls_r[i];
|
||||
|
|
|
@ -30,19 +30,19 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
|
|||
|
||||
);
|
||||
`STATIC_ASSERT (`IS_DIVISBLE(`ISSUE_WIDTH, BLOCK_SIZE), ("invalid parameter"))
|
||||
`STATIC_ASSERT (`IS_DIVISBLE(`NUM_THREADS, NUM_LANES), ("invalid parameter"))
|
||||
`STATIC_ASSERT (`IS_DIVISBLE(`SIMD_WIDTH, NUM_LANES), ("invalid parameter"))
|
||||
localparam BLOCK_SIZE_W = `LOG2UP(BLOCK_SIZE);
|
||||
localparam NUM_PACKETS = `NUM_THREADS / NUM_LANES;
|
||||
localparam NUM_PACKETS = `SIMD_WIDTH / NUM_LANES;
|
||||
localparam PID_BITS = `CLOG2(NUM_PACKETS);
|
||||
localparam PID_WIDTH = `UP(PID_BITS);
|
||||
localparam BATCH_COUNT = `ISSUE_WIDTH / BLOCK_SIZE;
|
||||
localparam BATCH_COUNT_W= `LOG2UP(BATCH_COUNT);
|
||||
localparam ISSUE_W = `LOG2UP(`ISSUE_WIDTH);
|
||||
localparam IN_DATAW = `UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `PC_BITS + `NR_BITS + `NT_WIDTH + (3 * `NUM_THREADS * `XLEN);
|
||||
localparam OUT_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `INST_OP_BITS + `INST_ARGS_BITS + 1 + `PC_BITS + `NR_BITS + `NT_WIDTH + (3 * NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1;
|
||||
localparam FANOUT_ENABLE= (`NUM_THREADS > (MAX_FANOUT + MAX_FANOUT /2));
|
||||
localparam IN_DATAW = UUID_WIDTH + ISSUE_WIS_W + `SIMD_WIDTH + INST_OP_BITS + INST_ARGS_BITS + 1 + PC_BITS + NR_BITS + NT_WIDTH + (3 * `SIMD_WIDTH * `XLEN);
|
||||
localparam OUT_DATAW = UUID_WIDTH + NW_WIDTH + NUM_LANES + INST_OP_BITS + INST_ARGS_BITS + 1 + PC_BITS + NR_BITS + NT_WIDTH + (3 * NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1;
|
||||
localparam FANOUT_ENABLE= (`SIMD_WIDTH > (MAX_FANOUT + MAX_FANOUT /2));
|
||||
|
||||
localparam DATA_TMASK_OFF = IN_DATAW - (`UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS);
|
||||
localparam DATA_TMASK_OFF = IN_DATAW - (UUID_WIDTH + ISSUE_WIS_W + `SIMD_WIDTH);
|
||||
localparam DATA_REGS_OFF = 0;
|
||||
|
||||
wire [`ISSUE_WIDTH-1:0] dispatch_valid;
|
||||
|
@ -111,7 +111,7 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
|
|||
wire [ISSUE_W-1:0] issue_idx = issue_indices[block_idx];
|
||||
wire valid_p, ready_p;
|
||||
|
||||
if (`NUM_THREADS > NUM_LANES) begin : g_partial_threads
|
||||
if (`SIMD_WIDTH > NUM_LANES) begin : g_partial_threads
|
||||
reg [NUM_PACKETS-1:0] sent_mask_p;
|
||||
wire [PID_WIDTH-1:0] start_p_n, start_p, end_p;
|
||||
wire dispatch_valid_r;
|
||||
|
@ -141,10 +141,10 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
|
|||
wire [NUM_PACKETS-1:0][NUM_LANES-1:0] per_packet_tmask;
|
||||
wire [NUM_PACKETS-1:0][2:0][NUM_LANES-1:0][`XLEN-1:0] per_packet_regs;
|
||||
|
||||
wire [`NUM_THREADS-1:0] dispatch_tmask = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS];
|
||||
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs1_data = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
|
||||
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs2_data = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
|
||||
wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs3_data = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
|
||||
wire [`SIMD_WIDTH-1:0] dispatch_tmask = dispatch_data[issue_idx][DATA_TMASK_OFF +: `SIMD_WIDTH];
|
||||
wire [`SIMD_WIDTH-1:0][`XLEN-1:0] dispatch_rs1_data = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `SIMD_WIDTH * `XLEN +: `SIMD_WIDTH * `XLEN];
|
||||
wire [`SIMD_WIDTH-1:0][`XLEN-1:0] dispatch_rs2_data = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `SIMD_WIDTH * `XLEN +: `SIMD_WIDTH * `XLEN];
|
||||
wire [`SIMD_WIDTH-1:0][`XLEN-1:0] dispatch_rs3_data = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `SIMD_WIDTH * `XLEN +: `SIMD_WIDTH * `XLEN];
|
||||
|
||||
for (genvar i = 0; i < NUM_PACKETS; ++i) begin : g_per_packet_data
|
||||
for (genvar j = 0; j < NUM_LANES; ++j) begin : g_j
|
||||
|
@ -219,10 +219,10 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
|
|||
assign block_done[block_idx] = fire_eop || ~dispatch_valid[issue_idx];
|
||||
end else begin : g_full_threads
|
||||
assign valid_p = dispatch_valid[issue_idx];
|
||||
assign block_tmask[block_idx] = dispatch_data[issue_idx][DATA_TMASK_OFF +: `NUM_THREADS];
|
||||
assign block_regs[block_idx][0] = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
|
||||
assign block_regs[block_idx][1] = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
|
||||
assign block_regs[block_idx][2] = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `NUM_THREADS * `XLEN +: `NUM_THREADS * `XLEN];
|
||||
assign block_tmask[block_idx] = dispatch_data[issue_idx][DATA_TMASK_OFF +: `SIMD_WIDTH];
|
||||
assign block_regs[block_idx][0] = dispatch_data[issue_idx][DATA_REGS_OFF + 2 * `SIMD_WIDTH * `XLEN +: `SIMD_WIDTH * `XLEN];
|
||||
assign block_regs[block_idx][1] = dispatch_data[issue_idx][DATA_REGS_OFF + 1 * `SIMD_WIDTH * `XLEN +: `SIMD_WIDTH * `XLEN];
|
||||
assign block_regs[block_idx][2] = dispatch_data[issue_idx][DATA_REGS_OFF + 0 * `SIMD_WIDTH * `XLEN +: `SIMD_WIDTH * `XLEN];
|
||||
assign block_pid[block_idx] = '0;
|
||||
assign block_sop[block_idx] = 1'b1;
|
||||
assign block_eop[block_idx] = 1'b1;
|
||||
|
@ -241,7 +241,7 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
|
|||
assign isw = block_idx;
|
||||
end
|
||||
|
||||
wire [`NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`NUM_THREADS +: ISSUE_WIS_W], isw);
|
||||
wire [NW_WIDTH-1:0] block_wid = wis_to_wid(dispatch_data[issue_idx][DATA_TMASK_OFF+`SIMD_WIDTH +: ISSUE_WIS_W], isw);
|
||||
|
||||
logic [OUT_DATAW-1:0] execute_data, execute_data_w;
|
||||
|
||||
|
@ -255,10 +255,10 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
|
|||
.valid_in (valid_p),
|
||||
.ready_in (ready_p),
|
||||
.data_in ({
|
||||
dispatch_data[issue_idx][IN_DATAW-1 : DATA_TMASK_OFF+`NUM_THREADS+ISSUE_WIS_W],
|
||||
dispatch_data[issue_idx][IN_DATAW-1 : DATA_TMASK_OFF+`SIMD_WIDTH+ISSUE_WIS_W],
|
||||
block_wid,
|
||||
block_tmask[block_idx],
|
||||
dispatch_data[issue_idx][DATA_TMASK_OFF-1 : DATA_REGS_OFF + 3 * `NUM_THREADS * `XLEN],
|
||||
dispatch_data[issue_idx][DATA_TMASK_OFF-1 : DATA_REGS_OFF + 3 * `SIMD_WIDTH * `XLEN],
|
||||
block_regs[block_idx][0],
|
||||
block_regs[block_idx][1],
|
||||
block_regs[block_idx][2],
|
||||
|
@ -270,7 +270,7 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
|
|||
.ready_out (execute_if[block_idx].ready)
|
||||
);
|
||||
|
||||
if (`NUM_THREADS != NUM_LANES) begin : g_execute_data_w_partial
|
||||
if (`SIMD_WIDTH != NUM_LANES) begin : g_execute_data_w_partial
|
||||
assign execute_data_w = execute_data;
|
||||
end else begin : g_execute_data_w_full
|
||||
always @(*) begin
|
||||
|
|
|
@ -33,10 +33,10 @@ module VX_execute import VX_gpu_pkg::*; #(
|
|||
VX_lsu_mem_if.master lsu_mem_if [`NUM_LSU_BLOCKS],
|
||||
|
||||
// dispatch interface
|
||||
VX_dispatch_if.slave dispatch_if [`NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
VX_dispatch_if.slave dispatch_if [NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
|
||||
// commit interface
|
||||
VX_commit_if.master commit_if [`NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
VX_commit_if.master commit_if [NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
|
||||
// scheduler interfaces
|
||||
VX_sched_csr_if.slave sched_csr_if,
|
||||
|
@ -56,8 +56,8 @@ module VX_execute import VX_gpu_pkg::*; #(
|
|||
) alu_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.dispatch_if (dispatch_if[`EX_ALU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
.commit_if (commit_if[`EX_ALU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
.dispatch_if (dispatch_if[EX_ALU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
.commit_if (commit_if[EX_ALU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
.branch_ctl_if (branch_ctl_if)
|
||||
);
|
||||
|
||||
|
@ -69,8 +69,8 @@ module VX_execute import VX_gpu_pkg::*; #(
|
|||
`SCOPE_IO_BIND (0)
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.dispatch_if (dispatch_if[`EX_LSU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
.commit_if (commit_if[`EX_LSU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
.dispatch_if (dispatch_if[EX_LSU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
.commit_if (commit_if[EX_LSU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
.lsu_mem_if (lsu_mem_if)
|
||||
);
|
||||
|
||||
|
@ -80,8 +80,8 @@ module VX_execute import VX_gpu_pkg::*; #(
|
|||
) fpu_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.dispatch_if (dispatch_if[`EX_FPU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
.commit_if (commit_if[`EX_FPU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
.dispatch_if (dispatch_if[EX_FPU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
.commit_if (commit_if[EX_FPU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
.fpu_csr_if (fpu_csr_if)
|
||||
);
|
||||
`endif
|
||||
|
@ -97,8 +97,8 @@ module VX_execute import VX_gpu_pkg::*; #(
|
|||
.pipeline_perf (pipeline_perf),
|
||||
`endif
|
||||
.base_dcrs (base_dcrs),
|
||||
.dispatch_if (dispatch_if[`EX_SFU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
.commit_if (commit_if[`EX_SFU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
.dispatch_if (dispatch_if[EX_SFU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
.commit_if (commit_if[EX_SFU * `ISSUE_WIDTH +: `ISSUE_WIDTH]),
|
||||
`ifdef EXT_F_ENABLE
|
||||
.fpu_csr_if (fpu_csr_if),
|
||||
`endif
|
||||
|
|
|
@ -38,8 +38,8 @@ module VX_fetch import VX_gpu_pkg::*; #(
|
|||
wire [ICACHE_TAG_WIDTH-1:0] icache_req_tag;
|
||||
wire icache_req_ready;
|
||||
|
||||
wire [`UUID_WIDTH-1:0] rsp_uuid;
|
||||
wire [`NW_WIDTH-1:0] req_tag, rsp_tag;
|
||||
wire [UUID_WIDTH-1:0] rsp_uuid;
|
||||
wire [NW_WIDTH-1:0] req_tag, rsp_tag;
|
||||
|
||||
wire icache_req_fire = icache_req_valid && icache_req_ready;
|
||||
|
||||
|
@ -47,11 +47,11 @@ module VX_fetch import VX_gpu_pkg::*; #(
|
|||
|
||||
assign {rsp_uuid, rsp_tag} = icache_bus_if.rsp_data.tag;
|
||||
|
||||
wire [`PC_BITS-1:0] rsp_PC;
|
||||
wire [PC_BITS-1:0] rsp_PC;
|
||||
wire [`NUM_THREADS-1:0] rsp_tmask;
|
||||
|
||||
VX_dp_ram #(
|
||||
.DATAW (`PC_BITS + `NUM_THREADS),
|
||||
.DATAW (PC_BITS + `NUM_THREADS),
|
||||
.SIZE (`NUM_WARPS),
|
||||
.RDW_MODE ("R"),
|
||||
.LUTRAM (1)
|
||||
|
@ -141,9 +141,9 @@ module VX_fetch import VX_gpu_pkg::*; #(
|
|||
wire reset_negedge;
|
||||
`NEG_EDGE (reset_negedge, reset);
|
||||
`SCOPE_TAP_EX (0, 1, 6, 3, (
|
||||
`UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS +
|
||||
`UUID_WIDTH + ICACHE_WORD_SIZE + ICACHE_ADDR_WIDTH +
|
||||
`UUID_WIDTH + (ICACHE_WORD_SIZE * 8)
|
||||
UUID_WIDTH + NW_WIDTH + `NUM_THREADS + PC_BITS +
|
||||
UUID_WIDTH + ICACHE_WORD_SIZE + ICACHE_ADDR_WIDTH +
|
||||
UUID_WIDTH + (ICACHE_WORD_SIZE * 8)
|
||||
), {
|
||||
schedule_if.valid,
|
||||
schedule_if.ready,
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
|
||||
`include "VX_fpu_define.vh"
|
||||
|
||||
module VX_fpu_unit import VX_fpu_pkg::*; #(
|
||||
module VX_fpu_unit import VX_gpu_pkg::*, VX_fpu_pkg::*; #(
|
||||
parameter `STRING INSTANCE_ID = ""
|
||||
) (
|
||||
input wire clk,
|
||||
|
@ -29,10 +29,10 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
|
|||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
localparam BLOCK_SIZE = `NUM_FPU_BLOCKS;
|
||||
localparam NUM_LANES = `NUM_FPU_LANES;
|
||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||
localparam PID_BITS = `CLOG2(`SIMD_WIDTH / NUM_LANES);
|
||||
localparam PID_WIDTH = `UP(PID_BITS);
|
||||
localparam TAG_WIDTH = `LOG2UP(`FPUQ_SIZE);
|
||||
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `NUM_THREADS);
|
||||
localparam PARTIAL_BW = (BLOCK_SIZE != `ISSUE_WIDTH) || (NUM_LANES != `SIMD_WIDTH);
|
||||
|
||||
VX_execute_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
|
@ -64,11 +64,11 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
|
|||
fflags_t fpu_rsp_fflags;
|
||||
wire fpu_rsp_has_fflags;
|
||||
|
||||
wire [`UUID_WIDTH-1:0] fpu_rsp_uuid;
|
||||
wire [`NW_WIDTH-1:0] fpu_rsp_wid;
|
||||
wire [UUID_WIDTH-1:0] fpu_rsp_uuid;
|
||||
wire [NW_WIDTH-1:0] fpu_rsp_wid;
|
||||
wire [NUM_LANES-1:0] fpu_rsp_tmask;
|
||||
wire [`PC_BITS-1:0] fpu_rsp_PC;
|
||||
wire [`NR_BITS-1:0] fpu_rsp_rd;
|
||||
wire [PC_BITS-1:0] fpu_rsp_PC;
|
||||
wire [NR_BITS-1:0] fpu_rsp_rd;
|
||||
wire [PID_WIDTH-1:0] fpu_rsp_pid, fpu_rsp_pid_u;
|
||||
wire fpu_rsp_sop, fpu_rsp_sop_u;
|
||||
wire fpu_rsp_eop, fpu_rsp_eop_u;
|
||||
|
@ -76,14 +76,14 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
|
|||
wire [TAG_WIDTH-1:0] fpu_req_tag, fpu_rsp_tag;
|
||||
wire mdata_full;
|
||||
|
||||
wire [`INST_FMT_BITS-1:0] fpu_fmt = per_block_execute_if[block_idx].data.op_args.fpu.fmt;
|
||||
wire [`INST_FRM_BITS-1:0] fpu_frm = per_block_execute_if[block_idx].data.op_args.fpu.frm;
|
||||
wire [INST_FMT_BITS-1:0] fpu_fmt = per_block_execute_if[block_idx].data.op_args.fpu.fmt;
|
||||
wire [INST_FRM_BITS-1:0] fpu_frm = per_block_execute_if[block_idx].data.op_args.fpu.frm;
|
||||
|
||||
wire execute_fire = per_block_execute_if[block_idx].valid && per_block_execute_if[block_idx].ready;
|
||||
wire fpu_rsp_fire = fpu_rsp_valid && fpu_rsp_ready;
|
||||
|
||||
VX_index_buffer #(
|
||||
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + PID_WIDTH + 1 + 1),
|
||||
.DATAW (UUID_WIDTH + NW_WIDTH + NUM_LANES + PC_BITS + NR_BITS + PID_WIDTH + 1 + 1),
|
||||
.SIZE (`FPUQ_SIZE)
|
||||
) tag_store (
|
||||
.clk (clk),
|
||||
|
@ -112,10 +112,10 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
|
|||
end
|
||||
|
||||
// resolve dynamic FRM from CSR
|
||||
wire [`INST_FRM_BITS-1:0] fpu_req_frm;
|
||||
wire [INST_FRM_BITS-1:0] fpu_req_frm;
|
||||
`ASSIGN_BLOCKED_WID (fpu_csr_if[block_idx].read_wid, per_block_execute_if[block_idx].data.wid, block_idx, `NUM_FPU_BLOCKS)
|
||||
assign fpu_req_frm = (per_block_execute_if[block_idx].data.op_type != `INST_FPU_MISC
|
||||
&& fpu_frm == `INST_FRM_DYN) ? fpu_csr_if[block_idx].read_frm : fpu_frm;
|
||||
assign fpu_req_frm = (per_block_execute_if[block_idx].data.op_type != INST_FPU_MISC
|
||||
&& fpu_frm == INST_FRM_DYN) ? fpu_csr_if[block_idx].read_frm : fpu_frm;
|
||||
|
||||
// submit FPU request
|
||||
|
||||
|
@ -234,7 +234,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
|
|||
assign fpu_csr_tmp_if.write_fflags = fpu_rsp_fflags_q;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `NW_WIDTH + $bits(fflags_t)),
|
||||
.DATAW (1 + NW_WIDTH + $bits(fflags_t)),
|
||||
.RESETW (1)
|
||||
) fpu_csr_reg (
|
||||
.clk (clk),
|
||||
|
@ -247,7 +247,7 @@ module VX_fpu_unit import VX_fpu_pkg::*; #(
|
|||
// send response
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1),
|
||||
.DATAW (UUID_WIDTH + NW_WIDTH + NUM_LANES + PC_BITS + NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1),
|
||||
.SIZE (0)
|
||||
) rsp_buf (
|
||||
.clk (clk),
|
||||
|
|
|
@ -29,12 +29,12 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
|
|||
|
||||
);
|
||||
`STATIC_ASSERT (`IS_DIVISBLE(`ISSUE_WIDTH, BLOCK_SIZE), ("invalid parameter"))
|
||||
`STATIC_ASSERT (`IS_DIVISBLE(`NUM_THREADS, NUM_LANES), ("invalid parameter"))
|
||||
`STATIC_ASSERT (`IS_DIVISBLE(`SIMD_WIDTH, NUM_LANES), ("invalid parameter"))
|
||||
localparam BLOCK_SIZE_W = `LOG2UP(BLOCK_SIZE);
|
||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||
localparam PID_BITS = `CLOG2(`SIMD_WIDTH / NUM_LANES);
|
||||
localparam PID_WIDTH = `UP(PID_BITS);
|
||||
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + 1 + `NR_BITS + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
|
||||
localparam DATA_WIS_OFF = DATAW - (`UUID_WIDTH + `NW_WIDTH);
|
||||
localparam DATAW = UUID_WIDTH + NW_WIDTH + NUM_LANES + PC_BITS + 1 + NR_BITS + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
|
||||
localparam DATA_WIS_OFF = DATAW - (UUID_WIDTH + NW_WIDTH);
|
||||
|
||||
wire [BLOCK_SIZE-1:0] commit_in_valid;
|
||||
wire [BLOCK_SIZE-1:0][DATAW-1:0] commit_in_data;
|
||||
|
@ -95,8 +95,8 @@ module VX_gather_unit import VX_gpu_pkg::*; #(
|
|||
.ready_out (commit_tmp_if.ready)
|
||||
);
|
||||
|
||||
logic [`NUM_THREADS-1:0] commit_tmask_w;
|
||||
logic [`NUM_THREADS-1:0][`XLEN-1:0] commit_data_w;
|
||||
logic [`SIMD_WIDTH-1:0] commit_tmask_w;
|
||||
logic [`SIMD_WIDTH-1:0][`XLEN-1:0] commit_data_w;
|
||||
if (PID_BITS != 0) begin : g_commit_data_with_pid
|
||||
always @(*) begin
|
||||
commit_tmask_w = '0;
|
||||
|
|
224
hw/rtl/core/VX_gpr_unit.sv
Normal file
224
hw/rtl/core/VX_gpr_unit.sv
Normal file
|
@ -0,0 +1,224 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
// reset all GPRs in debug mode
|
||||
`ifdef SIMULATION
|
||||
`ifndef NDEBUG
|
||||
`define GPR_RESET
|
||||
`endif
|
||||
`endif
|
||||
|
||||
module VX_gpr_unit import VX_gpu_pkg::*; #(
|
||||
parameter `STRING INSTANCE_ID = "",
|
||||
parameter NUM_REQS = 1,
|
||||
parameter NUM_BANKS = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
output wire [PERF_CTR_BITS-1:0] perf_stalls,
|
||||
`endif
|
||||
|
||||
VX_writeback_if.slave writeback_if,
|
||||
VX_opc_if.slave opc_if [NUM_REQS]
|
||||
);
|
||||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
|
||||
localparam REQ_SEL_BITS = `CLOG2(NUM_REQS);
|
||||
localparam REQ_SEL_WIDTH = `UP(REQ_SEL_BITS);
|
||||
localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS);
|
||||
localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS);
|
||||
localparam GPR_BANK_DATAW = `XLEN * `SIMD_WIDTH;
|
||||
localparam GPR_BANK_SIZE = (PER_ISSUE_WARPS * NUM_REGS * SIMD_COUNT) / NUM_BANKS;
|
||||
localparam GPR_BANK_ADDRW = `CLOG2(GPR_BANK_SIZE);
|
||||
localparam BANKID_WIS_BITS = (BANK_SEL_BITS > 1 && ISSUE_WIS_BITS != 0) ? 1 : 0;
|
||||
localparam BANKID_REG_BITS = BANK_SEL_BITS - BANKID_WIS_BITS;
|
||||
localparam PER_BANK_WIS_BITS = ISSUE_WIS_BITS - BANKID_WIS_BITS;
|
||||
localparam PER_BANK_REG_BITS = NR_BITS - BANKID_REG_BITS;
|
||||
localparam PER_BANK_WIS_WIDTH = `UP(PER_BANK_WIS_BITS);
|
||||
localparam PER_BANK_REG_WIDTH = `UP(PER_BANK_REG_BITS);
|
||||
localparam OPC_REQ_DATAW = 2 + SIMD_IDX_W + PER_BANK_WIS_BITS + PER_BANK_REG_BITS;
|
||||
localparam OPC_RSP_DATAW = 2 + `SIMD_WIDTH * `XLEN;
|
||||
localparam BYTEENW = `SIMD_WIDTH * XLENB;
|
||||
|
||||
wire [NUM_REQS-1:0] opc_req_valid, opc_req_ready;
|
||||
wire [NUM_REQS-1:0][OPC_REQ_DATAW-1:0] opc_req_data;
|
||||
wire [NUM_REQS-1:0][BANK_SEL_WIDTH-1:0] opc_req_bank_idx;
|
||||
|
||||
wire [NUM_BANKS-1:0] gpr_req_valid, gpr_req_ready;
|
||||
wire [NUM_BANKS-1:0][OPC_REQ_DATAW-1:0] gpr_req_data;
|
||||
wire [NUM_BANKS-1:0][1:0] gpr_req_opd_id;
|
||||
wire [NUM_BANKS-1:0][SIMD_IDX_W-1:0] gpr_req_sid;
|
||||
wire [NUM_BANKS-1:0][PER_BANK_WIS_WIDTH-1:0] gpr_req_wis;
|
||||
wire [NUM_BANKS-1:0][PER_BANK_REG_WIDTH-1:0] gpr_reg_id;
|
||||
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] gpr_req_idx;
|
||||
|
||||
wire [NUM_BANKS-1:0][`SIMD_WIDTH-1:0][`XLEN-1:0] gpr_rd_data;
|
||||
|
||||
wire [NUM_BANKS-1:0] gpr_rsp_valid;
|
||||
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] gpr_rsp_idx;
|
||||
wire [NUM_BANKS-1:0][1:0] gpr_rsp_opd_id;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
wire [PERF_CTR_BITS-1:0] collisions;
|
||||
`endif
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_opc_req
|
||||
assign opc_req_valid[i] = opc_if[i].req_valid;
|
||||
assign opc_req_data[i] = {
|
||||
opc_if[i].req_data.opd_id,
|
||||
opc_if[i].req_data.sid,
|
||||
opc_if[i].req_data.wis[ISSUE_WIS_W-1:BANKID_WIS_BITS],
|
||||
opc_if[i].req_data.reg_id[NR_BITS-1:BANKID_REG_BITS]
|
||||
};
|
||||
`CONCAT(opc_req_bank_idx[i], opc_if[i].req_data.wis[BANKID_WIS_BITS-1:0], opc_if[i].req_data.reg_id[BANKID_REG_BITS-1:0], BANKID_WIS_BITS, BANKID_REG_BITS)
|
||||
assign opc_if[i].req_ready = opc_req_ready[i];
|
||||
end
|
||||
|
||||
VX_stream_xbar #(
|
||||
.NUM_INPUTS (NUM_REQS),
|
||||
.NUM_OUTPUTS (NUM_BANKS),
|
||||
.DATAW (OPC_REQ_DATAW),
|
||||
.ARBITER ("P"),
|
||||
.OUT_BUF (1),
|
||||
.PERF_CTR_BITS (PERF_CTR_BITS)
|
||||
) req_xbar (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.collisions(collisions),
|
||||
`endif
|
||||
.valid_in (opc_req_valid),
|
||||
.data_in (opc_req_data),
|
||||
.sel_in (opc_req_bank_idx),
|
||||
.ready_in (opc_req_ready),
|
||||
.valid_out (gpr_req_valid),
|
||||
.data_out (gpr_req_data),
|
||||
.sel_out (gpr_req_idx),
|
||||
.ready_out ('1)
|
||||
);
|
||||
|
||||
wire [GPR_BANK_ADDRW-1:0] gpr_wr_addr;
|
||||
if (SIMD_IDX_BITS != 0 || PER_BANK_WIS_BITS != 0) begin : g_gpr_wr_addr
|
||||
wire [SIMD_IDX_BITS + PER_BANK_WIS_BITS-1:0] tmp;
|
||||
`CONCAT(tmp, writeback_if.data.sid, writeback_if.data.wis[ISSUE_WIS_W-1:BANKID_WIS_BITS], SIMD_IDX_BITS, PER_BANK_WIS_BITS);
|
||||
assign gpr_wr_addr = {tmp, writeback_if.data.rd[NR_BITS-1:BANKID_REG_BITS]};
|
||||
end else begin : g_gpr_wr_addr_reg
|
||||
assign gpr_wr_addr = writeback_if.data.rd[NR_BITS-1:BANKID_REG_BITS];
|
||||
end
|
||||
|
||||
wire [BANK_SEL_WIDTH-1:0] gpr_wr_bank_idx;
|
||||
if (NUM_BANKS != 1) begin : g_gpr_wr_bank_idx
|
||||
`CONCAT(gpr_wr_bank_idx, writeback_if.data.wis[BANKID_WIS_BITS-1:0], writeback_if.data.rd[BANKID_REG_BITS-1:0], BANKID_WIS_BITS, BANKID_REG_BITS)
|
||||
end else begin : g_gpr_wr_bank_idx_0
|
||||
assign gpr_wr_bank_idx = '0;
|
||||
end
|
||||
|
||||
wire [BYTEENW-1:0] gpr_wr_byteen;
|
||||
for (genvar i = 0; i < `SIMD_WIDTH; ++i) begin : g_gpr_wr_byteen
|
||||
assign gpr_wr_byteen[i*XLENB+:XLENB] = {XLENB{writeback_if.data.tmask[i]}};
|
||||
end
|
||||
|
||||
for (genvar b = 0; b < NUM_BANKS; ++b) begin : g_gpr_req_data
|
||||
assign {gpr_req_opd_id[b], gpr_req_sid[b], gpr_req_wis[b], gpr_reg_id[b]} = gpr_req_data[b];
|
||||
end
|
||||
|
||||
for (genvar b = 0; b < NUM_BANKS; ++b) begin : g_gpr_rams
|
||||
wire gpr_wr_enabled;
|
||||
if (BANK_SEL_BITS != 0) begin : g_gpr_wr_enabled_multibanks
|
||||
assign gpr_wr_enabled = writeback_if.valid && (gpr_wr_bank_idx == BANK_SEL_BITS'(b));
|
||||
end else begin : g_gpr_wr_enabled
|
||||
assign gpr_wr_enabled = writeback_if.valid;
|
||||
end
|
||||
|
||||
wire [GPR_BANK_ADDRW-1:0] gpr_rd_addr;
|
||||
if (SIMD_IDX_BITS != 0 || PER_BANK_WIS_BITS != 0) begin : g_gpr_rd_addr
|
||||
wire [(SIMD_IDX_BITS + PER_BANK_WIS_BITS)-1:0] tmp;
|
||||
`CONCAT(tmp, gpr_req_sid[b], gpr_req_wis[b], SIMD_IDX_BITS, PER_BANK_WIS_BITS);
|
||||
assign gpr_rd_addr = {tmp, gpr_reg_id[b]};
|
||||
end else begin : g_gpr_rd_addr_reg
|
||||
assign gpr_rd_addr = gpr_reg_id[b];
|
||||
end
|
||||
|
||||
VX_dp_ram #(
|
||||
.DATAW (GPR_BANK_DATAW),
|
||||
.SIZE (GPR_BANK_SIZE),
|
||||
.WRENW (BYTEENW),
|
||||
`ifdef GPR_RESET
|
||||
.RESET_RAM (1),
|
||||
`endif
|
||||
.OUT_REG (1),
|
||||
.RDW_MODE ("R")
|
||||
) gpr_ram (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.read (gpr_req_valid[b]),
|
||||
.wren (gpr_wr_byteen),
|
||||
.write (gpr_wr_enabled),
|
||||
.waddr (gpr_wr_addr),
|
||||
.wdata (writeback_if.data.data),
|
||||
.raddr (gpr_rd_addr),
|
||||
.rdata (gpr_rd_data[b])
|
||||
);
|
||||
|
||||
VX_pipe_buffer #(
|
||||
.DATAW (REQ_SEL_WIDTH + 2)
|
||||
) pipe_reg1 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (gpr_req_valid[b]),
|
||||
.data_in ({gpr_req_idx[b], gpr_req_opd_id[b]}),
|
||||
`UNUSED_PIN (ready_in),
|
||||
.valid_out(gpr_rsp_valid[b]),
|
||||
.data_out ({gpr_rsp_idx[b], gpr_rsp_opd_id[b]}),
|
||||
`UNUSED_PIN (ready_out)
|
||||
);
|
||||
end
|
||||
|
||||
wire [NUM_BANKS-1:0][OPC_RSP_DATAW-1:0] gpr_rsp_data;
|
||||
|
||||
`AOS_TO_ITF_RSP_V (opc, opc_if, NUM_REQS, OPC_RSP_DATAW)
|
||||
|
||||
VX_stream_xpoint #(
|
||||
.NUM_INPUTS (NUM_BANKS),
|
||||
.NUM_OUTPUTS (NUM_REQS),
|
||||
.DATAW (OPC_RSP_DATAW),
|
||||
.OUT_BUF (0) // no output buffering
|
||||
) rsp_xpoint (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (gpr_rsp_valid),
|
||||
.data_in (gpr_rsp_data),
|
||||
.sel_in (gpr_rsp_idx),
|
||||
`UNUSED_PIN (ready_in),
|
||||
.valid_out (opc_rsp_valid),
|
||||
.data_out (opc_rsp_data),
|
||||
`UNUSED_PIN (ready_out)
|
||||
);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [PERF_CTR_BITS-1:0] collisions_r;
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
collisions_r <= '0;
|
||||
end else begin
|
||||
collisions_r <= collisions_r + collisions;
|
||||
end
|
||||
end
|
||||
assign perf_stalls = collisions_r;
|
||||
`endif
|
||||
|
||||
endmodule
|
|
@ -20,7 +20,7 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
output wire [`PERF_CTR_BITS-1:0] perf_stalls,
|
||||
output wire [PERF_CTR_BITS-1:0] perf_stalls,
|
||||
`endif
|
||||
|
||||
// inputs
|
||||
|
@ -30,7 +30,9 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
|
|||
VX_ibuffer_if.master ibuffer_if [PER_ISSUE_WARPS]
|
||||
);
|
||||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + (`NR_BITS * 4) + (`REG_EXT_BITS * 4);
|
||||
|
||||
localparam NUM_OPDS = NUM_SRC_OPDS + 1;
|
||||
localparam DATAW = UUID_WIDTH + `NUM_THREADS + PC_BITS + EX_BITS + INST_OP_BITS + INST_ARGS_BITS + NUM_OPDS + (REG_IDX_BITS * NUM_OPDS);
|
||||
|
||||
wire [PER_ISSUE_WARPS-1:0] ibuf_ready_in;
|
||||
assign decode_if.ready = ibuf_ready_in[decode_if.data.wid];
|
||||
|
@ -52,14 +54,11 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
|
|||
decode_if.data.op_type,
|
||||
decode_if.data.op_args,
|
||||
decode_if.data.wb,
|
||||
decode_if.data.used_rs,
|
||||
decode_if.data.rd,
|
||||
decode_if.data.rs1,
|
||||
decode_if.data.rs2,
|
||||
decode_if.data.rs3,
|
||||
decode_if.data.rd_ext,
|
||||
decode_if.data.rs1_ext,
|
||||
decode_if.data.rs2_ext,
|
||||
decode_if.data.rs3_ext
|
||||
decode_if.data.rs3
|
||||
}),
|
||||
.ready_in (ibuf_ready_in[w]),
|
||||
.valid_out(ibuffer_if[w].valid),
|
||||
|
@ -72,7 +71,7 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
|
|||
end
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`PERF_CTR_BITS-1:0] perf_ibf_stalls;
|
||||
reg [PERF_CTR_BITS-1:0] perf_ibf_stalls;
|
||||
|
||||
wire decode_if_stall = decode_if.valid && ~decode_if.ready;
|
||||
|
||||
|
@ -80,7 +79,7 @@ module VX_ibuffer import VX_gpu_pkg::*; #(
|
|||
if (reset) begin
|
||||
perf_ibf_stalls <= '0;
|
||||
end else begin
|
||||
perf_ibf_stalls <= perf_ibf_stalls + `PERF_CTR_BITS'(decode_if_stall);
|
||||
perf_ibf_stalls <= perf_ibf_stalls + PERF_CTR_BITS'(decode_if_stall);
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
@ -27,20 +27,20 @@ module VX_issue import VX_gpu_pkg::*; #(
|
|||
|
||||
VX_decode_if.slave decode_if,
|
||||
VX_writeback_if.slave writeback_if [`ISSUE_WIDTH],
|
||||
VX_dispatch_if.master dispatch_if [`NUM_EX_UNITS * `ISSUE_WIDTH]
|
||||
VX_dispatch_if.master dispatch_if [NUM_EX_UNITS * `ISSUE_WIDTH]
|
||||
);
|
||||
`STATIC_ASSERT ((`ISSUE_WIDTH <= `NUM_WARPS), ("invalid parameter"))
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
issue_perf_t per_issue_perf [`ISSUE_WIDTH];
|
||||
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, ibf_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
|
||||
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, scb_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
|
||||
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, opd_stalls, `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
|
||||
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin : g_issue_perf_units_uses
|
||||
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, units_uses[i], `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
|
||||
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, ibf_stalls, PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
|
||||
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, scb_stalls, PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
|
||||
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, opd_stalls, PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
|
||||
for (genvar i = 0; i < NUM_EX_UNITS; ++i) begin : g_issue_perf_units_uses
|
||||
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, units_uses[i], PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
|
||||
end
|
||||
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin : g_issue_perf_sfu_uses
|
||||
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, sfu_uses[i], `PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
|
||||
for (genvar i = 0; i < NUM_SFU_UNITS; ++i) begin : g_issue_perf_sfu_uses
|
||||
`PERF_COUNTER_ADD (issue_perf, per_issue_perf, sfu_uses[i], PERF_CTR_BITS, `ISSUE_WIDTH, (`ISSUE_WIDTH > 2))
|
||||
end
|
||||
`endif
|
||||
|
||||
|
@ -57,7 +57,7 @@ module VX_issue import VX_gpu_pkg::*; #(
|
|||
.NUM_WARPS (PER_ISSUE_WARPS)
|
||||
) per_issue_decode_if();
|
||||
|
||||
VX_dispatch_if per_issue_dispatch_if[`NUM_EX_UNITS]();
|
||||
VX_dispatch_if per_issue_dispatch_if[NUM_EX_UNITS]();
|
||||
|
||||
assign per_issue_decode_if.valid = decode_if.valid && (decode_isw == ISSUE_ISW_W'(issue_id));
|
||||
assign per_issue_decode_if.data.uuid = decode_if.data.uuid;
|
||||
|
@ -93,7 +93,7 @@ module VX_issue import VX_gpu_pkg::*; #(
|
|||
);
|
||||
|
||||
// Assign transposed dispatch_if
|
||||
for (genvar ex_id = 0; ex_id < `NUM_EX_UNITS; ++ex_id) begin : g_dispatch_if
|
||||
for (genvar ex_id = 0; ex_id < NUM_EX_UNITS; ++ex_id) begin : g_dispatch_if
|
||||
`ASSIGN_VX_IF(dispatch_if[ex_id * `ISSUE_WIDTH + issue_id], per_issue_dispatch_if[ex_id]);
|
||||
end
|
||||
end
|
||||
|
|
|
@ -28,13 +28,13 @@ module VX_issue_slice import VX_gpu_pkg::*; #(
|
|||
|
||||
VX_decode_if.slave decode_if,
|
||||
VX_writeback_if.slave writeback_if,
|
||||
VX_dispatch_if.master dispatch_if [`NUM_EX_UNITS]
|
||||
VX_dispatch_if.master dispatch_if [NUM_EX_UNITS]
|
||||
);
|
||||
`UNUSED_PARAM (ISSUE_ID)
|
||||
|
||||
VX_ibuffer_if ibuffer_if [PER_ISSUE_WARPS]();
|
||||
VX_scoreboard_if scoreboard_if();
|
||||
VX_operands_if operands_if();
|
||||
VX_operands_if operands_if[`NUM_OPCS]();
|
||||
|
||||
VX_ibuffer #(
|
||||
.INSTANCE_ID (`SFORMATF(("%s-ibuffer", INSTANCE_ID)))
|
||||
|
@ -96,9 +96,9 @@ module VX_issue_slice import VX_gpu_pkg::*; #(
|
|||
wire reset_negedge;
|
||||
`NEG_EDGE (reset_negedge, reset);
|
||||
`SCOPE_TAP_EX (0, 2, 4, 3, (
|
||||
`UUID_WIDTH + `NW_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + 1 + `NR_BITS * 4 +
|
||||
`UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + 1 + `NR_BITS + (3 * `XLEN) +
|
||||
`UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + `NR_BITS + (`NUM_THREADS * `XLEN) + 1
|
||||
UUID_WIDTH + NW_WIDTH + `NUM_THREADS + PC_BITS + EX_BITS + INST_OP_BITS + 1 + NR_BITS * 4 +
|
||||
UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + PC_BITS + EX_BITS + INST_OP_BITS + 1 + NR_BITS + (3 * `XLEN) +
|
||||
UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + NR_BITS + (`NUM_THREADS * `XLEN) + 1
|
||||
), {
|
||||
decode_if.valid,
|
||||
decode_if.ready,
|
||||
|
|
|
@ -21,48 +21,48 @@ module VX_issue_top import VX_gpu_pkg::*; #(
|
|||
input wire reset,
|
||||
|
||||
input wire decode_valid,
|
||||
input wire [`UUID_WIDTH-1:0] decode_uuid,
|
||||
input wire [`NW_WIDTH-1:0] decode_wid,
|
||||
input wire [UUID_WIDTH-1:0] decode_uuid,
|
||||
input wire [NW_WIDTH-1:0] decode_wid,
|
||||
input wire [`NUM_THREADS-1:0] decode_tmask,
|
||||
input wire [`PC_BITS-1:0] decode_PC,
|
||||
input wire [`EX_BITS-1:0] decode_ex_type,
|
||||
input wire [`INST_OP_BITS-1:0] decode_op_type,
|
||||
input wire [PC_BITS-1:0] decode_PC,
|
||||
input wire [EX_BITS-1:0] decode_ex_type,
|
||||
input wire [INST_OP_BITS-1:0] decode_op_type,
|
||||
input op_args_t decode_op_args,
|
||||
input wire decode_wb,
|
||||
input wire [`NR_BITS-1:0] decode_rd,
|
||||
input wire [`NR_BITS-1:0] decode_rs1,
|
||||
input wire [`NR_BITS-1:0] decode_rs2,
|
||||
input wire [`NR_BITS-1:0] decode_rs3,
|
||||
input wire [NR_BITS-1:0] decode_rd,
|
||||
input wire [NR_BITS-1:0] decode_rs1,
|
||||
input wire [NR_BITS-1:0] decode_rs2,
|
||||
input wire [NR_BITS-1:0] decode_rs3,
|
||||
output wire decode_ready,
|
||||
|
||||
input wire writeback_valid[`ISSUE_WIDTH],
|
||||
input wire [`UUID_WIDTH-1:0] writeback_uuid[`ISSUE_WIDTH],
|
||||
input wire [UUID_WIDTH-1:0] writeback_uuid[`ISSUE_WIDTH],
|
||||
input wire [ISSUE_WIS_W-1:0] writeback_wis[`ISSUE_WIDTH],
|
||||
input wire [`NUM_THREADS-1:0] writeback_tmask[`ISSUE_WIDTH],
|
||||
input wire [`PC_BITS-1:0] writeback_PC[`ISSUE_WIDTH],
|
||||
input wire [`NR_BITS-1:0] writeback_rd[`ISSUE_WIDTH],
|
||||
input wire [PC_BITS-1:0] writeback_PC[`ISSUE_WIDTH],
|
||||
input wire [NR_BITS-1:0] writeback_rd[`ISSUE_WIDTH],
|
||||
input wire [`NUM_THREADS-1:0][`XLEN-1:0] writeback_data[`ISSUE_WIDTH],
|
||||
input wire writeback_sop[`ISSUE_WIDTH],
|
||||
input wire writeback_eop[`ISSUE_WIDTH],
|
||||
|
||||
output wire dispatch_valid[`NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
output wire [`UUID_WIDTH-1:0] dispatch_uuid[`NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
output wire [ISSUE_WIS_W-1:0] dispatch_wis[`NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
output wire [`NUM_THREADS-1:0] dispatch_tmask[`NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
output wire [`PC_BITS-1:0] dispatch_PC[`NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
output wire [`INST_ALU_BITS-1:0] dispatch_op_type[`NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
output op_args_t dispatch_op_args[`NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
output wire dispatch_wb[`NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
output wire [`NR_BITS-1:0] dispatch_rd[`NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
output wire [`NT_WIDTH-1:0] dispatch_tid[`NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
output wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs1_data[`NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
output wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs2_data[`NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
output wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs3_data[`NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
input wire dispatch_ready[`NUM_EX_UNITS * `ISSUE_WIDTH]
|
||||
output wire dispatch_valid[NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
output wire [UUID_WIDTH-1:0] dispatch_uuid[NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
output wire [ISSUE_WIS_W-1:0] dispatch_wis[NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
output wire [`NUM_THREADS-1:0] dispatch_tmask[NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
output wire [PC_BITS-1:0] dispatch_PC[NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
output wire [INST_ALU_BITS-1:0] dispatch_op_type[NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
output op_args_t dispatch_op_args[NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
output wire dispatch_wb[NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
output wire [NR_BITS-1:0] dispatch_rd[NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
output wire [NT_WIDTH-1:0] dispatch_tid[NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
output wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs1_data[NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
output wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs2_data[NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
output wire [`NUM_THREADS-1:0][`XLEN-1:0] dispatch_rs3_data[NUM_EX_UNITS * `ISSUE_WIDTH],
|
||||
input wire dispatch_ready[NUM_EX_UNITS * `ISSUE_WIDTH]
|
||||
);
|
||||
|
||||
VX_decode_if decode_if();
|
||||
VX_dispatch_if dispatch_if[`NUM_EX_UNITS * `ISSUE_WIDTH]();
|
||||
VX_dispatch_if dispatch_if[NUM_EX_UNITS * `ISSUE_WIDTH]();
|
||||
VX_writeback_if writeback_if[`ISSUE_WIDTH]();
|
||||
|
||||
assign decode_if.valid = decode_valid;
|
||||
|
@ -92,7 +92,7 @@ module VX_issue_top import VX_gpu_pkg::*; #(
|
|||
assign writeback_if[i].data.eop = writeback_eop[i];
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `NUM_EX_UNITS * `ISSUE_WIDTH; ++i) begin : g_dispatch_if
|
||||
for (genvar i = 0; i < NUM_EX_UNITS * `ISSUE_WIDTH; ++i) begin : g_dispatch_if
|
||||
assign dispatch_valid[i] = dispatch_if[i].valid;
|
||||
assign dispatch_uuid[i] = dispatch_if[i].data.uuid;
|
||||
assign dispatch_wis[i] = dispatch_if[i].data.wis;
|
||||
|
|
|
@ -29,19 +29,19 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
VX_lsu_mem_if.master lsu_mem_if
|
||||
);
|
||||
localparam NUM_LANES = `NUM_LSU_LANES;
|
||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||
localparam PID_BITS = `CLOG2(`SIMD_WIDTH / NUM_LANES);
|
||||
localparam PID_WIDTH = `UP(PID_BITS);
|
||||
localparam RSP_ARB_DATAW= `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
|
||||
localparam RSP_ARB_DATAW= UUID_WIDTH + NW_WIDTH + NUM_LANES + PC_BITS + NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
|
||||
localparam LSUQ_SIZEW = `LOG2UP(`LSUQ_IN_SIZE);
|
||||
localparam REQ_ASHIFT = `CLOG2(LSU_WORD_SIZE);
|
||||
localparam MEM_ASHIFT = `CLOG2(`MEM_BLOCK_SIZE);
|
||||
localparam MEM_ADDRW = `MEM_ADDR_WIDTH - MEM_ASHIFT;
|
||||
|
||||
// tag_id = wid + PC + wb + rd + op_type + align + pid + pkt_addr + fence
|
||||
localparam TAG_ID_WIDTH = `NW_WIDTH + `PC_BITS + 1 + `NR_BITS + `INST_LSU_BITS + (NUM_LANES * REQ_ASHIFT) + PID_WIDTH + LSUQ_SIZEW + 1;
|
||||
localparam TAG_ID_WIDTH = NW_WIDTH + PC_BITS + 1 + NR_BITS + INST_LSU_BITS + (NUM_LANES * REQ_ASHIFT) + PID_WIDTH + LSUQ_SIZEW + 1;
|
||||
|
||||
// tag = uuid + tag_id
|
||||
localparam TAG_WIDTH = `UUID_WIDTH + TAG_ID_WIDTH;
|
||||
localparam TAG_WIDTH = UUID_WIDTH + TAG_ID_WIDTH;
|
||||
|
||||
VX_commit_if #(
|
||||
.NUM_LANES (NUM_LANES)
|
||||
|
@ -65,19 +65,19 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
|
||||
// address type calculation
|
||||
|
||||
wire [NUM_LANES-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] mem_req_flags;
|
||||
wire [NUM_LANES-1:0][MEM_FLAGS_WIDTH-1:0] mem_req_flags;
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_mem_req_flags
|
||||
wire [MEM_ADDRW-1:0] block_addr = full_addr[i][MEM_ASHIFT +: MEM_ADDRW];
|
||||
// is I/O address
|
||||
wire [MEM_ADDRW-1:0] io_addr_start = MEM_ADDRW'(`XLEN'(`IO_BASE_ADDR) >> MEM_ASHIFT);
|
||||
wire [MEM_ADDRW-1:0] io_addr_end = MEM_ADDRW'(`XLEN'(`IO_END_ADDR) >> MEM_ASHIFT);
|
||||
assign mem_req_flags[i][`MEM_REQ_FLAG_FLUSH] = req_is_fence;
|
||||
assign mem_req_flags[i][`MEM_REQ_FLAG_IO] = (block_addr >= io_addr_start) && (block_addr < io_addr_end);
|
||||
assign mem_req_flags[i][MEM_REQ_FLAG_FLUSH] = req_is_fence;
|
||||
assign mem_req_flags[i][MEM_REQ_FLAG_IO] = (block_addr >= io_addr_start) && (block_addr < io_addr_end);
|
||||
`ifdef LMEM_ENABLE
|
||||
// is local memory address
|
||||
wire [MEM_ADDRW-1:0] lmem_addr_start = MEM_ADDRW'(`XLEN'(`LMEM_BASE_ADDR) >> MEM_ASHIFT);
|
||||
wire [MEM_ADDRW-1:0] lmem_addr_end = MEM_ADDRW'((`XLEN'(`LMEM_BASE_ADDR) + `XLEN'(1 << `LMEM_LOG_SIZE)) >> MEM_ASHIFT);
|
||||
assign mem_req_flags[i][`MEM_REQ_FLAG_LOCAL] = (block_addr >= lmem_addr_start) && (block_addr < lmem_addr_end);
|
||||
assign mem_req_flags[i][MEM_REQ_FLAG_LOCAL] = (block_addr >= lmem_addr_start) && (block_addr < lmem_addr_end);
|
||||
`endif
|
||||
end
|
||||
|
||||
|
@ -110,7 +110,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
|
||||
reg fence_lock;
|
||||
|
||||
assign req_is_fence = `INST_LSU_IS_FENCE(execute_if.data.op_type);
|
||||
assign req_is_fence = inst_lsu_is_fence(execute_if.data.op_type);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
|
@ -159,7 +159,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
reg [LSU_WORD_SIZE-1:0] mem_req_byteen_w;
|
||||
always @(*) begin
|
||||
mem_req_byteen_w = '0;
|
||||
case (`INST_LSU_WSIZE(execute_if.data.op_type))
|
||||
case (inst_lsu_wsize(execute_if.data.op_type))
|
||||
0: begin // 8-bit
|
||||
mem_req_byteen_w[req_align[i]] = 1'b1;
|
||||
end
|
||||
|
@ -185,9 +185,9 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
// memory misalignment not supported!
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_missalign
|
||||
wire lsu_req_fire = execute_if.valid && execute_if.ready;
|
||||
`RUNTIME_ASSERT((~lsu_req_fire || ~execute_if.data.tmask[i] || req_is_fence || (full_addr[i] % (1 << `INST_LSU_WSIZE(execute_if.data.op_type))) == 0),
|
||||
`RUNTIME_ASSERT((~lsu_req_fire || ~execute_if.data.tmask[i] || req_is_fence || (full_addr[i] % (1 << inst_lsu_wsize(execute_if.data.op_type))) == 0),
|
||||
("%t: misaligned memory access, wid=%0d, PC=0x%0h, addr=0x%0h, wsize=%0d! (#%0d)",
|
||||
$time, execute_if.data.wid, {execute_if.data.PC, 1'b0}, full_addr[i], `INST_LSU_WSIZE(execute_if.data.op_type), execute_if.data.uuid))
|
||||
$time, execute_if.data.wid, {execute_if.data.PC, 1'b0}, full_addr[i], inst_lsu_wsize(execute_if.data.op_type), execute_if.data.uuid))
|
||||
end
|
||||
|
||||
// store data formatting
|
||||
|
@ -298,7 +298,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
wire [NUM_LANES-1:0] lsu_mem_req_mask;
|
||||
wire [NUM_LANES-1:0][LSU_WORD_SIZE-1:0] lsu_mem_req_byteen;
|
||||
wire [NUM_LANES-1:0][LSU_ADDR_WIDTH-1:0] lsu_mem_req_addr;
|
||||
wire [NUM_LANES-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] lsu_mem_req_flags;
|
||||
wire [NUM_LANES-1:0][MEM_FLAGS_WIDTH-1:0] lsu_mem_req_flags;
|
||||
wire [NUM_LANES-1:0][(LSU_WORD_SIZE*8)-1:0] lsu_mem_req_data;
|
||||
wire [LSU_TAG_WIDTH-1:0] lsu_mem_req_tag;
|
||||
wire lsu_mem_req_ready;
|
||||
|
@ -316,11 +316,11 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
.WORD_SIZE (LSU_WORD_SIZE),
|
||||
.LINE_SIZE (LSU_WORD_SIZE),
|
||||
.ADDR_WIDTH (LSU_ADDR_WIDTH),
|
||||
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
|
||||
.FLAGS_WIDTH (MEM_FLAGS_WIDTH),
|
||||
.TAG_WIDTH (TAG_WIDTH),
|
||||
.CORE_QUEUE_SIZE (`LSUQ_IN_SIZE),
|
||||
.MEM_QUEUE_SIZE (`LSUQ_OUT_SIZE),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.UUID_WIDTH (UUID_WIDTH),
|
||||
.RSP_PARTIAL (1),
|
||||
.MEM_OUT_BUF (0),
|
||||
.CORE_OUT_BUF(0)
|
||||
|
@ -385,12 +385,12 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
assign lsu_mem_rsp_tag = lsu_mem_if.rsp_data.tag;
|
||||
assign lsu_mem_if.rsp_ready = lsu_mem_rsp_ready;
|
||||
|
||||
wire [`UUID_WIDTH-1:0] rsp_uuid;
|
||||
wire [`NW_WIDTH-1:0] rsp_wid;
|
||||
wire [`PC_BITS-1:0] rsp_pc;
|
||||
wire [UUID_WIDTH-1:0] rsp_uuid;
|
||||
wire [NW_WIDTH-1:0] rsp_wid;
|
||||
wire [PC_BITS-1:0] rsp_pc;
|
||||
wire rsp_wb;
|
||||
wire [`NR_BITS-1:0] rsp_rd;
|
||||
wire [`INST_LSU_BITS-1:0] rsp_op_type;
|
||||
wire [NR_BITS-1:0] rsp_rd;
|
||||
wire [INST_LSU_BITS-1:0] rsp_op_type;
|
||||
wire [NUM_LANES-1:0][REQ_ASHIFT-1:0] rsp_align;
|
||||
wire [PID_WIDTH-1:0] rsp_pid;
|
||||
`UNUSED_VAR (rsp_op_type)
|
||||
|
@ -433,17 +433,17 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
wire [7:0] rsp_data8 = rsp_align[i][0] ? rsp_data16[15:8] : rsp_data16[7:0];
|
||||
|
||||
always @(*) begin
|
||||
case (`INST_LSU_FMT(rsp_op_type))
|
||||
`INST_FMT_B: rsp_data[i] = `XLEN'(signed'(rsp_data8));
|
||||
`INST_FMT_H: rsp_data[i] = `XLEN'(signed'(rsp_data16));
|
||||
`INST_FMT_BU: rsp_data[i] = `XLEN'(unsigned'(rsp_data8));
|
||||
`INST_FMT_HU: rsp_data[i] = `XLEN'(unsigned'(rsp_data16));
|
||||
case (inst_lsu_fmt(rsp_op_type))
|
||||
LSU_FMT_B: rsp_data[i] = `XLEN'(signed'(rsp_data8));
|
||||
LSU_FMT_H: rsp_data[i] = `XLEN'(signed'(rsp_data16));
|
||||
LSU_FMT_BU: rsp_data[i] = `XLEN'(unsigned'(rsp_data8));
|
||||
LSU_FMT_HU: rsp_data[i] = `XLEN'(unsigned'(rsp_data16));
|
||||
`ifdef XLEN_64
|
||||
`INST_FMT_W: rsp_data[i] = rsp_is_float ? (`XLEN'(rsp_data32) | 64'hffffffff00000000) : `XLEN'(signed'(rsp_data32));
|
||||
`INST_FMT_WU: rsp_data[i] = `XLEN'(unsigned'(rsp_data32));
|
||||
`INST_FMT_D: rsp_data[i] = `XLEN'(signed'(rsp_data64));
|
||||
LSU_FMT_W: rsp_data[i] = rsp_is_float ? (`XLEN'(rsp_data32) | 64'hffffffff00000000) : `XLEN'(signed'(rsp_data32));
|
||||
LSU_FMT_WU: rsp_data[i] = `XLEN'(unsigned'(rsp_data32));
|
||||
LSU_FMT_D: rsp_data[i] = `XLEN'(signed'(rsp_data64));
|
||||
`else
|
||||
`INST_FMT_W: rsp_data[i] = `XLEN'(signed'(rsp_data32));
|
||||
LSU_FMT_W: rsp_data[i] = `XLEN'(signed'(rsp_data32));
|
||||
`endif
|
||||
default: rsp_data[i] = 'x;
|
||||
endcase
|
||||
|
@ -453,7 +453,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
// commit
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + 1 + `NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1),
|
||||
.DATAW (UUID_WIDTH + NW_WIDTH + NUM_LANES + PC_BITS + 1 + NR_BITS + (NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1),
|
||||
.SIZE (2)
|
||||
) rsp_buf (
|
||||
.clk (clk),
|
||||
|
@ -467,7 +467,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
);
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (`UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + PID_WIDTH + 1 + 1),
|
||||
.DATAW (UUID_WIDTH + NW_WIDTH + NUM_LANES + PC_BITS + PID_WIDTH + 1 + 1),
|
||||
.SIZE (2)
|
||||
) no_rsp_buf (
|
||||
.clk (clk),
|
||||
|
@ -538,7 +538,7 @@ module VX_lsu_slice import VX_gpu_pkg::*; #(
|
|||
wire reset_negedge;
|
||||
`NEG_EDGE (reset_negedge, reset);
|
||||
`SCOPE_TAP_EX (0, 3, 4, 2, (
|
||||
1 + NUM_LANES * (`XLEN + LSU_WORD_SIZE + LSU_WORD_SIZE * 8) + `UUID_WIDTH + NUM_LANES * LSU_WORD_SIZE * 8 + `UUID_WIDTH
|
||||
1 + NUM_LANES * (`XLEN + LSU_WORD_SIZE + LSU_WORD_SIZE * 8) + UUID_WIDTH + NUM_LANES * LSU_WORD_SIZE * 8 + UUID_WIDTH
|
||||
), {
|
||||
mem_req_valid,
|
||||
mem_req_ready,
|
||||
|
|
|
@ -93,7 +93,7 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
|
|||
.NUM_LANES (`NUM_LSU_LANES),
|
||||
.DATA_SIZE (LSU_WORD_SIZE),
|
||||
.TAG_WIDTH (LMEM_TAG_WIDTH),
|
||||
.TAG_SEL_BITS (LMEM_TAG_WIDTH - `UUID_WIDTH),
|
||||
.TAG_SEL_BITS (LMEM_TAG_WIDTH - UUID_WIDTH),
|
||||
.ARBITER ("P"),
|
||||
.REQ_OUT_BUF (3),
|
||||
.RSP_OUT_BUF (0)
|
||||
|
@ -111,7 +111,6 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
|
|||
.NUM_BANKS (`LMEM_NUM_BANKS),
|
||||
.WORD_SIZE (LSU_WORD_SIZE),
|
||||
.ADDR_WIDTH (LMEM_ADDR_WIDTH),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.TAG_WIDTH (LMEM_TAG_WIDTH),
|
||||
.OUT_BUF (3)
|
||||
) local_mem (
|
||||
|
@ -142,11 +141,11 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
|
|||
) dcache_coalesced_if[`NUM_LSU_BLOCKS]();
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
wire [`NUM_LSU_BLOCKS-1:0][`PERF_CTR_BITS-1:0] per_block_coalescer_misses;
|
||||
wire [`PERF_CTR_BITS-1:0] coalescer_misses;
|
||||
wire [`NUM_LSU_BLOCKS-1:0][PERF_CTR_BITS-1:0] per_block_coalescer_misses;
|
||||
wire [PERF_CTR_BITS-1:0] coalescer_misses;
|
||||
VX_reduce_tree #(
|
||||
.DATAW_IN (`PERF_CTR_BITS),
|
||||
.DATAW_OUT (`PERF_CTR_BITS),
|
||||
.DATAW_IN (PERF_CTR_BITS),
|
||||
.DATAW_OUT (PERF_CTR_BITS),
|
||||
.N (`NUM_LSU_BLOCKS),
|
||||
.OP ("+")
|
||||
) coalescer_reduce (
|
||||
|
@ -165,11 +164,11 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
|
|||
.DATA_IN_SIZE (LSU_WORD_SIZE),
|
||||
.DATA_OUT_SIZE (DCACHE_WORD_SIZE),
|
||||
.ADDR_WIDTH (LSU_ADDR_WIDTH),
|
||||
.FLAGS_WIDTH (`MEM_REQ_FLAGS_WIDTH),
|
||||
.FLAGS_WIDTH (MEM_FLAGS_WIDTH),
|
||||
.TAG_WIDTH (LSU_TAG_WIDTH),
|
||||
.UUID_WIDTH (`UUID_WIDTH),
|
||||
.UUID_WIDTH (UUID_WIDTH),
|
||||
.QUEUE_SIZE (`LSUQ_OUT_SIZE),
|
||||
.PERF_CTR_BITS (`PERF_CTR_BITS)
|
||||
.PERF_CTR_BITS (PERF_CTR_BITS)
|
||||
) mem_coalescer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -240,7 +239,7 @@ module VX_mem_unit import VX_gpu_pkg::*; #(
|
|||
.NUM_LANES (DCACHE_CHANNELS),
|
||||
.DATA_SIZE (DCACHE_WORD_SIZE),
|
||||
.TAG_WIDTH (DCACHE_TAG_WIDTH),
|
||||
.TAG_SEL_BITS (DCACHE_TAG_WIDTH - `UUID_WIDTH),
|
||||
.TAG_SEL_BITS (DCACHE_TAG_WIDTH - UUID_WIDTH),
|
||||
.ARBITER ("P"),
|
||||
.REQ_OUT_BUF (0),
|
||||
.RSP_OUT_BUF (0)
|
||||
|
|
|
@ -27,7 +27,7 @@ module VX_mem_unit_top import VX_gpu_pkg::*; #(
|
|||
input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0] lsu_req_mask,
|
||||
input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][LSU_WORD_SIZE-1:0] lsu_req_byteen,
|
||||
input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][LSU_ADDR_WIDTH-1:0] lsu_req_addr,
|
||||
input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] lsu_req_flags,
|
||||
input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][MEM_FLAGS_WIDTH-1:0] lsu_req_flags,
|
||||
input wire [`NUM_LSU_BLOCKS-1:0][`NUM_LSU_LANES-1:0][LSU_WORD_WIDTH-1:0] lsu_req_data,
|
||||
input wire [`NUM_LSU_BLOCKS-1:0][LSU_TAG_WIDTH-1:0] lsu_req_tag,
|
||||
output wire [`NUM_LSU_BLOCKS-1:0] lsu_req_ready,
|
||||
|
@ -44,7 +44,7 @@ module VX_mem_unit_top import VX_gpu_pkg::*; #(
|
|||
output wire [DCACHE_NUM_REQS-1:0] mem_req_rw,
|
||||
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE-1:0] mem_req_byteen,
|
||||
output wire [DCACHE_NUM_REQS-1:0][DCACHE_ADDR_WIDTH-1:0] mem_req_addr,
|
||||
output wire [DCACHE_NUM_REQS-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] mem_req_flags,
|
||||
output wire [DCACHE_NUM_REQS-1:0][MEM_FLAGS_WIDTH-1:0] mem_req_flags,
|
||||
output wire [DCACHE_NUM_REQS-1:0][DCACHE_WORD_SIZE*8-1:0] mem_req_data,
|
||||
output wire [DCACHE_NUM_REQS-1:0][DCACHE_TAG_WIDTH-1:0] mem_req_tag,
|
||||
input wire [DCACHE_NUM_REQS-1:0] mem_req_ready,
|
||||
|
@ -109,7 +109,7 @@ module VX_mem_unit_top import VX_gpu_pkg::*; #(
|
|||
end
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
cache_perf_t lmem_perf = '0;
|
||||
VX_cache_pkg::cache_perf_t lmem_perf = '0;
|
||||
`endif
|
||||
|
||||
VX_mem_unit #(
|
||||
|
|
122
hw/rtl/core/VX_opc_unit.sv
Normal file
122
hw/rtl/core/VX_opc_unit.sv
Normal file
|
@ -0,0 +1,122 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
// reset all GPRs in debug mode
|
||||
`ifdef SIMULATION
|
||||
`ifndef NDEBUG
|
||||
`define GPR_RESET
|
||||
`endif
|
||||
`endif
|
||||
|
||||
module VX_opc_unit import VX_gpu_pkg::*; #(
|
||||
parameter `STRING INSTANCE_ID = "",
|
||||
parameter OUT_BUF = 3
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
output wire [ISSUE_WIS_W-1:0] wis,
|
||||
output wire [NUM_REGS-1:0] pending_regs_n,
|
||||
|
||||
VX_scoreboard_if.slave scoreboard_if,
|
||||
VX_opc_if.master opc_if,
|
||||
VX_operands_if.master operands_if
|
||||
);
|
||||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
|
||||
localparam NUM_OPDS = NUM_SRC_OPDS + 1;
|
||||
localparam SCB_DATAW = UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + PC_BITS + EX_BITS + INST_OP_BITS + INST_ARGS_BITS + NUM_OPDS + (REG_IDX_BITS * NUM_OPDS);
|
||||
|
||||
localparam STATE_IDLE = 0;
|
||||
localparam STATE_FETCH = 1;
|
||||
localparam STATE_DONE = 2;
|
||||
|
||||
VX_scoreboard_if staging_if();
|
||||
|
||||
reg [NUM_SRC_OPDS-1:0] opds_needed, opds_needed_n;
|
||||
reg [NUM_SRC_OPDS-1:0] opds_busy, opds_busy_n;
|
||||
reg [2:0] state, state_n;
|
||||
|
||||
wire scboard_fire = scoreboard_if.valid && scoreboard_if.ready;
|
||||
wire col_req_fire = opc_if.req_valid && opc_if.req_ready;
|
||||
wire col_rsp_fire = opc_if.rsp_valid;
|
||||
wire operands_fire = operands_if.valid && operands_if.ready;
|
||||
|
||||
VX_pipe_buffer #(
|
||||
.DATAW (SCB_DATAW)
|
||||
) stanging_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (scoreboard_if.valid),
|
||||
.data_in (scoreboard_if.data),
|
||||
.ready_in (scoreboard_if.ready),
|
||||
.valid_out(staging_if.valid),
|
||||
.data_out (staging_if.data),
|
||||
.ready_out(staging_if.ready)
|
||||
);
|
||||
|
||||
wire [NR_BITS-1:0] rs1 = {staging_if.data.rs1.rtype[0], staging_if.data.rs1.id};
|
||||
wire [NR_BITS-1:0] rs2 = {staging_if.data.rs2.rtype[0], staging_if.data.rs2.id};
|
||||
wire [NR_BITS-1:0] rs3 = {staging_if.data.rs3.rtype[0], staging_if.data.rs3.id};
|
||||
wire [NUM_SRC_OPDS-1:0][NR_BITS-1:0] src_opds = {rs3, rs2, rs1};
|
||||
|
||||
always @(*) begin
|
||||
state_n = state;
|
||||
opds_needed_n = opds_needed;
|
||||
opds_busy_n = opds_busy;
|
||||
case (state)
|
||||
STATE_IDLE: begin
|
||||
if (scboard_fire) begin
|
||||
opds_needed_n = scoreboard_if.data.used_rs;
|
||||
opds_busy_n = opds_needed_n;
|
||||
if (opds_busy_n == 0) begin
|
||||
state_n = STATE_DONE;
|
||||
end else begin
|
||||
state_n = STATE_FETCH;
|
||||
end
|
||||
end
|
||||
end
|
||||
STATE_FETCH: begin
|
||||
if (col_req_fire) begin
|
||||
opds_needed_n[opc_if.req_data.opd_id] = 0;
|
||||
end
|
||||
if (col_rsp_fire) begin
|
||||
opds_busy_n[opc_if.rsp_data.opd_id] = 0;
|
||||
end
|
||||
if (opds_busy_n == 0) begin
|
||||
state_n = STATE_DONE;
|
||||
end
|
||||
end
|
||||
STATE_DONE: begin
|
||||
if (operands_fire) begin
|
||||
state_n = STATE_IDLE;
|
||||
end
|
||||
end
|
||||
endcase
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
state <= STATE_IDLE;
|
||||
opds_needed <= '0;
|
||||
opds_busy <= '0;
|
||||
end else begin
|
||||
state <= state_n;
|
||||
opds_needed <= opds_needed_n;
|
||||
opds_busy <= opds_busy_n;
|
||||
end
|
||||
end
|
||||
|
||||
endmodule
|
|
@ -21,284 +21,73 @@
|
|||
`endif
|
||||
|
||||
module VX_operands import VX_gpu_pkg::*; #(
|
||||
parameter `STRING INSTANCE_ID = "",
|
||||
parameter NUM_BANKS = 4,
|
||||
parameter OUT_BUF = 3
|
||||
parameter `STRING INSTANCE_ID = ""
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
output wire [`PERF_CTR_BITS-1:0] perf_stalls,
|
||||
output wire [PERF_CTR_BITS-1:0] perf_stalls,
|
||||
`endif
|
||||
|
||||
VX_writeback_if.slave writeback_if,
|
||||
VX_scoreboard_if.slave scoreboard_if,
|
||||
VX_operands_if.master operands_if
|
||||
VX_operands_if.master operands_if [`NUM_OPCS]
|
||||
);
|
||||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
localparam NUM_SRC_OPDS = 3;
|
||||
localparam REQ_SEL_BITS = `CLOG2(NUM_SRC_OPDS);
|
||||
localparam REQ_SEL_WIDTH = `UP(REQ_SEL_BITS);
|
||||
localparam BANK_SEL_BITS = `CLOG2(NUM_BANKS);
|
||||
localparam BANK_SEL_WIDTH = `UP(BANK_SEL_BITS);
|
||||
localparam PER_BANK_REGS = `NUM_REGS / NUM_BANKS;
|
||||
localparam META_DATAW = ISSUE_WIS_W + `NUM_THREADS + `PC_BITS + 1 + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + `NR_BITS + `UUID_WIDTH;
|
||||
localparam REGS_DATAW = `XLEN * `NUM_THREADS;
|
||||
localparam DATAW = META_DATAW + NUM_SRC_OPDS * REGS_DATAW;
|
||||
localparam RAM_ADDRW = `LOG2UP(`NUM_REGS * PER_ISSUE_WARPS);
|
||||
localparam PER_BANK_ADDRW = RAM_ADDRW - BANK_SEL_BITS;
|
||||
localparam XLEN_SIZE = `XLEN / 8;
|
||||
localparam BYTEENW = `NUM_THREADS * XLEN_SIZE;
|
||||
|
||||
`UNUSED_VAR (writeback_if.data.sop)
|
||||
localparam NUM_OPDS = NUM_SRC_OPDS + 1;
|
||||
localparam SB_DATAW = UUID_WIDTH + ISSUE_WIS_W + `NUM_THREADS + PC_BITS + EX_BITS + INST_OP_BITS + INST_ARGS_BITS + NUM_OPDS + (REG_IDX_BITS * NUM_OPDS);
|
||||
|
||||
wire [NUM_SRC_OPDS-1:0] src_valid;
|
||||
wire [NUM_SRC_OPDS-1:0] req_valid_in, req_ready_in;
|
||||
wire [NUM_SRC_OPDS-1:0][PER_BANK_ADDRW-1:0] req_data_in;
|
||||
wire [NUM_SRC_OPDS-1:0][BANK_SEL_WIDTH-1:0] req_bank_idx;
|
||||
VX_opc_if opc_if[`NUM_OPCS]();
|
||||
VX_scoreboard_if per_opc_scoreboard_if[`NUM_OPCS]();
|
||||
wire [ISSUE_WIS_W-1:0] per_opc_wis[`NUM_OPCS];
|
||||
wire [NUM_REGS-1:0] per_opc_pending_regs_n[`NUM_OPCS];
|
||||
|
||||
wire [NUM_BANKS-1:0] gpr_rd_valid, gpr_rd_ready;
|
||||
wire [NUM_BANKS-1:0] gpr_rd_valid_st1, gpr_rd_valid_st2;
|
||||
wire [NUM_BANKS-1:0][PER_BANK_ADDRW-1:0] gpr_rd_addr, gpr_rd_addr_st1;
|
||||
wire [NUM_BANKS-1:0][`NUM_THREADS-1:0][`XLEN-1:0] gpr_rd_data_st2;
|
||||
wire [NUM_BANKS-1:0][REQ_SEL_WIDTH-1:0] gpr_rd_req_idx, gpr_rd_req_idx_st1, gpr_rd_req_idx_st2;
|
||||
`AOS_TO_ITF (per_opc_scoreboard, per_opc_scoreboard_if, `NUM_OPCS, SB_DATAW)
|
||||
|
||||
wire pipe_ready_in;
|
||||
wire pipe_valid_st1, pipe_ready_st1;
|
||||
wire pipe_valid_st2, pipe_ready_st2;
|
||||
wire [META_DATAW-1:0] pipe_data, pipe_data_st1, pipe_data_st2;
|
||||
|
||||
reg [NUM_SRC_OPDS-1:0][(`NUM_THREADS * `XLEN)-1:0] src_data_st2, src_data_m_st2;
|
||||
|
||||
reg [NUM_SRC_OPDS-1:0] data_fetched_st1;
|
||||
|
||||
reg has_collision_n;
|
||||
wire has_collision_st1;
|
||||
|
||||
wire [NUM_SRC_OPDS-1:0][`NR_BITS-1:0] src_opds;
|
||||
assign src_opds = {scoreboard_if.data.rs3, scoreboard_if.data.rs2, scoreboard_if.data.rs1};
|
||||
|
||||
for (genvar i = 0; i < NUM_SRC_OPDS; ++i) begin : g_req_data_in
|
||||
if (ISSUE_WIS != 0) begin : g_wis
|
||||
assign req_data_in[i] = {src_opds[i][`NR_BITS-1:BANK_SEL_BITS], scoreboard_if.data.wis};
|
||||
end else begin : g_no_wis
|
||||
assign req_data_in[i] = src_opds[i][`NR_BITS-1:BANK_SEL_BITS];
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_SRC_OPDS; ++i) begin : g_req_bank_idx
|
||||
if (NUM_BANKS != 1) begin : g_multibanks
|
||||
assign req_bank_idx[i] = src_opds[i][BANK_SEL_BITS-1:0];
|
||||
end else begin : g_singlebank
|
||||
assign req_bank_idx[i] = '0;
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_SRC_OPDS; ++i) begin : g_src_valid
|
||||
assign src_valid[i] = (src_opds[i] != 0) && ~data_fetched_st1[i];
|
||||
end
|
||||
|
||||
assign req_valid_in = {NUM_SRC_OPDS{scoreboard_if.valid}} & src_valid;
|
||||
|
||||
`UNUSED_VAR (scoreboard_if.data.rd_ext)
|
||||
`UNUSED_VAR (scoreboard_if.data.rs1_ext)
|
||||
`UNUSED_VAR (scoreboard_if.data.rs2_ext)
|
||||
`UNUSED_VAR (scoreboard_if.data.rs3_ext)
|
||||
|
||||
VX_stream_xbar #(
|
||||
.NUM_INPUTS (NUM_SRC_OPDS),
|
||||
.NUM_OUTPUTS (NUM_BANKS),
|
||||
.DATAW (PER_BANK_ADDRW),
|
||||
.ARBITER ("P"), // use priority arbiter
|
||||
.OUT_BUF (0) // no output buffering
|
||||
) req_xbar (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
`UNUSED_PIN(collisions),
|
||||
.valid_in (req_valid_in),
|
||||
.data_in (req_data_in),
|
||||
.sel_in (req_bank_idx),
|
||||
.ready_in (req_ready_in),
|
||||
.valid_out (gpr_rd_valid),
|
||||
.data_out (gpr_rd_addr),
|
||||
.sel_out (gpr_rd_req_idx),
|
||||
.ready_out (gpr_rd_ready)
|
||||
VX_stream_arb #(
|
||||
.NUM_INPUTS (1),
|
||||
.NUM_OUTPUTS (`NUM_OPCS),
|
||||
.DATAW (SB_DATAW)
|
||||
) scboard_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (scoreboard_if.valid),
|
||||
.data_in (scoreboard_if.data),
|
||||
.ready_in (scoreboard_if.ready),
|
||||
.valid_out (per_opc_scoreboard_valid),
|
||||
.data_out (per_opc_scoreboard_data),
|
||||
.ready_out (per_opc_scoreboard_ready),
|
||||
`UNUSED_PIN(sel_out)
|
||||
);
|
||||
|
||||
assign gpr_rd_ready = {NUM_BANKS{pipe_ready_in}};
|
||||
|
||||
always @(*) begin
|
||||
has_collision_n = 0;
|
||||
for (integer i = 0; i < NUM_SRC_OPDS; ++i) begin
|
||||
for (integer j = 1; j < (NUM_SRC_OPDS-i); ++j) begin
|
||||
has_collision_n |= src_valid[i]
|
||||
&& src_valid[j+i]
|
||||
&& (req_bank_idx[i] == req_bank_idx[j+i]);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
wire [NUM_SRC_OPDS-1:0] req_fire_in = req_valid_in & req_ready_in;
|
||||
|
||||
assign pipe_data = {
|
||||
scoreboard_if.data.wis,
|
||||
scoreboard_if.data.tmask,
|
||||
scoreboard_if.data.PC,
|
||||
scoreboard_if.data.wb,
|
||||
scoreboard_if.data.ex_type,
|
||||
scoreboard_if.data.op_type,
|
||||
scoreboard_if.data.op_args,
|
||||
scoreboard_if.data.rd,
|
||||
scoreboard_if.data.uuid
|
||||
};
|
||||
|
||||
assign scoreboard_if.ready = pipe_ready_in && ~has_collision_n;
|
||||
|
||||
wire pipe_fire_st1 = pipe_valid_st1 && pipe_ready_st1;
|
||||
wire pipe_fire_st2 = pipe_valid_st2 && pipe_ready_st2;
|
||||
|
||||
VX_pipe_buffer #(
|
||||
.DATAW (NUM_BANKS + META_DATAW + 1 + NUM_BANKS * (PER_BANK_ADDRW + REQ_SEL_WIDTH))
|
||||
) pipe_reg1 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (scoreboard_if.valid),
|
||||
.ready_in (pipe_ready_in),
|
||||
.data_in ({gpr_rd_valid, pipe_data, has_collision_n, gpr_rd_addr, gpr_rd_req_idx}),
|
||||
.data_out ({gpr_rd_valid_st1, pipe_data_st1, has_collision_st1, gpr_rd_addr_st1, gpr_rd_req_idx_st1}),
|
||||
.valid_out(pipe_valid_st1),
|
||||
.ready_out(pipe_ready_st1)
|
||||
);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset || scoreboard_if.ready) begin
|
||||
data_fetched_st1 <= 0;
|
||||
end else begin
|
||||
data_fetched_st1 <= data_fetched_st1 | req_fire_in;
|
||||
end
|
||||
end
|
||||
|
||||
wire pipe_valid2_st1 = pipe_valid_st1 && ~has_collision_st1;
|
||||
|
||||
VX_pipe_buffer #(
|
||||
.DATAW (NUM_BANKS * (1 + REQ_SEL_WIDTH) + META_DATAW)
|
||||
) pipe_reg2 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (pipe_valid2_st1),
|
||||
.ready_in (pipe_ready_st1),
|
||||
.data_in ({gpr_rd_valid_st1, gpr_rd_req_idx_st1, pipe_data_st1}),
|
||||
.data_out ({gpr_rd_valid_st2, gpr_rd_req_idx_st2, pipe_data_st2}),
|
||||
.valid_out(pipe_valid_st2),
|
||||
.ready_out(pipe_ready_st2)
|
||||
);
|
||||
|
||||
always @(*) begin
|
||||
src_data_m_st2 = src_data_st2;
|
||||
for (integer b = 0; b < NUM_BANKS; ++b) begin
|
||||
if (gpr_rd_valid_st2[b]) begin
|
||||
src_data_m_st2[gpr_rd_req_idx_st2[b]] = gpr_rd_data_st2[b];
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset || pipe_fire_st2) begin
|
||||
src_data_st2 <= 0;
|
||||
end else begin
|
||||
src_data_st2 <= src_data_m_st2;
|
||||
end
|
||||
end
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
|
||||
) out_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (pipe_valid_st2),
|
||||
.ready_in (pipe_ready_st2),
|
||||
.data_in ({pipe_data_st2, src_data_m_st2}),
|
||||
.data_out ({
|
||||
operands_if.data.wis,
|
||||
operands_if.data.tmask,
|
||||
operands_if.data.PC,
|
||||
operands_if.data.wb,
|
||||
operands_if.data.ex_type,
|
||||
operands_if.data.op_type,
|
||||
operands_if.data.op_args,
|
||||
operands_if.data.rd,
|
||||
operands_if.data.uuid,
|
||||
operands_if.data.rs3_data,
|
||||
operands_if.data.rs2_data,
|
||||
operands_if.data.rs1_data
|
||||
}),
|
||||
.valid_out (operands_if.valid),
|
||||
.ready_out (operands_if.ready)
|
||||
);
|
||||
|
||||
wire [PER_BANK_ADDRW-1:0] gpr_wr_addr;
|
||||
if (ISSUE_WIS != 0) begin : g_gpr_wr_addr
|
||||
assign gpr_wr_addr = {writeback_if.data.rd[`NR_BITS-1:BANK_SEL_BITS], writeback_if.data.wis};
|
||||
end else begin : g_gpr_wr_addr_no_wis
|
||||
assign gpr_wr_addr = writeback_if.data.rd[`NR_BITS-1:BANK_SEL_BITS];
|
||||
end
|
||||
|
||||
wire [BANK_SEL_WIDTH-1:0] gpr_wr_bank_idx;
|
||||
if (NUM_BANKS != 1) begin : g_gpr_wr_bank_idx
|
||||
assign gpr_wr_bank_idx = writeback_if.data.rd[BANK_SEL_BITS-1:0];
|
||||
end else begin : g_gpr_wr_bank_idx_0
|
||||
assign gpr_wr_bank_idx = '0;
|
||||
end
|
||||
|
||||
for (genvar b = 0; b < NUM_BANKS; ++b) begin : g_gpr_rams
|
||||
wire gpr_wr_enabled;
|
||||
if (BANK_SEL_BITS != 0) begin : g_gpr_wr_enabled_multibanks
|
||||
assign gpr_wr_enabled = writeback_if.valid
|
||||
&& (gpr_wr_bank_idx == BANK_SEL_BITS'(b));
|
||||
end else begin : g_gpr_wr_enabled
|
||||
assign gpr_wr_enabled = writeback_if.valid;
|
||||
end
|
||||
|
||||
wire [BYTEENW-1:0] wren;
|
||||
for (genvar i = 0; i < `NUM_THREADS; ++i) begin : g_wren
|
||||
assign wren[i*XLEN_SIZE+:XLEN_SIZE] = {XLEN_SIZE{writeback_if.data.tmask[i]}};
|
||||
end
|
||||
|
||||
VX_dp_ram #(
|
||||
.DATAW (REGS_DATAW),
|
||||
.SIZE (PER_BANK_REGS * PER_ISSUE_WARPS),
|
||||
.WRENW (BYTEENW),
|
||||
`ifdef GPR_RESET
|
||||
.RESET_RAM (1),
|
||||
`endif
|
||||
.OUT_REG (1),
|
||||
.RDW_MODE ("R")
|
||||
) gpr_ram (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.read (pipe_fire_st1),
|
||||
.wren (wren),
|
||||
.write (gpr_wr_enabled),
|
||||
.waddr (gpr_wr_addr),
|
||||
.wdata (writeback_if.data.data),
|
||||
.raddr (gpr_rd_addr_st1[b]),
|
||||
.rdata (gpr_rd_data_st2[b])
|
||||
for (genvar i = 0; i < `NUM_OPCS; ++i) begin : g_opc_units
|
||||
VX_opc_unit #(
|
||||
.INSTANCE_ID (INSTANCE_ID)
|
||||
) opc_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.wis (per_opc_wis[i]),
|
||||
.pending_regs_n(per_opc_pending_regs_n[i]),
|
||||
.scoreboard_if(per_opc_scoreboard_if[i]),
|
||||
.opc_if (opc_if[i]),
|
||||
.operands_if (operands_if[i])
|
||||
);
|
||||
end
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`PERF_CTR_BITS-1:0] collisions_r;
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
collisions_r <= '0;
|
||||
end else begin
|
||||
collisions_r <= collisions_r + `PERF_CTR_BITS'(scoreboard_if.valid && pipe_ready_in && has_collision_n);
|
||||
end
|
||||
end
|
||||
assign perf_stalls = collisions_r;
|
||||
`endif
|
||||
VX_gpr_unit #(
|
||||
.INSTANCE_ID (INSTANCE_ID),
|
||||
.NUM_REQS (`NUM_OPCS),
|
||||
.NUM_BANKS (`NUM_GPR_BANKS)
|
||||
) gpr_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_stalls (perf_stalls),
|
||||
`endif
|
||||
.writeback_if (writeback_if),
|
||||
.opc_if (opc_if)
|
||||
);
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -29,10 +29,10 @@ module VX_pe_switch import VX_gpu_pkg::*; #(
|
|||
VX_execute_if.master execute_out_if[PE_COUNT],
|
||||
VX_commit_if .slave commit_in_if[PE_COUNT]
|
||||
);
|
||||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||
localparam PID_BITS = `CLOG2(`SIMD_WIDTH / NUM_LANES);
|
||||
localparam PID_WIDTH = `UP(PID_BITS);
|
||||
localparam REQ_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `INST_ALU_BITS + $bits(op_args_t) + 1 + `NR_BITS + `NT_WIDTH + (3 * NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1;
|
||||
localparam RSP_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
|
||||
localparam REQ_DATAW = UUID_WIDTH + NW_WIDTH + NUM_LANES + PC_BITS + INST_ALU_BITS + $bits(op_args_t) + 1 + NR_BITS + NT_WIDTH + (3 * NUM_LANES * `XLEN) + PID_WIDTH + 1 + 1;
|
||||
localparam RSP_DATAW = UUID_WIDTH + NW_WIDTH + NUM_LANES + PC_BITS + NR_BITS + 1 + NUM_LANES * `XLEN + PID_WIDTH + 1 + 1;
|
||||
|
||||
wire [PE_COUNT-1:0] pe_req_valid;
|
||||
wire [PE_COUNT-1:0][REQ_DATAW-1:0] pe_req_data;
|
||||
|
|
|
@ -50,11 +50,11 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
reg [`NUM_WARPS-1:0] stalled_warps, stalled_warps_n; // set when branch/gpgpu instructions are issued
|
||||
|
||||
reg [`NUM_WARPS-1:0][`NUM_THREADS-1:0] thread_masks, thread_masks_n;
|
||||
reg [`NUM_WARPS-1:0][`PC_BITS-1:0] warp_pcs, warp_pcs_n;
|
||||
reg [`NUM_WARPS-1:0][PC_BITS-1:0] warp_pcs, warp_pcs_n;
|
||||
|
||||
wire [`NW_WIDTH-1:0] schedule_wid;
|
||||
wire [NW_WIDTH-1:0] schedule_wid;
|
||||
wire [`NUM_THREADS-1:0] schedule_tmask;
|
||||
wire [`PC_BITS-1:0] schedule_pc;
|
||||
wire [PC_BITS-1:0] schedule_pc;
|
||||
wire schedule_valid;
|
||||
wire schedule_ready;
|
||||
|
||||
|
@ -62,20 +62,20 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
wire join_valid;
|
||||
wire join_is_dvg;
|
||||
wire join_is_else;
|
||||
wire [`NW_WIDTH-1:0] join_wid;
|
||||
wire [NW_WIDTH-1:0] join_wid;
|
||||
wire [`NUM_THREADS-1:0] join_tmask;
|
||||
wire [`PC_BITS-1:0] join_pc;
|
||||
wire [PC_BITS-1:0] join_pc;
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] cycles;
|
||||
reg [PERF_CTR_BITS-1:0] cycles;
|
||||
|
||||
wire schedule_fire = schedule_valid && schedule_ready;
|
||||
wire schedule_if_fire = schedule_if.valid && schedule_if.ready;
|
||||
|
||||
// branch
|
||||
wire [`NUM_ALU_BLOCKS-1:0] branch_valid;
|
||||
wire [`NUM_ALU_BLOCKS-1:0][`NW_WIDTH-1:0] branch_wid;
|
||||
wire [`NUM_ALU_BLOCKS-1:0][NW_WIDTH-1:0] branch_wid;
|
||||
wire [`NUM_ALU_BLOCKS-1:0] branch_taken;
|
||||
wire [`NUM_ALU_BLOCKS-1:0][`PC_BITS-1:0] branch_dest;
|
||||
wire [`NUM_ALU_BLOCKS-1:0][PC_BITS-1:0] branch_dest;
|
||||
for (genvar i = 0; i < `NUM_ALU_BLOCKS; ++i) begin : g_branch_init
|
||||
assign branch_valid[i] = branch_ctl_if[i].valid;
|
||||
assign branch_wid[i] = branch_ctl_if[i].wid;
|
||||
|
@ -85,18 +85,18 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
|
||||
// barriers
|
||||
reg [`NUM_BARRIERS-1:0][`NUM_WARPS-1:0] barrier_masks, barrier_masks_n;
|
||||
reg [`NUM_BARRIERS-1:0][`NW_WIDTH-1:0] barrier_ctrs, barrier_ctrs_n;
|
||||
reg [`NUM_BARRIERS-1:0][NW_WIDTH-1:0] barrier_ctrs, barrier_ctrs_n;
|
||||
reg [`NUM_WARPS-1:0] barrier_stalls, barrier_stalls_n;
|
||||
reg [`NUM_WARPS-1:0] curr_barrier_mask_p1;
|
||||
`ifdef GBAR_ENABLE
|
||||
reg gbar_req_valid;
|
||||
reg [`NB_WIDTH-1:0] gbar_req_id;
|
||||
reg [`NC_WIDTH-1:0] gbar_req_size_m1;
|
||||
reg [NB_WIDTH-1:0] gbar_req_id;
|
||||
reg [NC_WIDTH-1:0] gbar_req_size_m1;
|
||||
`endif
|
||||
|
||||
// wspawn
|
||||
wspawn_t wspawn;
|
||||
reg [`NW_WIDTH-1:0] wspawn_wid;
|
||||
reg [NW_WIDTH-1:0] wspawn_wid;
|
||||
reg is_single_warp;
|
||||
|
||||
wire [`CLOG2(`NUM_WARPS+1)-1:0] active_warps_cnt;
|
||||
|
@ -165,13 +165,13 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
if (warp_ctl_if.valid && warp_ctl_if.barrier.valid) begin
|
||||
if (~warp_ctl_if.barrier.is_noop) begin
|
||||
if (~warp_ctl_if.barrier.is_global
|
||||
&& (barrier_ctrs[warp_ctl_if.barrier.id] == `NW_WIDTH'(warp_ctl_if.barrier.size_m1))) begin
|
||||
&& (barrier_ctrs[warp_ctl_if.barrier.id] == NW_WIDTH'(warp_ctl_if.barrier.size_m1))) begin
|
||||
barrier_ctrs_n[warp_ctl_if.barrier.id] = '0; // reset barrier counter
|
||||
barrier_masks_n[warp_ctl_if.barrier.id] = '0; // reset barrier mask
|
||||
stalled_warps_n &= ~barrier_masks[warp_ctl_if.barrier.id]; // unlock warps
|
||||
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
|
||||
end else begin
|
||||
barrier_ctrs_n[warp_ctl_if.barrier.id] = barrier_ctrs[warp_ctl_if.barrier.id] + `NW_WIDTH'(1);
|
||||
barrier_ctrs_n[warp_ctl_if.barrier.id] = barrier_ctrs[warp_ctl_if.barrier.id] + NW_WIDTH'(1);
|
||||
barrier_masks_n[warp_ctl_if.barrier.id] = curr_barrier_mask_p1;
|
||||
end
|
||||
end else begin
|
||||
|
@ -204,7 +204,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
|
||||
// advance PC
|
||||
if (schedule_if_fire) begin
|
||||
warp_pcs_n[schedule_if.data.wid] = schedule_if.data.PC + `PC_BITS'(2);
|
||||
warp_pcs_n[schedule_if.data.wid] = schedule_if.data.PC + PC_BITS'(2);
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -226,7 +226,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
wspawn.valid <= 0;
|
||||
|
||||
// activate first warp
|
||||
warp_pcs[0] <= base_dcrs.startup_addr[1 +: `PC_BITS];
|
||||
warp_pcs[0] <= base_dcrs.startup_addr[1 +: PC_BITS];
|
||||
active_warps[0] <= 1;
|
||||
thread_masks[0][0] <= 1;
|
||||
is_single_warp <= 1;
|
||||
|
@ -259,7 +259,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
&& (curr_barrier_mask_p1 == active_warps)) begin
|
||||
gbar_req_valid <= 1;
|
||||
gbar_req_id <= warp_ctl_if.barrier.id;
|
||||
gbar_req_size_m1 <= `NC_WIDTH'(warp_ctl_if.barrier.size_m1);
|
||||
gbar_req_size_m1 <= NC_WIDTH'(warp_ctl_if.barrier.size_m1);
|
||||
end
|
||||
if (gbar_bus_if.req_valid && gbar_bus_if.req_ready) begin
|
||||
gbar_req_valid <= 0;
|
||||
|
@ -278,7 +278,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
assign gbar_bus_if.req_valid = gbar_req_valid;
|
||||
assign gbar_bus_if.req_data.id = gbar_req_id;
|
||||
assign gbar_bus_if.req_data.size_m1 = gbar_req_size_m1;
|
||||
assign gbar_bus_if.req_data.core_id = `NC_WIDTH'(CORE_ID % `NUM_CORES);
|
||||
assign gbar_bus_if.req_data.core_id = NC_WIDTH'(CORE_ID % `NUM_CORES);
|
||||
`endif
|
||||
|
||||
// split/join handling
|
||||
|
@ -315,21 +315,20 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
.valid_out (schedule_valid)
|
||||
);
|
||||
|
||||
wire [`NUM_WARPS-1:0][(`NUM_THREADS + `PC_BITS)-1:0] schedule_data;
|
||||
wire [`NUM_WARPS-1:0][(`NUM_THREADS + PC_BITS)-1:0] schedule_data;
|
||||
for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_schedule_data
|
||||
assign schedule_data[i] = {thread_masks[i], warp_pcs[i]};
|
||||
end
|
||||
|
||||
assign {schedule_tmask, schedule_pc} = {
|
||||
schedule_data[schedule_wid][(`NUM_THREADS + `PC_BITS)-1:(`NUM_THREADS + `PC_BITS)-4],
|
||||
schedule_data[schedule_wid][(`NUM_THREADS + `PC_BITS)-5:0]
|
||||
schedule_data[schedule_wid][(`NUM_THREADS + PC_BITS)-1:(`NUM_THREADS + PC_BITS)-4],
|
||||
schedule_data[schedule_wid][(`NUM_THREADS + PC_BITS)-5:0]
|
||||
};
|
||||
|
||||
wire [`UUID_WIDTH-1:0] instr_uuid;
|
||||
wire [UUID_WIDTH-1:0] instr_uuid;
|
||||
`ifdef UUID_ENABLE
|
||||
VX_uuid_gen #(
|
||||
.CORE_ID (CORE_ID),
|
||||
.UUID_WIDTH (`UUID_WIDTH)
|
||||
.CORE_ID (CORE_ID)
|
||||
) uuid_gen (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -342,7 +341,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
`endif
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (`NUM_THREADS + `PC_BITS + `NW_WIDTH + `UUID_WIDTH),
|
||||
.DATAW (`NUM_THREADS + PC_BITS + NW_WIDTH + UUID_WIDTH),
|
||||
.SIZE (2), // need to buffer out ready_in
|
||||
.OUT_REG (1) // should be registered for BRAM acces in fetch unit
|
||||
) out_buf (
|
||||
|
@ -368,7 +367,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
) counter (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.incr (schedule_if_fire && (schedule_if.data.wid == `NW_WIDTH'(i))),
|
||||
.incr (schedule_if_fire && (schedule_if.data.wid == NW_WIDTH'(i))),
|
||||
.decr (commit_sched_if.committed_warps[i]),
|
||||
.empty (pending_warp_empty[i]),
|
||||
.alm_empty (pending_warp_alm_empty[i]),
|
||||
|
@ -407,11 +406,11 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
end
|
||||
end
|
||||
end
|
||||
`RUNTIME_ASSERT(timeout_ctr < `STALL_TIMEOUT, ("%t: *** %s timeout: stalled_warps=%b", $time, INSTANCE_ID, stalled_warps))
|
||||
`RUNTIME_ASSERT(timeout_ctr < STALL_TIMEOUT, ("%t: *** %s timeout: stalled_warps=%b", $time, INSTANCE_ID, stalled_warps))
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`PERF_CTR_BITS-1:0] perf_sched_idles;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_sched_stalls;
|
||||
reg [PERF_CTR_BITS-1:0] perf_sched_idles;
|
||||
reg [PERF_CTR_BITS-1:0] perf_sched_stalls;
|
||||
|
||||
wire schedule_idle = ~schedule_valid;
|
||||
wire schedule_stall = schedule_if.valid && ~schedule_if.ready;
|
||||
|
@ -421,8 +420,8 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
perf_sched_idles <= '0;
|
||||
perf_sched_stalls <= '0;
|
||||
end else begin
|
||||
perf_sched_idles <= perf_sched_idles + `PERF_CTR_BITS'(schedule_idle);
|
||||
perf_sched_stalls <= perf_sched_stalls + `PERF_CTR_BITS'(schedule_stall);
|
||||
perf_sched_idles <= perf_sched_idles + PERF_CTR_BITS'(schedule_idle);
|
||||
perf_sched_stalls <= perf_sched_stalls + PERF_CTR_BITS'(schedule_stall);
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
@ -20,32 +20,33 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
input wire reset,
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
output reg [`PERF_CTR_BITS-1:0] perf_stalls,
|
||||
output reg [`NUM_EX_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_units_uses,
|
||||
output reg [`NUM_SFU_UNITS-1:0][`PERF_CTR_BITS-1:0] perf_sfu_uses,
|
||||
output reg [PERF_CTR_BITS-1:0] perf_stalls,
|
||||
output reg [NUM_EX_UNITS-1:0][PERF_CTR_BITS-1:0] perf_units_uses,
|
||||
output reg [NUM_SFU_UNITS-1:0][PERF_CTR_BITS-1:0] perf_sfu_uses,
|
||||
`endif
|
||||
|
||||
VX_writeback_if.slave writeback_if,
|
||||
VX_ibuffer_if.slave ibuffer_if [PER_ISSUE_WARPS],
|
||||
VX_scoreboard_if.master scoreboard_if
|
||||
);
|
||||
//`UNUSED_SPARAM (INSTANCE_ID)
|
||||
//localparam NUM_SRC_OPDS = 3;
|
||||
//localparam NUM_OPDS = NUM_SRC_OPDS + 1;
|
||||
localparam DATAW = `UUID_WIDTH + `NUM_THREADS + `PC_BITS + `EX_BITS + `INST_OP_BITS + `INST_ARGS_BITS + (`NR_BITS * 4) + (`REG_EXT_BITS * 4) + 1;
|
||||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
`UNUSED_VAR (writeback_if.data.sop)
|
||||
|
||||
localparam NUM_OPDS = NUM_SRC_OPDS + 1;
|
||||
localparam DATAW = UUID_WIDTH + `NUM_THREADS + PC_BITS + EX_BITS + INST_OP_BITS + INST_ARGS_BITS + NUM_OPDS + (REG_IDX_BITS * NUM_OPDS);
|
||||
|
||||
VX_ibuffer_if staging_if [PER_ISSUE_WARPS]();
|
||||
reg [PER_ISSUE_WARPS-1:0] operands_ready;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [PER_ISSUE_WARPS-1:0][`NUM_EX_UNITS-1:0] perf_inuse_units_per_cycle;
|
||||
wire [`NUM_EX_UNITS-1:0] perf_units_per_cycle, perf_units_per_cycle_r;
|
||||
reg [PER_ISSUE_WARPS-1:0][NUM_EX_UNITS-1:0] perf_inuse_units_per_cycle;
|
||||
wire [NUM_EX_UNITS-1:0] perf_units_per_cycle, perf_units_per_cycle_r;
|
||||
|
||||
reg [PER_ISSUE_WARPS-1:0][`NUM_SFU_UNITS-1:0] perf_inuse_sfu_per_cycle;
|
||||
wire [`NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r;
|
||||
reg [PER_ISSUE_WARPS-1:0][NUM_SFU_UNITS-1:0] perf_inuse_sfu_per_cycle;
|
||||
wire [NUM_SFU_UNITS-1:0] perf_sfu_per_cycle, perf_sfu_per_cycle_r;
|
||||
|
||||
VX_reduce_tree #(
|
||||
.DATAW_IN (`NUM_EX_UNITS),
|
||||
.DATAW_IN (NUM_EX_UNITS),
|
||||
.N (PER_ISSUE_WARPS),
|
||||
.OP ("|")
|
||||
) perf_units_reduce (
|
||||
|
@ -54,7 +55,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
);
|
||||
|
||||
VX_reduce_tree #(
|
||||
.DATAW_IN (`NUM_SFU_UNITS),
|
||||
.DATAW_IN (NUM_SFU_UNITS),
|
||||
.N (PER_ISSUE_WARPS),
|
||||
.OP ("|")
|
||||
) perf_sfu_reduce (
|
||||
|
@ -76,26 +77,26 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
if (reset) begin
|
||||
perf_stalls <= '0;
|
||||
end else begin
|
||||
perf_stalls <= perf_stalls + `PERF_CTR_BITS'(perf_stall_per_cycle);
|
||||
perf_stalls <= perf_stalls + PERF_CTR_BITS'(perf_stall_per_cycle);
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `NUM_EX_UNITS; ++i) begin : g_perf_units_uses
|
||||
for (genvar i = 0; i < NUM_EX_UNITS; ++i) begin : g_perf_units_uses
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_units_uses[i] <= '0;
|
||||
end else begin
|
||||
perf_units_uses[i] <= perf_units_uses[i] + `PERF_CTR_BITS'(perf_units_per_cycle_r[i]);
|
||||
perf_units_uses[i] <= perf_units_uses[i] + PERF_CTR_BITS'(perf_units_per_cycle_r[i]);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `NUM_SFU_UNITS; ++i) begin : g_perf_sfu_uses
|
||||
for (genvar i = 0; i < NUM_SFU_UNITS; ++i) begin : g_perf_sfu_uses
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
perf_sfu_uses[i] <= '0;
|
||||
end else begin
|
||||
perf_sfu_uses[i] <= perf_sfu_uses[i] + `PERF_CTR_BITS'(perf_sfu_per_cycle_r[i]);
|
||||
perf_sfu_uses[i] <= perf_sfu_uses[i] + PERF_CTR_BITS'(perf_sfu_per_cycle_r[i]);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -117,8 +118,8 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
end
|
||||
|
||||
for (genvar w = 0; w < PER_ISSUE_WARPS; ++w) begin : g_scoreboard
|
||||
reg [`NUM_REGS-1:0] inuse_regs, inuse_regs_n;
|
||||
reg [`REG_TYPES-1:0] operands_busy, operands_busy_n;
|
||||
reg [NUM_REGS-1:0] inuse_regs, inuse_regs_n;
|
||||
wire [NUM_OPDS-1:0] operands_busy;
|
||||
|
||||
wire ibuffer_fire = ibuffer_if[w].valid && ibuffer_if[w].ready;
|
||||
wire staging_fire = staging_if[w].valid && staging_if[w].ready;
|
||||
|
@ -127,49 +128,27 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
&& (writeback_if.data.wis == ISSUE_WIS_W'(w))
|
||||
&& writeback_if.data.eop;
|
||||
|
||||
wire [`REG_TYPE_WIDTH-1:0] ibf_rs1_type, ibf_rs2_type, ibf_rs3_type, ibf_rd_type;
|
||||
wire [`REG_TYPES-1:0][31:0] ibf_rs1_mask, ibf_rs2_mask, ibf_rs3_mask, ibf_rd_mask;
|
||||
wire [REG_TYPES-1:0][31:0] ibf_rs1_mask, ibf_rs2_mask, ibf_rs3_mask, ibf_rd_mask;
|
||||
wire [REG_TYPES-1:0][31:0] stg_rs1_mask, stg_rs2_mask, stg_rs3_mask, stg_rd_mask;
|
||||
|
||||
wire [`REG_TYPE_WIDTH-1:0] stg_rs1_type, stg_rs2_type, stg_rs3_type, stg_rd_type;
|
||||
wire [`REG_TYPES-1:0][31:0] stg_rs1_mask, stg_rs2_mask, stg_rs3_mask, stg_rd_mask;
|
||||
for (genvar i = 0; i < REG_TYPES; ++i) begin : g_opd_masks
|
||||
assign ibf_rd_mask[i] = (`REG_EXT_VAL(ibuffer_if[w].data.rd.ext, i) << ibuffer_if[w].data.rd.id) & {32{ibuffer_if[w].data.wb && ibuffer_if[w].data.rd.rtype == i}};
|
||||
assign ibf_rs1_mask[i] = (`REG_EXT_VAL(ibuffer_if[w].data.rs1.ext, i) << ibuffer_if[w].data.rs1.id) & {32{ibuffer_if[w].data.used_rs[0] && ibuffer_if[w].data.rs1.rtype == i}};
|
||||
assign ibf_rs2_mask[i] = (`REG_EXT_VAL(ibuffer_if[w].data.rs2.ext, i) << ibuffer_if[w].data.rs2.id) & {32{ibuffer_if[w].data.used_rs[1] && ibuffer_if[w].data.rs2.rtype == i}};
|
||||
assign ibf_rs3_mask[i] = (`REG_EXT_VAL(ibuffer_if[w].data.rs3.ext, i) << ibuffer_if[w].data.rs3.id) & {32{ibuffer_if[w].data.used_rs[2] && ibuffer_if[w].data.rs3.rtype == i}};
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
assign ibf_rs1_type = ibuffer_if[w].data.rs1[5];
|
||||
assign ibf_rs2_type = ibuffer_if[w].data.rs2[5];
|
||||
assign ibf_rs3_type = ibuffer_if[w].data.rs3[5];
|
||||
assign ibf_rd_type = ibuffer_if[w].data.rd[5];
|
||||
|
||||
assign stg_rs1_type = staging_if[w].data.rs1[5];
|
||||
assign stg_rs2_type = staging_if[w].data.rs2[5];
|
||||
assign stg_rs3_type = staging_if[w].data.rs3[5];
|
||||
assign stg_rd_type = staging_if[w].data.rd[5];
|
||||
`else
|
||||
assign ibf_rs1_type = 0;
|
||||
assign ibf_rs2_type = 0;
|
||||
assign ibf_rs3_type = 0;
|
||||
assign ibf_rd_type = 0;
|
||||
|
||||
assign stg_rs1_type = 0;
|
||||
assign stg_rs2_type = 0;
|
||||
assign stg_rs3_type = 0;
|
||||
assign stg_rd_type = 0;
|
||||
`endif
|
||||
|
||||
for (genvar i = 0; i < `REG_TYPES; ++i) begin : g_opd_masks
|
||||
assign ibf_rs1_mask[i] = (`REG_EXT_VAL(ibuffer_if[w].data.rs1_ext, i) << ibuffer_if[w].data.rs1[4:0]) & {32{ibf_rs1_type == i}};
|
||||
assign ibf_rs2_mask[i] = (`REG_EXT_VAL(ibuffer_if[w].data.rs2_ext, i) << ibuffer_if[w].data.rs2[4:0]) & {32{ibf_rs2_type == i}};
|
||||
assign ibf_rs3_mask[i] = (`REG_EXT_VAL(ibuffer_if[w].data.rs3_ext, i) << ibuffer_if[w].data.rs3[4:0]) & {32{ibf_rs3_type == i}};
|
||||
assign ibf_rd_mask[i] = (`REG_EXT_VAL(ibuffer_if[w].data.rd_ext, i) << ibuffer_if[w].data.rd[4:0]) & {32{ibf_rd_type == i}};
|
||||
|
||||
assign stg_rs1_mask[i] = (`REG_EXT_VAL(staging_if[w].data.rs1_ext, i) << staging_if[w].data.rs1[4:0]) & {32{stg_rs1_type == i}};
|
||||
assign stg_rs2_mask[i] = (`REG_EXT_VAL(staging_if[w].data.rs2_ext, i) << staging_if[w].data.rs2[4:0]) & {32{stg_rs2_type == i}};
|
||||
assign stg_rs3_mask[i] = (`REG_EXT_VAL(staging_if[w].data.rs3_ext, i) << staging_if[w].data.rs3[4:0]) & {32{stg_rs3_type == i}};
|
||||
assign stg_rd_mask[i] = (`REG_EXT_VAL(staging_if[w].data.rd_ext, i) << staging_if[w].data.rd[4:0]) & {32{stg_rd_type == i}};
|
||||
assign stg_rd_mask[i] = (`REG_EXT_VAL(staging_if[w].data.rd.ext, i) << staging_if[w].data.rd.id) & {32{staging_if[w].data.wb && staging_if[w].data.rd.rtype == i}};
|
||||
assign stg_rs1_mask[i] = (`REG_EXT_VAL(staging_if[w].data.rs1.ext, i) << staging_if[w].data.rs1.id) & {32{staging_if[w].data.used_rs[0] && staging_if[w].data.rs1.rtype == i}};
|
||||
assign stg_rs2_mask[i] = (`REG_EXT_VAL(staging_if[w].data.rs2.ext, i) << staging_if[w].data.rs2.id) & {32{staging_if[w].data.used_rs[1] && staging_if[w].data.rs2.rtype == i}};
|
||||
assign stg_rs3_mask[i] = (`REG_EXT_VAL(staging_if[w].data.rs3.ext, i) << staging_if[w].data.rs3.id) & {32{staging_if[w].data.used_rs[2] && staging_if[w].data.rs3.rtype == i}};
|
||||
end
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
reg [`NUM_REGS-1:0][`EX_WIDTH-1:0] inuse_units;
|
||||
reg [`NUM_REGS-1:0][`SFU_WIDTH-1:0] inuse_sfu;
|
||||
reg [NUM_REGS-1:0][EX_WIDTH-1:0] inuse_units;
|
||||
reg [NUM_REGS-1:0][SFU_WIDTH-1:0] inuse_sfu;
|
||||
|
||||
reg_idx_t [NUM_OPDS-1:0] stg_opds;
|
||||
assign stg_opds = {staging_if[w].data.rs3, staging_if[w].data.rs2, staging_if[w].data.rs1, staging_if[w].data.rd};
|
||||
|
||||
always @(*) begin
|
||||
perf_inuse_units_per_cycle[w] = '0;
|
||||
|
@ -177,7 +156,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
for (integer i = 0; i < NUM_OPDS; ++i) begin
|
||||
if (staging_if[w].valid && operands_busy[i]) begin
|
||||
perf_inuse_units_per_cycle[w][inuse_units[stg_opds[i]]] = 1;
|
||||
if (inuse_units[stg_opds[i]] == `EX_SFU) begin
|
||||
if (inuse_units[stg_opds[i]] == EX_SFU) begin
|
||||
perf_inuse_sfu_per_cycle[w][inuse_sfu[stg_opds[i]]] = 1;
|
||||
end
|
||||
end
|
||||
|
@ -185,29 +164,6 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
end
|
||||
`endif
|
||||
|
||||
/*or (genvar i = 0; i < `REG_TYPES; ++i) begin : g_operands_busy_n
|
||||
wire [31:0] ibf_reg_mask = ibf_rs1_mask[i] | ibf_rs2_mask[i] | ibf_rs3_mask[i] | ibf_rd_mask[i];
|
||||
wire in_use_check = (inuse_regs_n[i * 32 +: 32] & ibf_reg_mask) != 0;;
|
||||
wire in_stg_check = staging_fire && staging_if[w].data.wb && ((ibf_reg_mask & stg_rd_mask[i]) != 0);
|
||||
always @(*) begin
|
||||
operands_busy_n[i] = operands_busy[i];
|
||||
if (ibuffer_fire) begin
|
||||
operands_busy_n[i] = in_use_check | in_stg_check;
|
||||
end
|
||||
if (writeback_fire) begin
|
||||
if (ibuffer_fire) begin
|
||||
if (writeback_if.data.rd == ibuf_opds[i]) begin
|
||||
operands_busy_n[i] = 0;
|
||||
end
|
||||
end else begin
|
||||
if (writeback_if.data.rd == stg_opds[i]) begin
|
||||
operands_busy_n[i] = 0;
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end*/
|
||||
|
||||
always @(*) begin
|
||||
inuse_regs_n = inuse_regs;
|
||||
if (writeback_fire) begin
|
||||
|
@ -218,30 +174,39 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < `REG_TYPES; ++i) begin : g_operands_busy_n
|
||||
wire [REG_TYPES-1:0][31:0] in_use_mask;
|
||||
for (genvar i = 0; i < REG_TYPES; ++i) begin : g_in_use_mask
|
||||
wire [31:0] ibf_reg_mask = ibf_rs1_mask[i] | ibf_rs2_mask[i] | ibf_rs3_mask[i] | ibf_rd_mask[i];
|
||||
wire [31:0] stg_reg_mask = stg_rs1_mask[i] | stg_rs2_mask[i] | stg_rs3_mask[i] | stg_rd_mask[i];
|
||||
wire [31:0] reg_mask = ibuffer_fire ? ibf_reg_mask : stg_reg_mask;
|
||||
assign operands_busy_n[i] = (inuse_regs_n[i * 32 +: 32] & reg_mask) != 0;
|
||||
wire [31:0] regs_mask = ibuffer_fire ? ibf_reg_mask : stg_reg_mask;
|
||||
assign in_use_mask[i] = inuse_regs_n[i * 32 +: 32] & regs_mask;
|
||||
end
|
||||
|
||||
wire [REG_TYPES-1:0] regs_busy;
|
||||
for (genvar i = 0; i < REG_TYPES; ++i) begin : g_regs_busy
|
||||
assign regs_busy[i] = (in_use_mask[i] != 0);
|
||||
end
|
||||
|
||||
// per operand busy
|
||||
assign operands_busy[0] = (in_use_mask[staging_if[w].data.rd.rtype] & stg_rd_mask[staging_if[w].data.rd.rtype]) != 0;
|
||||
assign operands_busy[1] = (in_use_mask[staging_if[w].data.rs1.rtype] & stg_rs1_mask[staging_if[w].data.rs1.rtype]) != 0;
|
||||
assign operands_busy[2] = (in_use_mask[staging_if[w].data.rs2.rtype] & stg_rs2_mask[staging_if[w].data.rs2.rtype]) != 0;
|
||||
assign operands_busy[3] = (in_use_mask[staging_if[w].data.rs3.rtype] & stg_rs3_mask[staging_if[w].data.rs3.rtype]) != 0;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
inuse_regs <= '0;
|
||||
operands_busy <= '0;
|
||||
end else begin
|
||||
inuse_regs <= inuse_regs_n;
|
||||
operands_busy <= operands_busy_n;
|
||||
end
|
||||
operands_ready[w] <= ~(| regs_busy);
|
||||
end
|
||||
|
||||
assign operands_ready[w] = ~(| operands_busy);
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
always @(posedge clk) begin
|
||||
if (staging_fire && staging_if[w].data.wb) begin
|
||||
inuse_units[staging_if[w].data.rd] <= staging_if[w].data.ex_type;
|
||||
if (staging_if[w].data.ex_type == `EX_SFU) begin
|
||||
if (staging_if[w].data.ex_type == EX_SFU) begin
|
||||
inuse_sfu[staging_if[w].data.rd] <= op_to_sfu_type(staging_if[w].data.op_type);
|
||||
end
|
||||
end
|
||||
|
@ -268,7 +233,7 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
end
|
||||
end
|
||||
|
||||
`RUNTIME_ASSERT((timeout_ctr < `STALL_TIMEOUT),
|
||||
`RUNTIME_ASSERT((timeout_ctr < STALL_TIMEOUT),
|
||||
("%t: *** %s timeout: wid=%0d, PC=0x%0h, tmask=%b, cycles=%0d, inuse=%b (#%0d)",
|
||||
$time, INSTANCE_ID, w, {staging_if[w].data.PC, 1'b0}, staging_if[w].data.tmask, timeout_ctr,
|
||||
operands_busy, staging_if[w].data.uuid))
|
||||
|
@ -309,14 +274,11 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
|
|||
scoreboard_if.data.op_type,
|
||||
scoreboard_if.data.op_args,
|
||||
scoreboard_if.data.wb,
|
||||
scoreboard_if.data.used_rs,
|
||||
scoreboard_if.data.rd,
|
||||
scoreboard_if.data.rs1,
|
||||
scoreboard_if.data.rs2,
|
||||
scoreboard_if.data.rs3,
|
||||
scoreboard_if.data.rd_ext,
|
||||
scoreboard_if.data.rs1_ext,
|
||||
scoreboard_if.data.rs2_ext,
|
||||
scoreboard_if.data.rs3_ext
|
||||
scoreboard_if.data.rs3
|
||||
}),
|
||||
.valid_out (scoreboard_if.valid),
|
||||
.ready_out (scoreboard_if.ready),
|
||||
|
|
|
@ -78,8 +78,9 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
|
|||
reg [PE_SEL_BITS-1:0] pe_select;
|
||||
always @(*) begin
|
||||
pe_select = PE_IDX_WCTL;
|
||||
if (`INST_SFU_IS_CSR(per_block_execute_if[0].data.op_type))
|
||||
if (inst_sfu_is_csr(per_block_execute_if[0].data.op_type)) begin
|
||||
pe_select = PE_IDX_CSRS;
|
||||
end
|
||||
end
|
||||
|
||||
VX_pe_switch #(
|
||||
|
|
|
@ -19,26 +19,26 @@ module VX_split_join import VX_gpu_pkg::*; #(
|
|||
input wire clk,
|
||||
input wire reset,
|
||||
input wire valid,
|
||||
input wire [`NW_WIDTH-1:0] wid,
|
||||
input wire [NW_WIDTH-1:0] wid,
|
||||
input split_t split,
|
||||
input join_t sjoin,
|
||||
output wire join_valid,
|
||||
output wire join_is_dvg,
|
||||
output wire join_is_else,
|
||||
output wire [`NW_WIDTH-1:0] join_wid,
|
||||
output wire [NW_WIDTH-1:0] join_wid,
|
||||
output wire [`NUM_THREADS-1:0] join_tmask,
|
||||
output wire [`PC_BITS-1:0] join_pc,
|
||||
input wire [`NW_WIDTH-1:0] stack_wid,
|
||||
output wire [`DV_STACK_SIZEW-1:0] stack_ptr
|
||||
output wire [PC_BITS-1:0] join_pc,
|
||||
input wire [NW_WIDTH-1:0] stack_wid,
|
||||
output wire [DV_STACK_SIZEW-1:0] stack_ptr
|
||||
);
|
||||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
|
||||
wire [(`NUM_THREADS+`PC_BITS)-1:0] ipdom_data [`NUM_WARPS-1:0];
|
||||
wire [`DV_STACK_SIZEW-1:0] ipdom_q_ptr [`NUM_WARPS-1:0];
|
||||
wire [(`NUM_THREADS+PC_BITS)-1:0] ipdom_data [`NUM_WARPS-1:0];
|
||||
wire [DV_STACK_SIZEW-1:0] ipdom_q_ptr [`NUM_WARPS-1:0];
|
||||
wire ipdom_set [`NUM_WARPS-1:0];
|
||||
|
||||
wire [(`NUM_THREADS+`PC_BITS)-1:0] ipdom_q0 = {split.then_tmask | split.else_tmask, `PC_BITS'(0)};
|
||||
wire [(`NUM_THREADS+`PC_BITS)-1:0] ipdom_q1 = {split.else_tmask, split.next_pc};
|
||||
wire [(`NUM_THREADS+PC_BITS)-1:0] ipdom_q0 = {split.then_tmask | split.else_tmask, PC_BITS'(0)};
|
||||
wire [(`NUM_THREADS+PC_BITS)-1:0] ipdom_q1 = {split.else_tmask, split.next_pc};
|
||||
|
||||
wire sjoin_is_dvg = (sjoin.stack_ptr != ipdom_q_ptr[wid]);
|
||||
|
||||
|
@ -47,8 +47,8 @@ module VX_split_join import VX_gpu_pkg::*; #(
|
|||
|
||||
for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_ipdom_stacks
|
||||
VX_ipdom_stack #(
|
||||
.WIDTH (`NUM_THREADS+`PC_BITS),
|
||||
.DEPTH (`DV_STACK_SIZE)
|
||||
.WIDTH (`NUM_THREADS+PC_BITS),
|
||||
.DEPTH (DV_STACK_SIZE)
|
||||
) ipdom_stack (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
@ -65,7 +65,7 @@ module VX_split_join import VX_gpu_pkg::*; #(
|
|||
end
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + 1 + 1 + `NW_WIDTH + `NUM_THREADS + `PC_BITS),
|
||||
.DATAW (1 + 1 + 1 + NW_WIDTH + `NUM_THREADS + PC_BITS),
|
||||
.DEPTH (1),
|
||||
.RESETW (1)
|
||||
) pipe_reg (
|
||||
|
|
|
@ -14,13 +14,12 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_uuid_gen import VX_gpu_pkg::*; #(
|
||||
parameter CORE_ID = 0,
|
||||
parameter UUID_WIDTH = 48
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
input wire incr,
|
||||
input wire [`NW_WIDTH-1:0] wid,
|
||||
input wire [NW_WIDTH-1:0] wid,
|
||||
output wire [UUID_WIDTH-1:0] uuid
|
||||
);
|
||||
localparam GNW_WIDTH = UUID_WIDTH - 32;
|
||||
|
@ -38,7 +37,7 @@ module VX_uuid_gen import VX_gpu_pkg::*; #(
|
|||
end
|
||||
end
|
||||
|
||||
wire [GNW_WIDTH-1:0] g_wid = (GNW_WIDTH'(CORE_ID) << `NW_BITS) + GNW_WIDTH'(wid);
|
||||
wire [GNW_WIDTH-1:0] g_wid = (GNW_WIDTH'(CORE_ID) << NW_BITS) + GNW_WIDTH'(wid);
|
||||
assign uuid = {g_wid, (has_uuid_cntrs[wid] ? uuid_cntrs[wid] : 0)};
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -32,7 +32,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
|
|||
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
|
||||
localparam PID_WIDTH = `UP(PID_BITS);
|
||||
localparam WCTL_WIDTH = $bits(tmc_t) + $bits(wspawn_t) + $bits(split_t) + $bits(join_t) + $bits(barrier_t);
|
||||
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `PC_BITS + `NR_BITS + 1 + WCTL_WIDTH + PID_WIDTH + 1 + 1 + `DV_STACK_SIZEW;
|
||||
localparam DATAW = UUID_WIDTH + NW_WIDTH + NUM_LANES + PC_BITS + NR_BITS + 1 + WCTL_WIDTH + PID_WIDTH + 1 + 1 + DV_STACK_SIZEW;
|
||||
|
||||
`UNUSED_VAR (execute_if.data.rs3_data)
|
||||
|
||||
|
@ -42,12 +42,12 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
|
|||
join_t sjoin, sjoin_r;
|
||||
barrier_t barrier, barrier_r;
|
||||
|
||||
wire is_wspawn = (execute_if.data.op_type == `INST_SFU_WSPAWN);
|
||||
wire is_tmc = (execute_if.data.op_type == `INST_SFU_TMC);
|
||||
wire is_pred = (execute_if.data.op_type == `INST_SFU_PRED);
|
||||
wire is_split = (execute_if.data.op_type == `INST_SFU_SPLIT);
|
||||
wire is_join = (execute_if.data.op_type == `INST_SFU_JOIN);
|
||||
wire is_bar = (execute_if.data.op_type == `INST_SFU_BAR);
|
||||
wire is_wspawn = (execute_if.data.op_type == INST_SFU_WSPAWN);
|
||||
wire is_tmc = (execute_if.data.op_type == INST_SFU_TMC);
|
||||
wire is_pred = (execute_if.data.op_type == INST_SFU_PRED);
|
||||
wire is_split = (execute_if.data.op_type == INST_SFU_SPLIT);
|
||||
wire is_join = (execute_if.data.op_type == INST_SFU_JOIN);
|
||||
wire is_bar = (execute_if.data.op_type == INST_SFU_BAR);
|
||||
|
||||
wire [`UP(LANE_BITS)-1:0] tid;
|
||||
if (LANE_BITS != 0) begin : g_tid
|
||||
|
@ -107,19 +107,19 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
|
|||
assign split.is_dvg = has_then && has_else;
|
||||
assign split.then_tmask = taken_tmask;
|
||||
assign split.else_tmask = ntaken_tmask;
|
||||
assign split.next_pc = execute_if.data.PC + `PC_BITS'(2);
|
||||
assign split.next_pc = execute_if.data.PC + PC_BITS'(2);
|
||||
|
||||
assign warp_ctl_if.dvstack_wid = execute_if.data.wid;
|
||||
wire [`DV_STACK_SIZEW-1:0] dvstack_ptr;
|
||||
wire [DV_STACK_SIZEW-1:0] dvstack_ptr;
|
||||
|
||||
// join
|
||||
|
||||
assign sjoin.valid = is_join;
|
||||
assign sjoin.stack_ptr = rs1_data[`DV_STACK_SIZEW-1:0];
|
||||
assign sjoin.stack_ptr = rs1_data[DV_STACK_SIZEW-1:0];
|
||||
|
||||
// barrier
|
||||
assign barrier.valid = is_bar;
|
||||
assign barrier.id = rs1_data[`NB_WIDTH-1:0];
|
||||
assign barrier.id = rs1_data[NB_WIDTH-1:0];
|
||||
`ifdef GBAR_ENABLE
|
||||
assign barrier.is_global = rs1_data[31];
|
||||
`else
|
||||
|
@ -132,11 +132,11 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
|
|||
|
||||
wire [`NUM_WARPS-1:0] wspawn_wmask;
|
||||
for (genvar i = 0; i < `NUM_WARPS; ++i) begin : g_wspawn_wmask
|
||||
assign wspawn_wmask[i] = (i < rs1_data[`NW_BITS:0]) && (i != execute_if.data.wid);
|
||||
assign wspawn_wmask[i] = (i < rs1_data[NW_BITS:0]) && (i != execute_if.data.wid);
|
||||
end
|
||||
assign wspawn.valid = is_wspawn;
|
||||
assign wspawn.wmask = wspawn_wmask;
|
||||
assign wspawn.pc = rs2_data[1 +: `PC_BITS];
|
||||
assign wspawn.pc = rs2_data[1 +: PC_BITS];
|
||||
|
||||
// response
|
||||
|
||||
|
|
|
@ -30,7 +30,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
|
|||
|
||||
input wire enable,
|
||||
|
||||
input wire [`INST_FRM_BITS-1:0] frm,
|
||||
input wire [INST_FRM_BITS-1:0] frm,
|
||||
|
||||
input wire is_itof,
|
||||
input wire is_signed,
|
||||
|
@ -92,7 +92,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
|
|||
wire [S_MAN_WIDTH-1:0] encoded_mant_s0;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `INST_FRM_BITS + 1 + $bits(fclass_t) + 1 + S_EXP_WIDTH + S_MAN_WIDTH),
|
||||
.DATAW (1 + INST_FRM_BITS + 1 + $bits(fclass_t) + 1 + S_EXP_WIDTH + S_MAN_WIDTH),
|
||||
.DEPTH (LATENCY > 1)
|
||||
) pipe_reg0 (
|
||||
.clk (clk),
|
||||
|
@ -140,7 +140,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
|
|||
wire [S_EXP_WIDTH-1:0] input_exp_s1;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `INST_FRM_BITS + 1 + $bits(fclass_t) + 1 + 1 + S_MAN_WIDTH + S_EXP_WIDTH),
|
||||
.DATAW (1 + INST_FRM_BITS + 1 + $bits(fclass_t) + 1 + 1 + S_MAN_WIDTH + S_EXP_WIDTH),
|
||||
.DEPTH (LATENCY > 2)
|
||||
) pipe_reg1 (
|
||||
.clk (clk),
|
||||
|
@ -182,7 +182,7 @@ module VX_fcvt_unit import VX_fpu_pkg::*; #(
|
|||
wire of_before_round_s2;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + 1 + `INST_FRM_BITS + $bits(fclass_t) + 1 + 1 + (2*S_MAN_WIDTH+1) + EXP_BITS + 1),
|
||||
.DATAW (1 + 1 + INST_FRM_BITS + $bits(fclass_t) + 1 + 1 + (2*S_MAN_WIDTH+1) + EXP_BITS + 1),
|
||||
.DEPTH (LATENCY > 0)
|
||||
) pipe_reg2 (
|
||||
.clk (clk),
|
||||
|
|
|
@ -29,8 +29,8 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
|
|||
|
||||
input wire enable,
|
||||
|
||||
input wire [`INST_FPU_BITS-1:0] op_type,
|
||||
input wire [`INST_FRM_BITS-1:0] frm,
|
||||
input wire [INST_FPU_BITS-1:0] op_type,
|
||||
input wire [INST_FRM_BITS-1:0] frm,
|
||||
|
||||
input wire [31:0] dataa,
|
||||
input wire [31:0] datab,
|
||||
|
@ -98,7 +98,7 @@ module VX_fncp_unit import VX_fpu_pkg::*; #(
|
|||
|
||||
`UNUSED_VAR (b_fclass_s0)
|
||||
|
||||
wire [3:0] op_mod = {(op_type == `INST_FPU_CMP), frm};
|
||||
wire [3:0] op_mod = {(op_type == INST_FPU_CMP), frm};
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (4 + 2 * 32 + 1 + 1 + 8 + 23 + 2 * $bits(fclass_t) + 1 + 1),
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -48,17 +48,17 @@ module VX_fp_rounding #(
|
|||
|
||||
always @(*) begin
|
||||
case (rnd_mode_i)
|
||||
`INST_FRM_RNE: // Decide accoring to round/sticky bits
|
||||
INST_FRM_RNE: // Decide accoring to round/sticky bits
|
||||
case (round_sticky_bits_i)
|
||||
2'b00,
|
||||
2'b00,
|
||||
2'b01: round_up = 1'b0; // < ulp/2 away, round down
|
||||
2'b10: round_up = abs_value_i[0]; // = ulp/2 away, round towards even result
|
||||
2'b11: round_up = 1'b1; // > ulp/2 away, round up
|
||||
endcase
|
||||
`INST_FRM_RTZ: round_up = 1'b0; // always round down
|
||||
`INST_FRM_RDN: round_up = (| round_sticky_bits_i) & sign_i; // to 0 if +, away if -
|
||||
`INST_FRM_RUP: round_up = (| round_sticky_bits_i) & ~sign_i; // to 0 if -, away if +
|
||||
`INST_FRM_RMM: round_up = round_sticky_bits_i[1]; // round down if < ulp/2 away, else up
|
||||
INST_FRM_RTZ: round_up = 1'b0; // always round down
|
||||
INST_FRM_RDN: round_up = (| round_sticky_bits_i) & sign_i; // to 0 if +, away if -
|
||||
INST_FRM_RUP: round_up = (| round_sticky_bits_i) & ~sign_i; // to 0 if -, away if +
|
||||
INST_FRM_RMM: round_up = round_sticky_bits_i[1]; // round down if < ulp/2 away, else up
|
||||
default: round_up = 1'bx; // propagate x
|
||||
endcase
|
||||
end
|
||||
|
@ -71,7 +71,7 @@ module VX_fp_rounding #(
|
|||
|
||||
// In case of effective subtraction (thus signs of addition operands must have differed) and a
|
||||
// true zero result, the result sign is '-' in case of RDN and '+' for other modes.
|
||||
assign sign_o = (exact_zero_o && effective_subtraction_i) ? (rnd_mode_i == `INST_FRM_RDN)
|
||||
assign sign_o = (exact_zero_o && effective_subtraction_i) ? (rnd_mode_i == INST_FRM_RDN)
|
||||
: sign_i;
|
||||
|
||||
endmodule
|
||||
|
|
|
@ -30,7 +30,7 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
|||
|
||||
input wire [TAG_WIDTH-1:0] tag_in,
|
||||
|
||||
input wire [`INST_FRM_BITS-1:0] frm,
|
||||
input wire [INST_FRM_BITS-1:0] frm,
|
||||
|
||||
input wire is_itof,
|
||||
input wire is_signed,
|
||||
|
@ -46,7 +46,7 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
|||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
localparam DATAW = 32 + `INST_FRM_BITS + 1 + 1;
|
||||
localparam DATAW = 32 + INST_FRM_BITS + 1 + 1;
|
||||
|
||||
wire [NUM_LANES-1:0][DATAW-1:0] data_in;
|
||||
|
||||
|
@ -60,9 +60,9 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
|||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_data_in
|
||||
assign data_in[i][0 +: 32] = dataa[i];
|
||||
assign data_in[i][32 +: `INST_FRM_BITS] = frm;
|
||||
assign data_in[i][32 + `INST_FRM_BITS +: 1] = is_itof;
|
||||
assign data_in[i][32 + `INST_FRM_BITS + 1 +: 1] = is_signed;
|
||||
assign data_in[i][32 +: INST_FRM_BITS] = frm;
|
||||
assign data_in[i][32 + INST_FRM_BITS +: 1] = is_itof;
|
||||
assign data_in[i][32 + INST_FRM_BITS + 1 +: 1] = is_signed;
|
||||
end
|
||||
|
||||
VX_pe_serializer #(
|
||||
|
@ -105,9 +105,9 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
|
|||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (pe_enable),
|
||||
.frm (pe_data_in[0][32 +: `INST_FRM_BITS]),
|
||||
.is_itof (pe_data_in[0][32 + `INST_FRM_BITS +: 1]),
|
||||
.is_signed (pe_data_in[0][32 + `INST_FRM_BITS + 1 +: 1]),
|
||||
.frm (pe_data_in[0][32 +: INST_FRM_BITS]),
|
||||
.is_itof (pe_data_in[0][32 + INST_FRM_BITS +: 1]),
|
||||
.is_signed (pe_data_in[0][32 + INST_FRM_BITS + 1 +: 1]),
|
||||
.dataa (pe_data_in[i][0 +: 32]),
|
||||
.result (pe_data_out[i][0 +: 32]),
|
||||
.fflags (pe_data_out[i][32 +: `FP_FLAGS_BITS])
|
||||
|
|
|
@ -30,7 +30,7 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
|
|||
|
||||
input wire [TAG_WIDTH-1:0] tag_in,
|
||||
|
||||
input wire [`INST_FRM_BITS-1:0] frm,
|
||||
input wire [INST_FRM_BITS-1:0] frm,
|
||||
|
||||
input wire [NUM_LANES-1:0][31:0] dataa,
|
||||
input wire [NUM_LANES-1:0][31:0] datab,
|
||||
|
@ -44,7 +44,7 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
|
|||
output wire valid_out,
|
||||
input wire ready_out
|
||||
);
|
||||
localparam DATAW = 2 * 32 + `INST_FRM_BITS;
|
||||
localparam DATAW = 2 * 32 + INST_FRM_BITS;
|
||||
|
||||
wire [NUM_LANES-1:0][DATAW-1:0] data_in;
|
||||
|
||||
|
@ -59,7 +59,7 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
|
|||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_data_in
|
||||
assign data_in[i][0 +: 32] = dataa[i];
|
||||
assign data_in[i][32 +: 32] = datab[i];
|
||||
assign data_in[i][64 +: `INST_FRM_BITS] = frm;
|
||||
assign data_in[i][64 +: INST_FRM_BITS] = frm;
|
||||
end
|
||||
|
||||
VX_pe_serializer #(
|
||||
|
@ -149,7 +149,7 @@ module VX_fpu_div import VX_fpu_pkg::*; #(
|
|||
int'(0),
|
||||
{32'hffffffff, pe_data_in[i][0 +: 32]}, // a
|
||||
{32'hffffffff, pe_data_in[i][32 +: 32]}, // b
|
||||
pe_data_in[0][64 +: `INST_FRM_BITS], // frm
|
||||
pe_data_in[0][64 +: INST_FRM_BITS], // frm
|
||||
r,
|
||||
f
|
||||
);
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
|
||||
`ifdef FPU_DPI
|
||||
|
||||
module VX_fpu_dpi import VX_fpu_pkg::*; #(
|
||||
module VX_fpu_dpi import VX_gpu_pkg::*, VX_fpu_pkg::*; #(
|
||||
parameter NUM_LANES = 1,
|
||||
parameter TAG_WIDTH = 1,
|
||||
parameter OUT_BUF = 0
|
||||
|
@ -30,9 +30,9 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
|
|||
|
||||
input wire [TAG_WIDTH-1:0] tag_in,
|
||||
|
||||
input wire [`INST_FPU_BITS-1:0] op_type,
|
||||
input wire [`INST_FMT_BITS-1:0] fmt,
|
||||
input wire [`INST_FRM_BITS-1:0] frm,
|
||||
input wire [INST_FPU_BITS-1:0] op_type,
|
||||
input wire [INST_FMT_BITS-1:0] fmt,
|
||||
input wire [INST_FRM_BITS-1:0] frm,
|
||||
|
||||
input wire [NUM_LANES-1:0][`XLEN-1:0] dataa,
|
||||
input wire [NUM_LANES-1:0][`XLEN-1:0] datab,
|
||||
|
@ -107,18 +107,18 @@ module VX_fpu_dpi import VX_fpu_pkg::*; #(
|
|||
is_f2f = 0;
|
||||
|
||||
case (op_type)
|
||||
`INST_FPU_ADD: begin core_select = FPU_FMA; is_fadd = ~i_fmt; is_fsub = i_fmt; end
|
||||
`INST_FPU_MADD: begin core_select = FPU_FMA; is_fmadd = ~i_fmt; is_fmsub = i_fmt; end
|
||||
`INST_FPU_NMADD: begin core_select = FPU_FMA; is_fnmadd = ~i_fmt; is_fnmsub = i_fmt; end
|
||||
`INST_FPU_MUL: begin core_select = FPU_FMA; is_fmul = 1; end
|
||||
`INST_FPU_DIV: begin core_select = FPU_DIVSQRT; is_div = 1; end
|
||||
`INST_FPU_SQRT: begin core_select = FPU_DIVSQRT; end
|
||||
`INST_FPU_CMP: begin core_select = FPU_NCP; is_fcmp = 1; end
|
||||
`INST_FPU_F2I: begin core_select = FPU_CVT; is_ftoi = 1; end
|
||||
`INST_FPU_F2U: begin core_select = FPU_CVT; is_ftou = 1; end
|
||||
`INST_FPU_I2F: begin core_select = FPU_CVT; is_itof = 1; end
|
||||
`INST_FPU_U2F: begin core_select = FPU_CVT; is_utof = 1; end
|
||||
`INST_FPU_F2F: begin core_select = FPU_CVT; is_f2f = 1; end
|
||||
INST_FPU_ADD: begin core_select = FPU_FMA; is_fadd = ~i_fmt; is_fsub = i_fmt; end
|
||||
INST_FPU_MADD: begin core_select = FPU_FMA; is_fmadd = ~i_fmt; is_fmsub = i_fmt; end
|
||||
INST_FPU_NMADD: begin core_select = FPU_FMA; is_fnmadd = ~i_fmt; is_fnmsub = i_fmt; end
|
||||
INST_FPU_MUL: begin core_select = FPU_FMA; is_fmul = 1; end
|
||||
INST_FPU_DIV: begin core_select = FPU_DIVSQRT; is_div = 1; end
|
||||
INST_FPU_SQRT: begin core_select = FPU_DIVSQRT; end
|
||||
INST_FPU_CMP: begin core_select = FPU_NCP; is_fcmp = 1; end
|
||||
INST_FPU_F2I: begin core_select = FPU_CVT; is_ftoi = 1; end
|
||||
INST_FPU_F2U: begin core_select = FPU_CVT; is_ftou = 1; end
|
||||
INST_FPU_I2F: begin core_select = FPU_CVT; is_itof = 1; end
|
||||
INST_FPU_U2F: begin core_select = FPU_CVT; is_utof = 1; end
|
||||
INST_FPU_F2F: begin core_select = FPU_CVT; is_f2f = 1; end
|
||||
default: begin core_select = FPU_NCP; end
|
||||
endcase
|
||||
end
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
|
||||
`ifdef FPU_DSP
|
||||
|
||||
module VX_fpu_dsp import VX_fpu_pkg::*; #(
|
||||
module VX_fpu_dsp import VX_gpu_pkg::*, VX_fpu_pkg::*; #(
|
||||
parameter NUM_LANES = 4,
|
||||
parameter TAG_WIDTH = 4,
|
||||
parameter OUT_BUF = 0
|
||||
|
@ -30,9 +30,9 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
|
|||
|
||||
input wire [TAG_WIDTH-1:0] tag_in,
|
||||
|
||||
input wire [`INST_FPU_BITS-1:0] op_type,
|
||||
input wire [`INST_FMT_BITS-1:0] fmt,
|
||||
input wire [`INST_FRM_BITS-1:0] frm,
|
||||
input wire [INST_FPU_BITS-1:0] op_type,
|
||||
input wire [INST_FMT_BITS-1:0] fmt,
|
||||
input wire [INST_FRM_BITS-1:0] frm,
|
||||
|
||||
input wire [NUM_LANES-1:0][`XLEN-1:0] dataa,
|
||||
input wire [NUM_LANES-1:0][`XLEN-1:0] datab,
|
||||
|
@ -54,7 +54,7 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
|
|||
localparam NUM_FPCORES = 4;
|
||||
localparam FPCORES_BITS = `LOG2UP(NUM_FPCORES);
|
||||
|
||||
localparam REQ_DATAW = NUM_LANES + TAG_WIDTH + `INST_FPU_BITS + `INST_FMT_BITS + `INST_FRM_BITS + 3 * (NUM_LANES * 32);
|
||||
localparam REQ_DATAW = NUM_LANES + TAG_WIDTH + INST_FPU_BITS + INST_FMT_BITS + INST_FRM_BITS + 3 * (NUM_LANES * 32);
|
||||
localparam RSP_DATAW = (NUM_LANES * 32) + 1 + $bits(fflags_t) + TAG_WIDTH;
|
||||
|
||||
`UNUSED_VAR (fmt)
|
||||
|
@ -65,9 +65,9 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
|
|||
|
||||
wire [NUM_FPCORES-1:0][NUM_LANES-1:0] per_core_mask_in;
|
||||
wire [NUM_FPCORES-1:0][TAG_WIDTH-1:0] per_core_tag_in;
|
||||
wire [NUM_FPCORES-1:0][`INST_FPU_BITS-1:0] per_core_op_type;
|
||||
wire [NUM_FPCORES-1:0][`INST_FMT_BITS-1:0] per_core_fmt;
|
||||
wire [NUM_FPCORES-1:0][`INST_FRM_BITS-1:0] per_core_frm;
|
||||
wire [NUM_FPCORES-1:0][INST_FPU_BITS-1:0] per_core_op_type;
|
||||
wire [NUM_FPCORES-1:0][INST_FMT_BITS-1:0] per_core_fmt;
|
||||
wire [NUM_FPCORES-1:0][INST_FRM_BITS-1:0] per_core_frm;
|
||||
wire [NUM_FPCORES-1:0][NUM_LANES-1:0][31:0] per_core_dataa;
|
||||
wire [NUM_FPCORES-1:0][NUM_LANES-1:0][31:0] per_core_datab;
|
||||
wire [NUM_FPCORES-1:0][NUM_LANES-1:0][31:0] per_core_datac;
|
||||
|
@ -98,6 +98,7 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
|
|||
|
||||
VX_stream_switch #(
|
||||
.DATAW (REQ_DATAW),
|
||||
.NUM_INPUTS (1),
|
||||
.NUM_OUTPUTS (NUM_FPCORES)
|
||||
) req_switch (
|
||||
.clk (clk),
|
||||
|
@ -163,9 +164,9 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
|
|||
|
||||
wire [1:0][NUM_LANES-1:0] div_sqrt_mask_in;
|
||||
wire [1:0][TAG_WIDTH-1:0] div_sqrt_tag_in;
|
||||
wire [1:0][`INST_FPU_BITS-1:0] div_sqrt_op_type;
|
||||
wire [1:0][`INST_FMT_BITS-1:0] div_sqrt_fmt;
|
||||
wire [1:0][`INST_FRM_BITS-1:0] div_sqrt_frm;
|
||||
wire [1:0][INST_FPU_BITS-1:0] div_sqrt_op_type;
|
||||
wire [1:0][INST_FMT_BITS-1:0] div_sqrt_fmt;
|
||||
wire [1:0][INST_FRM_BITS-1:0] div_sqrt_frm;
|
||||
wire [1:0][NUM_LANES-1:0][31:0] div_sqrt_dataa;
|
||||
wire [1:0][NUM_LANES-1:0][31:0] div_sqrt_datab;
|
||||
wire [1:0][NUM_LANES-1:0][31:0] div_sqrt_datac;
|
||||
|
@ -198,6 +199,7 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
|
|||
|
||||
VX_stream_switch #(
|
||||
.DATAW (REQ_DATAW),
|
||||
.NUM_INPUTS (1),
|
||||
.NUM_OUTPUTS (2)
|
||||
) div_sqrt_req_switch (
|
||||
.clk (clk),
|
||||
|
@ -333,12 +335,12 @@ module VX_fpu_dsp import VX_fpu_pkg::*; #(
|
|||
|
||||
// NCP core ///////////////////////////////////////////////////////////////
|
||||
|
||||
wire ncp_ret_int_in = (per_core_op_type[FPU_NCP] == `INST_FPU_CMP)
|
||||
|| `INST_FPU_IS_CLASS(per_core_op_type[FPU_NCP], per_core_frm[FPU_NCP])
|
||||
|| `INST_FPU_IS_MVXW(per_core_op_type[FPU_NCP], per_core_frm[FPU_NCP]);
|
||||
wire ncp_ret_int_in = (per_core_op_type[FPU_NCP] == INST_FPU_CMP)
|
||||
|| inst_fpu_is_class(per_core_op_type[FPU_NCP], per_core_frm[FPU_NCP])
|
||||
|| inst_fpu_is_mvxw(per_core_op_type[FPU_NCP], per_core_frm[FPU_NCP]);
|
||||
wire ncp_ret_int_out;
|
||||
|
||||
wire ncp_ret_sext_in = `INST_FPU_IS_MVXW(per_core_op_type[FPU_NCP], per_core_frm[FPU_NCP]);
|
||||
wire ncp_ret_sext_in = inst_fpu_is_mvxw(per_core_op_type[FPU_NCP], per_core_frm[FPU_NCP]);
|
||||
wire ncp_ret_sext_out;
|
||||
|
||||
VX_fpu_ncp #(
|
||||
|
|
|
@ -30,7 +30,7 @@ module VX_fpu_fma import VX_fpu_pkg::*; #(
|
|||
|
||||
input wire [TAG_WIDTH-1:0] tag_in,
|
||||
|
||||
input wire [`INST_FRM_BITS-1:0] frm,
|
||||
input wire [INST_FRM_BITS-1:0] frm,
|
||||
|
||||
input wire is_madd,
|
||||
input wire is_sub,
|
||||
|
@ -49,7 +49,7 @@ module VX_fpu_fma import VX_fpu_pkg::*; #(
|
|||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
localparam DATAW = 3 * 32 + `INST_FRM_BITS;
|
||||
localparam DATAW = 3 * 32 + INST_FRM_BITS;
|
||||
|
||||
wire [NUM_LANES-1:0][DATAW-1:0] data_in;
|
||||
|
||||
|
@ -90,7 +90,7 @@ module VX_fpu_fma import VX_fpu_pkg::*; #(
|
|||
assign data_in[i][0 +: 32] = a[i];
|
||||
assign data_in[i][32 +: 32] = b[i];
|
||||
assign data_in[i][64 +: 32] = c[i];
|
||||
assign data_in[i][96 +: `INST_FRM_BITS] = frm;
|
||||
assign data_in[i][96 +: INST_FRM_BITS] = frm;
|
||||
end
|
||||
|
||||
VX_pe_serializer #(
|
||||
|
@ -184,7 +184,7 @@ module VX_fpu_fma import VX_fpu_pkg::*; #(
|
|||
{32'hffffffff, pe_data_in[i][0 +: 32]}, // a
|
||||
{32'hffffffff, pe_data_in[i][32 +: 32]}, // b
|
||||
{32'hffffffff, pe_data_in[i][64 +: 32]}, // c
|
||||
pe_data_in[0][96 +: `INST_FRM_BITS], // frm
|
||||
pe_data_in[0][96 +: INST_FRM_BITS], // frm
|
||||
r,
|
||||
f
|
||||
);
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
`ifdef FPU_FPNEW
|
||||
|
||||
module VX_fpu_fpnew
|
||||
import VX_gpu_pkg::*;
|
||||
import VX_fpu_pkg::*;
|
||||
import fpnew_pkg::*;
|
||||
import cf_math_pkg::*;
|
||||
|
|
|
@ -30,8 +30,8 @@ module VX_fpu_ncp import VX_fpu_pkg::*; #(
|
|||
|
||||
input wire [TAG_WIDTH-1:0] tag_in,
|
||||
|
||||
input wire [`INST_FPU_BITS-1:0] op_type,
|
||||
input wire [`INST_FRM_BITS-1:0] frm,
|
||||
input wire [INST_FPU_BITS-1:0] op_type,
|
||||
input wire [INST_FRM_BITS-1:0] frm,
|
||||
|
||||
input wire [NUM_LANES-1:0][31:0] dataa,
|
||||
input wire [NUM_LANES-1:0][31:0] datab,
|
||||
|
@ -45,7 +45,7 @@ module VX_fpu_ncp import VX_fpu_pkg::*; #(
|
|||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
localparam DATAW = 2 * 32 + `INST_FRM_BITS + `INST_FPU_BITS;
|
||||
localparam DATAW = 2 * 32 + INST_FRM_BITS + INST_FPU_BITS;
|
||||
|
||||
wire [NUM_LANES-1:0][DATAW-1:0] data_in;
|
||||
|
||||
|
@ -60,8 +60,8 @@ module VX_fpu_ncp import VX_fpu_pkg::*; #(
|
|||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_data_in
|
||||
assign data_in[i][0 +: 32] = dataa[i];
|
||||
assign data_in[i][32 +: 32] = datab[i];
|
||||
assign data_in[i][64 +: `INST_FRM_BITS] = frm;
|
||||
assign data_in[i][64 + `INST_FRM_BITS +: `INST_FPU_BITS] = op_type;
|
||||
assign data_in[i][64 +: INST_FRM_BITS] = frm;
|
||||
assign data_in[i][64 + INST_FRM_BITS +: INST_FPU_BITS] = op_type;
|
||||
end
|
||||
|
||||
VX_pe_serializer #(
|
||||
|
@ -104,8 +104,8 @@ module VX_fpu_ncp import VX_fpu_pkg::*; #(
|
|||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (pe_enable),
|
||||
.frm (pe_data_in[0][64 +: `INST_FRM_BITS]),
|
||||
.op_type (pe_data_in[0][64 + `INST_FRM_BITS +: `INST_FPU_BITS]),
|
||||
.frm (pe_data_in[0][64 +: INST_FRM_BITS]),
|
||||
.op_type (pe_data_in[0][64 + INST_FRM_BITS +: INST_FPU_BITS]),
|
||||
.dataa (pe_data_in[i][0 +: 32]),
|
||||
.datab (pe_data_in[i][32 +: 32]),
|
||||
.result (pe_data_out[i][0 +: 32]),
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -25,7 +25,7 @@ typedef struct packed {
|
|||
logic is_inf;
|
||||
logic is_nan;
|
||||
logic is_quiet;
|
||||
logic is_signaling;
|
||||
logic is_signaling;
|
||||
} fclass_t;
|
||||
|
||||
typedef struct packed {
|
||||
|
|
|
@ -30,7 +30,7 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #(
|
|||
|
||||
input wire [TAG_WIDTH-1:0] tag_in,
|
||||
|
||||
input wire [`INST_FRM_BITS-1:0] frm,
|
||||
input wire [INST_FRM_BITS-1:0] frm,
|
||||
|
||||
input wire [NUM_LANES-1:0][31:0] dataa,
|
||||
output wire [NUM_LANES-1:0][31:0] result,
|
||||
|
@ -43,7 +43,7 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #(
|
|||
input wire ready_out,
|
||||
output wire valid_out
|
||||
);
|
||||
localparam DATAW = 32 + `INST_FRM_BITS;
|
||||
localparam DATAW = 32 + INST_FRM_BITS;
|
||||
|
||||
wire [NUM_LANES-1:0][DATAW-1:0] data_in;
|
||||
|
||||
|
@ -57,7 +57,7 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #(
|
|||
|
||||
for (genvar i = 0; i < NUM_LANES; ++i) begin : g_data_in
|
||||
assign data_in[i][0 +: 32] = dataa[i];
|
||||
assign data_in[i][32 +: `INST_FRM_BITS] = frm;
|
||||
assign data_in[i][32 +: INST_FRM_BITS] = frm;
|
||||
end
|
||||
|
||||
VX_pe_serializer #(
|
||||
|
@ -144,7 +144,7 @@ module VX_fpu_sqrt import VX_fpu_pkg::*; #(
|
|||
pe_enable,
|
||||
int'(0),
|
||||
{32'hffffffff, pe_data_in[i][0 +: 32]}, // a
|
||||
pe_data_in[0][32 +: `INST_FRM_BITS], // frm
|
||||
pe_data_in[0][32 +: INST_FRM_BITS], // frm
|
||||
r,
|
||||
f
|
||||
);
|
||||
|
|
|
@ -13,12 +13,12 @@
|
|||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_branch_ctl_if ();
|
||||
interface VX_branch_ctl_if import VX_gpu_pkg::*; ();
|
||||
|
||||
wire valid;
|
||||
wire [`NW_WIDTH-1:0] wid;
|
||||
wire [NW_WIDTH-1:0] wid;
|
||||
wire taken;
|
||||
wire [`PC_BITS-1:0] dest;
|
||||
wire [PC_BITS-1:0] dest;
|
||||
|
||||
modport master (
|
||||
output valid,
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -13,9 +13,9 @@
|
|||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_commit_csr_if ();
|
||||
interface VX_commit_csr_if import VX_gpu_pkg::*; ();
|
||||
|
||||
wire [`PERF_CTR_BITS-1:0] instret;
|
||||
wire [PERF_CTR_BITS-1:0] instret;
|
||||
|
||||
modport master (
|
||||
output instret
|
||||
|
|
|
@ -13,18 +13,18 @@
|
|||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_commit_if #(
|
||||
parameter NUM_LANES = `NUM_THREADS,
|
||||
parameter PID_WIDTH = `LOG2UP(`NUM_THREADS / NUM_LANES)
|
||||
interface VX_commit_if import VX_gpu_pkg::*; #(
|
||||
parameter NUM_LANES = `SIMD_WIDTH,
|
||||
parameter PID_WIDTH = `LOG2UP(`SIMD_WIDTH / NUM_LANES)
|
||||
) ();
|
||||
|
||||
typedef struct packed {
|
||||
logic [`UUID_WIDTH-1:0] uuid;
|
||||
logic [`NW_WIDTH-1:0] wid;
|
||||
logic [UUID_WIDTH-1:0] uuid;
|
||||
logic [NW_WIDTH-1:0] wid;
|
||||
logic [NUM_LANES-1:0] tmask;
|
||||
logic [`PC_BITS-1:0] PC;
|
||||
logic [PC_BITS-1:0] PC;
|
||||
logic wb;
|
||||
logic [`NR_BITS-1:0] rd;
|
||||
logic [NR_BITS-1:0] rd;
|
||||
logic [NUM_LANES-1:0][`XLEN-1:0] data;
|
||||
logic [PID_WIDTH-1:0] pid;
|
||||
logic sop;
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -13,11 +13,11 @@
|
|||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_dcr_bus_if ();
|
||||
interface VX_dcr_bus_if import VX_gpu_pkg::*; ();
|
||||
|
||||
wire write_valid;
|
||||
wire [`VX_DCR_ADDR_WIDTH-1:0] write_addr;
|
||||
wire [`VX_DCR_DATA_WIDTH-1:0] write_data;
|
||||
wire write_valid;
|
||||
wire [VX_DCR_ADDR_WIDTH-1:0] write_addr;
|
||||
wire [VX_DCR_DATA_WIDTH-1:0] write_data;
|
||||
|
||||
modport master (
|
||||
output write_valid,
|
||||
|
|
|
@ -14,27 +14,23 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
interface VX_decode_if import VX_gpu_pkg::*; #(
|
||||
parameter NUM_WARPS = `NUM_WARPS,
|
||||
parameter NW_WIDTH = `LOG2UP(NUM_WARPS)
|
||||
parameter NUM_WARPS = `NUM_WARPS
|
||||
);
|
||||
|
||||
typedef struct packed {
|
||||
logic [`UUID_WIDTH-1:0] uuid;
|
||||
logic [NW_WIDTH-1:0] wid;
|
||||
logic [UUID_WIDTH-1:0] uuid;
|
||||
logic [`LOG2UP(NUM_WARPS)-1:0] wid;
|
||||
logic [`NUM_THREADS-1:0] tmask;
|
||||
logic [`PC_BITS-1:0] PC;
|
||||
logic [`EX_BITS-1:0] ex_type;
|
||||
logic [`INST_OP_BITS-1:0] op_type;
|
||||
logic [PC_BITS-1:0] PC;
|
||||
logic [EX_BITS-1:0] ex_type;
|
||||
logic [INST_OP_BITS-1:0] op_type;
|
||||
op_args_t op_args;
|
||||
logic wb;
|
||||
logic [`NR_BITS-1:0] rd;
|
||||
logic [`NR_BITS-1:0] rs1;
|
||||
logic [`NR_BITS-1:0] rs2;
|
||||
logic [`NR_BITS-1:0] rs3;
|
||||
logic [`REG_EXT_BITS-1:0] rd_ext;
|
||||
logic [`REG_EXT_BITS-1:0] rs1_ext;
|
||||
logic [`REG_EXT_BITS-1:0] rs2_ext;
|
||||
logic [`REG_EXT_BITS-1:0] rs3_ext;
|
||||
logic [NUM_SRC_OPDS-1:0] used_rs;
|
||||
reg_idx_t rd;
|
||||
reg_idx_t rs1;
|
||||
reg_idx_t rs2;
|
||||
reg_idx_t rs3;
|
||||
} data_t;
|
||||
|
||||
logic valid;
|
||||
|
|
|
@ -13,11 +13,11 @@
|
|||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_decode_sched_if ();
|
||||
interface VX_decode_sched_if import VX_gpu_pkg::*; ();
|
||||
|
||||
wire valid;
|
||||
wire unlock;
|
||||
wire [`NW_WIDTH-1:0] wid;
|
||||
wire [NW_WIDTH-1:0] wid;
|
||||
|
||||
modport master (
|
||||
output valid,
|
||||
|
|
|
@ -16,18 +16,18 @@
|
|||
interface VX_dispatch_if import VX_gpu_pkg::*; ();
|
||||
// warning: this layout should not be modified without updating VX_dispatch_unit!!!
|
||||
typedef struct packed {
|
||||
logic [`UUID_WIDTH-1:0] uuid;
|
||||
logic [ISSUE_WIS_W-1:0] wis;
|
||||
logic [`NUM_THREADS-1:0] tmask;
|
||||
logic [`PC_BITS-1:0] PC;
|
||||
logic [`INST_ALU_BITS-1:0] op_type;
|
||||
op_args_t op_args;
|
||||
logic wb;
|
||||
logic [`NR_BITS-1:0] rd;
|
||||
logic [`NT_WIDTH-1:0] tid;
|
||||
logic [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data;
|
||||
logic [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data;
|
||||
logic [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data;
|
||||
logic [UUID_WIDTH-1:0] uuid;
|
||||
logic [ISSUE_WIS_W-1:0] wis;
|
||||
logic [`SIMD_WIDTH-1:0] tmask;
|
||||
logic [PC_BITS-1:0] PC;
|
||||
logic [INST_ALU_BITS-1:0] op_type;
|
||||
op_args_t op_args;
|
||||
logic wb;
|
||||
logic [NR_BITS-1:0] rd;
|
||||
logic [NT_WIDTH-1:0] tid;
|
||||
logic [`SIMD_WIDTH-1:0][`XLEN-1:0] rs1_data;
|
||||
logic [`SIMD_WIDTH-1:0][`XLEN-1:0] rs2_data;
|
||||
logic [`SIMD_WIDTH-1:0][`XLEN-1:0] rs3_data;
|
||||
} data_t;
|
||||
|
||||
logic valid;
|
||||
|
|
|
@ -15,18 +15,18 @@
|
|||
|
||||
interface VX_execute_if import VX_gpu_pkg::*; #(
|
||||
parameter NUM_LANES = 1,
|
||||
parameter PID_WIDTH = `LOG2UP(`NUM_THREADS / NUM_LANES)
|
||||
parameter PID_WIDTH = `LOG2UP(`SIMD_WIDTH / NUM_LANES)
|
||||
);
|
||||
typedef struct packed {
|
||||
logic [`UUID_WIDTH-1:0] uuid;
|
||||
logic [`NW_WIDTH-1:0] wid;
|
||||
logic [UUID_WIDTH-1:0] uuid;
|
||||
logic [NW_WIDTH-1:0] wid;
|
||||
logic [NUM_LANES-1:0] tmask;
|
||||
logic [`PC_BITS-1:0] PC;
|
||||
logic [`INST_ALU_BITS-1:0] op_type;
|
||||
logic [PC_BITS-1:0] PC;
|
||||
logic [INST_ALU_BITS-1:0] op_type;
|
||||
op_args_t op_args;
|
||||
logic wb;
|
||||
logic [`NR_BITS-1:0] rd;
|
||||
logic [`NT_WIDTH-1:0] tid;
|
||||
logic [NR_BITS-1:0] rd;
|
||||
logic [NT_WIDTH-1:0] tid;
|
||||
logic [NUM_LANES-1:0][`XLEN-1:0] rs1_data;
|
||||
logic [NUM_LANES-1:0][`XLEN-1:0] rs2_data;
|
||||
logic [NUM_LANES-1:0][`XLEN-1:0] rs3_data;
|
||||
|
|
|
@ -13,13 +13,13 @@
|
|||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_fetch_if ();
|
||||
interface VX_fetch_if import VX_gpu_pkg::*; ();
|
||||
|
||||
typedef struct packed {
|
||||
logic [`UUID_WIDTH-1:0] uuid;
|
||||
logic [`NW_WIDTH-1:0] wid;
|
||||
logic [UUID_WIDTH-1:0] uuid;
|
||||
logic [NW_WIDTH-1:0] wid;
|
||||
logic [`NUM_THREADS-1:0] tmask;
|
||||
logic [`PC_BITS-1:0] PC;
|
||||
logic [PC_BITS-1:0] PC;
|
||||
logic [31:0] instr;
|
||||
} data_t;
|
||||
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -13,14 +13,14 @@
|
|||
|
||||
`include "VX_fpu_define.vh"
|
||||
|
||||
interface VX_fpu_csr_if import VX_fpu_pkg::*; ();
|
||||
interface VX_fpu_csr_if import VX_gpu_pkg::*, VX_fpu_pkg::*; ();
|
||||
|
||||
wire write_enable;
|
||||
wire [`NW_WIDTH-1:0] write_wid;
|
||||
wire [NW_WIDTH-1:0] write_wid;
|
||||
fflags_t write_fflags;
|
||||
|
||||
wire [`NW_WIDTH-1:0] read_wid;
|
||||
wire [`INST_FRM_BITS-1:0] read_frm;
|
||||
wire [NW_WIDTH-1:0] read_wid;
|
||||
wire [INST_FRM_BITS-1:0] read_frm;
|
||||
|
||||
modport master (
|
||||
output write_enable,
|
||||
|
@ -35,7 +35,7 @@ interface VX_fpu_csr_if import VX_fpu_pkg::*; ();
|
|||
input write_enable,
|
||||
input write_wid,
|
||||
input write_fflags,
|
||||
|
||||
|
||||
input read_wid,
|
||||
output read_frm
|
||||
);
|
|
@ -16,21 +16,18 @@
|
|||
interface VX_ibuffer_if import VX_gpu_pkg::*; ();
|
||||
|
||||
typedef struct packed {
|
||||
logic [`UUID_WIDTH-1:0] uuid;
|
||||
logic [UUID_WIDTH-1:0] uuid;
|
||||
logic [`NUM_THREADS-1:0] tmask;
|
||||
logic [`PC_BITS-1:0] PC;
|
||||
logic [`EX_BITS-1:0] ex_type;
|
||||
logic [`INST_OP_BITS-1:0] op_type;
|
||||
logic [PC_BITS-1:0] PC;
|
||||
logic [EX_BITS-1:0] ex_type;
|
||||
logic [INST_OP_BITS-1:0] op_type;
|
||||
op_args_t op_args;
|
||||
logic wb;
|
||||
logic [`NR_BITS-1:0] rd;
|
||||
logic [`NR_BITS-1:0] rs1;
|
||||
logic [`NR_BITS-1:0] rs2;
|
||||
logic [`NR_BITS-1:0] rs3;
|
||||
logic [`REG_EXT_BITS-1:0] rd_ext;
|
||||
logic [`REG_EXT_BITS-1:0] rs1_ext;
|
||||
logic [`REG_EXT_BITS-1:0] rs2_ext;
|
||||
logic [`REG_EXT_BITS-1:0] rs3_ext;
|
||||
logic [NUM_SRC_OPDS-1:0] used_rs;
|
||||
reg_idx_t rd;
|
||||
reg_idx_t rs1;
|
||||
reg_idx_t rs2;
|
||||
reg_idx_t rs3;
|
||||
} data_t;
|
||||
|
||||
logic valid;
|
||||
|
|
55
hw/rtl/interfaces/VX_opc_if.sv
Normal file
55
hw/rtl/interfaces/VX_opc_if.sv
Normal file
|
@ -0,0 +1,55 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_opc_if import VX_gpu_pkg::*; ();
|
||||
|
||||
typedef struct packed {
|
||||
logic [1:0] opd_id;
|
||||
logic [SIMD_IDX_W-1:0] sid;
|
||||
logic [ISSUE_WIS_W-1:0] wis;
|
||||
logic [NR_BITS-1:0] reg_id;
|
||||
} req_data_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic [1:0] opd_id;
|
||||
logic [`SIMD_WIDTH-1:0][`XLEN-1:0] value;
|
||||
} rsp_data_t;
|
||||
|
||||
logic req_valid;
|
||||
req_data_t req_data;
|
||||
logic req_ready;
|
||||
|
||||
logic rsp_valid;
|
||||
rsp_data_t rsp_data;
|
||||
|
||||
modport master (
|
||||
output req_valid,
|
||||
output req_data,
|
||||
input req_ready,
|
||||
|
||||
input rsp_valid,
|
||||
input rsp_data
|
||||
);
|
||||
|
||||
modport slave (
|
||||
input req_valid,
|
||||
input req_data,
|
||||
output req_ready,
|
||||
|
||||
output rsp_valid,
|
||||
output rsp_data
|
||||
);
|
||||
|
||||
endinterface
|
|
@ -16,18 +16,18 @@
|
|||
interface VX_operands_if import VX_gpu_pkg::*; ();
|
||||
|
||||
typedef struct packed {
|
||||
logic [`UUID_WIDTH-1:0] uuid;
|
||||
logic [ISSUE_WIS_W-1:0] wis;
|
||||
logic [`NUM_THREADS-1:0] tmask;
|
||||
logic [`PC_BITS-1:0] PC;
|
||||
logic [`EX_BITS-1:0] ex_type;
|
||||
logic [`INST_OP_BITS-1:0] op_type;
|
||||
op_args_t op_args;
|
||||
logic wb;
|
||||
logic [`NR_BITS-1:0] rd;
|
||||
logic [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data;
|
||||
logic [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data;
|
||||
logic [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data;
|
||||
logic [UUID_WIDTH-1:0] uuid;
|
||||
logic [ISSUE_WIS_W-1:0] wis;
|
||||
logic [`SIMD_WIDTH-1:0] tmask;
|
||||
logic [PC_BITS-1:0] PC;
|
||||
logic [EX_BITS-1:0] ex_type;
|
||||
logic [INST_OP_BITS-1:0] op_type;
|
||||
op_args_t op_args;
|
||||
logic wb;
|
||||
logic [NR_BITS-1:0] rd;
|
||||
logic [`SIMD_WIDTH-1:0][`XLEN-1:0] rs1_data;
|
||||
logic [`SIMD_WIDTH-1:0][`XLEN-1:0] rs2_data;
|
||||
logic [`SIMD_WIDTH-1:0][`XLEN-1:0] rs3_data;
|
||||
} data_t;
|
||||
|
||||
logic valid;
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -13,15 +13,15 @@
|
|||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_sched_csr_if ();
|
||||
interface VX_sched_csr_if import VX_gpu_pkg::*; ();
|
||||
|
||||
wire [`PERF_CTR_BITS-1:0] cycles;
|
||||
wire [PERF_CTR_BITS-1:0] cycles;
|
||||
wire [`NUM_WARPS-1:0] active_warps;
|
||||
wire [`NUM_WARPS-1:0][`NUM_THREADS-1:0] thread_masks;
|
||||
wire alm_empty;
|
||||
wire [`NW_WIDTH-1:0] alm_empty_wid;
|
||||
wire [NW_WIDTH-1:0] alm_empty_wid;
|
||||
wire unlock_warp;
|
||||
wire [`NW_WIDTH-1:0] unlock_wid;
|
||||
wire [NW_WIDTH-1:0] unlock_wid;
|
||||
|
||||
modport master (
|
||||
output cycles,
|
||||
|
@ -29,7 +29,7 @@ interface VX_sched_csr_if ();
|
|||
output thread_masks,
|
||||
input alm_empty_wid,
|
||||
output alm_empty,
|
||||
input unlock_wid,
|
||||
input unlock_wid,
|
||||
input unlock_warp
|
||||
);
|
||||
|
||||
|
|
|
@ -13,13 +13,13 @@
|
|||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_schedule_if ();
|
||||
interface VX_schedule_if import VX_gpu_pkg::*; ();
|
||||
|
||||
typedef struct packed {
|
||||
logic [`UUID_WIDTH-1:0] uuid;
|
||||
logic [`NW_WIDTH-1:0] wid;
|
||||
logic [UUID_WIDTH-1:0] uuid;
|
||||
logic [NW_WIDTH-1:0] wid;
|
||||
logic [`NUM_THREADS-1:0] tmask;
|
||||
logic [`PC_BITS-1:0] PC;
|
||||
logic [PC_BITS-1:0] PC;
|
||||
} data_t;
|
||||
|
||||
logic valid;
|
||||
|
|
|
@ -16,22 +16,19 @@
|
|||
interface VX_scoreboard_if import VX_gpu_pkg::*; ();
|
||||
|
||||
typedef struct packed {
|
||||
logic [`UUID_WIDTH-1:0] uuid;
|
||||
logic [UUID_WIDTH-1:0] uuid;
|
||||
logic [ISSUE_WIS_W-1:0] wis;
|
||||
logic [`NUM_THREADS-1:0] tmask;
|
||||
logic [`PC_BITS-1:0] PC;
|
||||
logic [`EX_BITS-1:0] ex_type;
|
||||
logic [`INST_OP_BITS-1:0] op_type;
|
||||
logic [PC_BITS-1:0] PC;
|
||||
logic [EX_BITS-1:0] ex_type;
|
||||
logic [INST_OP_BITS-1:0] op_type;
|
||||
op_args_t op_args;
|
||||
logic wb;
|
||||
logic [`NR_BITS-1:0] rd;
|
||||
logic [`NR_BITS-1:0] rs1;
|
||||
logic [`NR_BITS-1:0] rs2;
|
||||
logic [`NR_BITS-1:0] rs3;
|
||||
logic [`REG_EXT_BITS-1:0] rd_ext;
|
||||
logic [`REG_EXT_BITS-1:0] rs1_ext;
|
||||
logic [`REG_EXT_BITS-1:0] rs2_ext;
|
||||
logic [`REG_EXT_BITS-1:0] rs3_ext;
|
||||
logic [NUM_SRC_OPDS-1:0] used_rs;
|
||||
reg_idx_t rd;
|
||||
reg_idx_t rs1;
|
||||
reg_idx_t rs2;
|
||||
reg_idx_t rs3;
|
||||
} data_t;
|
||||
|
||||
logic valid;
|
||||
|
|
|
@ -16,15 +16,15 @@
|
|||
interface VX_warp_ctl_if import VX_gpu_pkg::*; ();
|
||||
|
||||
wire valid;
|
||||
wire [`NW_WIDTH-1:0] wid;
|
||||
wire [NW_WIDTH-1:0] wid;
|
||||
tmc_t tmc;
|
||||
wspawn_t wspawn;
|
||||
split_t split;
|
||||
join_t sjoin;
|
||||
barrier_t barrier;
|
||||
|
||||
wire [`NW_WIDTH-1:0] dvstack_wid;
|
||||
wire [`DV_STACK_SIZEW-1:0] dvstack_ptr;
|
||||
wire [NW_WIDTH-1:0] dvstack_wid;
|
||||
wire [DV_STACK_SIZEW-1:0] dvstack_ptr;
|
||||
|
||||
modport master (
|
||||
output valid,
|
||||
|
|
|
@ -16,12 +16,13 @@
|
|||
interface VX_writeback_if import VX_gpu_pkg::*; ();
|
||||
|
||||
typedef struct packed {
|
||||
logic [`UUID_WIDTH-1:0] uuid;
|
||||
logic [UUID_WIDTH-1:0] uuid;
|
||||
logic [ISSUE_WIS_W-1:0] wis;
|
||||
logic [`NUM_THREADS-1:0] tmask;
|
||||
logic [`PC_BITS-1:0] PC;
|
||||
logic [`NR_BITS-1:0] rd;
|
||||
logic [`NUM_THREADS-1:0][`XLEN-1:0] data;
|
||||
logic [SIMD_IDX_W-1:0] sid;
|
||||
logic [`SIMD_WIDTH-1:0] tmask;
|
||||
logic [PC_BITS-1:0] PC;
|
||||
logic [NR_BITS-1:0] rd;
|
||||
logic [`SIMD_WIDTH-1:0][`XLEN-1:0] data;
|
||||
logic sop;
|
||||
logic eop;
|
||||
} data_t;
|
||||
|
|
|
@ -36,13 +36,17 @@ module VX_stream_switch #(
|
|||
output wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out,
|
||||
input wire [NUM_OUTPUTS-1:0] ready_out
|
||||
);
|
||||
logic [NUM_OUTPUTS-1:0] valid_out_w;
|
||||
logic [NUM_OUTPUTS-1:0][DATAW-1:0] data_out_w;
|
||||
logic [NUM_OUTPUTS-1:0] ready_out_w;
|
||||
|
||||
if (NUM_INPUTS > NUM_OUTPUTS) begin : g_input_select
|
||||
|
||||
for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_out_buf
|
||||
|
||||
wire [NUM_REQS-1:0] valid_in_w;
|
||||
wire [NUM_REQS-1:0][DATAW-1:0] data_in_w;
|
||||
wire [NUM_REQS-1:0] ready_in_w;
|
||||
logic [NUM_REQS-1:0] valid_in_w;
|
||||
logic [NUM_REQS-1:0][DATAW-1:0] data_in_w;
|
||||
logic [NUM_REQS-1:0] ready_in_w;
|
||||
|
||||
for (genvar r = 0; r < NUM_REQS; ++r) begin : g_r
|
||||
localparam i = r * NUM_OUTPUTS + o;
|
||||
|
@ -57,20 +61,15 @@ module VX_stream_switch #(
|
|||
end
|
||||
end
|
||||
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
|
||||
) out_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in_w[sel_in[o]]),
|
||||
.ready_in (ready_in_w[sel_in[o]]),
|
||||
.data_in (data_in_w[sel_in[o]]),
|
||||
.data_out (data_out[o]),
|
||||
.valid_out (valid_out[o]),
|
||||
.ready_out (ready_out[o])
|
||||
);
|
||||
assign valid_out_w[o] = valid_in_w[sel_in[o]];
|
||||
assign data_out_w[o] = data_in_w[sel_in[o]];
|
||||
|
||||
always @(*) begin
|
||||
ready_in_w = '0;
|
||||
for (integer o = 0; o < NUM_OUTPUTS; ++o) begin
|
||||
ready_in_w[sel_in[o]] = ready_out_w[o];
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
end else if (NUM_OUTPUTS > NUM_INPUTS) begin : g_output_select
|
||||
|
@ -79,32 +78,20 @@ module VX_stream_switch #(
|
|||
|
||||
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_out_buf
|
||||
|
||||
wire [NUM_REQS-1:0] ready_out_w;
|
||||
logic [NUM_REQS-1:0] ready_out_s;
|
||||
|
||||
for (genvar r = 0; r < NUM_REQS; ++r) begin : g_r
|
||||
localparam o = r * NUM_INPUTS + i;
|
||||
if (o < NUM_OUTPUTS) begin : g_valid
|
||||
wire valid_out_w = valid_in[i] && (sel_in[i] == LOG_NUM_REQS'(r));
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
|
||||
) out_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_out_w),
|
||||
.ready_in (ready_out_w[r]),
|
||||
.data_in (data_in[i]),
|
||||
.data_out (data_out[o]),
|
||||
.valid_out (valid_out[o]),
|
||||
.ready_out (ready_out[o])
|
||||
);
|
||||
assign valid_out_w[o] = valid_in[i] && (sel_in[i] == LOG_NUM_REQS'(r));
|
||||
assign data_out_w[o] = data_in[i];
|
||||
assign ready_out_s[r] = ready_out_w[o];
|
||||
end else begin : g_padding
|
||||
assign ready_out_w[r] = '0;
|
||||
assign ready_out_s[r] = '0;
|
||||
end
|
||||
end
|
||||
|
||||
assign ready_in[i] = ready_out_w[sel_in[i]];
|
||||
assign ready_in[i] = ready_out_s[sel_in[i]];
|
||||
end
|
||||
|
||||
end else begin : g_passthru
|
||||
|
@ -114,22 +101,28 @@ module VX_stream_switch #(
|
|||
`UNUSED_VAR (sel_in)
|
||||
|
||||
for (genvar i = 0; i < NUM_OUTPUTS; ++i) begin : g_out_buf
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
|
||||
) out_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_in[i]),
|
||||
.ready_in (ready_in[i]),
|
||||
.data_in (data_in[i]),
|
||||
.data_out (data_out[i]),
|
||||
.valid_out (valid_out[i]),
|
||||
.ready_out (ready_out[i])
|
||||
);
|
||||
assign valid_out_w[i] = valid_in[i];
|
||||
assign data_out_w[i] = data_in[i];
|
||||
assign ready_in[i] = ready_out_w[i];
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_out_buf
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
|
||||
) out_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_out_w[o]),
|
||||
.data_in (data_out_w[o]),
|
||||
.ready_in (ready_out_w[o]),
|
||||
.valid_out (valid_out[o]),
|
||||
.data_out (data_out[o]),
|
||||
.ready_out (ready_out[o])
|
||||
);
|
||||
end
|
||||
|
||||
endmodule
|
||||
`TRACING_ON
|
||||
|
|
93
hw/rtl/libs/VX_stream_xpoint.sv
Normal file
93
hw/rtl/libs/VX_stream_xpoint.sv
Normal file
|
@ -0,0 +1,93 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
`include "VX_platform.vh"
|
||||
|
||||
`TRACING_OFF
|
||||
module VX_stream_xpoint #(
|
||||
parameter NUM_INPUTS = 1,
|
||||
parameter NUM_OUTPUTS = 1,
|
||||
parameter DATAW = 1,
|
||||
parameter OUT_DRIVEN = 0,
|
||||
parameter OUT_BUF = 0,
|
||||
parameter SEL_SRC = OUT_DRIVEN ? NUM_OUTPUTS : NUM_INPUTS,
|
||||
parameter SEL_DST = OUT_DRIVEN ? NUM_INPUTS : NUM_OUTPUTS
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
input wire [SEL_SRC-1:0][`LOG2UP(SEL_DST)-1:0] sel_in,
|
||||
|
||||
input wire [NUM_INPUTS-1:0] valid_in,
|
||||
input wire [NUM_INPUTS-1:0][DATAW-1:0] data_in,
|
||||
output wire [NUM_INPUTS-1:0] ready_in,
|
||||
|
||||
output wire [NUM_OUTPUTS-1:0] valid_out,
|
||||
output wire [NUM_OUTPUTS-1:0][DATAW-1:0] data_out,
|
||||
input wire [NUM_OUTPUTS-1:0] ready_out
|
||||
);
|
||||
logic [NUM_OUTPUTS-1:0] valid_out_w;
|
||||
logic [NUM_OUTPUTS-1:0][DATAW-1:0] data_out_w;
|
||||
logic [NUM_OUTPUTS-1:0] ready_out_w;
|
||||
|
||||
if (OUT_DRIVEN) begin : g_output_driven
|
||||
|
||||
for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_out_buf
|
||||
assign valid_out_w[o] = valid_in[sel_in[o]];
|
||||
assign data_out_w[o] = data_in[sel_in[o]];
|
||||
end
|
||||
|
||||
logic [NUM_INPUTS-1:0] ready_in_w;
|
||||
always @(*) begin
|
||||
ready_in_w = '0;
|
||||
for (integer o = 0; o < NUM_OUTPUTS; ++o) begin
|
||||
ready_in_w[sel_in[o]] = ready_out_w[o];
|
||||
end
|
||||
end
|
||||
assign ready_in = ready_in_w;
|
||||
|
||||
end else begin: g_input_driven
|
||||
|
||||
always @(*) begin
|
||||
valid_out_w = '0;
|
||||
data_out_w = 'x;
|
||||
for (integer i = 0; i < NUM_INPUTS; ++i) begin
|
||||
valid_out_w[sel_in[i]] = valid_in[i];
|
||||
data_out_w[sel_in[i]] = data_in[i];
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_ready_in
|
||||
assign ready_in[i] = ready_out_w[sel_in[i]];
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar o = 0; o < NUM_OUTPUTS; ++o) begin : g_out_buf
|
||||
VX_elastic_buffer #(
|
||||
.DATAW (DATAW),
|
||||
.SIZE (`TO_OUT_BUF_SIZE(OUT_BUF)),
|
||||
.OUT_REG (`TO_OUT_BUF_REG(OUT_BUF))
|
||||
) out_buf (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (valid_out_w[o]),
|
||||
.data_in (data_out_w[o]),
|
||||
.ready_in (ready_out_w[o]),
|
||||
.valid_out (valid_out[o]),
|
||||
.data_out (data_out[o]),
|
||||
.ready_out (ready_out[o])
|
||||
);
|
||||
end
|
||||
|
||||
endmodule
|
||||
`TRACING_ON
|
|
@ -13,7 +13,7 @@
|
|||
|
||||
`include "VX_define.vh"
|
||||
|
||||
module VX_gbar_arb #(
|
||||
module VX_gbar_arb import VX_gpu_pkg::*; #(
|
||||
parameter NUM_REQS = 1,
|
||||
parameter OUT_BUF = 0,
|
||||
parameter `STRING ARBITER = "R"
|
||||
|
@ -25,7 +25,7 @@ module VX_gbar_arb #(
|
|||
VX_gbar_bus_if.master bus_out_if
|
||||
);
|
||||
|
||||
localparam REQ_DATAW = `NB_WIDTH + `NC_WIDTH + `NC_WIDTH;
|
||||
localparam REQ_DATAW = NB_WIDTH + NC_WIDTH + NC_WIDTH;
|
||||
|
||||
// arbitrate request
|
||||
|
||||
|
@ -60,7 +60,7 @@ module VX_gbar_arb #(
|
|||
// broadcast response
|
||||
|
||||
reg rsp_valid;
|
||||
reg [`NB_WIDTH-1:0] rsp_data;
|
||||
reg [NB_WIDTH-1:0] rsp_data;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
|
|
|
@ -13,16 +13,16 @@
|
|||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_gbar_bus_if ();
|
||||
interface VX_gbar_bus_if import VX_gpu_pkg::*; ();
|
||||
|
||||
typedef struct packed {
|
||||
logic [`NB_WIDTH-1:0] id;
|
||||
logic [`NC_WIDTH-1:0] size_m1;
|
||||
logic [`NC_WIDTH-1:0] core_id;
|
||||
logic [NB_WIDTH-1:0] id;
|
||||
logic [NC_WIDTH-1:0] size_m1;
|
||||
logic [NC_WIDTH-1:0] core_id;
|
||||
} req_data_t;
|
||||
|
||||
typedef struct packed {
|
||||
logic [`NB_WIDTH-1:0] id;
|
||||
logic [NB_WIDTH-1:0] id;
|
||||
} rsp_data_t;
|
||||
|
||||
logic req_valid;
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
|
||||
`include "VX_define.vh"
|
||||
|
||||
module VX_gbar_unit #(
|
||||
module VX_gbar_unit import VX_gpu_pkg::*; #(
|
||||
parameter `STRING INSTANCE_ID = ""
|
||||
) (
|
||||
input wire clk,
|
||||
|
@ -23,7 +23,7 @@ module VX_gbar_unit #(
|
|||
);
|
||||
`UNUSED_SPARAM (INSTANCE_ID)
|
||||
|
||||
reg [`NB_WIDTH-1:0][`NUM_CORES-1:0] barrier_masks;
|
||||
reg [NB_WIDTH-1:0][`NUM_CORES-1:0] barrier_masks;
|
||||
wire [`CLOG2(`NUM_CORES+1)-1:0] active_barrier_count;
|
||||
wire [`NUM_CORES-1:0] curr_barrier_mask = barrier_masks[gbar_bus_if.req_data.id];
|
||||
|
||||
|
@ -31,7 +31,7 @@ module VX_gbar_unit #(
|
|||
`UNUSED_VAR (active_barrier_count)
|
||||
|
||||
reg rsp_valid;
|
||||
reg [`NB_WIDTH-1:0] rsp_bar_id;
|
||||
reg [NB_WIDTH-1:0] rsp_bar_id;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
|
@ -42,7 +42,7 @@ module VX_gbar_unit #(
|
|||
rsp_valid <= 0;
|
||||
end
|
||||
if (gbar_bus_if.req_valid) begin
|
||||
if (active_barrier_count[`NC_WIDTH-1:0] == gbar_bus_if.req_data.size_m1) begin
|
||||
if (active_barrier_count[NC_WIDTH-1:0] == gbar_bus_if.req_data.size_m1) begin
|
||||
barrier_masks[gbar_bus_if.req_data.id] <= '0;
|
||||
rsp_bar_id <= gbar_bus_if.req_data.id;
|
||||
rsp_valid <= 1;
|
||||
|
|
|
@ -25,7 +25,7 @@ module VX_lmem_switch import VX_gpu_pkg::*; #(
|
|||
VX_lsu_mem_if.master global_out_if,
|
||||
VX_lsu_mem_if.master local_out_if
|
||||
);
|
||||
localparam REQ_DATAW = `NUM_LSU_LANES + 1 + `NUM_LSU_LANES * (LSU_WORD_SIZE + LSU_ADDR_WIDTH + `MEM_REQ_FLAGS_WIDTH + LSU_WORD_SIZE * 8) + LSU_TAG_WIDTH;
|
||||
localparam REQ_DATAW = `NUM_LSU_LANES + 1 + `NUM_LSU_LANES * (LSU_WORD_SIZE + LSU_ADDR_WIDTH + MEM_FLAGS_WIDTH + LSU_WORD_SIZE * 8) + LSU_TAG_WIDTH;
|
||||
localparam RSP_DATAW = `NUM_LSU_LANES + `NUM_LSU_LANES * (LSU_WORD_SIZE * 8) + LSU_TAG_WIDTH;
|
||||
|
||||
wire [`NUM_LSU_LANES-1:0] is_addr_local_mask;
|
||||
|
@ -33,7 +33,7 @@ module VX_lmem_switch import VX_gpu_pkg::*; #(
|
|||
wire req_local_ready;
|
||||
|
||||
for (genvar i = 0; i < `NUM_LSU_LANES; ++i) begin : g_is_addr_local_mask
|
||||
assign is_addr_local_mask[i] = lsu_in_if.req_data.flags[i][`MEM_REQ_FLAG_LOCAL];
|
||||
assign is_addr_local_mask[i] = lsu_in_if.req_data.flags[i][MEM_REQ_FLAG_LOCAL];
|
||||
end
|
||||
|
||||
wire is_addr_global = | (lsu_in_if.req_data.mask & ~is_addr_local_mask);
|
||||
|
|
|
@ -29,9 +29,6 @@ module VX_local_mem import VX_gpu_pkg::*; #(
|
|||
// Size of a word in bytes
|
||||
parameter WORD_SIZE = `XLEN/8,
|
||||
|
||||
// Request debug identifier
|
||||
parameter UUID_WIDTH = 0,
|
||||
|
||||
// Request tag size
|
||||
parameter TAG_WIDTH = 16,
|
||||
|
||||
|
@ -101,7 +98,7 @@ module VX_local_mem import VX_gpu_pkg::*; #(
|
|||
wire [NUM_REQS-1:0] req_ready_in;
|
||||
|
||||
`ifdef PERF_ENABLE
|
||||
wire [`PERF_CTR_BITS-1:0] perf_collisions;
|
||||
wire [PERF_CTR_BITS-1:0] perf_collisions;
|
||||
`endif
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin : g_req_data_in
|
||||
|
@ -120,7 +117,7 @@ module VX_local_mem import VX_gpu_pkg::*; #(
|
|||
.NUM_INPUTS (NUM_REQS),
|
||||
.NUM_OUTPUTS (NUM_BANKS),
|
||||
.DATAW (REQ_DATAW),
|
||||
.PERF_CTR_BITS (`PERF_CTR_BITS),
|
||||
.PERF_CTR_BITS (PERF_CTR_BITS),
|
||||
.ARBITER ("P"),
|
||||
.OUT_BUF (3) // output should be registered for the data_store addressing
|
||||
) req_xbar (
|
||||
|
@ -270,9 +267,9 @@ module VX_local_mem import VX_gpu_pkg::*; #(
|
|||
`POP_COUNT(perf_writes_per_cycle, perf_writes_per_req);
|
||||
`POP_COUNT(perf_crsp_stall_per_cycle, perf_crsp_stall_per_req);
|
||||
|
||||
reg [`PERF_CTR_BITS-1:0] perf_reads;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_writes;
|
||||
reg [`PERF_CTR_BITS-1:0] perf_crsp_stalls;
|
||||
reg [PERF_CTR_BITS-1:0] perf_reads;
|
||||
reg [PERF_CTR_BITS-1:0] perf_writes;
|
||||
reg [PERF_CTR_BITS-1:0] perf_crsp_stalls;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
|
@ -280,9 +277,9 @@ module VX_local_mem import VX_gpu_pkg::*; #(
|
|||
perf_writes <= '0;
|
||||
perf_crsp_stalls <= '0;
|
||||
end else begin
|
||||
perf_reads <= perf_reads + `PERF_CTR_BITS'(perf_reads_per_cycle);
|
||||
perf_writes <= perf_writes + `PERF_CTR_BITS'(perf_writes_per_cycle);
|
||||
perf_crsp_stalls <= perf_crsp_stalls + `PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
|
||||
perf_reads <= perf_reads + PERF_CTR_BITS'(perf_reads_per_cycle);
|
||||
perf_writes <= perf_writes + PERF_CTR_BITS'(perf_writes_per_cycle);
|
||||
perf_crsp_stalls <= perf_crsp_stalls + PERF_CTR_BITS'(perf_crsp_stall_per_cycle);
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
@ -47,7 +47,7 @@ module VX_local_mem_top import VX_gpu_pkg::*; #(
|
|||
input wire [NUM_REQS-1:0] mem_req_rw,
|
||||
input wire [NUM_REQS-1:0][WORD_SIZE-1:0] mem_req_byteen,
|
||||
input wire [NUM_REQS-1:0][ADDR_WIDTH-1:0] mem_req_addr,
|
||||
input wire [NUM_REQS-1:0][`MEM_REQ_FLAGS_WIDTH-1:0] mem_req_flags,
|
||||
input wire [NUM_REQS-1:0][MEM_FLAGS_WIDTH-1:0] mem_req_flags,
|
||||
input wire [NUM_REQS-1:0][WORD_SIZE*8-1:0] mem_req_data,
|
||||
input wire [NUM_REQS-1:0][TAG_WIDTH-1:0] mem_req_tag,
|
||||
output wire [NUM_REQS-1:0] mem_req_ready,
|
||||
|
|
|
@ -29,7 +29,7 @@ module VX_lsu_adapter import VX_gpu_pkg::*; #(
|
|||
VX_mem_bus_if.master mem_bus_if [NUM_LANES]
|
||||
);
|
||||
localparam REQ_ADDR_WIDTH = `MEM_ADDR_WIDTH - `CLOG2(DATA_SIZE);
|
||||
localparam REQ_DATA_WIDTH = 1 + DATA_SIZE + REQ_ADDR_WIDTH + `MEM_REQ_FLAGS_WIDTH + DATA_SIZE * 8;
|
||||
localparam REQ_DATA_WIDTH = 1 + DATA_SIZE + REQ_ADDR_WIDTH + MEM_FLAGS_WIDTH + DATA_SIZE * 8;
|
||||
localparam RSP_DATA_WIDTH = DATA_SIZE * 8;
|
||||
|
||||
// handle request unpacking
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
|
||||
`include "VX_define.vh"
|
||||
|
||||
module VX_lsu_mem_arb #(
|
||||
module VX_lsu_mem_arb import VX_gpu_pkg::*; #(
|
||||
parameter NUM_INPUTS = 1,
|
||||
parameter NUM_OUTPUTS = 1,
|
||||
parameter NUM_LANES = 1,
|
||||
|
@ -25,7 +25,7 @@ module VX_lsu_mem_arb #(
|
|||
parameter `STRING ARBITER = "R",
|
||||
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
|
||||
parameter ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE)),
|
||||
parameter FLAGS_WIDTH = `MEM_REQ_FLAGS_WIDTH
|
||||
parameter FLAGS_WIDTH = MEM_FLAGS_WIDTH
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
@ -40,15 +40,15 @@ module VX_lsu_mem_arb #(
|
|||
|
||||
`STATIC_ASSERT ((NUM_INPUTS >= NUM_OUTPUTS), ("invalid parameter: NUM_INPUTS=%0d, NUM_OUTPUTS=%0d", NUM_INPUTS, NUM_OUTPUTS));
|
||||
|
||||
wire [NUM_OUTPUTS-1:0] req_valid_out;
|
||||
wire [NUM_OUTPUTS-1:0][REQ_DATAW-1:0] req_data_out;
|
||||
wire [NUM_OUTPUTS-1:0] req_ready_out;
|
||||
wire [NUM_OUTPUTS-1:0][`UP(LOG_NUM_REQS)-1:0] req_sel_out;
|
||||
|
||||
wire [NUM_INPUTS-1:0] req_valid_in;
|
||||
wire [NUM_INPUTS-1:0][REQ_DATAW-1:0] req_data_in;
|
||||
wire [NUM_INPUTS-1:0] req_ready_in;
|
||||
|
||||
wire [NUM_OUTPUTS-1:0] req_valid_out;
|
||||
wire [NUM_OUTPUTS-1:0][REQ_DATAW-1:0] req_data_out;
|
||||
wire [NUM_OUTPUTS-1:0][`UP(LOG_NUM_REQS)-1:0] req_sel_out;
|
||||
wire [NUM_OUTPUTS-1:0] req_ready_out;
|
||||
|
||||
for (genvar i = 0; i < NUM_INPUTS; ++i) begin : g_req_data_in
|
||||
assign req_valid_in[i] = bus_in_if[i].req_valid;
|
||||
assign req_data_in[i] = bus_in_if[i].req_data;
|
||||
|
|
|
@ -13,14 +13,13 @@
|
|||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_lsu_mem_if #(
|
||||
interface VX_lsu_mem_if import VX_gpu_pkg::*; #(
|
||||
parameter NUM_LANES = 1,
|
||||
parameter DATA_SIZE = 1,
|
||||
parameter TAG_WIDTH = 1,
|
||||
parameter FLAGS_WIDTH= `MEM_REQ_FLAGS_WIDTH,
|
||||
parameter FLAGS_WIDTH = MEM_FLAGS_WIDTH,
|
||||
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
|
||||
parameter ADDR_WIDTH = MEM_ADDR_WIDTH - `CLOG2(DATA_SIZE),
|
||||
parameter UUID_WIDTH = `UUID_WIDTH
|
||||
parameter ADDR_WIDTH = MEM_ADDR_WIDTH - `CLOG2(DATA_SIZE)
|
||||
) ();
|
||||
|
||||
typedef struct packed {
|
||||
|
|
|
@ -13,7 +13,7 @@
|
|||
|
||||
`include "VX_define.vh"
|
||||
|
||||
module VX_mem_arb #(
|
||||
module VX_mem_arb import VX_gpu_pkg::*; #(
|
||||
parameter NUM_INPUTS = 1,
|
||||
parameter NUM_OUTPUTS = 1,
|
||||
parameter DATA_SIZE = 1,
|
||||
|
@ -24,7 +24,7 @@ module VX_mem_arb #(
|
|||
parameter `STRING ARBITER = "R",
|
||||
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
|
||||
parameter ADDR_WIDTH = (MEM_ADDR_WIDTH-`CLOG2(DATA_SIZE)),
|
||||
parameter FLAGS_WIDTH = `MEM_REQ_FLAGS_WIDTH
|
||||
parameter FLAGS_WIDTH = MEM_FLAGS_WIDTH
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
|
|
@ -13,13 +13,12 @@
|
|||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_mem_bus_if #(
|
||||
interface VX_mem_bus_if import VX_gpu_pkg::*; #(
|
||||
parameter DATA_SIZE = 1,
|
||||
parameter FLAGS_WIDTH= `MEM_REQ_FLAGS_WIDTH,
|
||||
parameter FLAGS_WIDTH = MEM_FLAGS_WIDTH,
|
||||
parameter TAG_WIDTH = 1,
|
||||
parameter MEM_ADDR_WIDTH = `MEM_ADDR_WIDTH,
|
||||
parameter ADDR_WIDTH = MEM_ADDR_WIDTH - `CLOG2(DATA_SIZE),
|
||||
parameter UUID_WIDTH = `UUID_WIDTH
|
||||
parameter ADDR_WIDTH = MEM_ADDR_WIDTH - `CLOG2(DATA_SIZE)
|
||||
) ();
|
||||
|
||||
typedef struct packed {
|
||||
|
|
|
@ -35,7 +35,7 @@ module VX_mem_switch import VX_gpu_pkg::*; #(
|
|||
VX_mem_bus_if.master bus_out_if [NUM_OUTPUTS]
|
||||
);
|
||||
localparam DATA_WIDTH = (8 * DATA_SIZE);
|
||||
localparam REQ_DATAW = TAG_WIDTH + ADDR_WIDTH + `MEM_REQ_FLAGS_WIDTH + 1 + DATA_SIZE + DATA_WIDTH;
|
||||
localparam REQ_DATAW = TAG_WIDTH + ADDR_WIDTH + MEM_FLAGS_WIDTH + 1 + DATA_SIZE + DATA_WIDTH;
|
||||
localparam RSP_DATAW = TAG_WIDTH + DATA_WIDTH;
|
||||
|
||||
// handle requests ////////////////////////////////////////////////////////
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Reference in a new issue