SIMT Tack compression

This commit is contained in:
Blaise Tine 2024-04-30 02:19:32 -07:00
parent 9df25ff48f
commit ca79e69355
13 changed files with 289 additions and 235 deletions

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -44,6 +44,9 @@
`define NR_BITS `CLOG2(`NUM_REGS)
`define DV_STACK_SIZE `UP(`NUM_THREADS-1)
`define DV_STACK_SIZEW `UP(`CLOG2(`DV_STACK_SIZE))
`define PERF_CTR_BITS 44
`ifndef NDEBUG
@ -90,10 +93,10 @@
`define INST_FL 7'b0000111 // float load instruction
`define INST_FS 7'b0100111 // float store instruction
`define INST_FMADD 7'b1000011
`define INST_FMADD 7'b1000011
`define INST_FMSUB 7'b1000111
`define INST_FNMSUB 7'b1001011
`define INST_FNMADD 7'b1001111
`define INST_FNMADD 7'b1001111
`define INST_FCI 7'b1010011 // float common instructions
// Custom extension opcodes
@ -143,8 +146,8 @@
`define INST_BR_EQ 4'b0000
`define INST_BR_NE 4'b0010
`define INST_BR_LTU 4'b0100
`define INST_BR_GEU 4'b0110
`define INST_BR_LTU 4'b0100
`define INST_BR_GEU 4'b0110
`define INST_BR_LT 4'b0101
`define INST_BR_GE 4'b0111
`define INST_BR_JAL 4'b1000
@ -184,14 +187,14 @@
`define INST_FMT_HU 3'b101
`define INST_FMT_WU 3'b110
`define INST_LSU_LB 4'b0000
`define INST_LSU_LB 4'b0000
`define INST_LSU_LH 4'b0001
`define INST_LSU_LW 4'b0010
`define INST_LSU_LD 4'b0011 // new for RV64I LD
`define INST_LSU_LBU 4'b0100
`define INST_LSU_LHU 4'b0101
`define INST_LSU_LWU 4'b0110 // new for RV64I LWU
`define INST_LSU_SB 4'b1000
`define INST_LSU_SB 4'b1000
`define INST_LSU_SH 4'b1001
`define INST_LSU_SW 4'b1010
`define INST_LSU_SD 4'b1011 // new for RV64I SD
@ -205,9 +208,9 @@
`define INST_FENCE_D 1'h0
`define INST_FENCE_I 1'h1
`define INST_FPU_ADD 4'b0000
`define INST_FPU_SUB 4'b0001
`define INST_FPU_MUL 4'b0010
`define INST_FPU_ADD 4'b0000
`define INST_FPU_SUB 4'b0001
`define INST_FPU_MUL 4'b0010
`define INST_FPU_DIV 4'b0011
`define INST_FPU_SQRT 4'b0100
`define INST_FPU_CMP 4'b0101 // mod: LE=0, LT=1, EQ=2
@ -217,9 +220,9 @@
`define INST_FPU_F2U 4'b1001
`define INST_FPU_I2F 4'b1010
`define INST_FPU_U2F 4'b1011
`define INST_FPU_MADD 4'b1100
`define INST_FPU_MSUB 4'b1101
`define INST_FPU_NMSUB 4'b1110
`define INST_FPU_MADD 4'b1100
`define INST_FPU_MSUB 4'b1101
`define INST_FPU_NMSUB 4'b1110
`define INST_FPU_NMADD 4'b1111
`define INST_FPU_BITS 4
`define INST_FPU_IS_W(mod) (mod[4])
@ -227,7 +230,7 @@
`define INST_FPU_IS_MVXW(op, mod) (op == `INST_FPU_MISC && mod == 4)
`define INST_SFU_TMC 4'h0
`define INST_SFU_WSPAWN 4'h1
`define INST_SFU_WSPAWN 4'h1
`define INST_SFU_SPLIT 4'h2
`define INST_SFU_JOIN 4'h3
`define INST_SFU_BAR 4'h4
@ -249,7 +252,7 @@
`define CACHE_MEM_TAG_WIDTH(mshr_size, num_banks) \
(`CLOG2(mshr_size) + `CLOG2(num_banks))
`define CACHE_BYPASS_TAG_WIDTH(num_reqs, line_size, word_size, tag_width) \
(`CLOG2(num_reqs) + `CLOG2(line_size / word_size) + tag_width)
@ -259,7 +262,7 @@
///////////////////////////////////////////////////////////////////////////////
`define CACHE_CLUSTER_CORE_ARB_TAG(tag_width, num_inputs, num_caches) \
(tag_width + `ARB_SEL_BITS(num_inputs, `UP(num_caches)))
(tag_width + `ARB_SEL_BITS(num_inputs, `UP(num_caches)))
`define CACHE_CLUSTER_MEM_ARB_TAG(tag_width, num_caches) \
(tag_width + `ARB_SEL_BITS(`UP(num_caches), 1))
@ -286,7 +289,7 @@
`define ADDR_TYPE_LOCAL 1
`define ADDR_TYPE_WIDTH (`LMEM_ENABLED + 1)
`define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE
`define VX_MEM_BYTEEN_WIDTH `L3_LINE_SIZE
`define VX_MEM_ADDR_WIDTH (`MEM_ADDR_WIDTH - `CLOG2(`L3_LINE_SIZE))
`define VX_MEM_DATA_WIDTH (`L3_LINE_SIZE * 8)
`define VX_MEM_TAG_WIDTH L3_MEM_TAG_WIDTH

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -39,7 +39,7 @@ package VX_gpu_pkg;
typedef struct packed {
logic valid;
logic is_dvg;
logic [`DV_STACK_SIZEW-1:0] stack_ptr;
} join_t;
typedef struct packed {
@ -100,14 +100,14 @@ package VX_gpu_pkg;
localparam DCACHE_CHANNELS = `UP((`NUM_LSU_LANES * LSU_WORD_SIZE) / DCACHE_WORD_SIZE);
localparam DCACHE_NUM_REQS = `NUM_LSU_BLOCKS * DCACHE_CHANNELS;
// Core request tag Id bits
// Core request tag Id bits
localparam DCACHE_MERGED_REQS = (`NUM_LSU_LANES * LSU_WORD_SIZE) / DCACHE_WORD_SIZE;
localparam DCACHE_MEM_BATCHES = `CDIV(DCACHE_MERGED_REQS, DCACHE_CHANNELS);
localparam DCACHE_TAG_ID_BITS = (`CLOG2(`LSUQ_OUT_SIZE) + `CLOG2(DCACHE_MEM_BATCHES));
// Core request tag bits
localparam DCACHE_TAG_WIDTH = (`UUID_WIDTH + DCACHE_TAG_ID_BITS);
// Memory request data bits
localparam DCACHE_MEM_DATA_WIDTH = (DCACHE_LINE_SIZE * 8);
@ -127,7 +127,7 @@ package VX_gpu_pkg;
// Block size in bytes
localparam ICACHE_LINE_SIZE = `L1_LINE_SIZE;
// Core request tag Id bits
// Core request tag Id bits
localparam ICACHE_TAG_ID_BITS = `NW_WIDTH;
// Core request tag bits
@ -147,7 +147,7 @@ package VX_gpu_pkg;
localparam L1_MEM_TAG_WIDTH = `MAX(ICACHE_MEM_TAG_WIDTH, DCACHE_MEM_TAG_WIDTH);
localparam L1_MEM_ARB_TAG_WIDTH = (L1_MEM_TAG_WIDTH + `CLOG2(2));
/////////////////////////////// L2 Parameters /////////////////////////////
localparam ICACHE_MEM_ARB_IDX = 0;
@ -198,21 +198,21 @@ package VX_gpu_pkg;
/////////////////////////////// Issue parameters //////////////////////////
localparam ISSUE_ISW = `CLOG2(`ISSUE_WIDTH);
localparam ISSUE_ISW_W = `UP(ISSUE_ISW);
localparam ISSUE_ISW_W = `UP(ISSUE_ISW);
localparam ISSUE_RATIO = `NUM_WARPS / `ISSUE_WIDTH;
localparam ISSUE_WIS = `CLOG2(ISSUE_RATIO);
localparam ISSUE_WIS_W = `UP(ISSUE_WIS);
`IGNORE_UNUSED_BEGIN
function logic [`NW_WIDTH-1:0] wis_to_wid(
input logic [ISSUE_WIS_W-1:0] wis,
input logic [ISSUE_WIS_W-1:0] wis,
input logic [ISSUE_ISW_W-1:0] isw
);
if (ISSUE_WIS == 0) begin
wis_to_wid = `NW_WIDTH'(isw);
end else if (ISSUE_ISW == 0) begin
wis_to_wid = `NW_WIDTH'(wis);
end else begin
end else begin
wis_to_wid = `NW_WIDTH'({wis, isw});
end
endfunction
@ -220,7 +220,7 @@ package VX_gpu_pkg;
function logic [ISSUE_ISW_W-1:0] wid_to_isw(
input logic [`NW_WIDTH-1:0] wid
);
if (ISSUE_ISW != 0) begin
if (ISSUE_ISW != 0) begin
wid_to_isw = wid[ISSUE_ISW_W-1:0];
end else begin
wid_to_isw = 0;

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -37,7 +37,7 @@ module VX_decode #(
// inputs
VX_fetch_if.slave fetch_if,
// outputs
// outputs
VX_decode_if.master decode_if,
VX_decode_sched_if.master decode_sched_if
);
@ -47,17 +47,17 @@ module VX_decode #(
`UNUSED_PARAM (CORE_ID)
`UNUSED_VAR (clk)
`UNUSED_VAR (reset)
reg [`EX_BITS-1:0] ex_type;
reg [`INST_OP_BITS-1:0] op_type;
reg [`EX_BITS-1:0] ex_type;
reg [`INST_OP_BITS-1:0] op_type;
reg [`INST_MOD_BITS-1:0] op_mod;
reg [`NR_BITS-1:0] rd_r, rs1_r, rs2_r, rs3_r;
reg [`XLEN-1:0] imm;
reg [`XLEN-1:0] imm;
reg use_rd, use_rs1, use_rs2, use_rs3, use_PC, use_imm;
reg is_wstall;
wire [31:0] instr = fetch_if.data.instr;
wire [6:0] opcode = instr[6:0];
wire [6:0] opcode = instr[6:0];
wire [1:0] func2 = instr[26:25];
wire [2:0] func3 = instr[14:12];
wire [4:0] func5 = instr[31:27];
@ -85,7 +85,7 @@ module VX_decode #(
wire [11:0] iw_imm = is_itype_sh ? {7'b0, instr[24:20]} : u_12;
`else
wire [11:0] i_imm = is_itype_sh ? {7'b0, instr[24:20]} : u_12;
`endif
`endif
wire [11:0] s_imm = {func7, rd};
wire [12:0] b_imm = {instr[31], instr[7], instr[30:25], instr[11:8], 1'b0};
wire [20:0] jal_imm = {instr[31], instr[19:12], instr[20], instr[30:21], 1'b0};
@ -121,9 +121,9 @@ module VX_decode #(
always @(*) begin
case (u_12)
12'h000: s_type = `INST_OP_BITS'(`INST_BR_ECALL);
12'h001: s_type = `INST_OP_BITS'(`INST_BR_EBREAK);
12'h002: s_type = `INST_OP_BITS'(`INST_BR_URET);
12'h102: s_type = `INST_OP_BITS'(`INST_BR_SRET);
12'h001: s_type = `INST_OP_BITS'(`INST_BR_EBREAK);
12'h002: s_type = `INST_OP_BITS'(`INST_BR_URET);
12'h102: s_type = `INST_OP_BITS'(`INST_BR_SRET);
12'h302: s_type = `INST_OP_BITS'(`INST_BR_MRET);
default: s_type = 'x;
endcase
@ -163,7 +163,7 @@ module VX_decode #(
use_rs3 = 0;
is_wstall = 0;
case (opcode)
case (opcode)
`INST_I: begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(r_type);
@ -173,17 +173,17 @@ module VX_decode #(
`USED_IREG (rd);
`USED_IREG (rs1);
end
`INST_R: begin
`INST_R: begin
ex_type = `EX_ALU;
`ifdef EXT_M_ENABLE
if (func7[0]) begin
op_type = `INST_OP_BITS'(m_type);
op_mod[1] = 1;
end else
end else
`endif
begin
op_type = `INST_OP_BITS'(r_type);
end
end
use_rd = 1;
`USED_IREG (rd);
`USED_IREG (rs1);
@ -203,12 +203,12 @@ module VX_decode #(
end
`INST_R_W: begin
ex_type = `EX_ALU;
`ifdef EXT_M_ENABLE
`ifdef EXT_M_ENABLE
if (func7[0]) begin
// MULW, DIVW, DIVUW, REMW, REMUW
op_type = `INST_OP_BITS'(m_type);
op_mod[1] = 1;
end else
op_mod[1] = 1;
end else
`endif
begin
// ADDW, SUBW, SLLW, SRLW, SRAW
@ -221,7 +221,7 @@ module VX_decode #(
`USED_IREG (rs2);
end
`endif
`INST_LUI: begin
`INST_LUI: begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(`INST_ALU_LUI);
use_rd = 1;
@ -229,7 +229,7 @@ module VX_decode #(
imm = {{`XLEN-31{ui_imm[19]}}, ui_imm[18:0], 12'(0)};
`USED_IREG (rd);
end
`INST_AUIPC: begin
`INST_AUIPC: begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(`INST_ALU_AUIPC);
use_rd = 1;
@ -238,7 +238,7 @@ module VX_decode #(
imm = {{`XLEN-31{ui_imm[19]}}, ui_imm[18:0], 12'(0)};
`USED_IREG (rd);
end
`INST_JAL: begin
`INST_JAL: begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(`INST_BR_JAL);
op_mod[0] = 1;
@ -249,7 +249,7 @@ module VX_decode #(
imm = {{(`XLEN-21){jal_imm[20]}}, jal_imm};
`USED_IREG (rd);
end
`INST_JALR: begin
`INST_JALR: begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(`INST_BR_JALR);
op_mod[0] = 1;
@ -260,7 +260,7 @@ module VX_decode #(
`USED_IREG (rd);
`USED_IREG (rs1);
end
`INST_B: begin
`INST_B: begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(b_type);
op_mod[0] = 1;
@ -275,8 +275,8 @@ module VX_decode #(
ex_type = `EX_LSU;
op_type = `INST_LSU_FENCE;
end
`INST_SYS : begin
if (func3[1:0] != 0) begin
`INST_SYS : begin
if (func3[1:0] != 0) begin
ex_type = `EX_SFU;
op_type = `INST_OP_BITS'(`INST_SFU_CSR(func3[1:0]));
use_rd = 1;
@ -288,7 +288,7 @@ module VX_decode #(
imm[`VX_CSR_ADDR_BITS +: `NRI_BITS] = rs1; // imm
end else begin
`USED_IREG (rs1);
end
end
end else begin
ex_type = `EX_ALU;
op_type = `INST_OP_BITS'(s_type);
@ -302,9 +302,9 @@ module VX_decode #(
end
end
`ifdef EXT_F_ENABLE
`INST_FL,
`INST_FL,
`endif
`INST_L: begin
`INST_L: begin
ex_type = `EX_LSU;
op_type = `INST_OP_BITS'({1'b0, func3});
use_rd = 1;
@ -319,9 +319,9 @@ module VX_decode #(
`USED_IREG (rs1);
end
`ifdef EXT_F_ENABLE
`INST_FS,
`INST_FS,
`endif
`INST_S: begin
`INST_S: begin
ex_type = `EX_LSU;
op_type = `INST_OP_BITS'({1'b1, func3});
imm = {{(`XLEN-12){s_imm[11]}}, s_imm};
@ -338,24 +338,24 @@ module VX_decode #(
`INST_FMADD,
`INST_FMSUB,
`INST_FNMSUB,
`INST_FNMADD: begin
`INST_FNMADD: begin
ex_type = `EX_FPU;
op_type = `INST_OP_BITS'({2'b11, opcode[3:2]});
op_mod = `INST_MOD_BITS'(func3);
imm[0] = func2[0]; // destination is double?
use_rd = 1;
`USED_FREG (rd);
`USED_FREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
`USED_FREG (rs3);
end
`INST_FCI: begin
`INST_FCI: begin
ex_type = `EX_FPU;
op_mod = `INST_MOD_BITS'(func3);
`ifdef FLEN_64
imm[0] = func2[0]; // destination is double?
`endif
use_rd = 1;
use_rd = 1;
case (func5)
5'b00000, // FADD
5'b00001, // FSUB
@ -381,28 +381,28 @@ module VX_decode #(
`USED_FREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
end
end
`ifdef FLEN_64
5'b01000: begin
5'b01000: begin
// CVT.S.D, CVT.D.S
op_type = `INST_OP_BITS'(`INST_FPU_F2F);
`USED_FREG (rd);
`USED_FREG (rs1);
end
`endif
5'b01011: begin
5'b01011: begin
// SQRT
op_type = `INST_OP_BITS'(`INST_FPU_SQRT);
`USED_FREG (rd);
`USED_FREG (rs1);
end
end
5'b10100: begin
// CMP
op_type = `INST_OP_BITS'(`INST_FPU_CMP);
`USED_IREG (rd);
`USED_FREG (rs1);
`USED_FREG (rs2);
end
end
5'b11000: begin
// CVT.W.X, CVT.WU.X
op_type = (rs2[0]) ? `INST_OP_BITS'(`INST_FPU_F2U) : `INST_OP_BITS'(`INST_FPU_F2I);
@ -421,10 +421,10 @@ module VX_decode #(
`USED_FREG (rd);
`USED_IREG (rs1);
end
5'b11100: begin
5'b11100: begin
if (func3[0]) begin
// NCP: FCLASS=3
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = 3;
end else begin
// NCP: FMV.X.W=4
@ -432,11 +432,11 @@ module VX_decode #(
op_mod = 4;
end
`USED_IREG (rd);
`USED_FREG (rs1);
end
5'b11110: begin
`USED_FREG (rs1);
end
5'b11110: begin
// NCP: FMV.W.X=5
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_type = `INST_OP_BITS'(`INST_FPU_MISC);
op_mod = 5;
`USED_FREG (rd);
`USED_IREG (rs1);
@ -445,7 +445,7 @@ module VX_decode #(
endcase
end
`endif
`INST_EXT1: begin
`INST_EXT1: begin
case (func7)
7'h00: begin
ex_type = `EX_SFU;
@ -463,8 +463,9 @@ module VX_decode #(
3'h2: begin // SPLIT
op_type = `INST_OP_BITS'(`INST_SFU_SPLIT);
use_rd = 1;
`USED_IREG (rs1);
`USED_IREG (rd);
op_mod[0] = rs2[0]; // not?
`USED_IREG (rs1);
`USED_IREG (rd);
end
3'h3: begin // JOIN
op_type = `INST_OP_BITS'(`INST_SFU_JOIN);
@ -477,6 +478,7 @@ module VX_decode #(
end
3'h5: begin // PRED
op_type = `INST_OP_BITS'(`INST_SFU_PRED);
op_mod[0] = rd[0]; // not?
`USED_IREG (rs1);
`USED_IREG (rs2);
end
@ -486,10 +488,10 @@ module VX_decode #(
default:;
endcase
end
`INST_EXT2: begin
`INST_EXT2: begin
case (func3)
3'h1: begin
case (func2)
case (func2)
2'h0: begin // CMOV
ex_type = `EX_SFU;
op_type = `INST_OP_BITS'(`INST_SFU_CMOV);
@ -533,7 +535,7 @@ module VX_decode #(
assign decode_sched_if.valid = fetch_fire;
assign decode_sched_if.wid = fetch_if.data.wid;
assign decode_sched_if.is_wstall = is_wstall;
`ifndef L1_ENABLE
`ifndef L1_ENABLE
assign fetch_if.ibuf_pop = decode_if.ibuf_pop;
`endif

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -24,14 +24,15 @@ module VX_ipdom_stack #(
input wire [WIDTH-1:0] q0,
input wire [WIDTH-1:0] q1,
output wire [WIDTH-1:0] d,
output wire d_set,
output wire d_set,
output wire [ADDRW-1:0] q_ptr,
input wire push,
input wire pop,
input wire pop,
output wire empty,
output wire full
);
reg slot_set [DEPTH-1:0];
reg [ADDRW-1:0] rd_ptr, wr_ptr;
reg empty_r, full_r;
@ -41,28 +42,28 @@ module VX_ipdom_stack #(
wire d_set_n = slot_set[rd_ptr];
always @(posedge clk) begin
if (reset) begin
if (reset) begin
rd_ptr <= '0;
wr_ptr <= '0;
empty_r <= 1;
full_r <= 0;
full_r <= 0;
end else begin
`ASSERT(~push || ~full, ("runtime error: writing to a full stack!"));
`ASSERT(~pop || ~empty, ("runtime error: reading an empty stack!"));
`ASSERT(~push || ~pop, ("runtime error: push and pop in same cycle not supported!"));
if (push) begin
if (push) begin
rd_ptr <= wr_ptr;
wr_ptr <= wr_ptr + ADDRW'(1);
wr_ptr <= wr_ptr + ADDRW'(1);
empty_r <= 0;
full_r <= (ADDRW'(DEPTH-1) == wr_ptr);
end else if (pop) begin
end else if (pop) begin
wr_ptr <= wr_ptr - ADDRW'(d_set_n);
rd_ptr <= rd_ptr - ADDRW'(d_set_n);
empty_r <= (rd_ptr == 0) && (d_set_n == 1);
full_r <= 0;
end
end
end
end
VX_dp_ram #(
.DATAW (WIDTH * 2),
@ -72,23 +73,24 @@ module VX_ipdom_stack #(
) store (
.clk (clk),
.read (1'b1),
.write (push),
`UNUSED_PIN (wren),
.write (push),
`UNUSED_PIN (wren),
.waddr (wr_ptr),
.wdata ({q1, q0}),
.raddr (rd_ptr),
.rdata ({d1, d0})
);
always @(posedge clk) begin
if (push) begin
slot_set[wr_ptr] <= 0;
end else if (pop) begin
slot_set[wr_ptr] <= 0;
end else if (pop) begin
slot_set[rd_ptr] <= 1;
end
end
wire d_set_r;
VX_pipe_register #(
.DATAW (1),
.DEPTH (OUT_REG)
@ -102,6 +104,7 @@ module VX_ipdom_stack #(
assign d = d_set_r ? d0 : d1;
assign d_set = ~d_set_r;
assign q_ptr = wr_ptr;
assign empty = empty_r;
assign full = full_r;

View file

@ -296,11 +296,13 @@ module VX_schedule import VX_gpu_pkg::*; #(
.split (warp_ctl_if.split),
.sjoin (warp_ctl_if.sjoin),
.join_valid (join_valid),
.join_is_dvg (join_is_dvg),
.join_is_else (join_is_else),
.join_is_dvg(join_is_dvg),
.join_is_else(join_is_else),
.join_wid (join_wid),
.join_tmask (join_tmask),
.join_pc (join_pc)
.join_pc (join_pc),
.stack_wid (warp_ctl_if.dvstack_wid),
.stack_ptr (warp_ctl_if.dvstack_ptr)
);
// schedule the next ready warp
@ -308,7 +310,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
wire [`NUM_WARPS-1:0] ready_warps = active_warps & ~(stalled_warps | barrier_stalls);
VX_lzc #(
.N (`NUM_WARPS),
.N (`NUM_WARPS),
.REVERSE (1)
) wid_select (
.data_in (ready_warps),

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -15,7 +15,7 @@
module VX_sfu_unit import VX_gpu_pkg::*; #(
parameter CORE_ID = 0
) (
) (
input wire clk,
input wire reset,
@ -28,7 +28,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
// Inputs
VX_dispatch_if.slave dispatch_if [`ISSUE_WIDTH],
`ifdef EXT_F_ENABLE
VX_fpu_csr_if.slave fpu_csr_if [`NUM_FPU_BLOCKS],
`endif
@ -37,13 +37,13 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
// Outputs
VX_commit_if.master commit_if [`ISSUE_WIDTH],
VX_warp_ctl_if.master warp_ctl_if
VX_warp_ctl_if.master warp_ctl_if
);
`UNUSED_PARAM (CORE_ID)
localparam BLOCK_SIZE = 1;
localparam NUM_LANES = `NUM_SFU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam BLOCK_SIZE = 1;
localparam NUM_LANES = `NUM_SFU_LANES;
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam RSP_ARB_DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + (NUM_LANES * `XLEN) + `NR_BITS + 1 + `XLEN + PID_WIDTH + 1 + 1;
localparam RSP_ARB_SIZE = 1 + 1;
@ -67,29 +67,29 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
wire [RSP_ARB_SIZE-1:0] rsp_arb_valid_in;
wire [RSP_ARB_SIZE-1:0] rsp_arb_ready_in;
wire [RSP_ARB_SIZE-1:0][RSP_ARB_DATAW-1:0] rsp_arb_data_in;
wire [RSP_ARB_SIZE-1:0][RSP_ARB_DATAW-1:0] rsp_arb_data_in;
// Warp control block
// Warp control block
VX_execute_if #(
.NUM_LANES (NUM_LANES)
) wctl_execute_if();
VX_commit_if#(
.NUM_LANES (NUM_LANES)
) wctl_commit_if();
assign wctl_execute_if.valid = per_block_execute_if[0].valid && `INST_SFU_IS_WCTL(per_block_execute_if[0].data.op_type);
assign wctl_execute_if.data = per_block_execute_if[0].data;
`RESET_RELAY (wctl_reset, reset);
VX_wctl_unit #(
.CORE_ID (CORE_ID),
.NUM_LANES (NUM_LANES)
) wctl_unit (
.clk (clk),
.reset (wctl_reset),
.execute_if (wctl_execute_if),
.warp_ctl_if(warp_ctl_if),
.execute_if (wctl_execute_if),
.warp_ctl_if(warp_ctl_if),
.commit_if (wctl_commit_if)
);
@ -119,20 +119,20 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
.base_dcrs (base_dcrs),
.execute_if (csr_execute_if),
`ifdef PERF_ENABLE
.mem_perf_if (mem_perf_if),
.pipeline_perf_if(pipeline_perf_if),
`endif
`ifdef EXT_F_ENABLE
`ifdef EXT_F_ENABLE
.fpu_csr_if (fpu_csr_if),
`endif
.sched_csr_if (sched_csr_if),
.commit_csr_if (commit_csr_if),
.commit_if (csr_commit_if)
);
);
assign rsp_arb_valid_in[RSP_ARB_IDX_CSRS] = csr_commit_if.valid;
assign rsp_arb_data_in[RSP_ARB_IDX_CSRS] = csr_commit_if.data;
@ -164,7 +164,7 @@ module VX_sfu_unit import VX_gpu_pkg::*; #(
.OUT_BUF (3)
) rsp_arb (
.clk (clk),
.reset (reset),
.reset (reset),
.valid_in (rsp_arb_valid_in),
.ready_in (rsp_arb_ready_in),
.data_in (rsp_arb_data_in),

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -27,50 +27,58 @@ module VX_split_join import VX_gpu_pkg::*; #(
output wire join_is_else,
output wire [`NW_WIDTH-1:0] join_wid,
output wire [`NUM_THREADS-1:0] join_tmask,
output wire [`XLEN-1:0] join_pc
output wire [`XLEN-1:0] join_pc,
input wire [`NW_WIDTH-1:0] stack_wid,
output wire [`DV_STACK_SIZEW-1:0] stack_ptr
);
`UNUSED_PARAM (CORE_ID)
wire [(`XLEN+`NUM_THREADS)-1:0] ipdom_data [`NUM_WARPS-1:0];
wire [`DV_STACK_SIZEW-1:0] ipdom_q_ptr [`NUM_WARPS-1:0];
wire ipdom_set [`NUM_WARPS-1:0];
wire [(`XLEN+`NUM_THREADS)-1:0] ipdom_q0 = {split.then_tmask | split.else_tmask, `XLEN'(0)};
wire [(`XLEN+`NUM_THREADS)-1:0] ipdom_q1 = {split.else_tmask, split.next_pc};
wire sjoin_is_dvg = (sjoin.stack_ptr != ipdom_q_ptr[wid]);
wire ipdom_push = valid && split.valid && split.is_dvg;
wire ipdom_pop = valid && sjoin.valid && sjoin.is_dvg;
wire ipdom_pop = valid && sjoin.valid && sjoin_is_dvg;
for (genvar i = 0; i < `NUM_WARPS; ++i) begin
`RESET_RELAY (ipdom_reset, reset);
VX_ipdom_stack #(
.WIDTH (`XLEN+`NUM_THREADS),
.DEPTH (`UP(`NUM_THREADS-1))
.WIDTH (`XLEN+`NUM_THREADS),
.DEPTH (`DV_STACK_SIZE)
) ipdom_stack (
.clk (clk),
.reset (ipdom_reset),
.push (ipdom_push && (i == wid)),
.pop (ipdom_pop && (i == wid)),
.q0 (ipdom_q0),
.q1 (ipdom_q1),
.d (ipdom_data[i]),
.d_set (ipdom_set[i]),
.q_ptr (ipdom_q_ptr[i]),
.push (ipdom_push && (i == wid)),
.pop (ipdom_pop && (i == wid)),
`UNUSED_PIN (empty),
`UNUSED_PIN (full)
);
end
VX_pipe_register #(
.DATAW (1 + 1 + `NW_WIDTH + 1 + `XLEN + `NUM_THREADS),
.DATAW (1 + 1 + 1 + `NW_WIDTH + `NUM_THREADS + `XLEN),
.DEPTH (1),
.RESETW (1)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (1'b1),
.data_in ({valid && sjoin.valid, sjoin.is_dvg, ipdom_set[wid], wid, ipdom_data[wid]}),
.data_out ({join_valid, join_is_dvg, join_is_else, join_wid, join_tmask, join_pc})
.data_in ({valid && sjoin.valid, sjoin_is_dvg, ipdom_set[wid], wid, ipdom_data[wid]}),
.data_out ({join_valid, join_is_dvg, join_is_else, join_wid, {join_tmask, join_pc}})
);
assign stack_ptr = ipdom_q_ptr[stack_wid];
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -26,17 +26,17 @@ task trace_ex_type(input int level, input [`EX_BITS-1:0] ex_type);
endcase
endtask
task trace_ex_op(input int level,
input [`EX_BITS-1:0] ex_type,
input [`INST_OP_BITS-1:0] op_type,
input [`INST_MOD_BITS-1:0] op_mod,
task trace_ex_op(input int level,
input [`EX_BITS-1:0] ex_type,
input [`INST_OP_BITS-1:0] op_type,
input [`INST_MOD_BITS-1:0] op_mod,
input use_imm,
input fdst_d,
input fcvt_l,
input rd_float
);
case (ex_type)
`EX_ALU: begin
`EX_ALU: begin
if (`INST_ALU_IS_BR(op_mod)) begin
case (`INST_BR_BITS'(op_type))
`INST_BR_EQ: `TRACE(level, ("BEQ"));
@ -131,7 +131,7 @@ task trace_ex_op(input int level,
end
end
end
`EX_LSU: begin
`EX_LSU: begin
if (rd_float) begin
case (`INST_LSU_BITS'(op_type))
`INST_LSU_LW: `TRACE(level, ("FLW"));
@ -161,55 +161,55 @@ task trace_ex_op(input int level,
`EX_FPU: begin
case (`INST_FPU_BITS'(op_type))
`INST_FPU_ADD: begin
if (fdst_d)
if (fdst_d)
`TRACE(level, ("FADD.D"));
else
`TRACE(level, ("FADD.S"));
end
`INST_FPU_SUB: begin
if (fdst_d)
if (fdst_d)
`TRACE(level, ("FSUB.D"));
else
`TRACE(level, ("FSUB.S"));
end
`INST_FPU_MUL: begin
if (fdst_d)
if (fdst_d)
`TRACE(level, ("FMUL.D"));
else
`TRACE(level, ("FMUL.S"));
end
`INST_FPU_DIV: begin
if (fdst_d)
if (fdst_d)
`TRACE(level, ("FDIV.D"));
else
`TRACE(level, ("FDIV.S"));
end
`INST_FPU_SQRT: begin
if (fdst_d)
if (fdst_d)
`TRACE(level, ("FSQRT.D"));
else
`TRACE(level, ("FSQRT.S"));
end
`INST_FPU_MADD: begin
if (fdst_d)
if (fdst_d)
`TRACE(level, ("FMADD.D"));
else
`TRACE(level, ("FMADD.S"));
end
`INST_FPU_MSUB: begin
if (fdst_d)
if (fdst_d)
`TRACE(level, ("FMSUB.D"));
else
`TRACE(level, ("FMSUB.S"));
end
`INST_FPU_NMADD: begin
if (fdst_d)
if (fdst_d)
`TRACE(level, ("FNMADD.D"));
else
`TRACE(level, ("FNMADD.S"));
end
`INST_FPU_NMSUB: begin
if (fdst_d)
if (fdst_d)
`TRACE(level, ("FNMSUB.D"));
else
`TRACE(level, ("FNMSUB.S"));
@ -330,10 +330,10 @@ task trace_ex_op(input int level,
case (`INST_SFU_BITS'(op_type))
`INST_SFU_TMC: `TRACE(level, ("TMC"));
`INST_SFU_WSPAWN:`TRACE(level, ("WSPAWN"));
`INST_SFU_SPLIT: `TRACE(level, ("SPLIT"));
`INST_SFU_SPLIT: begin if (op_mod[0]) `TRACE(level, ("SPLIT.N")); else `TRACE(level, ("SPLIT")); end
`INST_SFU_JOIN: `TRACE(level, ("JOIN"));
`INST_SFU_BAR: `TRACE(level, ("BAR"));
`INST_SFU_PRED: `TRACE(level, ("PRED"));
`INST_SFU_PRED: begin if (op_mod[0]) `TRACE(level, ("PRED.N")); else `TRACE(level, ("PRED")); end
`INST_SFU_CSRRW: begin if (use_imm) `TRACE(level, ("CSRRWI")); else `TRACE(level, ("CSRRW")); end
`INST_SFU_CSRRS: begin if (use_imm) `TRACE(level, ("CSRRSI")); else `TRACE(level, ("CSRRS")); end
`INST_SFU_CSRRC: begin if (use_imm) `TRACE(level, ("CSRRCI")); else `TRACE(level, ("CSRRC")); end

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -22,7 +22,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
// Inputs
VX_execute_if.slave execute_if,
// Outputs
VX_warp_ctl_if.master warp_ctl_if,
VX_commit_if.master commit_if
@ -32,12 +32,12 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
localparam PID_BITS = `CLOG2(`NUM_THREADS / NUM_LANES);
localparam PID_WIDTH = `UP(PID_BITS);
localparam WCTL_WIDTH = $bits(tmc_t) + $bits(wspawn_t) + $bits(split_t) + $bits(join_t) + $bits(barrier_t);
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + WCTL_WIDTH + PID_WIDTH + 1 + 1;
localparam DATAW = `UUID_WIDTH + `NW_WIDTH + NUM_LANES + `XLEN + `NR_BITS + 1 + WCTL_WIDTH + PID_WIDTH + 1 + 1 + `DV_STACK_SIZEW;
`UNUSED_VAR (execute_if.data.rs3_data)
tmc_t tmc, tmc_r;
wspawn_t wspawn, wspawn_r;
wspawn_t wspawn, wspawn_r;
split_t split, split_r;
join_t sjoin, sjoin_r;
barrier_t barrier, barrier_r;
@ -55,14 +55,16 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
end else begin
assign tid = 0;
end
wire [`XLEN-1:0] rs1_data = execute_if.data.rs1_data[tid];
wire [`XLEN-1:0] rs2_data = execute_if.data.rs2_data[tid];
`UNUSED_VAR (rs1_data)
wire not_pred = execute_if.data.op_mod[0];
wire [NUM_LANES-1:0] taken;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign taken[i] = execute_if.data.rs1_data[i][0];
assign taken[i] = (execute_if.data.rs1_data[i][0] ^ not_pred);
end
reg [`NUM_THREADS-1:0] then_tmask_r, then_tmask_n;
@ -93,17 +95,20 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
assign tmc.tmask = is_pred ? pred_mask : rs1_data[`NUM_THREADS-1:0];
// split
assign split.valid = is_split;
assign split.is_dvg = has_then && has_else;
assign split.then_tmask = then_tmask_n;
assign split.else_tmask = else_tmask_n;
assign split.next_pc = execute_if.data.PC + 4;
assign warp_ctl_if.dvstack_wid = execute_if.data.wid;
wire [`DV_STACK_SIZEW-1:0] dvstack_ptr;
// join
assign sjoin.valid = is_join;
assign sjoin.is_dvg = rs1_data[0];
assign sjoin.valid = is_join;
assign sjoin.stack_ptr = rs1_data[`DV_STACK_SIZEW-1:0];
// barrier
assign barrier.valid = is_bar;
@ -126,7 +131,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
assign wspawn.pc = rs2_data;
// response
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (2)
@ -135,8 +140,8 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
.reset (reset),
.valid_in (execute_if.valid),
.ready_in (execute_if.ready),
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, {tmc, wspawn, split, sjoin, barrier}}),
.data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, {tmc_r, wspawn_r, split_r, sjoin_r, barrier_r}}),
.data_in ({execute_if.data.uuid, execute_if.data.wid, execute_if.data.tmask, execute_if.data.PC, execute_if.data.rd, execute_if.data.wb, execute_if.data.pid, execute_if.data.sop, execute_if.data.eop, {tmc, wspawn, split, sjoin, barrier}, warp_ctl_if.dvstack_ptr}),
.data_out ({commit_if.data.uuid, commit_if.data.wid, commit_if.data.tmask, commit_if.data.PC, commit_if.data.rd, commit_if.data.wb, commit_if.data.pid, commit_if.data.sop, commit_if.data.eop, {tmc_r, wspawn_r, split_r, sjoin_r, barrier_r}, dvstack_ptr}),
.valid_out (commit_if.valid),
.ready_out (commit_if.ready)
);
@ -148,9 +153,9 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
assign warp_ctl_if.split = split_r;
assign warp_ctl_if.sjoin = sjoin_r;
assign warp_ctl_if.barrier = barrier_r;
for (genvar i = 0; i < NUM_LANES; ++i) begin
assign commit_if.data.data[i] = `XLEN'(split_r.is_dvg);
assign commit_if.data.data[i] = `XLEN'(dvstack_ptr);
end
endmodule

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -23,6 +23,9 @@ interface VX_warp_ctl_if import VX_gpu_pkg::*; ();
join_t sjoin;
barrier_t barrier;
wire [`NW_WIDTH-1:0] dvstack_wid;
wire [`DV_STACK_SIZEW-1:0] dvstack_ptr;
modport master (
output valid,
output wid,
@ -30,7 +33,10 @@ interface VX_warp_ctl_if import VX_gpu_pkg::*; ();
output tmc,
output split,
output sjoin,
output barrier
output barrier,
output dvstack_wid,
input dvstack_ptr
);
modport slave (
@ -40,7 +46,10 @@ interface VX_warp_ctl_if import VX_gpu_pkg::*; ();
input tmc,
input split,
input sjoin,
input barrier
input barrier,
input dvstack_wid,
output dvstack_ptr
);
endinterface

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -121,6 +121,11 @@ inline void vx_pred(int condition, int thread_mask) {
asm volatile (".insn r %0, 5, 0, x0, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(condition), "r"(thread_mask));
}
// Set thread not predicate
inline void vx_pred_n(int condition, int thread_mask) {
asm volatile (".insn r %0, 5, 0, x1, %1, %2" :: "i"(RISCV_CUSTOM0), "r"(condition), "r"(thread_mask));
}
typedef void (*vx_wspawn_pfn)();
// Spawn warps
@ -135,6 +140,13 @@ inline int vx_split(int predicate) {
return ret;
}
// Split on a not predicate
inline int vx_split_n(int predicate) {
size_t ret;
asm volatile (".insn r %1, 2, 0, %0, %2, x1" : "=r"(ret) : "i"(RISCV_CUSTOM0), "r"(predicate));
return ret;
}
// Join
inline void vx_join(int stack_ptr) {
asm volatile (".insn r %0, 3, 0, x0, %1, x0" :: "i"(RISCV_CUSTOM0), "r"(stack_ptr));
@ -191,7 +203,7 @@ inline int vx_num_threads() {
inline int vx_num_warps() {
int ret;
asm volatile ("csrr %0, %1" : "=r"(ret) : "i"(VX_CSR_NUM_WARPS));
return ret;
return ret;
}
// Return the number of cores per cluster

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -42,7 +42,7 @@ static const std::unordered_map<Opcode, InstType> sc_instTable = {
{Opcode::AMO, InstType::R},
{Opcode::FL, InstType::I},
{Opcode::FS, InstType::S},
{Opcode::FCI, InstType::R},
{Opcode::FCI, InstType::R},
{Opcode::FMADD, InstType::R4},
{Opcode::FMSUB, InstType::R4},
{Opcode::FMNMADD, InstType::R4},
@ -71,7 +71,7 @@ enum Constants {
shift_func7 = shift_rs2 + width_reg,
shift_rs3 = shift_func7 + width_func2,
mask_opcode = (1 << width_opcode) - 1,
mask_opcode = (1 << width_opcode) - 1,
mask_reg = (1 << width_reg) - 1,
mask_func2 = (1 << width_func2) - 1,
mask_func3 = (1 << width_func3) - 1,
@ -131,7 +131,7 @@ static const char* op_string(const Instr &instr) {
case 7: return "ANDI";
default:
std::abort();
}
}
case Opcode::B:
switch (func3) {
case 0: return "BEQ";
@ -181,7 +181,7 @@ static const char* op_string(const Instr &instr) {
switch (func3) {
case 0: return (func7 & 0x20) ? "SUBW" : "ADDW";
case 1: return "SLLW";
case 5: return (func7 & 0x20) ? "SRAW" : "SRLW";
case 5: return (func7 & 0x20) ? "SRAW" : "SRLW";
default:
std::abort();
}
@ -194,7 +194,7 @@ static const char* op_string(const Instr &instr) {
default:
std::abort();
}
case Opcode::SYS:
case Opcode::SYS:
switch (func3) {
case 0:
switch (imm) {
@ -204,7 +204,7 @@ static const char* op_string(const Instr &instr) {
case 0x102: return "SRET";
case 0x302: return "MRET";
default:
std::abort();
std::abort();
}
case 1: return "CSRRW";
case 2: return "CSRRS";
@ -216,20 +216,20 @@ static const char* op_string(const Instr &instr) {
std::abort();
}
case Opcode::FENCE: return "FENCE";
case Opcode::FL:
case Opcode::FL:
switch (func3) {
case 0x1: return "VL";
case 0x2: return "FLW";
case 0x3: return "FLD";
default:
default:
std::abort();
}
case Opcode::FS:
case Opcode::FS:
switch (func3) {
case 0x1: return "VS";
case 0x2: return "FSW";
case 0x3: return "FSD";
default:
default:
std::abort();
}
case Opcode::AMO: {
@ -267,11 +267,11 @@ static const char* op_string(const Instr &instr) {
default:
std::abort();
}
default:
default:
std::abort();
}
}
case Opcode::FCI:
case Opcode::FCI:
switch (func7) {
case 0x00: return "FADD.S";
case 0x01: return "FADD.D";
@ -284,7 +284,7 @@ static const char* op_string(const Instr &instr) {
case 0x2c: return "FSQRT.S";
case 0x2d: return "FSQRT.D";
case 0x10:
switch (func3) {
switch (func3) {
case 0: return "FSGNJ.S";
case 1: return "FSGNJN.S";
case 2: return "FSGNJX.S";
@ -292,7 +292,7 @@ static const char* op_string(const Instr &instr) {
std::abort();
}
case 0x11:
switch (func3) {
switch (func3) {
case 0: return "FSGNJ.D";
case 1: return "FSGNJN.D";
case 2: return "FSGNJX.D";
@ -300,14 +300,14 @@ static const char* op_string(const Instr &instr) {
std::abort();
}
case 0x14:
switch (func3) {
switch (func3) {
case 0: return "FMIN.S";
case 1: return "FMAX.S";
default:
std::abort();
}
case 0x15:
switch (func3) {
switch (func3) {
case 0: return "FMIN.D";
case 1: return "FMAX.D";
default:
@ -315,23 +315,23 @@ static const char* op_string(const Instr &instr) {
}
case 0x20: return "FCVT.S.D";
case 0x21: return "FCVT.D.S";
case 0x50:
switch (func3) {
case 0: return "FLE.S";
case 1: return "FLT.S";
case 0x50:
switch (func3) {
case 0: return "FLE.S";
case 1: return "FLT.S";
case 2: return "FEQ.S";
default:
std::abort();
}
case 0x51:
switch (func3) {
case 0: return "FLE.D";
case 1: return "FLT.D";
case 0x51:
switch (func3) {
case 0: return "FLE.D";
case 1: return "FLT.D";
case 2: return "FEQ.D";
default:
std::abort();
}
case 0x60:
case 0x60:
switch (rs2) {
case 0: return "FCVT.W.S";
case 1: return "FCVT.WU.S";
@ -349,7 +349,7 @@ static const char* op_string(const Instr &instr) {
default:
std::abort();
}
case 0x68:
case 0x68:
switch (rs2) {
case 0: return "FCVT.S.W";
case 1: return "FCVT.S.WU";
@ -381,13 +381,13 @@ static const char* op_string(const Instr &instr) {
case Opcode::EXT1:
switch (func7) {
case 0:
switch (func3) {
switch (func3) {
case 0: return "TMC";
case 1: return "WSPAWN";
case 2: return "SPLIT";
case 2: return imm ? "SPLIT.N" : "SPLIT";
case 3: return "JOIN";
case 4: return "BAR";
case 5: return "PRED";
case 5: return imm ? "PRED.N" : "PRED";
default:
std::abort();
}
@ -398,7 +398,7 @@ static const char* op_string(const Instr &instr) {
switch (func3) {
case 1: {
switch (func2) {
case 0: return "CMOV";
case 0: return "CMOV";
default:
std::abort();
}
@ -412,16 +412,16 @@ static const char* op_string(const Instr &instr) {
}
namespace vortex {
std::ostream &operator<<(std::ostream &os, const Instr &instr) {
std::ostream &operator<<(std::ostream &os, const Instr &instr) {
os << op_string(instr);
int sep = 0;
if (instr.getRDType() != RegType::None) {
if (sep++ != 0) { os << ", "; } else { os << " "; }
os << instr.getRDType() << std::dec << instr.getRDest();
}
for (uint32_t i = 0; i < instr.getNRSrc(); ++i) {
for (uint32_t i = 0; i < instr.getNRSrc(); ++i) {
if (sep++ != 0) { os << ", "; } else { os << " "; }
if (instr.getRSType(i) != RegType::None) {
if (instr.getRSType(i) != RegType::None) {
os << instr.getRSType(i) << std::dec << instr.getRSrc(i);
} else {
os << "0x" << std::hex << instr.getRSrc(0);
@ -435,7 +435,7 @@ std::ostream &operator<<(std::ostream &os, const Instr &instr) {
}
}
std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
auto instr = std::make_shared<Instr>();
auto op = Opcode((code >> shift_opcode) & mask_opcode);
instr->setOpcode(op);
@ -460,12 +460,12 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
case InstType::R:
switch (op) {
case Opcode::FCI:
switch (func7) {
switch (func7) {
case 0x2c: // FSQRT.S
case 0x2d: // FSQRT.D
instr->setDestReg(rd, RegType::Float);
instr->addSrcReg(rs1, RegType::Float);
break;
break;
case 0x50: // FLE.S, FLT.S, FEQ.S
case 0x51: // FLE.D, FLT.D, FEQ.D
instr->setDestReg(rd, RegType::Integer);
@ -485,39 +485,44 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
instr->addSrcReg(rs2, RegType::None);
break;
case 0x70: // FCLASS.S, FMV.X.S
case 0x71: // FCLASS.D, FMV.X.D
case 0x71: // FCLASS.D, FMV.X.D
instr->setDestReg(rd, RegType::Integer);
instr->addSrcReg(rs1, RegType::Float);
break;
case 0x78: // FMV.S.X
case 0x79: // FMV.D.X
case 0x79: // FMV.D.X
instr->setDestReg(rd, RegType::Float);
instr->addSrcReg(rs1, RegType::Integer);
break;
default:
instr->setDestReg(rd, RegType::Float);
instr->addSrcReg(rs1, RegType::Float);
instr->addSrcReg(rs2, RegType::Float);
instr->addSrcReg(rs2, RegType::Float);
break;
}
break;
case Opcode::EXT1:
switch (func7) {
case 0:
switch (func3) {
switch (func3) {
case 0: // TMC
case 3: // JOIN
instr->addSrcReg(rs1, RegType::Integer);
break;
case 1: // WSPAWN
case 1: // WSPAWN
case 4: // BAR
instr->addSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs2, RegType::Integer);
break;
case 5: // PRED
instr->addSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs2, RegType::Integer);
instr->setImm(rd);
break;
case 2: // SPLIT
instr->setDestReg(rd, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
instr->setImm(rs2);
break;
default:
std::abort();
@ -542,7 +547,7 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
case Opcode::I:
case Opcode::I_W:
case Opcode::JALR:
instr->setDestReg(rd, RegType::Integer);
instr->setDestReg(rd, RegType::Integer);
instr->addSrcReg(rs1, RegType::Integer);
instr->setFunc3(func3);
if (func3 == 0x1 || func3 == 0x5) {
@ -560,7 +565,7 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
auto imm = code >> shift_rs2;
instr->setImm(sext(imm, width_i_imm));
}
break;
break;
case Opcode::L:
case Opcode::FL: {
instr->setDestReg(rd, (op == Opcode::FL) ? RegType::Float : RegType::Integer);
@ -578,24 +583,24 @@ std::shared_ptr<Instr> Emulator::decode(uint32_t code) const {
// CSR instructions
instr->setDestReg(rd, RegType::Integer);
instr->setFunc3(func3);
if (func3 < 5) {
if (func3 < 5) {
instr->addSrcReg(rs1, RegType::Integer);
} else {
// zimm
} else {
// zimm
instr->addSrcReg(rs1, RegType::None);
}
instr->setImm(code >> shift_rs2);
} else {
// ECALL/EBREACK instructions
instr->setImm(code >> shift_rs2);
}
}
break;
default:
std::abort();
std::abort();
break;
}
} break;
case InstType::S: {
case InstType::S: {
instr->addSrcReg(rs1, RegType::Integer);
instr->addSrcReg(rs2, (op == Opcode::FS) ? RegType::Float : RegType::Integer);
instr->setFunc3(func3);

View file

@ -1309,17 +1309,20 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
trace->used_iregs.set(rsrc0);
trace->fetch_stall = true;
auto stack_size = warp.ipdom_stack.size();
ThreadMask then_tmask, else_tmask;
auto not_pred = immsrc & 0x1;
for (uint32_t t = 0; t < num_threads; ++t) {
auto cond = warp.ireg_file.at(t).at(rsrc0);
auto cond = (warp.ireg_file.at(t).at(rsrc0) & 0x1) ^ not_pred;
then_tmask[t] = warp.tmask.test(t) && cond;
else_tmask[t] = warp.tmask.test(t) && !cond;
}
bool is_divergent = then_tmask.any() && else_tmask.any();
if (is_divergent) {
if (warp.ipdom_stack.size() == arch_.ipdom_size()) {
std::cout << "IPDOM stack is full! size=" << std::dec << warp.ipdom_stack.size() << ", PC=0x" << std::hex << warp.PC << " (#" << std::dec << trace->uuid << ")\n" << std::flush;
if (stack_size == arch_.ipdom_size()) {
std::cout << "IPDOM stack is full! size=" << std::dec << stack_size << ", PC=0x" << std::hex << warp.PC << " (#" << std::dec << trace->uuid << ")\n" << std::flush;
std::abort();
}
// set new thread mask
@ -1331,7 +1334,7 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
}
// return divergent state
for (uint32_t t = thread_start; t < num_threads; ++t) {
rddata[t].i = is_divergent;
rddata[t].i = stack_size;
}
rd_write = true;
} break;
@ -1342,8 +1345,8 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
trace->used_iregs.set(rsrc0);
trace->fetch_stall = true;
int is_divergent = warp.ireg_file.at(thread_start).at(rsrc0);
if (is_divergent != 0) {
auto stack_ptr = warp.ireg_file.at(thread_start).at(rsrc0);
if (stack_ptr != warp.ipdom_stack.size()) {
if (warp.ipdom_stack.empty()) {
std::cout << "IPDOM stack is empty!\n" << std::flush;
std::abort();
@ -1372,8 +1375,10 @@ void Emulator::execute(const Instr &instr, uint32_t wid, instr_trace_t *trace) {
trace->used_iregs.set(rsrc1);
trace->fetch_stall = true;
ThreadMask pred;
auto not_pred = immsrc & 0x1;
for (uint32_t t = 0; t < num_threads; ++t) {
pred[t] = warp.tmask.test(t) && (warp.ireg_file.at(t).at(rsrc0) & 0x1);
auto cond = (warp.ireg_file.at(t).at(rsrc0) & 0x1) ^ not_pred;
pred[t] = warp.tmask.test(t) && cond;
}
if (pred.any()) {
next_tmask &= pred;