thread mask redesign

This commit is contained in:
Blaise Tine 2021-08-05 17:32:58 -07:00
parent 7b8fe11e6a
commit e4d9fd8a00
14 changed files with 68 additions and 28 deletions

View file

@ -17,8 +17,8 @@ module VX_commit #(
VX_commit_if gpu_commit_if,
// outputs
VX_writeback_if writeback_if,
VX_cmt_to_csr_if cmt_to_csr_if
VX_writeback_if writeback_if,
VX_cmt_to_csr_if cmt_to_csr_if
);
localparam CMTW = $clog2(3*`NUM_THREADS+1);

View file

@ -223,6 +223,7 @@
`define CSR_LWID 12'hCC3
`define CSR_GWID `CSR_MHARTID
`define CSR_GCID 12'hCC5
`define CSR_TMASK 12'hCC4
// Machine SIMT CSRs
`define CSR_NT 12'hFC0

View file

@ -12,6 +12,7 @@ module VX_csr_data #(
`endif
VX_cmt_to_csr_if cmt_to_csr_if,
VX_fetch_to_csr_if fetch_to_csr_if,
`ifdef EXT_F_ENABLE
VX_fpu_to_csr_if fpu_to_csr_if,
@ -62,15 +63,15 @@ module VX_csr_data #(
`CSR_FRM: fcsr[write_wid][`FRM_BITS+`FFG_BITS-1:`FFG_BITS] <= write_data[`FRM_BITS-1:0];
`CSR_FCSR: fcsr[write_wid] <= write_data[`FFG_BITS+`FRM_BITS-1:0];
`CSR_SATP: csr_satp <= write_data;
`CSR_SATP: csr_satp <= write_data;
`CSR_MSTATUS: csr_mstatus <= write_data;
`CSR_MEDELEG: csr_medeleg <= write_data;
`CSR_MIDELEG: csr_mideleg <= write_data;
`CSR_MIE: csr_mie <= write_data;
`CSR_MTVEC: csr_mtvec <= write_data;
`CSR_MSTATUS: csr_mstatus <= write_data;
`CSR_MEDELEG: csr_medeleg <= write_data;
`CSR_MIDELEG: csr_mideleg <= write_data;
`CSR_MIE: csr_mie <= write_data;
`CSR_MTVEC: csr_mtvec <= write_data;
`CSR_MEPC: csr_mepc <= write_data;
`CSR_MEPC: csr_mepc <= write_data;
`CSR_PMPCFG0: csr_pmpcfg[0] <= write_data;
`CSR_PMPADDR0: csr_pmpaddr[0] <= write_data;
@ -114,6 +115,9 @@ module VX_csr_data #(
/*`CSR_MHARTID ,*/
`CSR_GWID : read_data_r = CORE_ID * `NUM_WARPS + 32'(read_wid);
`CSR_GCID : read_data_r = CORE_ID;
`CSR_TMASK : read_data_r = 32'(fetch_to_csr_if.thread_masks[read_wid]);
`CSR_NT : read_data_r = `NUM_THREADS;
`CSR_NW : read_data_r = `NUM_WARPS;
`CSR_NC : read_data_r = `NUM_CORES * `NUM_CLUSTERS;

View file

@ -12,6 +12,7 @@ module VX_csr_unit #(
`endif
VX_cmt_to_csr_if cmt_to_csr_if,
VX_fetch_to_csr_if fetch_to_csr_if,
VX_csr_req_if csr_req_if,
VX_commit_if csr_commit_if,
@ -42,6 +43,7 @@ module VX_csr_unit #(
.perf_pipeline_if (perf_pipeline_if),
`endif
.cmt_to_csr_if (cmt_to_csr_if),
.fetch_to_csr_if(fetch_to_csr_if),
`ifdef EXT_F_ENABLE
.fpu_to_csr_if (fpu_to_csr_if),
`endif

View file

@ -12,9 +12,12 @@ module VX_execute #(
VX_dcache_req_if dcache_req_if,
VX_dcache_rsp_if dcache_rsp_if,
// commit status
// commit interface
VX_cmt_to_csr_if cmt_to_csr_if,
// fetch interface
VX_fetch_to_csr_if fetch_to_csr_if,
`ifdef PERF_ENABLE
VX_perf_memsys_if perf_memsys_if,
VX_perf_pipeline_if perf_pipeline_if,
@ -84,9 +87,10 @@ module VX_execute #(
.reset (csr_reset),
`ifdef PERF_ENABLE
.perf_memsys_if (perf_memsys_if),
.perf_pipeline_if (perf_pipeline_if),
.perf_pipeline_if(perf_pipeline_if),
`endif
.cmt_to_csr_if (cmt_to_csr_if),
.fetch_to_csr_if(fetch_to_csr_if),
.csr_req_if (csr_req_if),
.csr_commit_if (csr_commit_if),
`ifdef EXT_F_ENABLE

View file

@ -21,6 +21,10 @@ module VX_fetch #(
// outputs
VX_ifetch_rsp_if ifetch_rsp_if,
// csr interface
VX_fetch_to_csr_if fetch_to_csr_if,
// busy status
output wire busy
);
@ -32,13 +36,18 @@ module VX_fetch #(
`SCOPE_BIND_VX_fetch_warp_sched
.clk (clk),
.reset (reset),
.reset (reset),
.warp_ctl_if (warp_ctl_if),
.wstall_if (wstall_if),
.join_if (join_if),
.branch_ctl_if (branch_ctl_if),
.ifetch_req_if (ifetch_req_if),
.ifetch_rsp_if (ifetch_rsp_if),
.fetch_to_csr_if (fetch_to_csr_if),
.busy (busy)
);

View file

@ -3,14 +3,10 @@
module VX_fpu_unit #(
parameter CORE_ID = 0
) (
// inputs
input wire clk,
input wire reset,
// inputs
VX_fpu_req_if fpu_req_if,
// outputs
VX_fpu_to_csr_if fpu_to_csr_if,
VX_commit_if fpu_commit_if,

View file

@ -32,19 +32,15 @@ module VX_gpu_unit #(
// tmc
wire [`NUM_THREADS-1:0] tmc_new_mask;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
assign tmc_new_mask[i] = (i < gpu_req_if.rs1_data[0]);
end
assign tmc.valid = is_tmc;
assign tmc.tmask = tmc_new_mask;
assign tmc.tmask = `NUM_THREADS'(gpu_req_if.rs1_data[gpu_req_if.tid]);
// wspawn
wire [31:0] wspawn_pc = gpu_req_if.rs2_data;
wire [`NUM_WARPS-1:0] wspawn_wmask;
for (genvar i = 0; i < `NUM_WARPS; i++) begin
assign wspawn_wmask[i] = (i < gpu_req_if.rs1_data[0]);
assign wspawn_wmask[i] = (i < gpu_req_if.rs1_data[gpu_req_if.tid]);
end
assign wspawn.valid = is_wspawn;
assign wspawn.wmask = wspawn_wmask;
@ -56,7 +52,7 @@ module VX_gpu_unit #(
wire [`NUM_THREADS-1:0] split_else_mask;
for (genvar i = 0; i < `NUM_THREADS; i++) begin
wire taken = gpu_req_if.rs1_data[i][0];
wire taken = gpu_req_if.rs1_data[i][gpu_req_if.tid];
assign split_then_mask[i] = gpu_req_if.tmask[i] & taken;
assign split_else_mask[i] = gpu_req_if.tmask[i] & ~taken;
end
@ -70,7 +66,7 @@ module VX_gpu_unit #(
// barrier
assign barrier.valid = is_bar;
assign barrier.id = gpu_req_if.rs1_data[0][`NB_BITS-1:0];
assign barrier.id = gpu_req_if.rs1_data[gpu_req_if.tid][`NB_BITS-1:0];
assign barrier.size_m1 = (`NW_BITS)'(gpu_req_if.rs2_data - 1);
// output

View file

@ -119,15 +119,15 @@ module VX_instr_demux (
wire gpu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_GPU);
VX_skid_buffer #(
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32 + 32)),
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `NR_BITS + 1 + + `NT_BITS + (`NUM_THREADS * 32 + 32)),
.OUTPUT_REG (1)
) gpu_buffer (
.clk (clk),
.reset (reset),
.valid_in (gpu_req_valid),
.ready_in (gpu_req_ready),
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, `GPU_OP(ibuffer_if.op_type), ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data[0]}),
.data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.rs1_data, gpu_req_if.rs2_data}),
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, `GPU_OP(ibuffer_if.op_type), ibuffer_if.rd, ibuffer_if.wb, tid, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data[0]}),
.data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.tid, gpu_req_if.rs1_data, gpu_req_if.rs2_data}),
.valid_out (gpu_req_if.valid),
.ready_out (gpu_req_if.ready)
);

View file

@ -108,6 +108,7 @@ module VX_pipeline #(
///////////////////////////////////////////////////////////////////////////
VX_fetch_to_csr_if fetch_to_csr_if();
VX_cmt_to_csr_if cmt_to_csr_if();
VX_decode_if decode_if();
VX_branch_ctl_if branch_ctl_if();
@ -155,6 +156,7 @@ module VX_pipeline #(
.warp_ctl_if (warp_ctl_if),
.branch_ctl_if (branch_ctl_if),
.ifetch_rsp_if (ifetch_rsp_if),
.fetch_to_csr_if(fetch_to_csr_if),
.busy (busy)
);
@ -209,7 +211,8 @@ module VX_pipeline #(
.dcache_req_if (dcache_req_if),
.dcache_rsp_if (dcache_rsp_if),
.cmt_to_csr_if (cmt_to_csr_if),
.cmt_to_csr_if (cmt_to_csr_if),
.fetch_to_csr_if(fetch_to_csr_if),
.alu_req_if (alu_req_if),
.lsu_req_if (lsu_req_if),

View file

@ -16,6 +16,8 @@ module VX_warp_sched #(
VX_ifetch_rsp_if ifetch_rsp_if,
VX_ifetch_req_if ifetch_req_if,
VX_fetch_to_csr_if fetch_to_csr_if,
output wire busy
);
@ -153,6 +155,9 @@ module VX_warp_sched #(
end
end
// export thread mask register
assign fetch_to_csr_if.thread_masks = thread_masks;
// calculate active barrier status
`IGNORE_UNUSED_BEGIN

View file

@ -0,0 +1,12 @@
`ifndef VX_FETCH_TO_CSR_IF
`define VX_FETCH_TO_CSR_IF
`include "VX_define.vh"
interface VX_fetch_to_csr_if ();
wire [`NUM_THREADS-1:0] thread_masks [`NUM_WARPS-1:0];
endinterface
`endif

View file

@ -12,6 +12,7 @@ interface VX_gpu_req_if();
wire [31:0] PC;
wire [31:0] next_PC;
wire [`GPU_BITS-1:0] op_type;
wire [`NT_BITS-1:0] tid;
wire [`NUM_THREADS-1:0][31:0] rs1_data;
wire [31:0] rs2_data;
wire [`NR_BITS-1:0] rd;

View file

@ -121,6 +121,13 @@ inline int vx_core_id() {
return result;
}
// Return current threadk mask
inline int vx_thread_mask() {
int result;
asm volatile ("csrr %0, %1" : "=r"(result) : "i"(CSR_TMASK));
return result;
}
// Return the number of threads in a warp
inline int vx_num_threads() {
int result;