enable barrier and spawn skip mode if N=1

This commit is contained in:
Blaise Tine 2024-05-08 04:23:38 -07:00
parent b6aa44f39f
commit 717b2e9ba1
9 changed files with 54 additions and 39 deletions

View file

@ -51,6 +51,7 @@ package VX_gpu_pkg;
`else
logic [`NW_WIDTH-1:0] size_m1;
`endif
logic is_noop;
} barrier_t;
typedef struct packed {

View file

@ -85,11 +85,10 @@ module VX_schedule import VX_gpu_pkg::*; #(
// barriers
reg [`NUM_BARRIERS-1:0][`NUM_WARPS-1:0] barrier_masks, barrier_masks_n;
reg [`NUM_BARRIERS-1:0][`NW_WIDTH-1:0] barrier_ctrs, barrier_ctrs_n;
reg [`NUM_WARPS-1:0] barrier_stalls, barrier_stalls_n;
wire [`CLOG2(`NUM_WARPS+1)-1:0] active_barrier_count;
wire [`NUM_WARPS-1:0] curr_barrier_mask;
reg [`NUM_WARPS-1:0] curr_barrier_mask_p1;
`ifdef GBAR_ENABLE
reg [`NUM_WARPS-1:0] curr_barrier_mask_n;
reg gbar_req_valid;
reg [`NB_WIDTH-1:0] gbar_req_id;
reg [`NC_WIDTH-1:0] gbar_req_size_m1;
@ -103,15 +102,12 @@ module VX_schedule import VX_gpu_pkg::*; #(
wire [`CLOG2(`NUM_WARPS+1)-1:0] active_warps_cnt;
`POP_COUNT(active_warps_cnt, active_warps);
assign curr_barrier_mask = barrier_masks[warp_ctl_if.barrier.id];
`POP_COUNT(active_barrier_count, curr_barrier_mask);
`UNUSED_VAR (active_barrier_count)
always @(*) begin
active_warps_n = active_warps;
stalled_warps_n = stalled_warps;
thread_masks_n = thread_masks;
barrier_masks_n = barrier_masks;
barrier_ctrs_n = barrier_ctrs;
barrier_stalls_n= barrier_stalls;
warp_pcs_n = warp_pcs;
@ -154,25 +150,29 @@ module VX_schedule import VX_gpu_pkg::*; #(
end
// barrier handling
`ifdef GBAR_ENABLE
curr_barrier_mask_n = curr_barrier_mask;
curr_barrier_mask_n[warp_ctl_if.wid] = 1;
`endif
curr_barrier_mask_p1 = barrier_masks[warp_ctl_if.barrier.id];
curr_barrier_mask_p1[warp_ctl_if.wid] = 1;
if (warp_ctl_if.valid && warp_ctl_if.barrier.valid) begin
if (~warp_ctl_if.barrier.is_global
&& (active_barrier_count[`NW_WIDTH-1:0] == warp_ctl_if.barrier.size_m1[`NW_WIDTH-1:0])) begin
barrier_masks_n[warp_ctl_if.barrier.id] = '0;
barrier_stalls_n &= ~barrier_masks[warp_ctl_if.barrier.id];
if (~warp_ctl_if.barrier.is_noop) begin
if (~warp_ctl_if.barrier.is_global
&& (barrier_ctrs[warp_ctl_if.barrier.id] == `NW_WIDTH'(warp_ctl_if.barrier.size_m1))) begin
barrier_ctrs_n[warp_ctl_if.barrier.id] = '0; // reset barrier counter
barrier_masks_n[warp_ctl_if.barrier.id] = '0; // reset barrier mask
stalled_warps_n &= ~barrier_masks[warp_ctl_if.barrier.id]; // unlock warps
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
end else begin
barrier_ctrs_n[warp_ctl_if.barrier.id] = barrier_ctrs[warp_ctl_if.barrier.id] + 1;
barrier_masks_n[warp_ctl_if.barrier.id] = curr_barrier_mask_p1;
end
end else begin
barrier_masks_n[warp_ctl_if.barrier.id][warp_ctl_if.wid] = 1;
barrier_stalls_n[warp_ctl_if.wid] = 1;
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
end
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
end
`ifdef GBAR_ENABLE
if (gbar_bus_if.rsp_valid && (gbar_req_id == gbar_bus_if.rsp_id)) begin
barrier_masks_n[gbar_bus_if.rsp_id] = '0;
barrier_stalls_n = '0; // unlock all warps
barrier_ctrs_n[warp_ctl_if.barrier.id] = '0; // reset barrier counter
barrier_masks_n[gbar_bus_if.rsp_id] = '0; // reset barrier mask
stalled_warps_n = '0; // unlock all warps
end
`endif
@ -212,6 +212,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
always @(posedge clk) begin
if (reset) begin
barrier_masks <= '0;
barrier_ctrs <= '0;
`ifdef GBAR_ENABLE
gbar_req_valid <= 0;
`endif
@ -235,6 +236,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
thread_masks <= thread_masks_n;
warp_pcs <= warp_pcs_n;
barrier_masks <= barrier_masks_n;
barrier_ctrs <= barrier_ctrs_n;
barrier_stalls <= barrier_stalls_n;
is_single_warp <= (active_warps_cnt == $bits(active_warps_cnt)'(1));
@ -253,10 +255,11 @@ module VX_schedule import VX_gpu_pkg::*; #(
`ifdef GBAR_ENABLE
if (warp_ctl_if.valid && warp_ctl_if.barrier.valid
&& warp_ctl_if.barrier.is_global
&& (curr_barrier_mask_n == active_warps)) begin
&& !warp_ctl_if.barrier.is_noop
&& (curr_barrier_mask_p1 == active_warps)) begin
gbar_req_valid <= 1;
gbar_req_id <= warp_ctl_if.barrier.id;
gbar_req_size_m1 <= warp_ctl_if.barrier.size_m1[`NC_WIDTH-1:0];
gbar_req_size_m1 <= `NC_WIDTH'(warp_ctl_if.barrier.size_m1);
end
if (gbar_bus_if.req_valid && gbar_bus_if.req_ready) begin
gbar_req_valid <= 0;
@ -307,7 +310,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
// schedule the next ready warp
wire [`NUM_WARPS-1:0] ready_warps = active_warps & ~(stalled_warps | barrier_stalls);
wire [`NUM_WARPS-1:0] ready_warps = active_warps & ~stalled_warps;
VX_lzc #(
.N (`NUM_WARPS),

View file

@ -126,6 +126,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
assign barrier.is_global = 1'b0;
`endif
assign barrier.size_m1 = rs2_data[$bits(barrier.size_m1)-1:0] - $bits(barrier.size_m1)'(1);
assign barrier.is_noop = (rs2_data[$bits(barrier.size_m1)-1:0] == $bits(barrier.size_m1)'(1));
// wspawn

View file

@ -757,7 +757,11 @@ int vx_check_occupancy(vx_device_h hdevice, uint32_t group_size, uint32_t* max_b
RT_CHECK(vx_dev_caps(hdevice, VX_CAPS_NUM_BARRIERS, &num_barriers), {
return _ret;
});
*max_barriers = num_barriers / groups_per_core;
if (warps_per_group < 2) {
*max_barriers = -1;
} else {
*max_barriers = num_barriers / groups_per_core;
}
}
// check local memory capacity

View file

@ -385,12 +385,12 @@ void Core::resume(uint32_t wid) {
emulator_.resume(wid);
}
void Core::barrier(uint32_t bar_id, uint32_t count, uint32_t wid) {
emulator_.barrier(bar_id, count, wid);
bool Core::barrier(uint32_t bar_id, uint32_t count, uint32_t wid) {
return emulator_.barrier(bar_id, count, wid);
}
void Core::wspawn(uint32_t num_warps, Word nextPC) {
emulator_.wspawn(num_warps, nextPC);
bool Core::wspawn(uint32_t num_warps, Word nextPC) {
return emulator_.wspawn(num_warps, nextPC);
}
void Core::attach_ram(RAM* ram) {

View file

@ -101,9 +101,9 @@ public:
void resume(uint32_t wid);
void barrier(uint32_t bar_id, uint32_t count, uint32_t wid);
bool barrier(uint32_t bar_id, uint32_t count, uint32_t wid);
void wspawn(uint32_t num_warps, Word nextPC);
bool wspawn(uint32_t num_warps, Word nextPC);
uint32_t id() const {
return core_id_;

View file

@ -224,13 +224,20 @@ void Emulator::resume(uint32_t wid) {
}
}
void Emulator::wspawn(uint32_t num_warps, Word nextPC) {
bool Emulator::wspawn(uint32_t num_warps, Word nextPC) {
num_warps = std::min<uint32_t>(num_warps, arch_.num_warps());
if (num_warps < 2)
return true;
wspawn_.valid = true;
wspawn_.num_warps = std::min<uint32_t>(num_warps, arch_.num_warps());
wspawn_.num_warps = num_warps;
wspawn_.nextPC = nextPC;
return false;
}
void Emulator::barrier(uint32_t bar_id, uint32_t count, uint32_t wid) {
bool Emulator::barrier(uint32_t bar_id, uint32_t count, uint32_t wid) {
if (count < 2)
return true;
uint32_t bar_idx = bar_id & 0x7fffffff;
bool is_global = (bar_id >> 31);
@ -257,6 +264,7 @@ void Emulator::barrier(uint32_t bar_id, uint32_t count, uint32_t wid) {
barrier.reset();
}
}
return false;
}
void Emulator::icache_read(void *data, uint64_t addr, uint32_t size) {

View file

@ -48,9 +48,9 @@ public:
void resume(uint32_t wid);
void barrier(uint32_t bar_id, uint32_t count, uint32_t wid);
bool barrier(uint32_t bar_id, uint32_t count, uint32_t wid);
void wspawn(uint32_t num_warps, Word nextPC);
bool wspawn(uint32_t num_warps, Word nextPC);
int get_exitcode() const;

View file

@ -260,8 +260,7 @@ void SfuUnit::tick() {
output.push(trace, 1);
if (trace->eop) {
auto trace_data = std::dynamic_pointer_cast<SFUTraceData>(trace->data);
core_->wspawn(trace_data->arg1, trace_data->arg2);
release_warp = false;
release_warp = core_->wspawn(trace_data->arg1, trace_data->arg2);
}
break;
case SfuType::TMC:
@ -277,8 +276,7 @@ void SfuUnit::tick() {
output.push(trace, 1);
if (trace->eop) {
auto trace_data = std::dynamic_pointer_cast<SFUTraceData>(trace->data);
core_->barrier(trace_data->arg1, trace_data->arg2, trace->wid);
release_warp = false;
release_warp = core_->barrier(trace_data->arg1, trace_data->arg2, trace->wid);
}
} break;
case SfuType::CMOV: