mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
enable barrier and spawn skip mode if N=1
This commit is contained in:
parent
b6aa44f39f
commit
717b2e9ba1
9 changed files with 54 additions and 39 deletions
|
@ -51,6 +51,7 @@ package VX_gpu_pkg;
|
|||
`else
|
||||
logic [`NW_WIDTH-1:0] size_m1;
|
||||
`endif
|
||||
logic is_noop;
|
||||
} barrier_t;
|
||||
|
||||
typedef struct packed {
|
||||
|
|
|
@ -85,11 +85,10 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
|
||||
// barriers
|
||||
reg [`NUM_BARRIERS-1:0][`NUM_WARPS-1:0] barrier_masks, barrier_masks_n;
|
||||
reg [`NUM_BARRIERS-1:0][`NW_WIDTH-1:0] barrier_ctrs, barrier_ctrs_n;
|
||||
reg [`NUM_WARPS-1:0] barrier_stalls, barrier_stalls_n;
|
||||
wire [`CLOG2(`NUM_WARPS+1)-1:0] active_barrier_count;
|
||||
wire [`NUM_WARPS-1:0] curr_barrier_mask;
|
||||
reg [`NUM_WARPS-1:0] curr_barrier_mask_p1;
|
||||
`ifdef GBAR_ENABLE
|
||||
reg [`NUM_WARPS-1:0] curr_barrier_mask_n;
|
||||
reg gbar_req_valid;
|
||||
reg [`NB_WIDTH-1:0] gbar_req_id;
|
||||
reg [`NC_WIDTH-1:0] gbar_req_size_m1;
|
||||
|
@ -103,15 +102,12 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
wire [`CLOG2(`NUM_WARPS+1)-1:0] active_warps_cnt;
|
||||
`POP_COUNT(active_warps_cnt, active_warps);
|
||||
|
||||
assign curr_barrier_mask = barrier_masks[warp_ctl_if.barrier.id];
|
||||
`POP_COUNT(active_barrier_count, curr_barrier_mask);
|
||||
`UNUSED_VAR (active_barrier_count)
|
||||
|
||||
always @(*) begin
|
||||
active_warps_n = active_warps;
|
||||
stalled_warps_n = stalled_warps;
|
||||
thread_masks_n = thread_masks;
|
||||
barrier_masks_n = barrier_masks;
|
||||
barrier_ctrs_n = barrier_ctrs;
|
||||
barrier_stalls_n= barrier_stalls;
|
||||
warp_pcs_n = warp_pcs;
|
||||
|
||||
|
@ -154,25 +150,29 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
end
|
||||
|
||||
// barrier handling
|
||||
`ifdef GBAR_ENABLE
|
||||
curr_barrier_mask_n = curr_barrier_mask;
|
||||
curr_barrier_mask_n[warp_ctl_if.wid] = 1;
|
||||
`endif
|
||||
curr_barrier_mask_p1 = barrier_masks[warp_ctl_if.barrier.id];
|
||||
curr_barrier_mask_p1[warp_ctl_if.wid] = 1;
|
||||
if (warp_ctl_if.valid && warp_ctl_if.barrier.valid) begin
|
||||
if (~warp_ctl_if.barrier.is_global
|
||||
&& (active_barrier_count[`NW_WIDTH-1:0] == warp_ctl_if.barrier.size_m1[`NW_WIDTH-1:0])) begin
|
||||
barrier_masks_n[warp_ctl_if.barrier.id] = '0;
|
||||
barrier_stalls_n &= ~barrier_masks[warp_ctl_if.barrier.id];
|
||||
if (~warp_ctl_if.barrier.is_noop) begin
|
||||
if (~warp_ctl_if.barrier.is_global
|
||||
&& (barrier_ctrs[warp_ctl_if.barrier.id] == `NW_WIDTH'(warp_ctl_if.barrier.size_m1))) begin
|
||||
barrier_ctrs_n[warp_ctl_if.barrier.id] = '0; // reset barrier counter
|
||||
barrier_masks_n[warp_ctl_if.barrier.id] = '0; // reset barrier mask
|
||||
stalled_warps_n &= ~barrier_masks[warp_ctl_if.barrier.id]; // unlock warps
|
||||
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
|
||||
end else begin
|
||||
barrier_ctrs_n[warp_ctl_if.barrier.id] = barrier_ctrs[warp_ctl_if.barrier.id] + 1;
|
||||
barrier_masks_n[warp_ctl_if.barrier.id] = curr_barrier_mask_p1;
|
||||
end
|
||||
end else begin
|
||||
barrier_masks_n[warp_ctl_if.barrier.id][warp_ctl_if.wid] = 1;
|
||||
barrier_stalls_n[warp_ctl_if.wid] = 1;
|
||||
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
|
||||
end
|
||||
stalled_warps_n[warp_ctl_if.wid] = 0; // unlock warp
|
||||
end
|
||||
`ifdef GBAR_ENABLE
|
||||
if (gbar_bus_if.rsp_valid && (gbar_req_id == gbar_bus_if.rsp_id)) begin
|
||||
barrier_masks_n[gbar_bus_if.rsp_id] = '0;
|
||||
barrier_stalls_n = '0; // unlock all warps
|
||||
barrier_ctrs_n[warp_ctl_if.barrier.id] = '0; // reset barrier counter
|
||||
barrier_masks_n[gbar_bus_if.rsp_id] = '0; // reset barrier mask
|
||||
stalled_warps_n = '0; // unlock all warps
|
||||
end
|
||||
`endif
|
||||
|
||||
|
@ -212,6 +212,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
barrier_masks <= '0;
|
||||
barrier_ctrs <= '0;
|
||||
`ifdef GBAR_ENABLE
|
||||
gbar_req_valid <= 0;
|
||||
`endif
|
||||
|
@ -235,6 +236,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
thread_masks <= thread_masks_n;
|
||||
warp_pcs <= warp_pcs_n;
|
||||
barrier_masks <= barrier_masks_n;
|
||||
barrier_ctrs <= barrier_ctrs_n;
|
||||
barrier_stalls <= barrier_stalls_n;
|
||||
is_single_warp <= (active_warps_cnt == $bits(active_warps_cnt)'(1));
|
||||
|
||||
|
@ -253,10 +255,11 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
`ifdef GBAR_ENABLE
|
||||
if (warp_ctl_if.valid && warp_ctl_if.barrier.valid
|
||||
&& warp_ctl_if.barrier.is_global
|
||||
&& (curr_barrier_mask_n == active_warps)) begin
|
||||
&& !warp_ctl_if.barrier.is_noop
|
||||
&& (curr_barrier_mask_p1 == active_warps)) begin
|
||||
gbar_req_valid <= 1;
|
||||
gbar_req_id <= warp_ctl_if.barrier.id;
|
||||
gbar_req_size_m1 <= warp_ctl_if.barrier.size_m1[`NC_WIDTH-1:0];
|
||||
gbar_req_size_m1 <= `NC_WIDTH'(warp_ctl_if.barrier.size_m1);
|
||||
end
|
||||
if (gbar_bus_if.req_valid && gbar_bus_if.req_ready) begin
|
||||
gbar_req_valid <= 0;
|
||||
|
@ -307,7 +310,7 @@ module VX_schedule import VX_gpu_pkg::*; #(
|
|||
|
||||
// schedule the next ready warp
|
||||
|
||||
wire [`NUM_WARPS-1:0] ready_warps = active_warps & ~(stalled_warps | barrier_stalls);
|
||||
wire [`NUM_WARPS-1:0] ready_warps = active_warps & ~stalled_warps;
|
||||
|
||||
VX_lzc #(
|
||||
.N (`NUM_WARPS),
|
||||
|
|
|
@ -126,6 +126,7 @@ module VX_wctl_unit import VX_gpu_pkg::*; #(
|
|||
assign barrier.is_global = 1'b0;
|
||||
`endif
|
||||
assign barrier.size_m1 = rs2_data[$bits(barrier.size_m1)-1:0] - $bits(barrier.size_m1)'(1);
|
||||
assign barrier.is_noop = (rs2_data[$bits(barrier.size_m1)-1:0] == $bits(barrier.size_m1)'(1));
|
||||
|
||||
// wspawn
|
||||
|
||||
|
|
|
@ -757,7 +757,11 @@ int vx_check_occupancy(vx_device_h hdevice, uint32_t group_size, uint32_t* max_b
|
|||
RT_CHECK(vx_dev_caps(hdevice, VX_CAPS_NUM_BARRIERS, &num_barriers), {
|
||||
return _ret;
|
||||
});
|
||||
*max_barriers = num_barriers / groups_per_core;
|
||||
if (warps_per_group < 2) {
|
||||
*max_barriers = -1;
|
||||
} else {
|
||||
*max_barriers = num_barriers / groups_per_core;
|
||||
}
|
||||
}
|
||||
|
||||
// check local memory capacity
|
||||
|
|
|
@ -385,12 +385,12 @@ void Core::resume(uint32_t wid) {
|
|||
emulator_.resume(wid);
|
||||
}
|
||||
|
||||
void Core::barrier(uint32_t bar_id, uint32_t count, uint32_t wid) {
|
||||
emulator_.barrier(bar_id, count, wid);
|
||||
bool Core::barrier(uint32_t bar_id, uint32_t count, uint32_t wid) {
|
||||
return emulator_.barrier(bar_id, count, wid);
|
||||
}
|
||||
|
||||
void Core::wspawn(uint32_t num_warps, Word nextPC) {
|
||||
emulator_.wspawn(num_warps, nextPC);
|
||||
bool Core::wspawn(uint32_t num_warps, Word nextPC) {
|
||||
return emulator_.wspawn(num_warps, nextPC);
|
||||
}
|
||||
|
||||
void Core::attach_ram(RAM* ram) {
|
||||
|
|
|
@ -101,9 +101,9 @@ public:
|
|||
|
||||
void resume(uint32_t wid);
|
||||
|
||||
void barrier(uint32_t bar_id, uint32_t count, uint32_t wid);
|
||||
bool barrier(uint32_t bar_id, uint32_t count, uint32_t wid);
|
||||
|
||||
void wspawn(uint32_t num_warps, Word nextPC);
|
||||
bool wspawn(uint32_t num_warps, Word nextPC);
|
||||
|
||||
uint32_t id() const {
|
||||
return core_id_;
|
||||
|
|
|
@ -224,13 +224,20 @@ void Emulator::resume(uint32_t wid) {
|
|||
}
|
||||
}
|
||||
|
||||
void Emulator::wspawn(uint32_t num_warps, Word nextPC) {
|
||||
bool Emulator::wspawn(uint32_t num_warps, Word nextPC) {
|
||||
num_warps = std::min<uint32_t>(num_warps, arch_.num_warps());
|
||||
if (num_warps < 2)
|
||||
return true;
|
||||
wspawn_.valid = true;
|
||||
wspawn_.num_warps = std::min<uint32_t>(num_warps, arch_.num_warps());
|
||||
wspawn_.num_warps = num_warps;
|
||||
wspawn_.nextPC = nextPC;
|
||||
return false;
|
||||
}
|
||||
|
||||
void Emulator::barrier(uint32_t bar_id, uint32_t count, uint32_t wid) {
|
||||
bool Emulator::barrier(uint32_t bar_id, uint32_t count, uint32_t wid) {
|
||||
if (count < 2)
|
||||
return true;
|
||||
|
||||
uint32_t bar_idx = bar_id & 0x7fffffff;
|
||||
bool is_global = (bar_id >> 31);
|
||||
|
||||
|
@ -257,6 +264,7 @@ void Emulator::barrier(uint32_t bar_id, uint32_t count, uint32_t wid) {
|
|||
barrier.reset();
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void Emulator::icache_read(void *data, uint64_t addr, uint32_t size) {
|
||||
|
|
|
@ -48,9 +48,9 @@ public:
|
|||
|
||||
void resume(uint32_t wid);
|
||||
|
||||
void barrier(uint32_t bar_id, uint32_t count, uint32_t wid);
|
||||
bool barrier(uint32_t bar_id, uint32_t count, uint32_t wid);
|
||||
|
||||
void wspawn(uint32_t num_warps, Word nextPC);
|
||||
bool wspawn(uint32_t num_warps, Word nextPC);
|
||||
|
||||
int get_exitcode() const;
|
||||
|
||||
|
|
|
@ -260,8 +260,7 @@ void SfuUnit::tick() {
|
|||
output.push(trace, 1);
|
||||
if (trace->eop) {
|
||||
auto trace_data = std::dynamic_pointer_cast<SFUTraceData>(trace->data);
|
||||
core_->wspawn(trace_data->arg1, trace_data->arg2);
|
||||
release_warp = false;
|
||||
release_warp = core_->wspawn(trace_data->arg1, trace_data->arg2);
|
||||
}
|
||||
break;
|
||||
case SfuType::TMC:
|
||||
|
@ -277,8 +276,7 @@ void SfuUnit::tick() {
|
|||
output.push(trace, 1);
|
||||
if (trace->eop) {
|
||||
auto trace_data = std::dynamic_pointer_cast<SFUTraceData>(trace->data);
|
||||
core_->barrier(trace_data->arg1, trace_data->arg2, trace->wid);
|
||||
release_warp = false;
|
||||
release_warp = core_->barrier(trace_data->arg1, trace_data->arg2, trace->wid);
|
||||
}
|
||||
} break;
|
||||
case SfuType::CMOV:
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue