Merge branch 'develop' of https://github.com/vortexgpgpu/vortex into develop

This commit is contained in:
Blaise Tine 2023-12-15 14:11:38 -08:00
commit d79ff077b7
9 changed files with 116 additions and 146 deletions

View file

@ -217,7 +217,7 @@ package VX_gpu_pkg;
function logic [ISSUE_WIS_W-1:0] wid_to_wis(
input logic [`NW_WIDTH-1:0] wid
);
wid_to_wis = ISSUE_WIS_W'(wid >> `CLOG2(`ISSUE_WIDTH));
wid_to_wis = ISSUE_WIS_W'({1'b0, wid} >> `CLOG2(`ISSUE_WIDTH));
endfunction
function logic [ISSUE_ADDRW-1:0] wis_to_addr(

View file

@ -49,12 +49,12 @@ module Vortex import VX_gpu_pkg::*; (
cache_perf_t perf_l3cache;
mem_perf_t mem_perf;
assign mem_perf_if.icache = 'x;
assign mem_perf_if.dcache = 'x;
assign mem_perf_if.smem = 'x;
assign mem_perf_if.icache = 'x;
assign mem_perf_if.dcache = 'x;
assign mem_perf_if.l2cache = 'x;
assign mem_perf_if.l3cache = perf_l3cache;
assign mem_perf_if.smem = 'x;
assign mem_perf_if.mem = mem_perf;
assign mem_perf_if.mem = mem_perf;
`endif
VX_mem_bus_if #(

View file

@ -130,6 +130,12 @@ module VX_core_top import VX_gpu_pkg::*; #(
`ifdef PERF_ENABLE
VX_mem_perf_if mem_perf_if();
assign mem_perf_if.smem = '0;
assign mem_perf_if.icache = '0;
assign mem_perf_if.dcache = '0;
assign mem_perf_if.l2cache = '0;
assign mem_perf_if.l3cache = '0;
assign mem_perf_if.mem = '0;
`endif
`ifdef SCOPE

View file

@ -70,8 +70,8 @@ module VX_dispatch_unit import VX_gpu_pkg::*; #(
always @(posedge clk) begin
if (reset) begin
batch_idx <= '0;
end else if (batch_done) begin
batch_idx <= batch_idx + BATCH_COUNT_W'(1);
end else begin
batch_idx <= batch_idx + BATCH_COUNT_W'(batch_done);
end
end
end else begin

View file

@ -46,6 +46,8 @@ module VX_operands import VX_gpu_pkg::*; #(
reg [`NUM_THREADS-1:0] cache_tmask_n [ISSUE_RATIO-1:0];
reg [ISSUE_RATIO-1:0] cache_eop, cache_eop_n;
reg valid_out_r;
reg [DATAW-1:0] data_out_r;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs1_data, rs1_data_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs2_data, rs2_data_n;
reg [`NUM_THREADS-1:0][`XLEN-1:0] rs3_data, rs3_data_n;
@ -57,11 +59,11 @@ module VX_operands import VX_gpu_pkg::*; #(
reg rs3_ready, rs3_ready_n;
reg data_ready, data_ready_n;
wire ready_out = operands_if[i].ready;
wire is_rs1_zero = (scoreboard_if[i].data.rs1 == 0);
wire is_rs2_zero = (scoreboard_if[i].data.rs2 == 0);
wire is_rs3_zero = (scoreboard_if[i].data.rs3 == 0);
VX_operands_if staging_if();
wire is_rs3_zero = (scoreboard_if[i].data.rs3 == 0);
always @(*) begin
state_n = state;
@ -82,7 +84,7 @@ module VX_operands import VX_gpu_pkg::*; #(
case (state)
STATE_IDLE: begin
if (staging_if.valid && staging_if.ready) begin
if (valid_out_r && ready_out) begin
data_ready_n = 0;
end
if (scoreboard_if[i].valid && data_ready_n == 0) begin
@ -170,31 +172,70 @@ module VX_operands import VX_gpu_pkg::*; #(
end
always @(posedge clk) begin
if (reset) begin
if (reset) begin
state <= STATE_IDLE;
gpr_rd_rid <= '0;
gpr_rd_wis <= '0;
cache_eop <= {ISSUE_RATIO{1'b1}};
cache_eop <= {ISSUE_RATIO{1'b1}};
data_ready <= 0;
valid_out_r <= 0;
end else begin
state <= state_n;
rs2 <= rs2_n;
rs3 <= rs3_n;
rs2_ready <= rs2_ready_n;
rs3_ready <= rs3_ready_n;
rs1_data <= rs1_data_n;
rs2_data <= rs2_data_n;
rs3_data <= rs3_data_n;
gpr_rd_rid <= gpr_rd_rid_n;
gpr_rd_wis <= gpr_rd_wis_n;
cache_data <= cache_data_n;
cache_reg <= cache_reg_n;
cache_tmask <= cache_tmask_n;
cache_eop <= cache_eop_n;
data_ready <= data_ready_n;
data_ready <= data_ready_n;
if (~valid_out_r) begin
valid_out_r <= scoreboard_if[i].valid && data_ready;
end else if (ready_out) begin
valid_out_r <= 0;
end
end
end
if (~valid_out_r) begin
data_out_r <= {scoreboard_if[i].data.uuid,
scoreboard_if[i].data.wis,
scoreboard_if[i].data.tmask,
scoreboard_if[i].data.PC,
scoreboard_if[i].data.wb,
scoreboard_if[i].data.ex_type,
scoreboard_if[i].data.op_type,
scoreboard_if[i].data.op_mod,
scoreboard_if[i].data.use_PC,
scoreboard_if[i].data.use_imm,
scoreboard_if[i].data.imm,
scoreboard_if[i].data.rd};
end
gpr_rd_rid <= gpr_rd_rid_n;
gpr_rd_wis <= gpr_rd_wis_n;
rs2_ready <= rs2_ready_n;
rs3_ready <= rs3_ready_n;
rs2 <= rs2_n;
rs3 <= rs3_n;
rs1_data <= rs1_data_n;
rs2_data <= rs2_data_n;
rs3_data <= rs3_data_n;
cache_data <= cache_data_n;
cache_reg <= cache_reg_n;
cache_tmask <= cache_tmask_n;
end
assign operands_if[i].valid = valid_out_r;
assign {operands_if[i].data.uuid,
operands_if[i].data.wis,
operands_if[i].data.tmask,
operands_if[i].data.PC,
operands_if[i].data.wb,
operands_if[i].data.ex_type,
operands_if[i].data.op_type,
operands_if[i].data.op_mod,
operands_if[i].data.use_PC,
operands_if[i].data.use_imm,
operands_if[i].data.imm,
operands_if[i].data.rd} = data_out_r;
assign operands_if[i].data.rs1_data = rs1_data;
assign operands_if[i].data.rs2_data = rs2_data;
assign operands_if[i].data.rs3_data = rs3_data;
assign scoreboard_if[i].ready = ~valid_out_r && data_ready;
// GPR banks
`ifdef GPR_RESET
@ -228,74 +269,6 @@ module VX_operands import VX_gpu_pkg::*; #(
.rdata (gpr_rd_data[j])
);
end
// staging buffer
`RESET_RELAY (stg_buf_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW)
) stg_buf (
.clk (clk),
.reset (stg_buf_reset),
.valid_in (scoreboard_if[i].valid),
.ready_in (scoreboard_if[i].ready),
.data_in ({
scoreboard_if[i].data.uuid,
scoreboard_if[i].data.wis,
scoreboard_if[i].data.tmask,
scoreboard_if[i].data.PC,
scoreboard_if[i].data.wb,
scoreboard_if[i].data.ex_type,
scoreboard_if[i].data.op_type,
scoreboard_if[i].data.op_mod,
scoreboard_if[i].data.use_PC,
scoreboard_if[i].data.use_imm,
scoreboard_if[i].data.imm,
scoreboard_if[i].data.rd}),
.data_out ({
staging_if.data.uuid,
staging_if.data.wis,
staging_if.data.tmask,
staging_if.data.PC,
staging_if.data.wb,
staging_if.data.ex_type,
staging_if.data.op_type,
staging_if.data.op_mod,
staging_if.data.use_PC,
staging_if.data.use_imm,
staging_if.data.imm,
staging_if.data.rd}),
.valid_out (staging_if.valid),
.ready_out (staging_if.ready)
);
assign staging_if.data.rs1_data = rs1_data;
assign staging_if.data.rs2_data = rs2_data;
assign staging_if.data.rs3_data = rs3_data;
// output buffer
wire valid_stg, ready_stg;
assign valid_stg = staging_if.valid && data_ready;
assign staging_if.ready = ready_stg && data_ready;
`RESET_RELAY (out_buf_reset, reset);
VX_elastic_buffer #(
.DATAW (DATAW + (3 * `NUM_THREADS * `XLEN)),
.SIZE (2),
.OUT_REG (2)
) out_buf (
.clk (clk),
.reset (out_buf_reset),
.valid_in (valid_stg),
.ready_in (ready_stg),
.data_in (staging_if.data),
.data_out (operands_if[i].data),
.valid_out (operands_if[i].valid),
.ready_out (operands_if[i].ready)
);
end
endmodule

View file

@ -51,7 +51,6 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
for (genvar i = 0; i < `ISSUE_WIDTH; ++i) begin
reg [`UP(ISSUE_RATIO)-1:0][`NUM_REGS-1:0] inuse_regs;
VX_ibuffer_if staging_if();
wire writeback_fire = writeback_if[i].valid && writeback_if[i].data.eop;
@ -84,10 +83,17 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
reg [DATAW-1:0] data_out_r;
reg valid_out_r;
wire ready_out;
wire [3:0] ready_masks = ~{inuse_rd, inuse_rs1, inuse_rs2, inuse_rs3};
wire deps_ready = (& ready_masks);
wire valid_in = ibuffer_if[i].valid && deps_ready;
wire ready_in = ~valid_out_r && deps_ready;
wire [DATAW-1:0] data_in = ibuffer_if[i].data;
assign ready_out = scoreboard_if[i].ready;
always @(posedge clk) begin
if (reset) begin
valid_out_r <= 0;
@ -97,40 +103,25 @@ module VX_scoreboard import VX_gpu_pkg::*; #(
inuse_regs[writeback_if[i].data.wis][writeback_if[i].data.rd] <= 0;
end
if (~valid_out_r) begin
valid_out_r <= ibuffer_if[i].valid && deps_ready;
end else if (staging_if.ready) begin
if (staging_if.data.wb) begin
inuse_regs[staging_if.data.wis][staging_if.data.rd] <= 1;
valid_out_r <= valid_in;
end else if (ready_out) begin
if (scoreboard_if[i].data.wb) begin
inuse_regs[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= 1;
`ifdef PERF_ENABLE
inuse_units[staging_if.data.wis][staging_if.data.rd] <= staging_if.data.ex_type;
inuse_units[scoreboard_if[i].data.wis][scoreboard_if[i].data.rd] <= scoreboard_if[i].data.ex_type;
`endif
end
valid_out_r <= 0;
end
end
if (~valid_out_r) begin
data_out_r <= ibuffer_if[i].data;
data_out_r <= data_in;
end
end
assign ibuffer_if[i].ready = ~valid_out_r && deps_ready;
assign staging_if.valid = valid_out_r;
assign staging_if.data = data_out_r;
VX_elastic_buffer #(
.DATAW (DATAW),
.SIZE (0),
.OUT_REG (2)
) out_buf (
.clk (clk),
.reset (reset),
.valid_in (staging_if.valid),
.ready_in (staging_if.ready),
.data_in (staging_if.data),
.data_out (scoreboard_if[i].data),
.valid_out (scoreboard_if[i].valid),
.ready_out (scoreboard_if[i].ready)
);
assign ibuffer_if[i].ready = ready_in;
assign scoreboard_if[i].valid = valid_out_r;
assign scoreboard_if[i].data = data_out_r;
`ifdef SIMULATION
reg [31:0] timeout_ctr;

View file

@ -355,11 +355,14 @@ module VX_fpu_cvt import VX_fpu_pkg::*; #(
wire [NUM_LANES-1:0][INT_WIDTH-1:0] tmp_result_s3;
for (genvar i = 0; i < NUM_LANES; ++i) begin
fflags_t i2f_regular_status_s3 = i2f_round_has_sticky_s3[i] ? 5'h1 : 5'h0;
fflags_t f2i_regular_status_s3 = f2i_round_has_sticky_s3[i] ? 5'h1 : 5'h0;
fflags_t i2f_regular_status_s3, f2i_regular_status_s3;
fflags_t i2f_status_s3, f2i_status_s3;
fflags_t i2f_status_s3 = i2f_regular_status_s3;
fflags_t f2i_status_s3 = f2i_result_is_special_s3[i] ? f2i_special_status_s3[i] : f2i_regular_status_s3;
assign i2f_regular_status_s3 = {4'h0, i2f_round_has_sticky_s3[i]};
assign f2i_regular_status_s3 = {4'h0, f2i_round_has_sticky_s3[i]};
assign i2f_status_s3 = i2f_regular_status_s3;
assign f2i_status_s3 = f2i_result_is_special_s3[i] ? f2i_special_status_s3[i] : f2i_regular_status_s3;
wire [INT_WIDTH-1:0] i2f_result_s3 = fmt_result_s3[i];
wire [INT_WIDTH-1:0] f2i_result_s3 = f2i_result_is_special_s3[i] ? f2i_special_result_s3[i] : rounded_int_res_s3[i];

View file

@ -201,9 +201,7 @@ module VX_fifo_queue #(
rd_ptr_r <= '0;
rd_ptr_n_r <= 1;
end else begin
if (push) begin
wr_ptr_r <= wr_ptr_r + ADDRW'(1);
end
wr_ptr_r <= wr_ptr_r + ADDRW'(push);
if (pop) begin
rd_ptr_r <= rd_ptr_n_r;
if (DEPTH > 2) begin

View file

@ -9,16 +9,16 @@ all:
$(MAKE) -C dotproduct
$(MAKE) -C kmeans
$(MAKE) -C spmv
$(MAKE) -C transpose
$(MAKE) -C cutcp
$(MAKE) -C vectorhypot
$(MAKE) -C stencil
$(MAKE) -C mri-q
$(MAKE) -C lbm
$(MAKE) -C oclprintf
$(MAKE) -C blackscholes
$(MAKE) -C sgemm2
$(MAKE) -C transpose
$(MAKE) -C convolution
# $(MAKE) -C cutcp
# $(MAKE) -C sgemm2
# $(MAKE) -C vectorhypot
# $(MAKE) -C mri-q run-simx
run-simx:
$(MAKE) -C vecadd run-simx
@ -37,10 +37,10 @@ run-simx:
$(MAKE) -C blackscholes run-simx
$(MAKE) -C transpose run-simx
$(MAKE) -C convolution run-simx
$(MAKE) -C cutcp run-simx
$(MAKE) -C vectorhypot run-simx
$(MAKE) -C mri-q run-simx
# $(MAKE) -C cutcp run-simx
# $(MAKE) -C sgemm2 run-simx
# $(MAKE) -C vectorhypot run-simx
# $(MAKE) -C mri-q run-simx
run-rtlsim:
$(MAKE) -C vecadd run-rtlsim
@ -98,15 +98,15 @@ clean:
$(MAKE) -C kmeans clean
$(MAKE) -C spmv clean
$(MAKE) -C transpose clean
$(MAKE) -C cutcp clean
$(MAKE) -C vectorhypot clean
$(MAKE) -C stencil clean
$(MAKE) -C mri-q clean
$(MAKE) -C lbm clean
$(MAKE) -C oclprintf clean
$(MAKE) -C blackscholes clean
$(MAKE) -C sgemm2 clean
$(MAKE) -C convolution clean
# $(MAKE) -C cutcp clean
# $(MAKE) -C sgemm2 clean
# $(MAKE) -C vectorhypot clean
# $(MAKE) -C mri-q clean
clean-all:
$(MAKE) -C vecadd clean-all
@ -114,19 +114,18 @@ clean-all:
$(MAKE) -C psort clean-all
$(MAKE) -C saxpy clean-all
$(MAKE) -C sfilter clean-all
$(MAKE) -C sfilter clean-all
$(MAKE) -C nearn clean-all
$(MAKE) -C guassian clean-all
$(MAKE) -C dotproduct clean-all
$(MAKE) -C kmeans clean-all
$(MAKE) -C spmv clean-all
$(MAKE) -C transpose clean-all
$(MAKE) -C cutcp clean-all
$(MAKE) -C vectorhypot clean-all
$(MAKE) -C stencil clean-all
$(MAKE) -C mri-q clean-all
$(MAKE) -C lbm clean-all
$(MAKE) -C oclprintf clean-all
$(MAKE) -C blackscholes clean-all
$(MAKE) -C sgemm2 clean-all
$(MAKE) -C convolution clean-all
# $(MAKE) -C cutcp clean-all
# $(MAKE) -C sgemm2 clean-all
# $(MAKE) -C vectorhypot clean-all
# $(MAKE) -C mri-q clean-all