tex_unit critical path refactoring

This commit is contained in:
Blaise Tine 2021-08-01 00:54:47 -07:00
parent ee46bc8a48
commit fd0d908a68
5 changed files with 59 additions and 48 deletions

View file

@ -120,9 +120,11 @@ module VX_lsu_unit #(
wire [`NUM_THREADS-1:0] dcache_req_fire = dcache_req_if.valid & dcache_req_if.ready;
wire dcache_req_fire_any = (| dcache_req_fire);
wire dcache_rsp_fire = dcache_rsp_if.valid && dcache_rsp_if.ready;
wire mbuf_push = (| dcache_req_fire)
wire mbuf_push = dcache_req_fire_any
&& is_req_start // first submission only
&& req_wb; // loads only
@ -333,7 +335,7 @@ module VX_lsu_unit #(
if (lsu_req_if.valid && fence_wait) begin
$display("%t: *** D$%0d fence wait", $time, CORE_ID);
end
if ((| dcache_req_fire)) begin
if (dcache_req_fire_any) begin
if (dcache_req_if.rw[0]) begin
$write("%t: D$%0d Wr Req: wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_wid, req_pc, dcache_req_fire);
`PRINT_ARRAY1D(req_addr, `NUM_THREADS);

View file

@ -40,12 +40,12 @@ module VX_tex_addr #(
wire [NUM_REQS-1:0] tmask_s0;
wire [`TEX_FILTER_BITS-1:0] filter_s0;
wire [REQ_INFO_WIDTH-1:0] req_info_s0;
wire [1:0][NUM_REQS-1:0][`FIXED_FRAC-1:0] clamped_lo, clamped_lo_s0;
wire [1:0][NUM_REQS-1:0][`FIXED_FRAC-1:0] clamped_hi, clamped_hi_s0;
wire [1:0][NUM_REQS-1:0][31:0] coord_lo, coord_lo_s0;
wire [1:0][NUM_REQS-1:0][31:0] coord_hi, coord_hi_s0;
wire [`TEX_STRIDE_BITS-1:0] log_stride, log_stride_s0;
wire [NUM_REQS-1:0][31:0] mip_addr, mip_addr_s0;
wire [1:0][NUM_REQS-1:0][`TEX_DIM_BITS-1:0] log_dims_s0;
wire [1:0][`TEX_WRAP_BITS-1:0] req_wraps_s0;
wire stall_out;
@ -62,39 +62,21 @@ module VX_tex_addr #(
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar j = 0; j < 2; ++j) begin
wire [31:0] coord_lo, coord_hi;
assign coord_lo = req_coords[j][i] - (req_filter ? (`FIXED_HALF >> req_logdims[j][i]) : 0);
assign coord_hi = req_coords[j][i] + (req_filter ? (`FIXED_HALF >> req_logdims[j][i]) : 0);
VX_tex_wrap #(
.CORE_ID (CORE_ID)
) tex_wrap_lo (
.wrap_i (req_wraps[j]),
.coord_i (coord_lo),
.coord_o (clamped_lo[j][i])
);
VX_tex_wrap #(
.CORE_ID (CORE_ID)
) tex_wrap_hi (
.wrap_i (req_wraps[j]),
.coord_i (coord_hi),
.coord_o (clamped_hi[j][i])
);
assign coord_lo[j][i] = req_filter ? (req_coords[j][i] - (`FIXED_HALF >> req_logdims[j][i])) : req_coords[j][i];
assign coord_hi[j][i] = req_filter ? (req_coords[j][i] + (`FIXED_HALF >> req_logdims[j][i])) : req_coords[j][i];
end
assign mip_addr[i] = req_baseaddr + 32'(req_mipoffset[i]);
end
VX_pipe_register #(
.DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + REQ_INFO_WIDTH + NUM_REQS * (2 * `TEX_DIM_BITS + 32 + 2 * 2 * `FIXED_FRAC)),
.DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + REQ_INFO_WIDTH + 2 * `TEX_WRAP_BITS + NUM_REQS * (2 * `TEX_DIM_BITS + 32 + 2 * 2 * 32)),
.RESETW (1)
) pipe_reg0 (
.clk (clk),
.reset (reset),
.enable (~stall_out),
.data_in ({req_valid, req_tmask, req_filter, log_stride, req_info, req_logdims, mip_addr, clamped_lo, clamped_hi}),
.data_out ({valid_s0, tmask_s0, filter_s0, log_stride_s0, req_info_s0, log_dims_s0, mip_addr_s0, clamped_lo_s0, clamped_hi_s0})
.data_in ({req_valid, req_tmask, req_filter, log_stride, req_info, req_wraps, req_logdims, mip_addr, coord_lo, coord_hi}),
.data_out ({valid_s0, tmask_s0, filter_s0, log_stride_s0, req_info_s0, req_wraps_s0, log_dims_s0, mip_addr_s0, coord_lo_s0, coord_hi_s0})
);
// addresses generation
@ -105,9 +87,28 @@ module VX_tex_addr #(
for (genvar i = 0; i < NUM_REQS; ++i) begin
for (genvar j = 0; j < 2; ++j) begin
assign scaled_lo[j][i] = `FIXED_INT'(clamped_lo_s0[j][i] >> ((`FIXED_FRAC) - log_dims_s0[j][i]));
assign scaled_hi[j][i] = `FIXED_INT'(clamped_hi_s0[j][i] >> ((`FIXED_FRAC) - log_dims_s0[j][i]));
assign blends[j][i] = filter_s0 ? clamped_lo_s0[j][i][`BLEND_FRAC-1:0] : `BLEND_FRAC'(0);
wire [`FIXED_FRAC-1:0] clamped_lo;
wire [`FIXED_FRAC-1:0] clamped_hi;
VX_tex_wrap #(
.CORE_ID (CORE_ID)
) tex_wrap_lo (
.wrap_i (req_wraps_s0[j]),
.coord_i (coord_lo_s0[j][i]),
.coord_o (clamped_lo)
);
VX_tex_wrap #(
.CORE_ID (CORE_ID)
) tex_wrap_hi (
.wrap_i (req_wraps_s0[j]),
.coord_i (coord_hi_s0[j][i]),
.coord_o (clamped_hi)
);
assign scaled_lo[j][i] = `FIXED_INT'(clamped_lo >> ((`FIXED_FRAC) - log_dims_s0[j][i]));
assign scaled_hi[j][i] = `FIXED_INT'(clamped_hi >> ((`FIXED_FRAC) - log_dims_s0[j][i]));
assign blends[j][i] = filter_s0 ? clamped_lo[`BLEND_FRAC-1:0] : `BLEND_FRAC'(0);
end
end

View file

@ -125,10 +125,10 @@ module VX_tex_memory #(
// DCache Request
reg [NUM_REQS-1:0] texel_sent_mask;
wire [NUM_REQS-1:0] dcache_req_fire;
wire [NUM_REQS-1:0] req_dup_mask;
assign dcache_req_fire = dcache_req_if.valid & dcache_req_if.ready;
wire [NUM_REQS-1:0] dcache_req_fire = dcache_req_if.valid & dcache_req_if.ready;
wire dcache_req_fire_any = (| dcache_req_fire);
assign sent_all_ready = (&(dcache_req_if.ready | texel_sent_mask | ~q_req_tmask))
|| (req_texel_dup & dcache_req_if.ready[0]);
@ -141,7 +141,7 @@ module VX_tex_memory #(
end
end
assign req_dup_mask = {{(NUM_REQS-1){~req_texel_dup}}, 1'b1};
wire [NUM_REQS-1:0] req_dup_mask = {{(NUM_REQS-1){~req_texel_dup}}, 1'b1};
assign dcache_req_if.valid = {NUM_REQS{req_texel_valid}} & q_req_tmask & req_dup_mask & ~texel_sent_mask;
assign dcache_req_if.rw = {NUM_REQS{1'b0}};
@ -163,9 +163,8 @@ module VX_tex_memory #(
reg [3:0][NUM_REQS-1:0][31:0] rsp_texels, rsp_texels_n;
wire [NUM_REQS-1:0][3:0][31:0] rsp_texels_qual;
reg [NUM_REQS-1:0][31:0] rsp_data_qual;
reg [RSP_CTR_W-1:0] rsp_rem_ctr;
wire [NUM_REQS-1:0] rsp_cur_tmask;
wire [$clog2(NUM_REQS + 1)-1:0] rsp_cur_cnt;
reg [RSP_CTR_W-1:0] rsp_rem_ctr, rsp_rem_ctr_init;
wire [RSP_CTR_W-1:0] rsp_rem_ctr_n;
wire dcache_rsp_fire;
wire [1:0] rsp_texel_idx;
wire rsp_texel_dup;
@ -176,10 +175,6 @@ module VX_tex_memory #(
assign dcache_rsp_fire = dcache_rsp_if.valid && dcache_rsp_if.ready;
assign rsp_cur_tmask = rsp_texel_dup ? q_req_tmask : dcache_rsp_if.tmask;
assign rsp_cur_cnt = $countones(rsp_cur_tmask);
for (genvar i = 0; i < NUM_REQS; i++) begin
wire [31:0] src_mask = {32{dcache_rsp_if.tmask[i]}};
wire [31:0] src_data = ((i == 0 || rsp_texel_dup) ? dcache_rsp_if.data[0] : (dcache_rsp_if.data[i]) & src_mask);
@ -213,14 +208,25 @@ module VX_tex_memory #(
end
end
always @(*) begin
rsp_rem_ctr_init = RSP_CTR_W'($countones(q_dup_reqs[0] ? NUM_REQS'(1) : q_req_tmask));
if (q_req_filter) begin
for (integer i = 1; i < 4; ++i) begin
rsp_rem_ctr_init += RSP_CTR_W'($countones(q_dup_reqs[i] ? NUM_REQS'(1) : q_req_tmask));
end
end
end
assign rsp_rem_ctr_n = rsp_rem_ctr - RSP_CTR_W'($countones(dcache_rsp_if.tmask));
always @(posedge clk) begin
if (reset) begin
rsp_rem_ctr <= 0;
end else begin
if ((| dcache_req_fire) && 0 == rsp_rem_ctr) begin
rsp_rem_ctr <= q_req_filter ? {$countones(q_req_tmask), 2'b0} : {2'b0, $countones(q_req_tmask)};
if (dcache_req_fire_any && 0 == rsp_rem_ctr) begin
rsp_rem_ctr <= rsp_rem_ctr_init;
end else if (dcache_rsp_fire) begin
rsp_rem_ctr <= rsp_rem_ctr - RSP_CTR_W'(rsp_cur_cnt);
rsp_rem_ctr <= rsp_rem_ctr_n;
end
end
end
@ -233,7 +239,9 @@ module VX_tex_memory #(
wire stall_out = rsp_valid && ~rsp_ready;
wire rsp_texels_done = dcache_rsp_fire && (rsp_rem_ctr == RSP_CTR_W'(rsp_cur_cnt));
wire is_last_rsp = (0 == rsp_rem_ctr_n);
wire rsp_texels_done = dcache_rsp_fire && is_last_rsp;
assign reqq_pop = rsp_texels_done && ~stall_out;
@ -249,7 +257,7 @@ module VX_tex_memory #(
);
// Can accept new cache response?
assign dcache_rsp_if.ready = ~stall_out || (rsp_rem_ctr != RSP_CTR_W'(rsp_cur_cnt));
assign dcache_rsp_if.ready = ~(is_last_rsp && stall_out);
`ifdef DBG_PRINT_TEX
wire [`NW_BITS-1:0] req_wid, rsp_wid;
@ -258,7 +266,7 @@ module VX_tex_memory #(
assign {rsp_wid, rsp_PC} = rsp_info[`NW_BITS+32-1:0];
always @(posedge clk) begin
if ((| dcache_req_fire)) begin
if (dcache_req_fire_any) begin
$write("%t: core%0d-tex-cache-req: wid=%0d, PC=%0h, tmask=%b, tag=%0h, addr=",
$time, CORE_ID, q_req_wid, q_req_PC, dcache_req_fire, dcache_req_if.tag);
`PRINT_ARRAY1D(req_texel_addr, NUM_REQS);

Binary file not shown.

Before

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 256 KiB