increasing 8-bit lerp accuracy in h/w

This commit is contained in:
Blaise Tine 2023-03-14 23:46:15 -04:00
parent d4e5367c18
commit df95544a1c
9 changed files with 108 additions and 54 deletions

View file

@ -124,7 +124,8 @@ echo "begin graphics tests..."
CONFIGS="-DEXT_IMADD_ENABLE" ./ci/blackbox.sh --driver=simx --app=imadd
CONFIGS="-DEXT_IMADD_ENABLE" ./ci/blackbox.sh --driver=rtlsim --app=imadd --args="-n32 -z"
CONFIGS="-DEXT_GFX_ENABLE -DNUM_RASTER_UNITS=2" ./ci/blackbox.sh --driver=simx --app=draw3d --args="-tbox.cgltrace -rbox_ref_128.png"
CONFIGS="-DEXT_GFX_ENABLE -DNUM_RASTER_UNITS=2" ./ci/blackbox.sh --driver=rtlsim --app=draw3d --args="-tbox.cgltrace -rbox_ref_128.png"
CONFIGS="-DEXT_GFX_ENABLE -DL1_DISABLE -DSM_DISABLE -DTCACHE_DISABLE -DRCACHE_DISABLE -DOCACHE_DISABLE" ./ci/blackbox.sh --driver=simx --app=draw3d --args="-tbox.cgltrace -rbox_ref_128.png" --clusters=2 --cores=2 --warps=1 --threads=2
CONFIGS="-DEXT_GFX_ENABLE -DL1_DISABLE -DSM_DISABLE -DTCACHE_DISABLE -DRCACHE_DISABLE -DOCACHE_DISABLE" ./ci/blackbox.sh --driver=rtlsim --app=draw3d --args="-tbox.cgltrace -rbox_ref_128.png" --clusters=2 --cores=2 --warps=1 --threads=2
CONFIGS="-DEXT_GFX_ENABLE" ./ci/blackbox.sh --driver=simx --app=draw3d --args="-ttriangle.cgltrace -rtriangle_ref_8.png -w8 -h8" --warps=1 --threads=2 --debug=3

View file

@ -11,12 +11,9 @@ module VX_lerp_fx #(
output wire [N-1:0] out
);
wire [F:0] one = {1'b1, {F{1'b0}}};
wire [F+1:0] sub = one - (F+1)'(frac);
wire [(N+F):0] prod = in1 * sub + in2 * frac;
`UNUSED_VAR (prod)
assign out = prod [F +: N];
wire [F-1:0] sub = F'(-1) - frac;
wire [N+F:0] tmp = in1 * sub + in2 * frac;
assign out = N'((tmp + (tmp >> F)) >> F);
endmodule
`TRACING_ON

View file

@ -29,7 +29,7 @@ module VX_rop_blend #(
);
`UNUSED_SPARAM (INSTANCE_ID)
localparam LATENCY = `LATENCY_IMUL + 1;
localparam LATENCY = `LATENCY_IMUL + 2;
`UNUSED_VAR (dcrs)
@ -92,7 +92,8 @@ module VX_rop_blend #(
.LATENCY (LATENCY)
) rop_blend_multadd (
.clk (clk),
.enable (~stall),
.reset (reset),
.enable (~stall),
.mode_rgb (dcrs.blend_mode_rgb),
.mode_a (dcrs.blend_mode_a),
.src_color (src_color_s1[i]),
@ -106,6 +107,7 @@ module VX_rop_blend #(
.LATENCY (LATENCY)
) rop_blend_minmax (
.clk (clk),
.reset (reset),
.enable (~stall),
.src_color (src_color_s1[i]),
.dst_color (dst_color_s1[i]),
@ -117,6 +119,7 @@ module VX_rop_blend #(
.LATENCY (LATENCY)
) rop_logic_op (
.clk (clk),
.reset (reset),
.enable (~stall),
.op (dcrs.logic_op),
.src_color (src_color_s1[i]),

View file

@ -3,8 +3,10 @@
module VX_rop_blend_minmax #(
parameter LATENCY = 1
) (
input clk,
input enable,
input wire clk,
input wire reset,
input wire enable,
input rgba_t src_color,
input rgba_t dst_color,
@ -13,6 +15,8 @@ module VX_rop_blend_minmax #(
output rgba_t max_out
);
`UNUSED_VAR (reset)
rgba_t tmp_min;
rgba_t tmp_max;

View file

@ -15,10 +15,12 @@
)
module VX_rop_blend_multadd #(
parameter LATENCY = (`LATENCY_IMUL + 1)
parameter LATENCY = 1
) (
input clk,
input enable,
input wire clk,
input wire reset,
input wire enable,
input wire [`ROP_BLEND_MODE_BITS-1:0] mode_rgb,
input wire [`ROP_BLEND_MODE_BITS-1:0] mode_a,
@ -31,42 +33,46 @@ module VX_rop_blend_multadd #(
output rgba_t color_out
);
`STATIC_ASSERT((LATENCY >= `LATENCY_IMUL), ("invalid parameter"))
`STATIC_ASSERT((LATENCY > `LATENCY_IMUL), ("invalid parameter"))
`UNUSED_VAR (reset)
localparam LATENCY_REM = LATENCY - `LATENCY_IMUL;
wire [15:0] prod_src_r, prod_src_g, prod_src_b, prod_src_a;
wire [15:0] prod_dst_r, prod_dst_g, prod_dst_b, prod_dst_a;
// src_color x src_factor
`MULT8(clk, enable, prod_src_r, src_color.r, src_factor.r);
`MULT8(clk, enable, prod_src_g, src_color.g, src_factor.g);
`MULT8(clk, enable, prod_src_b, src_color.b, src_factor.b);
`MULT8(clk, enable, prod_src_a, src_color.a, src_factor.a);
// dst_color x dst_factor
`MULT8(clk, enable, prod_dst_r, dst_color.r, dst_factor.r);
`MULT8(clk, enable, prod_dst_g, dst_color.g, dst_factor.g);
`MULT8(clk, enable, prod_dst_b, dst_color.b, dst_factor.b);
`MULT8(clk, enable, prod_dst_a, dst_color.a, dst_factor.a);
reg [15:0] sum_r, sum_g, sum_b, sum_a;
reg [16:0] sum_r, sum_g, sum_b, sum_a;
// apply blend mode
always @(*) begin
// RGB blending
case (mode_rgb)
`ROP_BLEND_MODE_ADD: begin
sum_r = prod_src_r + prod_dst_r + 16'hff;
sum_g = prod_src_g + prod_dst_g + 16'hff;
sum_b = prod_src_b + prod_dst_b + 16'hff;
sum_r = prod_src_r + prod_dst_r;
sum_g = prod_src_g + prod_dst_g;
sum_b = prod_src_b + prod_dst_b;
end
`ROP_BLEND_MODE_SUB: begin
sum_r = prod_src_r - prod_dst_r + 16'hff;
sum_g = prod_src_g - prod_dst_g + 16'hff;
sum_b = prod_src_b - prod_dst_b + 16'hff;
sum_r = prod_src_r - prod_dst_r;
sum_g = prod_src_g - prod_dst_g;
sum_b = prod_src_b - prod_dst_b;
end
`ROP_BLEND_MODE_REV_SUB: begin
sum_r = prod_dst_r - prod_src_r + 16'hff;
sum_g = prod_dst_g - prod_src_g + 16'hff;
sum_b = prod_dst_b - prod_src_b + 16'hff;
sum_r = prod_dst_r - prod_src_r;
sum_g = prod_dst_g - prod_src_g;
sum_b = prod_dst_b - prod_src_b;
end
default: begin
sum_r = 'x;
@ -74,16 +80,15 @@ module VX_rop_blend_multadd #(
sum_b = 'x;
end
endcase
// Alpha blending
case (mode_a)
`ROP_BLEND_MODE_ADD: begin
sum_a = prod_src_a + prod_dst_a + 16'hff;
sum_a = prod_src_a + prod_dst_a;
end
`ROP_BLEND_MODE_SUB: begin
sum_a = prod_src_a - prod_dst_a + 16'hff;
sum_a = prod_src_a - prod_dst_a;
end
`ROP_BLEND_MODE_REV_SUB: begin
sum_a = prod_dst_a - prod_src_a + 16'hff;
sum_a = prod_dst_a - prod_src_a;
end
default: begin
sum_a = 'x;
@ -91,10 +96,49 @@ module VX_rop_blend_multadd #(
endcase
end
`UNUSED_VAR (sum_r)
`UNUSED_VAR (sum_g)
`UNUSED_VAR (sum_b)
`UNUSED_VAR (sum_a)
reg [15:0] clamp_r, clamp_g, clamp_b, clamp_a;
// clamp to (0, 255 * 255)
always @(*) begin
case (mode_rgb)
`ROP_BLEND_MODE_ADD: begin
clamp_r = (sum_r > 17'hFE01) ? 16'hFE01 : sum_r[15:0];
clamp_g = (sum_g > 17'hFE01) ? 16'hFE01 : sum_g[15:0];
clamp_b = (sum_b > 17'hFE01) ? 16'hFE01 : sum_b[15:0];
end
`ROP_BLEND_MODE_SUB,
`ROP_BLEND_MODE_REV_SUB: begin
clamp_r = sum_r[16] ? 16'h0 : sum_r[15:0];
clamp_g = sum_g[16] ? 16'h0 : sum_g[15:0];
clamp_b = sum_b[16] ? 16'h0 : sum_b[15:0];
end
default: begin
clamp_r = 'x;
clamp_g = 'x;
clamp_b = 'x;
end
endcase
case (mode_a)
`ROP_BLEND_MODE_ADD: begin
clamp_a = (sum_a > 17'hFE01) ? 16'hFE01 : sum_a[15:0];
end
`ROP_BLEND_MODE_SUB,
`ROP_BLEND_MODE_REV_SUB: begin
clamp_a = sum_a[16] ? 16'h0 : sum_a[15:0];
end
default: begin
clamp_a = 'x;
end
endcase
end
rgba_t result;
// divide by 255
assign result.r = 8'((clamp_r + (clamp_r >> 8)) >> 8);
assign result.g = 8'((clamp_g + (clamp_g >> 8)) >> 8);
assign result.b = 8'((clamp_b + (clamp_b >> 8)) >> 8);
assign result.a = 8'((clamp_a + (clamp_a >> 8)) >> 8);
VX_shift_register #(
.DATAW (32),
@ -103,8 +147,8 @@ module VX_rop_blend_multadd #(
.clk (clk),
`UNUSED_PIN (reset),
.enable (enable),
.data_in ({sum_a[15:8], sum_r[15:8], sum_g[15:8], sum_b[15:8]}),
.data_out ({color_out.a, color_out.r, color_out.g, color_out.b})
.data_in (result),
.data_out (color_out)
);
endmodule

View file

@ -3,8 +3,10 @@
module VX_rop_logic_op #(
parameter LATENCY = 1
) (
input clk,
input enable,
input wire clk,
input wire reset,
input wire enable,
input wire [`ROP_LOGIC_OP_BITS-1:0] op,
input wire [31:0] src_color,
@ -13,6 +15,8 @@ module VX_rop_logic_op #(
output wire [31:0] color_out
);
`UNUSED_VAR (reset)
reg [31:0] tmp_color;
always @(*) begin

View file

@ -52,6 +52,7 @@ module VX_rop_mem #(
wire [NUM_REQS-1:0][3:0] mreq_byteen, mreq_byteen_r;
wire [TAG_WIDTH-1:0] mreq_tag, mreq_tag_r;
wire mreq_ready_r;
wire mreq_stall;
wire mrsp_valid;
wire [NUM_REQS-1:0] mrsp_mask;
@ -68,9 +69,9 @@ module VX_rop_mem #(
assign stencil_byteen[i] = (dcrs.stencil_writemask[req_face[i]] != 0);
end
wire mul_ready_in;
wire mul_enable;
// DS submission
// depth/stencil values submission
for (genvar i = 0; i < NUM_LANES; ++i) begin
wire [31:0] m_y_pitch, baddr_s;
@ -81,7 +82,7 @@ module VX_rop_mem #(
.LATENCY (`LATENCY_IMUL)
) multiplier (
.clk (clk),
.enable (mul_ready_in),
.enable (mul_enable),
.dataa (req_pos_y[i]),
.datab (dcrs.zbuf_pitch),
.result (m_y_pitch)
@ -98,7 +99,7 @@ module VX_rop_mem #(
) shift_reg (
.clk (clk),
`UNUSED_PIN (reset),
.enable (mul_ready_in),
.enable (mul_enable),
.data_in ({mask, byteen, baddr, data}),
.data_out ({mreq_mask[i], mreq_byteen[i], baddr_s, mreq_data[i]})
);
@ -109,7 +110,7 @@ module VX_rop_mem #(
`UNUSED_VAR (addr)
end
// Bland submission
// blend color submission
for (genvar i = NUM_LANES; i < NUM_REQS; ++i) begin
wire [31:0] m_y_pitch, baddr_s;
@ -120,7 +121,7 @@ module VX_rop_mem #(
.LATENCY (`LATENCY_IMUL)
) multiplier (
.clk (clk),
.enable (mul_ready_in),
.enable (mul_enable),
.dataa (req_pos_y[i - NUM_LANES]),
.datab (dcrs.cbuf_pitch),
.result (m_y_pitch)
@ -137,7 +138,7 @@ module VX_rop_mem #(
) shift_reg (
.clk (clk),
`UNUSED_PIN (reset),
.enable (mul_ready_in),
.enable (mul_enable),
.data_in ({mask, byteen, baddr, data}),
.data_out ({mreq_mask[i], mreq_byteen[i], baddr_s, mreq_data[i]})
);
@ -145,7 +146,7 @@ module VX_rop_mem #(
wire [31:0] addr = baddr_s + m_y_pitch;
assign mreq_addr[i] = addr[(32-OCACHE_ADDR_WIDTH) +: OCACHE_ADDR_WIDTH];
`UNUSED_VAR (addr)
`UNUSED_VAR (addr)
end
VX_shift_register #(
@ -155,16 +156,14 @@ module VX_rop_mem #(
) shift_reg (
.clk (clk),
.reset (reset),
.enable (mul_ready_in),
.enable (mul_enable),
.data_in ({req_valid, req_rw, req_tag}),
.data_out ({mreq_valid, mreq_rw, mreq_tag})
);
wire mreq_stall = mreq_valid_r && ~mreq_ready_r;
assign req_ready = mul_enable;
assign mul_ready_in = ~(mreq_valid && mreq_stall);
assign req_ready = mul_ready_in;
assign mul_enable = ~(mreq_valid && mreq_stall);
VX_pipe_register #(
.DATAW (1 + 1 + NUM_REQS * (1 + 4 + OCACHE_ADDR_WIDTH + 32) + TAG_WIDTH),
@ -177,6 +176,8 @@ module VX_rop_mem #(
.data_out ({mreq_valid_r, mreq_rw_r, mreq_mask_r, mreq_byteen_r, mreq_addr_r, mreq_data_r, mreq_tag_r})
);
assign mreq_stall = mreq_valid_r && ~mreq_ready_r;
// schedule memory request
VX_mem_scheduler #(

View file

@ -40,7 +40,7 @@ bool sw_interp = false;
uint32_t start_draw = 0;
uint32_t end_draw = -1;
uint32_t clear_color = 0x00000000;
uint32_t clear_color = 0xff000000;
uint32_t clear_depth = 0xffffffff;
uint32_t dst_width = 128;
@ -542,7 +542,7 @@ int main(int argc, char *argv[]) {
cleanup();
if (reference_file) {
auto errors = CompareImages(output_file, reference_file, FORMAT_A8R8G8B8, 0);
auto errors = CompareImages(output_file, reference_file, FORMAT_A8R8G8B8, 2);
if (0 == errors) {
std::cout << "PASSED!" << std::endl;
} else {

View file

@ -33,7 +33,7 @@ const char* trace_file = "triangle.cgltrace";
const char* output_file = "output.png";
const char* reference_file = nullptr;
uint32_t clear_color = 0x00000000;
uint32_t clear_color = 0xff000000;
uint32_t dst_width = 128;
uint32_t dst_height = 128;