Adding new serial multiplier for low-cost 64-bit integer multiplication

This commit is contained in:
Blaise Tine 2023-06-20 09:49:21 -04:00
parent fdf82842b2
commit 6117fb48fe
8 changed files with 241 additions and 72 deletions

View file

@ -250,7 +250,7 @@ CONFIGS="-DENABLE_DPI -DNUM_FPU_UNITS=2" ./ci/blackbox.sh --driver=rtlsim --app=
CONFIGS="-DENABLE_DPI" AXI_BUS=1 ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=demo
# adjust l1 block size to match l2
CONFIGS="-DENABLE_DPI DL1_LINE_SIZE=64" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr --args="-n1"
CONFIGS="-DENABLE_DPI -DL1_LINE_SIZE=64" ./ci/blackbox.sh --driver=rtlsim --cores=2 --l2cache --app=io_addr --args="-n1"
# test cache banking
CONFIGS="-DENABLE_DPI -DSMEM_NUM_BANKS=4 -DDCACHE_NUM_BANKS=1" ./ci/blackbox.sh --driver=rtlsim --app=sgemm

View file

@ -428,7 +428,7 @@ module VX_core_top #(
`ifdef EXT_RASTER_ENABLE
input wire raster_req_valid,
input raster_stamp_t [`NUM_THREADS-1:0] raster_req_stamps,
input wire raster_req_empty,
input wire raster_req_done,
output wire raster_req_ready,
`endif
@ -568,7 +568,7 @@ module VX_core_top #(
assign raster_req_if.valid = raster_req_valid;
assign raster_req_if.stamps = raster_req_stamps;
assign raster_req_if.empty=raster_req_empty;
assign raster_req_if.done = raster_req_done;
assign raster_req_ready = raster_req_if.ready;
`endif

View file

@ -33,6 +33,7 @@ module VX_muldiv (
);
localparam UUID_WIDTH = `UP(`UUID_BITS);
localparam NW_WIDTH = `UP(`NW_BITS);
localparam TAGW = UUID_WIDTH + NW_WIDTH + `NUM_THREADS + `XLEN + `NR_BITS + 1;
`UNUSED_VAR (alu_op)
`UNUSED_VAR (op_mod)
@ -57,8 +58,7 @@ module VX_muldiv (
wire mul_valid_out;
wire mul_valid_in = valid_in && is_mulx_op;
wire mul_ready_in = ~stall_out || ~mul_valid_out;
wire is_mulh_in = `INST_M_IS_MULH(alu_op);
wire is_signed_mul_a = `INST_M_SIGNED_A(alu_op);
wire is_signed_mul_b = is_signed_op;
@ -67,6 +67,7 @@ module VX_muldiv (
wire [`NUM_THREADS-1:0][`XLEN-1:0] mul_result_tmp;
wire mul_ready_in = ~stall_out || ~mul_valid_out;
wire mul_fire_in = mul_valid_in && mul_ready_in;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
@ -97,14 +98,54 @@ module VX_muldiv (
wire is_mulh_out;
wire is_mul_w_out;
`ifdef XLEN_64
wire [`NUM_THREADS-1:0][`XLEN:0] mul_in1;
wire [`NUM_THREADS-1:0][`XLEN:0] mul_in2;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
assign mul_in1[i] = is_alu_w ? {{(`XLEN-31){alu_in1[i][31]}}, alu_in1[i][31:0]} : {is_signed_mul_a && alu_in1[i][`XLEN-1], alu_in1[i]};
assign mul_in2[i] = is_alu_w ? {{(`XLEN-31){alu_in2[i][31]}}, alu_in2[i][31:0]} : {is_signed_mul_b && alu_in2[i][`XLEN-1], alu_in2[i]};
end
wire mul_ready_in;
wire mul_ready_out = ~stall_out;
VX_serial_mul #(
.A_WIDTH (`XLEN+1),
.LANES (`NUM_THREADS),
.SIGNED (1)
) multiplier (
.clk (clk),
.reset (reset),
.valid_in (mul_valid_in),
.ready_in (mul_ready_in),
.valid_out (mul_valid_out),
.ready_out (mul_ready_out),
.dataa (mul_in1),
.datab (mul_in2),
.result (mul_result_tmp)
);
reg [TAGW+2-1:0] mul_tag_r;
always @(posedge clk) begin
if (mul_valid_in && mul_ready_in) begin
mul_tag_r <= {uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, is_mulh_in, is_alu_w};
end
end
assign {mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, is_mulh_out, is_mul_w_out} = mul_tag_r;
`else
wire mul_ready_in = ~stall_out || ~mul_valid_out;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
wire [`XLEN:0] mul_in1 = is_alu_w ? {{(`XLEN-31){alu_in1[i][31]}}, alu_in1[i][31:0]} : {is_signed_mul_a && alu_in1[i][`XLEN-1], alu_in1[i]};
wire [`XLEN:0] mul_in2 = is_alu_w ? {{(`XLEN-31){alu_in2[i][31]}}, alu_in2[i][31:0]} : {is_signed_mul_b && alu_in2[i][`XLEN-1], alu_in2[i]};
VX_multiplier #(
.A_WIDTH (`XLEN+1),
.B_WIDTH (`XLEN+1),
.R_WIDTH (2*(`XLEN+1)),
.SIGNED (1),
.LATENCY (`LATENCY_IMUL)
) multiplier (
@ -117,7 +158,7 @@ module VX_muldiv (
end
VX_shift_register #(
.DATAW (1 + UUID_WIDTH + NW_WIDTH + `NUM_THREADS + `XLEN + `NR_BITS + 1 + 1 + 1),
.DATAW (1 + TAGW + 1 + 1),
.DEPTH (`LATENCY_IMUL),
.RESETW (1)
) mul_shift_reg (
@ -128,6 +169,8 @@ module VX_muldiv (
.data_out ({mul_valid_out, mul_uuid_out, mul_wid_out, mul_tmask_out, mul_PC_out, mul_rd_out, mul_wb_out, is_mulh_out, is_mul_w_out})
);
`endif
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
`ifdef XLEN_64
assign mul_result_out[i] = is_mulh_out ? mul_result_tmp[i][2*(`XLEN)-1:`XLEN] :
@ -181,7 +224,7 @@ module VX_muldiv (
end
VX_shift_register #(
.DATAW (1 + UUID_WIDTH + NW_WIDTH + `NUM_THREADS + `XLEN + `NR_BITS + 1 + (`NUM_THREADS * `XLEN)),
.DATAW (1 + TAGW + (`NUM_THREADS * `XLEN)),
.DEPTH (`LATENCY_IMUL),
.RESETW (1)
) div_shift_reg (
@ -205,19 +248,16 @@ module VX_muldiv (
.WIDTHD (`XLEN),
.WIDTHQ (`XLEN),
.WIDTHR (`XLEN),
.LANES (`NUM_THREADS),
.TAGW (UUID_WIDTH + NW_WIDTH + `NUM_THREADS + `XLEN + `NR_BITS + 1 + 1 + 1)
.LANES (`NUM_THREADS)
) divide (
.clk (clk),
.reset (reset),
.valid_in (div_valid_in),
.ready_in (div_ready_in),
.tag_in ({uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, is_rem_op, is_alu_w}),
.ready_out (div_ready_out),
.valid_out (div_valid_out),
.tag_out ({div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, is_rem_op_out, is_div_w_out}),
.ready_out (div_ready_out),
.is_signed (is_signed_op),
.numer (div_in1),
@ -227,6 +267,15 @@ module VX_muldiv (
.remainder (div_remainder)
);
reg [TAGW+2-1:0] div_tag_r;
always @(posedge clk) begin
if (div_valid_in && div_ready_in) begin
div_tag_r <= {uuid_in, wid_in, tmask_in, PC_in, rd_in, wb_in, is_rem_op, is_alu_w};
end
end
assign {div_uuid_out, div_wid_out, div_tmask_out, div_PC_out, div_rd_out, div_wb_out, is_rem_op_out, is_div_w_out} = div_tag_r;
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
`ifdef XLEN_64
assign div_result_out[i] = is_rem_op_out ? (is_div_w_out ? `XLEN'($signed(div_remainder[i][31:0])) : div_remainder[i]) :
@ -236,6 +285,7 @@ module VX_muldiv (
`UNUSED_VAR (is_div_w_out)
`endif
end
`endif
///////////////////////////////////////////////////////////////////////////

View file

@ -2,9 +2,9 @@
`TRACING_OFF
module VX_multiplier #(
parameter A_WIDTH = 1,
parameter B_WIDTH = 1,
parameter R_WIDTH = 1,
parameter A_WIDTH = 1,
parameter B_WIDTH = A_WIDTH,
parameter R_WIDTH = A_WIDTH + B_WIDTH,
parameter SIGNED = 0,
parameter LATENCY = 0
) (
@ -16,31 +16,51 @@ module VX_multiplier #(
);
`STATIC_ASSERT ((LATENCY <= 3), ("invalid parameter"))
wire [R_WIDTH-1:0] result_unqual;
wire [A_WIDTH-1:0] dataa_w;
wire [B_WIDTH-1:0] datab_w;
wire [R_WIDTH-1:0] result_w;
if (SIGNED != 0) begin
assign result_unqual = $signed(dataa) * $signed(datab);
assign result_w = $signed(dataa_w) * $signed(datab_w);
end else begin
assign result_unqual = dataa * datab;
assign result_w = dataa_w * datab_w;
end
if (LATENCY == 0) begin
assign result = result_unqual;
assign dataa_w = dataa;
assign datab_w = datab;
assign result = result_w;
end else begin
reg [R_WIDTH-1:0] result_pipe [LATENCY-1:0];
always @(posedge clk) begin
if (enable) begin
result_pipe[0] <= result_unqual;
end
end
for (genvar i = 1; i < LATENCY; ++i) begin
if (LATENCY >= 2) begin
reg [A_WIDTH-1:0] dataa_p [LATENCY-2:0];
reg [B_WIDTH-1:0] datab_p [LATENCY-2:0];
always @(posedge clk) begin
if (enable) begin
result_pipe[i] <= result_pipe[i-1];
dataa_p[0] <= dataa;
datab_p[0] <= datab;
end
end
for (genvar i = 2; i < LATENCY; ++i) begin
always @(posedge clk) begin
if (enable) begin
dataa_p[i-1] <= dataa_p[i-2];
datab_p[i-1] <= datab_p[i-2];
end
end
end
assign dataa_w = dataa_p[LATENCY-2];
assign datab_w = datab_p[LATENCY-2];
end else begin
assign dataa_w = dataa;
assign datab_w = datab;
end
assign result = result_pipe[LATENCY-1];
reg [R_WIDTH-1:0] result_r;
always @(posedge clk) begin
if (enable) begin
result_r <= result_w;
end
end
assign result = result_r;
end
endmodule

View file

@ -1,6 +1,6 @@
`include "VX_platform.vh"
//`TRACING_OFF
`TRACING_OFF
module VX_scope_switch #(
parameter N = 0
) (
@ -47,4 +47,4 @@ module VX_scope_switch #(
end
endmodule
//`TRACING_ON
`TRACING_ON

View file

@ -1,6 +1,6 @@
`include "VX_platform.vh"
//`TRACING_OFF
`TRACING_OFF
module VX_scope_tap #(
parameter SCOPE_ID = 0, // scope identifier
parameter SCOPE_IDW = 8, // scope identifier width
@ -297,4 +297,4 @@ module VX_scope_tap #(
assign bus_out = bus_out_r;
endmodule
//`TRACING_ON
`TRACING_ON

View file

@ -6,19 +6,16 @@ module VX_serial_div #(
parameter WIDTHD = 1,
parameter WIDTHQ = 1,
parameter WIDTHR = 1,
parameter LANES = 1,
parameter TAGW = 1
parameter LANES = 1
) (
input wire clk,
input wire reset,
input wire valid_in,
output wire ready_in,
input wire [TAGW-1:0] tag_in,
input wire ready_out,
output wire valid_out,
output wire [TAGW-1:0] tag_out,
input wire is_signed,
input wire [LANES-1:0][WIDTHN-1:0] numer,
@ -28,7 +25,7 @@ module VX_serial_div #(
output wire [LANES-1:0][WIDTHR-1:0] remainder
);
localparam MIN_ND = (WIDTHN < WIDTHD) ? WIDTHN : WIDTHD;
localparam CNTRW = $clog2(WIDTHN+1);
localparam CNTRW = $clog2(WIDTHN);
reg [LANES-1:0][WIDTHN + MIN_ND:0] working;
reg [LANES-1:0][WIDTHD-1:0] denom_r;
@ -40,12 +37,7 @@ module VX_serial_div #(
reg [LANES-1:0] inv_quot, inv_rem;
reg [CNTRW-1:0] cntr;
reg loaded;
reg [TAGW-1:0] tag_r;
wire push = valid_in && ready_in;
wire pop = valid_out && ready_out;
reg busy, done;
for (genvar i = 0; i < LANES; ++i) begin
wire negate_numer = is_signed && numer[i][WIDTHN-1];
@ -54,51 +46,52 @@ module VX_serial_div #(
assign denom_qual[i] = negate_denom ? -$signed(denom[i]) : denom[i];
assign sub_result[i] = working[i][WIDTHN + MIN_ND : WIDTHN] - denom_r[i];
end
wire busy = (cntr != 0);
wire push = valid_in && ready_in;
wire pop = valid_out && ready_out;
always @(posedge clk) begin
if (reset) begin
cntr <= '0;
loaded <= 0;
busy <= 0;
done <= 0;
end else begin
if (push) begin
cntr <= WIDTHN;
loaded <= 1;
end else if (busy) begin
cntr <= cntr - CNTRW'(1);
if (push) begin
busy <= 1;
end
if (pop) begin
loaded <= 0;
if (busy && cntr == 0) begin
busy <= 0;
done <= 1;
end
if (pop) begin
done <= 0;
end
end
cntr <= cntr - CNTRW'(1);
if (push) begin
for (integer i = 0; i < LANES; ++i) begin
working[i] <= {{WIDTHD{1'b0}}, numer_qual[i], 1'b0};
denom_r[i] <= denom_qual[i];
inv_quot[i] <= (denom[i] != 0) && is_signed && (numer[i][31] ^ denom[i][31]);
inv_rem[i] <= is_signed && numer[i][31];
end
tag_r <= tag_in;
end else if (busy) begin
for (integer i = 0; i < LANES; ++i) begin
working[i] <= sub_result[i][WIDTHD] ? {working[i][WIDTHN+MIN_ND-1:0], 1'b0} :
{sub_result[i][WIDTHD-1:0], working[i][WIDTHN-1:0], 1'b1};
end
cntr <= CNTRW'(WIDTHN-1);
end
end
for (genvar i = 0; i < LANES; ++i) begin
always @(posedge clk) begin
if (push) begin
working[i] <= {{WIDTHD{1'b0}}, numer_qual[i], 1'b0};
denom_r[i] <= denom_qual[i];
inv_quot[i] <= (denom[i] != 0) && is_signed && (numer[i][31] ^ denom[i][31]);
inv_rem[i] <= is_signed && numer[i][31];
end else if (busy) begin
working[i] <= sub_result[i][WIDTHD] ? {working[i][WIDTHN+MIN_ND-1:0], 1'b0} :
{sub_result[i][WIDTHD-1:0], working[i][WIDTHN-1:0], 1'b1};
end
end
wire [WIDTHQ-1:0] q = working[i][WIDTHQ-1:0];
wire [WIDTHR-1:0] r = working[i][WIDTHN+WIDTHR:WIDTHN+1];
assign quotient[i] = inv_quot[i] ? -$signed(q) : q;
assign remainder[i] = inv_rem[i] ? -$signed(r) : r;
end
assign ready_in = ~loaded;
assign tag_out = tag_r;
assign valid_out = loaded && ~busy;
assign ready_in = ~busy && ~done;
assign valid_out = done;
endmodule
`TRACING_ON

View file

@ -0,0 +1,106 @@
`include "VX_platform.vh"
// Iterative integer multiplier
// An adaptation of ZipCPU algorithm for a multi-lane elastic architecture.
// https://zipcpu.com/zipcpu/2021/07/03/slowmpy.html
`TRACING_OFF
module VX_serial_mul #(
parameter A_WIDTH = 1,
parameter B_WIDTH = A_WIDTH,
parameter R_WIDTH = A_WIDTH + B_WIDTH,
parameter SIGNED = 0,
parameter LANES = 1
) (
input wire clk,
input wire reset,
input wire valid_in,
output wire ready_in,
input wire ready_out,
output wire valid_out,
input wire [LANES-1:0][A_WIDTH-1:0] dataa,
input wire [LANES-1:0][B_WIDTH-1:0] datab,
output wire [LANES-1:0][R_WIDTH-1:0] result
);
localparam X_WIDTH = SIGNED ? `MAX(A_WIDTH, B_WIDTH) : A_WIDTH;
localparam Y_WIDTH = SIGNED ? `MAX(A_WIDTH, B_WIDTH) : B_WIDTH;
localparam P_WIDTH = X_WIDTH + Y_WIDTH;
localparam CNTRW = $clog2(X_WIDTH);
reg [LANES-1:0][X_WIDTH-1:0] a;
reg [LANES-1:0][Y_WIDTH-1:0] b;
reg [LANES-1:0][P_WIDTH-1:0] p;
reg [CNTRW-1:0] cntr;
reg busy, done;
wire push = valid_in && ready_in;
wire pop = valid_out && ready_out;
always @(posedge clk) begin
if (reset) begin
busy <= 0;
done <= 0;
end else begin
if (push) begin
busy <= 1;
end
if (busy && cntr == 0) begin
done <= 1;
end
if (pop) begin
busy <= 0;
done <= 0;
end
end
cntr <= cntr - CNTRW'(1);
if (push) begin
cntr <= CNTRW'(X_WIDTH-1);
end
end
for (genvar i = 0; i < LANES; ++i) begin
wire [X_WIDTH-1:0] axb = b[i][0] ? a[i] : '0;
always @(posedge clk) begin
if (push) begin
if (SIGNED) begin
a[i] <= X_WIDTH'($signed(dataa[i]));
b[i] <= Y_WIDTH'($signed(datab[i]));
end else begin
a[i] <= dataa[i];
b[i] <= datab[i];
end
p[i] <= 0;
end else if (busy) begin
b[i] <= (b[i] >> 1);
p[i][Y_WIDTH-2:0] <= p[i][Y_WIDTH-1:1];
if (SIGNED) begin
if (cntr == 0) begin
p[i][P_WIDTH-1:Y_WIDTH-1] <= {1'b0, p[i][P_WIDTH-1:Y_WIDTH]} + {1'b0, axb[X_WIDTH-1], ~axb[X_WIDTH-2:0]};
end else begin
p[i][P_WIDTH-1:Y_WIDTH-1] <= {1'b0, p[i][P_WIDTH-1:Y_WIDTH]} + {1'b0, ~axb[X_WIDTH-1], axb[X_WIDTH-2:0]};
end
end else begin
p[i][P_WIDTH-1:Y_WIDTH-1] <= {1'b0, p[i][P_WIDTH-1:Y_WIDTH]} + ((b[i][0]) ? {1'b0, a[i]} : 0);
end
end
end
if (SIGNED) begin
assign result[i] = R_WIDTH'(p[i][P_WIDTH-1:0] + {1'b1, {(X_WIDTH-2){1'b0}}, 1'b1, {(Y_WIDTH){1'b0}}});
end else begin
assign result[i] = R_WIDTH'(p[i]);
end
end
`UNUSED_VAR (p)
assign ready_in = ~busy && ~done;
assign valid_out = done;
endmodule
`TRACING_ON