[bitmanip] Add ZBR instruction group

This commit implements the Bit Manipulation Extension ZBR instruction
group: crc32[c].[bhw].

CRC-32 (CRC-32/ISO-HDLC) and CRC-32C (CRC-32/ISCSI) are directly
implemented. The CRC operation solves the following equation using
binary polynomial arithmetic:

rev(rd)(x) = rev(rs1)(x) * x**n mod {1, P}(x),

where {1,P}(x) denotes the crc polynomial. Using barret reduction one
can write this as

rd = (rs1 >> n) ^ rev(rev( (rs1 << (32-1)) cx rev(mu)) cx P)
                      ^-- cycle 0--------------------^
     ^-- cycle 1 ------------------------------------------^

Where cx denotes carry-less multiplication and mu = polydiv(x**64,
{1,P}), omitting the MSB (bit 32).

The implementation increases area consumption by ~0.6kGE for synthesis
with relaxed timing constraints. With tight timing constraints that is
~1.6kGE. There is no significant impact on frequency.

Signed-off-by: ganoam <gnoam@live.com>
This commit is contained in:
ganoam 2020-05-19 15:24:31 +02:00 committed by Pirmin Vogel
parent 42eee90cf8
commit 66687e927c
8 changed files with 274 additions and 113 deletions

View file

@ -66,8 +66,9 @@ Other blocks use the ALU for the following tasks:
Support for the RISC-V Bitmanipulation Extension (Document Version 0.92, November 8, 2019) is enabled via the parameter ``RV32B``.
This feature is *EXPERIMENTAL* and the details of its impact are not yet documented here.
Currently the Zbb, Zbs, Zbp, Zbe, Zbf, Zbc and Zbt sub-extensions are implemented.
All instructions are carried out in a single clock cycle.
Currently the Zbb, Zbs, Zbp, Zbe, Zbf, Zbc, Zbr and Zbt sub-extensions are implemented.
The rotate instructions `ror` and `rol` (Zbb), ternary instructions `cmov`, `cmix`, `fsl` and `fsr` as well as cyclic redundancy checks `crc32[c]` (Zbr) are completed in 2 cycles. All remaining instructions complete in one cycle.
.. _mult-div:

View file

@ -94,8 +94,8 @@ Parameters
| ``RV32B`` | bit | 0 | *EXPERIMENTAL* - B(itmanipulation) extension enable: |
| | | | Currently supported Z-extensions: Zbb (base), Zbs (single-bit) |
| | | | Zbp (bit permutation), Zbe (bit extract/deposit), |
| | | | Zbf (bit-field place) Zbc (carry-less multiplication)
| | | | and Zbt (ternary) |
| | | | Zbf (bit-field place) Zbc (carry-less multiplication) |
| | | | Zbr (cyclic redundancy check) and Zbt (ternary) |
+------------------------------+-------------+------------+-----------------------------------------------------------------+
| ``BranchTargetALU`` | bit | 0 | *EXPERIMENTAL* - Enables branch target ALU removing a stall |
| | | | cycle from taken branches |

View file

@ -595,5 +595,5 @@
gen_test: riscv_rand_instr_test
gen_opts: >
+enable_b_extension=1
+enable_bitmanip_groups=zbb,zbt,zbs,zbp,zbf,zbe,zbc
+enable_bitmanip_groups=zbb,zbt,zbs,zbp,zbf,zbe,zbc,zbr
rtl_test: core_ibex_base_test

View file

@ -307,12 +307,16 @@ module ibex_alu #(
// select shifter input
// for bfp, sbmode and shift_left the corresponding bit-reversed input is chosen.
unique case (1'b1)
bfp_op: shift_result = bfp_mask_rev;
shift_sbmode: shift_result = 32'h8000_0000; // rev(32'h1)
shift_left: shift_result = operand_a_rev;
default: shift_result = operand_a_i;
endcase
if (shift_sbmode) begin
shift_result = 32'h8000_0000; // rev(32'h1)
end else begin
unique case (1'b1)
bfp_op: shift_result = bfp_mask_rev;
shift_left: shift_result = operand_a_rev;
default: shift_result = operand_a_i;
endcase
end
shift_result_ext =
$signed({shift_ones | (shift_arith & shift_result[31]), shift_result}) >>> shift_amt[4:0];
@ -774,14 +778,182 @@ module ibex_alu #(
end
end
///////////////////////////////////////////////////
// Carry-less Multiply + Cyclic Redundancy Check //
///////////////////////////////////////////////////
// Carry-less multiplication can be understood as multiplication based on
// the addition interpreted as the bit-wise xor operation.
//
// Example: 1101 X 1011 = 1111111:
//
// 1011 X 1101
// -----------
// 1101
// xor 1101
// ---------
// 10111
// xor 0000
// ----------
// 010111
// xor 1101
// -----------
// 1111111
//
// Architectural details:
// A 32 x 32-bit array
// [ operand_b[i] ? (operand_a << i) : '0 for i in 0 ... 31 ]
// is generated. The entries of the array are pairwise 'xor-ed'
// together in a 5-stage binary tree.
//
//
// Cyclic Redundancy Check:
//
// CRC-32 (CRC-32/ISO-HDLC) and CRC-32C (CRC-32/ISCSI) are directly implemented. For
// documentation of the crc configuration (crc-polynomials, initialization, reflection, etc.)
// see http://reveng.sourceforge.net/crc-catalogue/all.htm
// A useful guide to crc arithmetic and algorithms is given here:
// http://www.piclist.com/techref/method/math/crcguide.html.
//
// The CRC operation solves the following equation using binary polynomial arithmetic:
//
// rev(rd)(x) = rev(rs1)(x) * x**n mod {1, P}(x)
//
// where P denotes lower 32 bits of the corresponding CRC polynomial, rev(a) the bit reversal
// of a, n = 8,16, or 32 for .b, .h, .w -variants. {a, b} denotes bit concatenation.
//
// Using barret reduction, one can show that
//
// M(x) mod P(x) = R(x) =
// (M(x) * x**n) & {deg(P(x)'{1'b1}}) ^ (M(x) x**-(deg(P(x) - n)) cx mu(x) cx P(x),
//
// Where mu(x) = polydiv(x**64, {1,P}) & 0xffffffff. Here, 'cx' refers to carry-less
// multiplication. Substituting rev(rd)(x) for R(x) and rev(rs1)(x) for M(x) and solving for
// rd(x) with P(x) a crc32 polynomial (deg(P(x)) = 32), we get
//
// rd = rev( (rev(rs1) << n) ^ ((rev(rs1) >> (32-n)) cx mu cx P)
// = (rs1 >> n) ^ rev(rev( (rs1 << (32-n)) cx rev(mu)) cx P)
// ^-- cycle 0--------------------^
// ^- cycle 1 -------------------------------------------^
//
// In the last step we used the fact that carry-less multiplication is bit-order agnostic:
// rev(a cx b) = rev(a) cx rev(b).
logic clmul_rmode;
logic clmul_hmode;
logic [31:0] clmul_op_a;
logic [31:0] clmul_op_b;
logic [31:0] operand_b_rev;
logic [31:0] clmul_and_stage[32];
logic [31:0] clmul_xor_stage1[16];
logic [31:0] clmul_xor_stage2[8];
logic [31:0] clmul_xor_stage3[4];
logic [31:0] clmul_xor_stage4[2];
logic [31:0] clmul_result_raw;
logic [31:0] clmul_result_rev;
for (genvar i=0; i<32; i++) begin: gen_rev_operand_b
assign operand_b_rev[i] = operand_b_i[31-i];
end
assign clmul_rmode = operator_i == ALU_CLMULR;
assign clmul_hmode = operator_i == ALU_CLMULH;
// CRC
localparam logic [31:0] CRC32_POLYNOMIAL = 32'h04c1_1db7;
localparam logic [31:0] CRC32_MU_REV = 32'hf701_1641;
localparam logic [31:0] CRC32C_POLYNOMIAL = 32'h1edc_6f41;
localparam logic [31:0] CRC32C_MU_REV = 32'hdea7_13f1;
logic crc_op;
logic crc_hmode;
logic crc_bmode;
logic crc_cpoly;
logic [31:0] crc_operand;
logic [31:0] crc_poly;
logic [31:0] crc_mu_rev;
assign crc_op = (operator_i == ALU_CRC32C_W) | (operator_i == ALU_CRC32_W) |
(operator_i == ALU_CRC32C_H) | (operator_i == ALU_CRC32_H) |
(operator_i == ALU_CRC32C_B) | (operator_i == ALU_CRC32_B);
assign crc_cpoly = (operator_i == ALU_CRC32C_W) |
(operator_i == ALU_CRC32C_H) |
(operator_i == ALU_CRC32C_B);
assign crc_hmode = (operator_i == ALU_CRC32_H) | (operator_i == ALU_CRC32C_H);
assign crc_bmode = (operator_i == ALU_CRC32_B) | (operator_i == ALU_CRC32C_B);
assign crc_poly = crc_cpoly ? CRC32C_POLYNOMIAL : CRC32_POLYNOMIAL;
assign crc_mu_rev = crc_cpoly ? CRC32C_MU_REV : CRC32_MU_REV;
always_comb begin
unique case(1'b1)
crc_bmode: crc_operand = {operand_a_i[7:0], 24'h0};
crc_hmode: crc_operand = {operand_a_i[15:0], 16'h0};
default: crc_operand = operand_a_i;
endcase
end
// Select clmul input
always_comb begin
if (crc_op) begin
clmul_op_a = instr_first_cycle_i ? crc_operand : imd_val_q_i;
clmul_op_b = instr_first_cycle_i ? crc_mu_rev : crc_poly;
end else begin
clmul_op_a = clmul_rmode | clmul_hmode ? operand_a_rev : operand_a_i;
clmul_op_b = clmul_rmode | clmul_hmode ? operand_b_rev : operand_b_i;
end
end
for (genvar i=0; i<32; i++) begin : gen_clmul_and_op
assign clmul_and_stage[i] = clmul_op_b[i] ? clmul_op_a << i : '0;
end
for (genvar i=0; i<16; i++) begin : gen_clmul_xor_op_l1
assign clmul_xor_stage1[i] = clmul_and_stage[2*i] ^ clmul_and_stage[2*i+1];
end
for (genvar i=0; i<8; i++) begin : gen_clmul_xor_op_l2
assign clmul_xor_stage2[i] = clmul_xor_stage1[2*i] ^ clmul_xor_stage1[2*i+1];
end
for (genvar i=0; i<4; i++) begin : gen_clmul_xor_op_l3
assign clmul_xor_stage3[i] = clmul_xor_stage2[2*i] ^ clmul_xor_stage2[2*i+1];
end
for (genvar i=0; i<2; i++) begin : gen_clmul_xor_op_l4
assign clmul_xor_stage4[i] = clmul_xor_stage3[2*i] ^ clmul_xor_stage3[2*i+1];
end
assign clmul_result_raw = clmul_xor_stage4[0] ^ clmul_xor_stage4[1];
for (genvar i=0; i<32; i++) begin : gen_rev_clmul_result
assign clmul_result_rev[i] = clmul_result_raw[31-i];
end
// clmulr_result = rev(clmul(rev(a), rev(b)))
// clmulh_result = clmulr_result >> 1
always_comb begin
case(1'b1)
clmul_rmode: clmul_result = clmul_result_rev;
clmul_hmode: clmul_result = {1'b0, clmul_result_rev[31:1]};
default: clmul_result = clmul_result_raw;
endcase
end
//////////////////////////////////////
// Multicycle Bitmanip Instructions //
//////////////////////////////////////
// Ternary instructions + Shift Rotations
// Ternary instructions + Shift Rotations + CRC
// For ternary instructions (zbt), operand_a_i is tied to rs1 in the first cycle and rs3 in the
// second cycle. operand_b_i is always tied to rs2.
always_comb begin
unique case (operator_i)
ALU_CMOV: begin
@ -818,6 +990,23 @@ module ibex_alu #(
imd_val_we_o = 1'b0;
end
end
ALU_CRC32_W, ALU_CRC32C_W,
ALU_CRC32_H, ALU_CRC32C_H,
ALU_CRC32_B, ALU_CRC32C_B: begin
imd_val_d_o = clmul_result_rev;
unique case(1'b1)
crc_bmode: multicycle_result = clmul_result_rev ^ (operand_a_i >> 8);
crc_hmode: multicycle_result = clmul_result_rev ^ (operand_a_i >> 16);
default: multicycle_result = clmul_result_rev;
endcase
if (instr_first_cycle_i) begin
imd_val_we_o = 1'b1;
end else begin
imd_val_we_o = 1'b0;
end
end
default: begin
imd_val_d_o = operand_a_i;
imd_val_we_o = 1'b0;
@ -870,94 +1059,6 @@ module ibex_alu #(
assign sext_result = (operator_i == ALU_SEXTB) ?
{ {24{operand_a_i[7]}}, operand_a_i[7:0]} : { {16{operand_a_i[15]}}, operand_a_i[15:0]};
/////////////////////////
// Carry-less Multiply //
/////////////////////////
// Carry-less multiplication can be understood as multiplication based on
// the addition interpreted as the bit-wise xor operation.
//
// Example: 1101 X 1011 = 1111111:
//
// 1011 X 1101
// -----------
// 1101
// xor 1101
// ---------
// 10111
// xor 0000
// ----------
// 010111
// xor 1101
// -----------
// 1111111
//
// Architectural details:
// A 32 x 32-bit array
// [ operand_b[i] ? (operand_a << i) : '0 for i in 0 ... 31 ]
// is generated. The entries of the array are pairwise 'xor-ed'
// together in a 5-stage binary tree.
logic clmul_rmode;
logic clmul_hmode;
logic [31:0] clmul_op_a;
logic [31:0] clmul_op_b;
logic [31:0] operand_b_rev;
logic [31:0] clmul_and_stage[32];
logic [31:0] clmul_xor_stage1[16];
logic [31:0] clmul_xor_stage2[8];
logic [31:0] clmul_xor_stage3[4];
logic [31:0] clmul_xor_stage4[2];
logic [31:0] clmul_result_raw;
logic [31:0] clmul_result_rev;
for (genvar i=0; i<32; i++) begin: gen_rev_operand_b
assign operand_b_rev[i] = operand_b_i[31-i];
end
assign clmul_rmode = operator_i == ALU_CLMULR;
assign clmul_hmode = operator_i == ALU_CLMULH;
assign clmul_op_a = clmul_rmode | clmul_hmode ? operand_a_rev : operand_a_i;
assign clmul_op_b = clmul_rmode | clmul_hmode ? operand_b_rev : operand_b_i;
for (genvar i=0; i<32; i++) begin : gen_clmul_and_op
assign clmul_and_stage[i] = clmul_op_b[i] ? clmul_op_a << i : '0;
end
for (genvar i=0; i<16; i++) begin : gen_clmul_xor_op_l1
assign clmul_xor_stage1[i] = clmul_and_stage[2*i] ^ clmul_and_stage[2*i+1];
end
for (genvar i=0; i<8; i++) begin : gen_clmul_xor_op_l2
assign clmul_xor_stage2[i] = clmul_xor_stage1[2*i] ^ clmul_xor_stage1[2*i+1];
end
for (genvar i=0; i<4; i++) begin : gen_clmul_xor_op_l3
assign clmul_xor_stage3[i] = clmul_xor_stage2[2*i] ^ clmul_xor_stage2[2*i+1];
end
for (genvar i=0; i<2; i++) begin : gen_clmul_xor_op_l4
assign clmul_xor_stage4[i] = clmul_xor_stage3[2*i] ^ clmul_xor_stage3[2*i+1];
end
assign clmul_result_raw = clmul_xor_stage4[0] ^ clmul_xor_stage4[1];
for (genvar i=0; i<32; i++) begin : gen_rev_clmul_result
assign clmul_result_rev[i] = clmul_result_raw[31-i];
end
// clmulr_result = rev(clmul(rev(a), rev(b)))
// clmulh_result = clmulr_result >> 1
always_comb begin
case(1'b1)
clmul_rmode: clmul_result = clmul_result_rev;
clmul_hmode: clmul_result = {1'b0, clmul_result_rev[31:1]};
default: clmul_result = clmul_result_raw;
endcase
end
end else begin : g_no_alu_rvb
// RV32B result signals
assign minmax_result = '0;
@ -1024,7 +1125,12 @@ module ibex_alu #(
// Ternary Bitmanip Operations (RV32B)
ALU_CMIX, ALU_CMOV,
ALU_FSL, ALU_FSR,
ALU_ROL, ALU_ROR: result_o = multicycle_result;
// Rotate Shift (RV32B)
ALU_ROL, ALU_ROR,
// Cyclic Redundancy Checks (RV32B)
ALU_CRC32_W, ALU_CRC32C_W,
ALU_CRC32_H, ALU_CRC32C_H,
ALU_CRC32_B, ALU_CRC32C_B: result_o = multicycle_result;
// Single-Bit Bitmanip Operations (RV32B)
ALU_SBSET, ALU_SBCLR,
@ -1040,7 +1146,7 @@ module ibex_alu #(
// Bit Field Place (RV32B)
ALU_BFP: result_o = bfp_result;
// Carry-less Multiply Operations (RV32B Ops)
// Carry-less Multiply Operations (RV32B)
ALU_CLMUL, ALU_CLMULR,
ALU_CLMULH: result_o = clmul_result;

View file

@ -346,11 +346,17 @@ module ibex_decoder #(
end
5'b0_1100: begin
unique case(instr[26:20])
7'b00_00000, // clz
7'b00_00001, // ctz
7'b00_00010, // pcnt
7'b00_00100, // sext.b
7'b00_00101: illegal_insn = RV32B ? 1'b0 : 1'b1; // sext.h
7'b000_0000, // clz
7'b000_0001, // ctz
7'b000_0010, // pcnt
7'b000_0100, // sext.b
7'b000_0101, // sext.h
7'b001_0000, // crc32.b
7'b001_0001, // crc32.h
7'b001_0010, // crc32.w
7'b001_1000, // crc32c.b
7'b001_1001, // crc32c.h
7'b001_1010: illegal_insn = RV32B ? 1'b0 : 1'b1; // crc32c.w
default: illegal_insn = 1'b1;
endcase
@ -775,11 +781,35 @@ module ibex_decoder #(
5'b0_0001: if (instr_alu[26] == 0) alu_operator_o = ALU_SHFL;
5'b0_1100: begin
unique case (instr_alu[26:20])
7'b000_0000: alu_operator_o = ALU_CLZ; // Count Leading Zeros
7'b000_0001: alu_operator_o = ALU_CTZ; // Count Trailing Zeros
7'b000_0010: alu_operator_o = ALU_PCNT; // Count Set Bits
7'b000_0100: alu_operator_o = ALU_SEXTB; // Sign-extend Byte
7'b000_0101: alu_operator_o = ALU_SEXTH; // Sign-extend Half-word
7'b000_0000: alu_operator_o = ALU_CLZ; // clz
7'b000_0001: alu_operator_o = ALU_CTZ; // ctz
7'b000_0010: alu_operator_o = ALU_PCNT; // pcnt
7'b000_0100: alu_operator_o = ALU_SEXTB; // sext.b
7'b000_0101: alu_operator_o = ALU_SEXTH; // sext.h
7'b001_0000: begin
alu_operator_o = ALU_CRC32_B; // crc32.b
alu_multicycle_o = 1'b1;
end
7'b001_0001: begin
alu_operator_o = ALU_CRC32_H; // crc32.h
alu_multicycle_o = 1'b1;
end
7'b001_0010: begin
alu_operator_o = ALU_CRC32_W; // crc32.w
alu_multicycle_o = 1'b1;
end
7'b001_1000: begin
alu_operator_o = ALU_CRC32C_B; // crc32c.b
alu_multicycle_o = 1'b1;
end
7'b001_1001: begin
alu_operator_o = ALU_CRC32C_H; // crc32c.h
alu_multicycle_o = 1'b1;
end
7'b001_1010: begin
alu_operator_o = ALU_CRC32C_W; // crc32c.w
alu_multicycle_o = 1'b1;
end
default: ;
endcase
end

View file

@ -121,7 +121,15 @@ typedef enum logic [5:0] {
// RV32B
ALU_CLMUL,
ALU_CLMULR,
ALU_CLMULH
ALU_CLMULH,
// Cyclic Redundancy Check
ALU_CRC32_B,
ALU_CRC32C_B,
ALU_CRC32_H,
ALU_CRC32C_H,
ALU_CRC32_W,
ALU_CRC32C_W
} alu_op_e;
typedef enum logic [1:0] {

View file

@ -1019,6 +1019,14 @@ module ibex_tracer (
INSN_CLMULR: decode_r_insn("clmulr");
INSN_CLMULH: decode_r_insn("clmulh");
// RV32B - ZBR
INSN_CRC32_B: decode_r1_insn("crc32.b");
INSN_CRC32_H: decode_r1_insn("crc32.h");
INSN_CRC32_W: decode_r1_insn("crc32.w");
INSN_CRC32C_B: decode_r1_insn("crc32c.b");
INSN_CRC32C_H: decode_r1_insn("crc32c.h");
INSN_CRC32C_W: decode_r1_insn("crc32c.w");
default: decode_mnemonic("INVALID");
endcase
end

View file

@ -251,6 +251,14 @@ parameter logic [31:0] INSN_CLMUL = {7'b0000101, 10'b?, 3'b001, 5'b?, {OPCODE_O
parameter logic [31:0] INSN_CLMULR = {7'b0000101, 10'b?, 3'b010, 5'b?, {OPCODE_OP} };
parameter logic [31:0] INSN_CLMULH = {7'b0000101, 10'b?, 3'b011, 5'b?, {OPCODE_OP} };
// ZBR
parameter logic [31:0] INSN_CRC32_B = {7'b0110000, 5'b10000, 5'b?, 3'b001, 5'b?, {OPCODE_OP_IMM} };
parameter logic [31:0] INSN_CRC32_H = {7'b0110000, 5'b10001, 5'b?, 3'b001, 5'b?, {OPCODE_OP_IMM} };
parameter logic [31:0] INSN_CRC32_W = {7'b0110000, 5'b10010, 5'b?, 3'b001, 5'b?, {OPCODE_OP_IMM} };
parameter logic [31:0] INSN_CRC32C_B = {7'b0110000, 5'b11000, 5'b?, 3'b001, 5'b?, {OPCODE_OP_IMM} };
parameter logic [31:0] INSN_CRC32C_H = {7'b0110000, 5'b11001, 5'b?, 3'b001, 5'b?, {OPCODE_OP_IMM} };
parameter logic [31:0] INSN_CRC32C_W = {7'b0110000, 5'b11010, 5'b?, 3'b001, 5'b?, {OPCODE_OP_IMM} };
// LOAD & STORE
parameter logic [31:0] INSN_LOAD = {25'b?, {OPCODE_LOAD } };
parameter logic [31:0] INSN_STORE = {25'b?, {OPCODE_STORE} };