diff --git a/doc/instruction_decode_execute.rst b/doc/instruction_decode_execute.rst index 3880479e..00e090f1 100644 --- a/doc/instruction_decode_execute.rst +++ b/doc/instruction_decode_execute.rst @@ -66,8 +66,9 @@ Other blocks use the ALU for the following tasks: Support for the RISC-V Bitmanipulation Extension (Document Version 0.92, November 8, 2019) is enabled via the parameter ``RV32B``. This feature is *EXPERIMENTAL* and the details of its impact are not yet documented here. -Currently the Zbb, Zbs, Zbp, Zbe, Zbf, Zbc and Zbt sub-extensions are implemented. -All instructions are carried out in a single clock cycle. +Currently the Zbb, Zbs, Zbp, Zbe, Zbf, Zbc, Zbr and Zbt sub-extensions are implemented. +The rotate instructions `ror` and `rol` (Zbb), ternary instructions `cmov`, `cmix`, `fsl` and `fsr` as well as cyclic redundancy checks `crc32[c]` (Zbr) are completed in 2 cycles. All remaining instructions complete in one cycle. + .. _mult-div: diff --git a/doc/integration.rst b/doc/integration.rst index 2825e430..5cd7742f 100644 --- a/doc/integration.rst +++ b/doc/integration.rst @@ -94,8 +94,8 @@ Parameters | ``RV32B`` | bit | 0 | *EXPERIMENTAL* - B(itmanipulation) extension enable: | | | | | Currently supported Z-extensions: Zbb (base), Zbs (single-bit) | | | | | Zbp (bit permutation), Zbe (bit extract/deposit), | -| | | | Zbf (bit-field place) Zbc (carry-less multiplication) -| | | | and Zbt (ternary) | +| | | | Zbf (bit-field place) Zbc (carry-less multiplication) | +| | | | Zbr (cyclic redundancy check) and Zbt (ternary) | +------------------------------+-------------+------------+-----------------------------------------------------------------+ | ``BranchTargetALU`` | bit | 0 | *EXPERIMENTAL* - Enables branch target ALU removing a stall | | | | | cycle from taken branches | diff --git a/dv/uvm/core_ibex/riscv_dv_extension/testlist.yaml b/dv/uvm/core_ibex/riscv_dv_extension/testlist.yaml index d9d973dc..1ab636a6 100644 --- a/dv/uvm/core_ibex/riscv_dv_extension/testlist.yaml +++ b/dv/uvm/core_ibex/riscv_dv_extension/testlist.yaml @@ -595,5 +595,5 @@ gen_test: riscv_rand_instr_test gen_opts: > +enable_b_extension=1 - +enable_bitmanip_groups=zbb,zbt,zbs,zbp,zbf,zbe,zbc + +enable_bitmanip_groups=zbb,zbt,zbs,zbp,zbf,zbe,zbc,zbr rtl_test: core_ibex_base_test diff --git a/rtl/ibex_alu.sv b/rtl/ibex_alu.sv index b2d55359..de0e59f5 100644 --- a/rtl/ibex_alu.sv +++ b/rtl/ibex_alu.sv @@ -307,12 +307,16 @@ module ibex_alu #( // select shifter input // for bfp, sbmode and shift_left the corresponding bit-reversed input is chosen. - unique case (1'b1) - bfp_op: shift_result = bfp_mask_rev; - shift_sbmode: shift_result = 32'h8000_0000; // rev(32'h1) - shift_left: shift_result = operand_a_rev; - default: shift_result = operand_a_i; - endcase + if (shift_sbmode) begin + shift_result = 32'h8000_0000; // rev(32'h1) + end else begin + unique case (1'b1) + bfp_op: shift_result = bfp_mask_rev; + shift_left: shift_result = operand_a_rev; + default: shift_result = operand_a_i; + endcase + end + shift_result_ext = $signed({shift_ones | (shift_arith & shift_result[31]), shift_result}) >>> shift_amt[4:0]; @@ -774,14 +778,182 @@ module ibex_alu #( end end + /////////////////////////////////////////////////// + // Carry-less Multiply + Cyclic Redundancy Check // + /////////////////////////////////////////////////// + + // Carry-less multiplication can be understood as multiplication based on + // the addition interpreted as the bit-wise xor operation. + // + // Example: 1101 X 1011 = 1111111: + // + // 1011 X 1101 + // ----------- + // 1101 + // xor 1101 + // --------- + // 10111 + // xor 0000 + // ---------- + // 010111 + // xor 1101 + // ----------- + // 1111111 + // + // Architectural details: + // A 32 x 32-bit array + // [ operand_b[i] ? (operand_a << i) : '0 for i in 0 ... 31 ] + // is generated. The entries of the array are pairwise 'xor-ed' + // together in a 5-stage binary tree. + // + // + // Cyclic Redundancy Check: + // + // CRC-32 (CRC-32/ISO-HDLC) and CRC-32C (CRC-32/ISCSI) are directly implemented. For + // documentation of the crc configuration (crc-polynomials, initialization, reflection, etc.) + // see http://reveng.sourceforge.net/crc-catalogue/all.htm + // A useful guide to crc arithmetic and algorithms is given here: + // http://www.piclist.com/techref/method/math/crcguide.html. + // + // The CRC operation solves the following equation using binary polynomial arithmetic: + // + // rev(rd)(x) = rev(rs1)(x) * x**n mod {1, P}(x) + // + // where P denotes lower 32 bits of the corresponding CRC polynomial, rev(a) the bit reversal + // of a, n = 8,16, or 32 for .b, .h, .w -variants. {a, b} denotes bit concatenation. + // + // Using barret reduction, one can show that + // + // M(x) mod P(x) = R(x) = + // (M(x) * x**n) & {deg(P(x)'{1'b1}}) ^ (M(x) x**-(deg(P(x) - n)) cx mu(x) cx P(x), + // + // Where mu(x) = polydiv(x**64, {1,P}) & 0xffffffff. Here, 'cx' refers to carry-less + // multiplication. Substituting rev(rd)(x) for R(x) and rev(rs1)(x) for M(x) and solving for + // rd(x) with P(x) a crc32 polynomial (deg(P(x)) = 32), we get + // + // rd = rev( (rev(rs1) << n) ^ ((rev(rs1) >> (32-n)) cx mu cx P) + // = (rs1 >> n) ^ rev(rev( (rs1 << (32-n)) cx rev(mu)) cx P) + // ^-- cycle 0--------------------^ + // ^- cycle 1 -------------------------------------------^ + // + // In the last step we used the fact that carry-less multiplication is bit-order agnostic: + // rev(a cx b) = rev(a) cx rev(b). + + logic clmul_rmode; + logic clmul_hmode; + logic [31:0] clmul_op_a; + logic [31:0] clmul_op_b; + logic [31:0] operand_b_rev; + logic [31:0] clmul_and_stage[32]; + logic [31:0] clmul_xor_stage1[16]; + logic [31:0] clmul_xor_stage2[8]; + logic [31:0] clmul_xor_stage3[4]; + logic [31:0] clmul_xor_stage4[2]; + + logic [31:0] clmul_result_raw; + logic [31:0] clmul_result_rev; + + for (genvar i=0; i<32; i++) begin: gen_rev_operand_b + assign operand_b_rev[i] = operand_b_i[31-i]; + end + + assign clmul_rmode = operator_i == ALU_CLMULR; + assign clmul_hmode = operator_i == ALU_CLMULH; + + // CRC + localparam logic [31:0] CRC32_POLYNOMIAL = 32'h04c1_1db7; + localparam logic [31:0] CRC32_MU_REV = 32'hf701_1641; + + localparam logic [31:0] CRC32C_POLYNOMIAL = 32'h1edc_6f41; + localparam logic [31:0] CRC32C_MU_REV = 32'hdea7_13f1; + + logic crc_op; + logic crc_hmode; + logic crc_bmode; + + logic crc_cpoly; + + logic [31:0] crc_operand; + logic [31:0] crc_poly; + logic [31:0] crc_mu_rev; + + assign crc_op = (operator_i == ALU_CRC32C_W) | (operator_i == ALU_CRC32_W) | + (operator_i == ALU_CRC32C_H) | (operator_i == ALU_CRC32_H) | + (operator_i == ALU_CRC32C_B) | (operator_i == ALU_CRC32_B); + + assign crc_cpoly = (operator_i == ALU_CRC32C_W) | + (operator_i == ALU_CRC32C_H) | + (operator_i == ALU_CRC32C_B); + + assign crc_hmode = (operator_i == ALU_CRC32_H) | (operator_i == ALU_CRC32C_H); + assign crc_bmode = (operator_i == ALU_CRC32_B) | (operator_i == ALU_CRC32C_B); + + assign crc_poly = crc_cpoly ? CRC32C_POLYNOMIAL : CRC32_POLYNOMIAL; + assign crc_mu_rev = crc_cpoly ? CRC32C_MU_REV : CRC32_MU_REV; + + always_comb begin + unique case(1'b1) + crc_bmode: crc_operand = {operand_a_i[7:0], 24'h0}; + crc_hmode: crc_operand = {operand_a_i[15:0], 16'h0}; + default: crc_operand = operand_a_i; + endcase + end + + // Select clmul input + always_comb begin + if (crc_op) begin + clmul_op_a = instr_first_cycle_i ? crc_operand : imd_val_q_i; + clmul_op_b = instr_first_cycle_i ? crc_mu_rev : crc_poly; + end else begin + clmul_op_a = clmul_rmode | clmul_hmode ? operand_a_rev : operand_a_i; + clmul_op_b = clmul_rmode | clmul_hmode ? operand_b_rev : operand_b_i; + end + end + + for (genvar i=0; i<32; i++) begin : gen_clmul_and_op + assign clmul_and_stage[i] = clmul_op_b[i] ? clmul_op_a << i : '0; + end + + for (genvar i=0; i<16; i++) begin : gen_clmul_xor_op_l1 + assign clmul_xor_stage1[i] = clmul_and_stage[2*i] ^ clmul_and_stage[2*i+1]; + end + + for (genvar i=0; i<8; i++) begin : gen_clmul_xor_op_l2 + assign clmul_xor_stage2[i] = clmul_xor_stage1[2*i] ^ clmul_xor_stage1[2*i+1]; + end + + for (genvar i=0; i<4; i++) begin : gen_clmul_xor_op_l3 + assign clmul_xor_stage3[i] = clmul_xor_stage2[2*i] ^ clmul_xor_stage2[2*i+1]; + end + + for (genvar i=0; i<2; i++) begin : gen_clmul_xor_op_l4 + assign clmul_xor_stage4[i] = clmul_xor_stage3[2*i] ^ clmul_xor_stage3[2*i+1]; + end + + assign clmul_result_raw = clmul_xor_stage4[0] ^ clmul_xor_stage4[1]; + + for (genvar i=0; i<32; i++) begin : gen_rev_clmul_result + assign clmul_result_rev[i] = clmul_result_raw[31-i]; + end + + // clmulr_result = rev(clmul(rev(a), rev(b))) + // clmulh_result = clmulr_result >> 1 + always_comb begin + case(1'b1) + clmul_rmode: clmul_result = clmul_result_rev; + clmul_hmode: clmul_result = {1'b0, clmul_result_rev[31:1]}; + default: clmul_result = clmul_result_raw; + endcase + end ////////////////////////////////////// // Multicycle Bitmanip Instructions // ////////////////////////////////////// - // Ternary instructions + Shift Rotations + // Ternary instructions + Shift Rotations + CRC // For ternary instructions (zbt), operand_a_i is tied to rs1 in the first cycle and rs3 in the // second cycle. operand_b_i is always tied to rs2. + always_comb begin unique case (operator_i) ALU_CMOV: begin @@ -818,6 +990,23 @@ module ibex_alu #( imd_val_we_o = 1'b0; end end + + ALU_CRC32_W, ALU_CRC32C_W, + ALU_CRC32_H, ALU_CRC32C_H, + ALU_CRC32_B, ALU_CRC32C_B: begin + imd_val_d_o = clmul_result_rev; + unique case(1'b1) + crc_bmode: multicycle_result = clmul_result_rev ^ (operand_a_i >> 8); + crc_hmode: multicycle_result = clmul_result_rev ^ (operand_a_i >> 16); + default: multicycle_result = clmul_result_rev; + endcase + if (instr_first_cycle_i) begin + imd_val_we_o = 1'b1; + end else begin + imd_val_we_o = 1'b0; + end + end + default: begin imd_val_d_o = operand_a_i; imd_val_we_o = 1'b0; @@ -870,94 +1059,6 @@ module ibex_alu #( assign sext_result = (operator_i == ALU_SEXTB) ? { {24{operand_a_i[7]}}, operand_a_i[7:0]} : { {16{operand_a_i[15]}}, operand_a_i[15:0]}; - ///////////////////////// - // Carry-less Multiply // - ///////////////////////// - - // Carry-less multiplication can be understood as multiplication based on - // the addition interpreted as the bit-wise xor operation. - // - // Example: 1101 X 1011 = 1111111: - // - // 1011 X 1101 - // ----------- - // 1101 - // xor 1101 - // --------- - // 10111 - // xor 0000 - // ---------- - // 010111 - // xor 1101 - // ----------- - // 1111111 - // - // Architectural details: - // A 32 x 32-bit array - // [ operand_b[i] ? (operand_a << i) : '0 for i in 0 ... 31 ] - // is generated. The entries of the array are pairwise 'xor-ed' - // together in a 5-stage binary tree. - - logic clmul_rmode; - logic clmul_hmode; - logic [31:0] clmul_op_a; - logic [31:0] clmul_op_b; - logic [31:0] operand_b_rev; - logic [31:0] clmul_and_stage[32]; - logic [31:0] clmul_xor_stage1[16]; - logic [31:0] clmul_xor_stage2[8]; - logic [31:0] clmul_xor_stage3[4]; - logic [31:0] clmul_xor_stage4[2]; - - logic [31:0] clmul_result_raw; - logic [31:0] clmul_result_rev; - - for (genvar i=0; i<32; i++) begin: gen_rev_operand_b - assign operand_b_rev[i] = operand_b_i[31-i]; - end - - assign clmul_rmode = operator_i == ALU_CLMULR; - assign clmul_hmode = operator_i == ALU_CLMULH; - - assign clmul_op_a = clmul_rmode | clmul_hmode ? operand_a_rev : operand_a_i; - assign clmul_op_b = clmul_rmode | clmul_hmode ? operand_b_rev : operand_b_i; - - for (genvar i=0; i<32; i++) begin : gen_clmul_and_op - assign clmul_and_stage[i] = clmul_op_b[i] ? clmul_op_a << i : '0; - end - - for (genvar i=0; i<16; i++) begin : gen_clmul_xor_op_l1 - assign clmul_xor_stage1[i] = clmul_and_stage[2*i] ^ clmul_and_stage[2*i+1]; - end - - for (genvar i=0; i<8; i++) begin : gen_clmul_xor_op_l2 - assign clmul_xor_stage2[i] = clmul_xor_stage1[2*i] ^ clmul_xor_stage1[2*i+1]; - end - - for (genvar i=0; i<4; i++) begin : gen_clmul_xor_op_l3 - assign clmul_xor_stage3[i] = clmul_xor_stage2[2*i] ^ clmul_xor_stage2[2*i+1]; - end - - for (genvar i=0; i<2; i++) begin : gen_clmul_xor_op_l4 - assign clmul_xor_stage4[i] = clmul_xor_stage3[2*i] ^ clmul_xor_stage3[2*i+1]; - end - - assign clmul_result_raw = clmul_xor_stage4[0] ^ clmul_xor_stage4[1]; - - for (genvar i=0; i<32; i++) begin : gen_rev_clmul_result - assign clmul_result_rev[i] = clmul_result_raw[31-i]; - end - - // clmulr_result = rev(clmul(rev(a), rev(b))) - // clmulh_result = clmulr_result >> 1 - always_comb begin - case(1'b1) - clmul_rmode: clmul_result = clmul_result_rev; - clmul_hmode: clmul_result = {1'b0, clmul_result_rev[31:1]}; - default: clmul_result = clmul_result_raw; - endcase - end - end else begin : g_no_alu_rvb // RV32B result signals assign minmax_result = '0; @@ -1024,7 +1125,12 @@ module ibex_alu #( // Ternary Bitmanip Operations (RV32B) ALU_CMIX, ALU_CMOV, ALU_FSL, ALU_FSR, - ALU_ROL, ALU_ROR: result_o = multicycle_result; + // Rotate Shift (RV32B) + ALU_ROL, ALU_ROR, + // Cyclic Redundancy Checks (RV32B) + ALU_CRC32_W, ALU_CRC32C_W, + ALU_CRC32_H, ALU_CRC32C_H, + ALU_CRC32_B, ALU_CRC32C_B: result_o = multicycle_result; // Single-Bit Bitmanip Operations (RV32B) ALU_SBSET, ALU_SBCLR, @@ -1040,7 +1146,7 @@ module ibex_alu #( // Bit Field Place (RV32B) ALU_BFP: result_o = bfp_result; - // Carry-less Multiply Operations (RV32B Ops) + // Carry-less Multiply Operations (RV32B) ALU_CLMUL, ALU_CLMULR, ALU_CLMULH: result_o = clmul_result; diff --git a/rtl/ibex_decoder.sv b/rtl/ibex_decoder.sv index 9bbaa252..96acd3d3 100644 --- a/rtl/ibex_decoder.sv +++ b/rtl/ibex_decoder.sv @@ -346,11 +346,17 @@ module ibex_decoder #( end 5'b0_1100: begin unique case(instr[26:20]) - 7'b00_00000, // clz - 7'b00_00001, // ctz - 7'b00_00010, // pcnt - 7'b00_00100, // sext.b - 7'b00_00101: illegal_insn = RV32B ? 1'b0 : 1'b1; // sext.h + 7'b000_0000, // clz + 7'b000_0001, // ctz + 7'b000_0010, // pcnt + 7'b000_0100, // sext.b + 7'b000_0101, // sext.h + 7'b001_0000, // crc32.b + 7'b001_0001, // crc32.h + 7'b001_0010, // crc32.w + 7'b001_1000, // crc32c.b + 7'b001_1001, // crc32c.h + 7'b001_1010: illegal_insn = RV32B ? 1'b0 : 1'b1; // crc32c.w default: illegal_insn = 1'b1; endcase @@ -775,11 +781,35 @@ module ibex_decoder #( 5'b0_0001: if (instr_alu[26] == 0) alu_operator_o = ALU_SHFL; 5'b0_1100: begin unique case (instr_alu[26:20]) - 7'b000_0000: alu_operator_o = ALU_CLZ; // Count Leading Zeros - 7'b000_0001: alu_operator_o = ALU_CTZ; // Count Trailing Zeros - 7'b000_0010: alu_operator_o = ALU_PCNT; // Count Set Bits - 7'b000_0100: alu_operator_o = ALU_SEXTB; // Sign-extend Byte - 7'b000_0101: alu_operator_o = ALU_SEXTH; // Sign-extend Half-word + 7'b000_0000: alu_operator_o = ALU_CLZ; // clz + 7'b000_0001: alu_operator_o = ALU_CTZ; // ctz + 7'b000_0010: alu_operator_o = ALU_PCNT; // pcnt + 7'b000_0100: alu_operator_o = ALU_SEXTB; // sext.b + 7'b000_0101: alu_operator_o = ALU_SEXTH; // sext.h + 7'b001_0000: begin + alu_operator_o = ALU_CRC32_B; // crc32.b + alu_multicycle_o = 1'b1; + end + 7'b001_0001: begin + alu_operator_o = ALU_CRC32_H; // crc32.h + alu_multicycle_o = 1'b1; + end + 7'b001_0010: begin + alu_operator_o = ALU_CRC32_W; // crc32.w + alu_multicycle_o = 1'b1; + end + 7'b001_1000: begin + alu_operator_o = ALU_CRC32C_B; // crc32c.b + alu_multicycle_o = 1'b1; + end + 7'b001_1001: begin + alu_operator_o = ALU_CRC32C_H; // crc32c.h + alu_multicycle_o = 1'b1; + end + 7'b001_1010: begin + alu_operator_o = ALU_CRC32C_W; // crc32c.w + alu_multicycle_o = 1'b1; + end default: ; endcase end diff --git a/rtl/ibex_pkg.sv b/rtl/ibex_pkg.sv index 1bd61098..3150dbff 100644 --- a/rtl/ibex_pkg.sv +++ b/rtl/ibex_pkg.sv @@ -121,7 +121,15 @@ typedef enum logic [5:0] { // RV32B ALU_CLMUL, ALU_CLMULR, - ALU_CLMULH + ALU_CLMULH, + + // Cyclic Redundancy Check + ALU_CRC32_B, + ALU_CRC32C_B, + ALU_CRC32_H, + ALU_CRC32C_H, + ALU_CRC32_W, + ALU_CRC32C_W } alu_op_e; typedef enum logic [1:0] { diff --git a/rtl/ibex_tracer.sv b/rtl/ibex_tracer.sv index f6b00d4d..efd6b9f5 100644 --- a/rtl/ibex_tracer.sv +++ b/rtl/ibex_tracer.sv @@ -1019,6 +1019,14 @@ module ibex_tracer ( INSN_CLMULR: decode_r_insn("clmulr"); INSN_CLMULH: decode_r_insn("clmulh"); + // RV32B - ZBR + INSN_CRC32_B: decode_r1_insn("crc32.b"); + INSN_CRC32_H: decode_r1_insn("crc32.h"); + INSN_CRC32_W: decode_r1_insn("crc32.w"); + INSN_CRC32C_B: decode_r1_insn("crc32c.b"); + INSN_CRC32C_H: decode_r1_insn("crc32c.h"); + INSN_CRC32C_W: decode_r1_insn("crc32c.w"); + default: decode_mnemonic("INVALID"); endcase end diff --git a/rtl/ibex_tracer_pkg.sv b/rtl/ibex_tracer_pkg.sv index e9329f84..9d11c88c 100644 --- a/rtl/ibex_tracer_pkg.sv +++ b/rtl/ibex_tracer_pkg.sv @@ -251,6 +251,14 @@ parameter logic [31:0] INSN_CLMUL = {7'b0000101, 10'b?, 3'b001, 5'b?, {OPCODE_O parameter logic [31:0] INSN_CLMULR = {7'b0000101, 10'b?, 3'b010, 5'b?, {OPCODE_OP} }; parameter logic [31:0] INSN_CLMULH = {7'b0000101, 10'b?, 3'b011, 5'b?, {OPCODE_OP} }; +// ZBR +parameter logic [31:0] INSN_CRC32_B = {7'b0110000, 5'b10000, 5'b?, 3'b001, 5'b?, {OPCODE_OP_IMM} }; +parameter logic [31:0] INSN_CRC32_H = {7'b0110000, 5'b10001, 5'b?, 3'b001, 5'b?, {OPCODE_OP_IMM} }; +parameter logic [31:0] INSN_CRC32_W = {7'b0110000, 5'b10010, 5'b?, 3'b001, 5'b?, {OPCODE_OP_IMM} }; +parameter logic [31:0] INSN_CRC32C_B = {7'b0110000, 5'b11000, 5'b?, 3'b001, 5'b?, {OPCODE_OP_IMM} }; +parameter logic [31:0] INSN_CRC32C_H = {7'b0110000, 5'b11001, 5'b?, 3'b001, 5'b?, {OPCODE_OP_IMM} }; +parameter logic [31:0] INSN_CRC32C_W = {7'b0110000, 5'b11010, 5'b?, 3'b001, 5'b?, {OPCODE_OP_IMM} }; + // LOAD & STORE parameter logic [31:0] INSN_LOAD = {25'b?, {OPCODE_LOAD } }; parameter logic [31:0] INSN_STORE = {25'b?, {OPCODE_STORE} };