mirror of
https://github.com/lowRISC/ibex.git
synced 2025-04-22 04:47:25 -04:00
Add chapters about hardware loops and multiply-accumulate
Add instruction specification of pulp extensions
This commit is contained in:
parent
dc8144a459
commit
e4cbf45209
4 changed files with 283 additions and 7 deletions
|
@ -1,3 +1,189 @@
|
|||
\chapter{PULP Hardware Loop Extensions}
|
||||
\label{chap:hwloop}
|
||||
|
||||
For increased efficiency of small loops, RI5CY supports hardware loops.
|
||||
Hardware loops make it possible to execute a piece of code multiple times,
|
||||
without the overhead of branches or updating a counter. Hardware loops involve
|
||||
zero stall cycles for jumping to the first instruction of a loop.
|
||||
|
||||
A hardware loop is defined by its start address (pointing to the first
|
||||
instruction in the loop), its end address (pointing to the instruction that
|
||||
will be executed last in the loop) and a counter that is decremented every time
|
||||
the loop body is executed. RI5CY contains two hardware loop register sets, each
|
||||
of them can store these three values.
|
||||
If the end address of the two hardware loops is identical, loop 0 has higher
|
||||
priority and only the loop counter for hardware loop 0 is decremented.
|
||||
|
||||
The instructions described below are used to setup the hardware loop registers.
|
||||
Note that the minimum loop size is two instructions.
|
||||
|
||||
For debugging and context switches the hardware loop registers are mapped into
|
||||
the CSR address space and thus it is possible to read and write them via
|
||||
\instr{csrr} and \instr{csrw} instructions.
|
||||
|
||||
|
||||
\section{Instructions}
|
||||
|
||||
\subsection{lp.starti L, uimmL}
|
||||
Sets the start address of a hardware loop.
|
||||
|
||||
\begin{center}
|
||||
\begin{bytefield}[endianness=big,bitwidth=1.3em]{32}
|
||||
\bitheader{31,20,19,15,14,12,11,7,6,0} \\
|
||||
\bitbox{12}{ uimmL[11:0] }
|
||||
\bitbox{5}{ rs1 }
|
||||
\bitbox{3}{ funct3 }
|
||||
\bitbox{4}{ 0000 }
|
||||
\bitbox{1}{ L }
|
||||
\bitbox{7}{ opcode } \\
|
||||
|
||||
\bitbox[]{12}{ uimmL[11:0] }
|
||||
\bitbox[]{5}{ 00000 }
|
||||
\bitbox[]{3}{ 000 }
|
||||
\bitbox[]{4}{ 0000 }
|
||||
\bitbox[]{1}{ L }
|
||||
\bitbox[]{7}{ 111 1011 }
|
||||
\end{bytefield}
|
||||
\end{center}
|
||||
\textbf{Operation:} \texttt{lpstart[L] = pc + (Zext(uimmL[11:0]) << 1)}
|
||||
|
||||
|
||||
\subsection{lp.endi L, uimmL}
|
||||
Sets the end address of a hardware loop.
|
||||
|
||||
\begin{center}
|
||||
\begin{bytefield}[endianness=big,bitwidth=1.3em]{32}
|
||||
\bitheader{31,20,19,15,14,12,11,7,6,0} \\
|
||||
\bitbox{12}{ uimmL[11:0] }
|
||||
\bitbox{5}{ rs1 }
|
||||
\bitbox{3}{ funct3 }
|
||||
\bitbox{4}{ 0000 }
|
||||
\bitbox{1}{ L }
|
||||
\bitbox{7}{ opcode } \\
|
||||
|
||||
\bitbox[]{12}{ uimmL[11:0] }
|
||||
\bitbox[]{5}{ 00000 }
|
||||
\bitbox[]{3}{ 001 }
|
||||
\bitbox[]{4}{ 0000 }
|
||||
\bitbox[]{1}{ L }
|
||||
\bitbox[]{7}{ 111 1011 }
|
||||
\end{bytefield}
|
||||
\end{center}
|
||||
\textbf{Operation:} \texttt{lpend[L] = pc + (Zext(uimmL[11:0]) << 1)}
|
||||
|
||||
|
||||
\subsection{lp.count L, rs1}
|
||||
Sets the number of iterations of a hardware loop.
|
||||
|
||||
\begin{center}
|
||||
\begin{bytefield}[endianness=big,bitwidth=1.3em]{32}
|
||||
\bitheader{31,20,19,15,14,12,11,7,6,0} \\
|
||||
\bitbox{12}{ uimmL[11:0] }
|
||||
\bitbox{5}{ rs1 }
|
||||
\bitbox{3}{ funct3 }
|
||||
\bitbox{4}{ 0000 }
|
||||
\bitbox{1}{ L }
|
||||
\bitbox{7}{ opcode } \\
|
||||
|
||||
\bitbox[]{12}{ 0000 0000 0000}
|
||||
\bitbox[]{5}{ src1 }
|
||||
\bitbox[]{3}{ 010 }
|
||||
\bitbox[]{4}{ 0000 }
|
||||
\bitbox[]{1}{ L }
|
||||
\bitbox[]{7}{ 111 1011 }
|
||||
\end{bytefield}
|
||||
\end{center}
|
||||
\textbf{Operation:} \texttt{lpcount[L] = rs1}
|
||||
|
||||
|
||||
\subsection{lp.counti L, uimmL}
|
||||
Sets the number of iterations of a hardware loop.
|
||||
|
||||
\begin{center}
|
||||
\begin{bytefield}[endianness=big,bitwidth=1.3em]{32}
|
||||
\bitheader{31,20,19,15,14,12,11,7,6,0} \\
|
||||
\bitbox{12}{ uimmL[11:0] }
|
||||
\bitbox{5}{ rs1 }
|
||||
\bitbox{3}{ funct3 }
|
||||
\bitbox{4}{ 0000 }
|
||||
\bitbox{1}{ L }
|
||||
\bitbox{7}{ opcode } \\
|
||||
|
||||
\bitbox[]{12}{ uimmL[11:0] }
|
||||
\bitbox[]{5}{ 00000 }
|
||||
\bitbox[]{3}{ 011 }
|
||||
\bitbox[]{4}{ 0000 }
|
||||
\bitbox[]{1}{ L }
|
||||
\bitbox[]{7}{ 111 1011 }
|
||||
\end{bytefield}
|
||||
\end{center}
|
||||
\textbf{Operation:} \texttt{lpcount[L] = Zext(uimmL[11:0])}
|
||||
|
||||
|
||||
\subsection{lp.setup L, rs1, uimmL}
|
||||
Sets up a hardware loop in one instruction. This instruction assumes that the
|
||||
next instruction is the start address of the loop.
|
||||
|
||||
\begin{center}
|
||||
\begin{bytefield}[endianness=big,bitwidth=1.3em]{32}
|
||||
\bitheader{31,20,19,15,14,12,11,7,6,0} \\
|
||||
\bitbox{12}{ uimmL[11:0] }
|
||||
\bitbox{5}{ rs1 }
|
||||
\bitbox{3}{ funct3 }
|
||||
\bitbox{4}{ 0000 }
|
||||
\bitbox{1}{ L }
|
||||
\bitbox{7}{ opcode } \\
|
||||
|
||||
\bitbox[]{12}{ uimmL[11:0] }
|
||||
\bitbox[]{5}{ src1 }
|
||||
\bitbox[]{3}{ 100 }
|
||||
\bitbox[]{4}{ 0000 }
|
||||
\bitbox[]{1}{ L }
|
||||
\bitbox[]{7}{ 111 1011 }
|
||||
\end{bytefield}
|
||||
\end{center}
|
||||
\textbf{Operation:} \texttt{lpstart[L] = pc + 4; lpend[L] = pc + Zext(uimmL[11:0]); lpcount[L] = rs1}
|
||||
|
||||
|
||||
\subsection{lp.setupi L, uimmS, uimmL}
|
||||
Sets up a hardware loop in one instruction. This instruction assumes that the
|
||||
next instruction is the start address of the loop. The number of iterations is
|
||||
given as an immediate.
|
||||
|
||||
\begin{center}
|
||||
\begin{bytefield}[endianness=big,bitwidth=1.3em]{32}
|
||||
\bitheader{31,20,19,15,14,12,11,7,6,0} \\
|
||||
\bitbox{12}{ uimmL[11:0] }
|
||||
\bitbox{5}{ uimmS }
|
||||
\bitbox{3}{ funct3 }
|
||||
\bitbox{4}{ 0000 }
|
||||
\bitbox{1}{ L }
|
||||
\bitbox{7}{ opcode } \\
|
||||
|
||||
\bitbox[]{12}{ uimmL[11:0] }
|
||||
\bitbox[]{5}{ uimmS }
|
||||
\bitbox[]{3}{ 101 }
|
||||
\bitbox[]{4}{ 0000 }
|
||||
\bitbox[]{1}{ L }
|
||||
\bitbox[]{7}{ 111 1011 }
|
||||
\end{bytefield}
|
||||
\end{center}
|
||||
\textbf{Operation:} \texttt{lpstart[L] = pc + 4; lpend[L] = pc + Zext(uimmL[11:0]); lpcount[L] = Zext(uimmS)}
|
||||
|
||||
|
||||
\section{CSR Mapping}
|
||||
|
||||
\begin{table}[H]
|
||||
\caption{Control and Status Register Map}
|
||||
\label{tab:csr_map}
|
||||
\centering\begin{tabularx}{\linewidth}{@{}|cc|c|l|l|X|@{}} \toprule
|
||||
\multicolumn{2}{|c|}{\textbf{CSR Address}} & \textbf{Hex} & \textbf{Name} & \textbf{Access} & \textbf{Description} \\ \hline
|
||||
\textbf{[11:6]} & \textbf{[5:0]} & & & & \\ \toprule
|
||||
011110 & 110000 & 0x7B0 & lpstart[0] & R/W & Hardware Loop 0 Start \\ \hline
|
||||
011110 & 110001 & 0x7B1 & lpend[0] & R/W & Hardware Loop 0 End \\ \hline
|
||||
011110 & 110010 & 0x7B2 & lpcount[0] & R/W & Hardware Loop 0 Counter \\ \hline
|
||||
011110 & 110100 & 0x7B4 & lpstart[1] & R/W & Hardware Loop 1 Start \\ \hline
|
||||
011110 & 110101 & 0x7B5 & lpend[1] & R/W & Hardware Loop 1 End \\ \hline
|
||||
011110 & 110110 & 0x7B6 & lpcount[1] & R/W & Hardware Loop 1 Counter \\ \bottomrule
|
||||
\end{tabularx}
|
||||
\end{table}
|
||||
|
|
|
@ -1,7 +1,89 @@
|
|||
\chapter{Multiply-Accumulate}
|
||||
\label{chap:mac}
|
||||
|
||||
\rvcore uses a single-cycle 32 bit lower result multiplier. Only a subset of the
|
||||
standard M extension is implemented, i.e. the \instr{mul} instruction.
|
||||
Divisions and multiplications that return the upper half of the result are not
|
||||
supported.
|
||||
\rvcore uses a single-cycle 32-bit $\times$ 32-bit multiplier with a 32-bit
|
||||
result. Only a subset of the standard M extension is implemented, i.e. the
|
||||
\instr{mul} instruction. Divisions and multiplications that return the upper
|
||||
32-bit of the result are not supported.
|
||||
|
||||
Specifically the following instruction is supported:
|
||||
\begin{itemize}
|
||||
\item \instr{mul}
|
||||
\end{itemize}
|
||||
|
||||
The following instructions are \textbf{not} supported:
|
||||
\begin{itemize}
|
||||
\item \instr{mulh}
|
||||
\item \instr{mulhs}
|
||||
\item \instr{mulhu}
|
||||
\item \instr{div}
|
||||
\item \instr{divu}
|
||||
\item \instr{rem}
|
||||
\item \instr{remu}
|
||||
\end{itemize}
|
||||
|
||||
Instead \rvcore supports non-standard extensions for multiply-accumulate and
|
||||
half-word multiplications.
|
||||
|
||||
\section{Instructions}
|
||||
|
||||
\subsection{p.mac rD, rs1, rs2}
|
||||
Multiply-Accumulate on 32-bit $\times$ 32-bit with a 32-bit result.
|
||||
|
||||
\begin{center}
|
||||
\begin{bytefield}[endianness=big,bitwidth=1.3em]{32}
|
||||
\bitheader{31,30,29,25,24,20,19,15,14,12,11,7,6,0} \\
|
||||
\bitbox{2}{ 00 }
|
||||
\bitbox{5}{ rs3 }
|
||||
\bitbox{5}{ rs2 }
|
||||
\bitbox{5}{ rs1 }
|
||||
\bitbox{3}{ funct3 }
|
||||
\bitbox{5}{ rd }
|
||||
\bitbox{7}{ opcode } \\
|
||||
|
||||
\bitbox[]{2}{ 00 }
|
||||
\bitbox[]{5}{ src3 }
|
||||
\bitbox[]{5}{ src2 }
|
||||
\bitbox[]{5}{ src1 }
|
||||
\bitbox[]{3}{ 000 }
|
||||
\bitbox[]{5}{ dest }
|
||||
\bitbox[]{7}{ 101 1011 }
|
||||
\end{bytefield}
|
||||
\end{center}
|
||||
\textbf{Operation:} \texttt{rD = rs1 * rs2 + rs3}
|
||||
|
||||
|
||||
\subsection{p.mac\{.zl,.sl,.zh,.sh\}\{.zl,.sl,.zh,.sh\} rD, rs1, rs2}
|
||||
Multiply-Accumulate on 16-bit $\times$ 16-bit with a 32-bit result. The
|
||||
half-word and sign-mode that is used for the multiplication can be selected.
|
||||
|
||||
\begin{center}
|
||||
\begin{bytefield}[endianness=big,bitwidth=1.3em]{32}
|
||||
\bitheader{31,30,29,25,24,20,19,15,14,12,11,7,6,0} \\
|
||||
\bitbox{1}{ S1 }
|
||||
\bitbox{1}{ S2 }
|
||||
\bitbox{5}{ rs3 }
|
||||
\bitbox{5}{ rs2 }
|
||||
\bitbox{5}{ rs1 }
|
||||
\bitbox{1}{ 1 }
|
||||
\bitbox{1}{ H1 }
|
||||
\bitbox{1}{ H2 }
|
||||
\bitbox{5}{ rd }
|
||||
\bitbox{7}{ opcode } \\
|
||||
|
||||
\bitbox[]{1}{ S1 }
|
||||
\bitbox[]{1}{ S2 }
|
||||
\bitbox[]{5}{ unused }
|
||||
\bitbox[]{5}{ src2 }
|
||||
\bitbox[]{5}{ src1 }
|
||||
\bitbox[]{1}{ 0 }
|
||||
\bitbox[]{1}{ H1 }
|
||||
\bitbox[]{1}{ H2 }
|
||||
\bitbox[]{5}{ dest }
|
||||
\bitbox[]{7}{ 101 1011 }
|
||||
\end{bytefield}
|
||||
\end{center}
|
||||
\textbf{Operation:} \texttt{rD = rs1[H1*16+15:H1*16] * rs2[H2*16+15:H2*16] + rs3}
|
||||
|
||||
S1 and S2 determine the zero/sign-extension of rs1 and rs2. A value of
|
||||
\texttt{1} means sign-extension.
|
||||
|
|
|
@ -27,10 +27,10 @@ Figure~\ref{fig:ri5cy_overview} shows a block diagram of the core.
|
|||
Only the \instr{mul} instruction is supported.
|
||||
\item PULP specific extensions \\
|
||||
\begin{itemize}
|
||||
\item Hardware Loops, see Chapter~\ref{chap:hwloop}
|
||||
\item ALU extensions, see Chapter~\ref{chap:aluext}
|
||||
\item Multiply-Accumulate extensions, see Chapter~\ref{chap:mac}
|
||||
\item Post-Incrementing load and stores, see Chapter~\ref{chap:lsu}
|
||||
\item Multiply-Accumulate extensions, see Chapter~\ref{chap:mac}
|
||||
\item ALU extensions, see Chapter~\ref{chap:aluext}
|
||||
\item Hardware Loops, see Chapter~\ref{chap:hwloop}
|
||||
\end{itemize}
|
||||
\end{itemize}
|
||||
|
||||
|
|
|
@ -107,3 +107,11 @@
|
|||
\caption{#3}
|
||||
\end{figure}}
|
||||
|
||||
\newcommand\instrDesc[3]{%
|
||||
\subsection{#1}
|
||||
\begin{center}
|
||||
#3
|
||||
\end{center}
|
||||
\textbf{Operation:} \texttt{#2}%
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue