Add chapters about hardware loops and multiply-accumulate

Add instruction specification of pulp extensions
This commit is contained in:
Andreas Traber 2015-12-22 11:16:47 +01:00
parent dc8144a459
commit e4cbf45209
4 changed files with 283 additions and 7 deletions

View file

@ -1,3 +1,189 @@
\chapter{PULP Hardware Loop Extensions}
\label{chap:hwloop}
For increased efficiency of small loops, RI5CY supports hardware loops.
Hardware loops make it possible to execute a piece of code multiple times,
without the overhead of branches or updating a counter. Hardware loops involve
zero stall cycles for jumping to the first instruction of a loop.
A hardware loop is defined by its start address (pointing to the first
instruction in the loop), its end address (pointing to the instruction that
will be executed last in the loop) and a counter that is decremented every time
the loop body is executed. RI5CY contains two hardware loop register sets, each
of them can store these three values.
If the end address of the two hardware loops is identical, loop 0 has higher
priority and only the loop counter for hardware loop 0 is decremented.
The instructions described below are used to setup the hardware loop registers.
Note that the minimum loop size is two instructions.
For debugging and context switches the hardware loop registers are mapped into
the CSR address space and thus it is possible to read and write them via
\instr{csrr} and \instr{csrw} instructions.
\section{Instructions}
\subsection{lp.starti L, uimmL}
Sets the start address of a hardware loop.
\begin{center}
\begin{bytefield}[endianness=big,bitwidth=1.3em]{32}
\bitheader{31,20,19,15,14,12,11,7,6,0} \\
\bitbox{12}{ uimmL[11:0] }
\bitbox{5}{ rs1 }
\bitbox{3}{ funct3 }
\bitbox{4}{ 0000 }
\bitbox{1}{ L }
\bitbox{7}{ opcode } \\
\bitbox[]{12}{ uimmL[11:0] }
\bitbox[]{5}{ 00000 }
\bitbox[]{3}{ 000 }
\bitbox[]{4}{ 0000 }
\bitbox[]{1}{ L }
\bitbox[]{7}{ 111 1011 }
\end{bytefield}
\end{center}
\textbf{Operation:} \texttt{lpstart[L] = pc + (Zext(uimmL[11:0]) << 1)}
\subsection{lp.endi L, uimmL}
Sets the end address of a hardware loop.
\begin{center}
\begin{bytefield}[endianness=big,bitwidth=1.3em]{32}
\bitheader{31,20,19,15,14,12,11,7,6,0} \\
\bitbox{12}{ uimmL[11:0] }
\bitbox{5}{ rs1 }
\bitbox{3}{ funct3 }
\bitbox{4}{ 0000 }
\bitbox{1}{ L }
\bitbox{7}{ opcode } \\
\bitbox[]{12}{ uimmL[11:0] }
\bitbox[]{5}{ 00000 }
\bitbox[]{3}{ 001 }
\bitbox[]{4}{ 0000 }
\bitbox[]{1}{ L }
\bitbox[]{7}{ 111 1011 }
\end{bytefield}
\end{center}
\textbf{Operation:} \texttt{lpend[L] = pc + (Zext(uimmL[11:0]) << 1)}
\subsection{lp.count L, rs1}
Sets the number of iterations of a hardware loop.
\begin{center}
\begin{bytefield}[endianness=big,bitwidth=1.3em]{32}
\bitheader{31,20,19,15,14,12,11,7,6,0} \\
\bitbox{12}{ uimmL[11:0] }
\bitbox{5}{ rs1 }
\bitbox{3}{ funct3 }
\bitbox{4}{ 0000 }
\bitbox{1}{ L }
\bitbox{7}{ opcode } \\
\bitbox[]{12}{ 0000 0000 0000}
\bitbox[]{5}{ src1 }
\bitbox[]{3}{ 010 }
\bitbox[]{4}{ 0000 }
\bitbox[]{1}{ L }
\bitbox[]{7}{ 111 1011 }
\end{bytefield}
\end{center}
\textbf{Operation:} \texttt{lpcount[L] = rs1}
\subsection{lp.counti L, uimmL}
Sets the number of iterations of a hardware loop.
\begin{center}
\begin{bytefield}[endianness=big,bitwidth=1.3em]{32}
\bitheader{31,20,19,15,14,12,11,7,6,0} \\
\bitbox{12}{ uimmL[11:0] }
\bitbox{5}{ rs1 }
\bitbox{3}{ funct3 }
\bitbox{4}{ 0000 }
\bitbox{1}{ L }
\bitbox{7}{ opcode } \\
\bitbox[]{12}{ uimmL[11:0] }
\bitbox[]{5}{ 00000 }
\bitbox[]{3}{ 011 }
\bitbox[]{4}{ 0000 }
\bitbox[]{1}{ L }
\bitbox[]{7}{ 111 1011 }
\end{bytefield}
\end{center}
\textbf{Operation:} \texttt{lpcount[L] = Zext(uimmL[11:0])}
\subsection{lp.setup L, rs1, uimmL}
Sets up a hardware loop in one instruction. This instruction assumes that the
next instruction is the start address of the loop.
\begin{center}
\begin{bytefield}[endianness=big,bitwidth=1.3em]{32}
\bitheader{31,20,19,15,14,12,11,7,6,0} \\
\bitbox{12}{ uimmL[11:0] }
\bitbox{5}{ rs1 }
\bitbox{3}{ funct3 }
\bitbox{4}{ 0000 }
\bitbox{1}{ L }
\bitbox{7}{ opcode } \\
\bitbox[]{12}{ uimmL[11:0] }
\bitbox[]{5}{ src1 }
\bitbox[]{3}{ 100 }
\bitbox[]{4}{ 0000 }
\bitbox[]{1}{ L }
\bitbox[]{7}{ 111 1011 }
\end{bytefield}
\end{center}
\textbf{Operation:} \texttt{lpstart[L] = pc + 4; lpend[L] = pc + Zext(uimmL[11:0]); lpcount[L] = rs1}
\subsection{lp.setupi L, uimmS, uimmL}
Sets up a hardware loop in one instruction. This instruction assumes that the
next instruction is the start address of the loop. The number of iterations is
given as an immediate.
\begin{center}
\begin{bytefield}[endianness=big,bitwidth=1.3em]{32}
\bitheader{31,20,19,15,14,12,11,7,6,0} \\
\bitbox{12}{ uimmL[11:0] }
\bitbox{5}{ uimmS }
\bitbox{3}{ funct3 }
\bitbox{4}{ 0000 }
\bitbox{1}{ L }
\bitbox{7}{ opcode } \\
\bitbox[]{12}{ uimmL[11:0] }
\bitbox[]{5}{ uimmS }
\bitbox[]{3}{ 101 }
\bitbox[]{4}{ 0000 }
\bitbox[]{1}{ L }
\bitbox[]{7}{ 111 1011 }
\end{bytefield}
\end{center}
\textbf{Operation:} \texttt{lpstart[L] = pc + 4; lpend[L] = pc + Zext(uimmL[11:0]); lpcount[L] = Zext(uimmS)}
\section{CSR Mapping}
\begin{table}[H]
\caption{Control and Status Register Map}
\label{tab:csr_map}
\centering\begin{tabularx}{\linewidth}{@{}|cc|c|l|l|X|@{}} \toprule
\multicolumn{2}{|c|}{\textbf{CSR Address}} & \textbf{Hex} & \textbf{Name} & \textbf{Access} & \textbf{Description} \\ \hline
\textbf{[11:6]} & \textbf{[5:0]} & & & & \\ \toprule
011110 & 110000 & 0x7B0 & lpstart[0] & R/W & Hardware Loop 0 Start \\ \hline
011110 & 110001 & 0x7B1 & lpend[0] & R/W & Hardware Loop 0 End \\ \hline
011110 & 110010 & 0x7B2 & lpcount[0] & R/W & Hardware Loop 0 Counter \\ \hline
011110 & 110100 & 0x7B4 & lpstart[1] & R/W & Hardware Loop 1 Start \\ \hline
011110 & 110101 & 0x7B5 & lpend[1] & R/W & Hardware Loop 1 End \\ \hline
011110 & 110110 & 0x7B6 & lpcount[1] & R/W & Hardware Loop 1 Counter \\ \bottomrule
\end{tabularx}
\end{table}

View file

@ -1,7 +1,89 @@
\chapter{Multiply-Accumulate}
\label{chap:mac}
\rvcore uses a single-cycle 32 bit lower result multiplier. Only a subset of the
standard M extension is implemented, i.e. the \instr{mul} instruction.
Divisions and multiplications that return the upper half of the result are not
supported.
\rvcore uses a single-cycle 32-bit $\times$ 32-bit multiplier with a 32-bit
result. Only a subset of the standard M extension is implemented, i.e. the
\instr{mul} instruction. Divisions and multiplications that return the upper
32-bit of the result are not supported.
Specifically the following instruction is supported:
\begin{itemize}
\item \instr{mul}
\end{itemize}
The following instructions are \textbf{not} supported:
\begin{itemize}
\item \instr{mulh}
\item \instr{mulhs}
\item \instr{mulhu}
\item \instr{div}
\item \instr{divu}
\item \instr{rem}
\item \instr{remu}
\end{itemize}
Instead \rvcore supports non-standard extensions for multiply-accumulate and
half-word multiplications.
\section{Instructions}
\subsection{p.mac rD, rs1, rs2}
Multiply-Accumulate on 32-bit $\times$ 32-bit with a 32-bit result.
\begin{center}
\begin{bytefield}[endianness=big,bitwidth=1.3em]{32}
\bitheader{31,30,29,25,24,20,19,15,14,12,11,7,6,0} \\
\bitbox{2}{ 00 }
\bitbox{5}{ rs3 }
\bitbox{5}{ rs2 }
\bitbox{5}{ rs1 }
\bitbox{3}{ funct3 }
\bitbox{5}{ rd }
\bitbox{7}{ opcode } \\
\bitbox[]{2}{ 00 }
\bitbox[]{5}{ src3 }
\bitbox[]{5}{ src2 }
\bitbox[]{5}{ src1 }
\bitbox[]{3}{ 000 }
\bitbox[]{5}{ dest }
\bitbox[]{7}{ 101 1011 }
\end{bytefield}
\end{center}
\textbf{Operation:} \texttt{rD = rs1 * rs2 + rs3}
\subsection{p.mac\{.zl,.sl,.zh,.sh\}\{.zl,.sl,.zh,.sh\} rD, rs1, rs2}
Multiply-Accumulate on 16-bit $\times$ 16-bit with a 32-bit result. The
half-word and sign-mode that is used for the multiplication can be selected.
\begin{center}
\begin{bytefield}[endianness=big,bitwidth=1.3em]{32}
\bitheader{31,30,29,25,24,20,19,15,14,12,11,7,6,0} \\
\bitbox{1}{ S1 }
\bitbox{1}{ S2 }
\bitbox{5}{ rs3 }
\bitbox{5}{ rs2 }
\bitbox{5}{ rs1 }
\bitbox{1}{ 1 }
\bitbox{1}{ H1 }
\bitbox{1}{ H2 }
\bitbox{5}{ rd }
\bitbox{7}{ opcode } \\
\bitbox[]{1}{ S1 }
\bitbox[]{1}{ S2 }
\bitbox[]{5}{ unused }
\bitbox[]{5}{ src2 }
\bitbox[]{5}{ src1 }
\bitbox[]{1}{ 0 }
\bitbox[]{1}{ H1 }
\bitbox[]{1}{ H2 }
\bitbox[]{5}{ dest }
\bitbox[]{7}{ 101 1011 }
\end{bytefield}
\end{center}
\textbf{Operation:} \texttt{rD = rs1[H1*16+15:H1*16] * rs2[H2*16+15:H2*16] + rs3}
S1 and S2 determine the zero/sign-extension of rs1 and rs2. A value of
\texttt{1} means sign-extension.

View file

@ -27,10 +27,10 @@ Figure~\ref{fig:ri5cy_overview} shows a block diagram of the core.
Only the \instr{mul} instruction is supported.
\item PULP specific extensions \\
\begin{itemize}
\item Hardware Loops, see Chapter~\ref{chap:hwloop}
\item ALU extensions, see Chapter~\ref{chap:aluext}
\item Multiply-Accumulate extensions, see Chapter~\ref{chap:mac}
\item Post-Incrementing load and stores, see Chapter~\ref{chap:lsu}
\item Multiply-Accumulate extensions, see Chapter~\ref{chap:mac}
\item ALU extensions, see Chapter~\ref{chap:aluext}
\item Hardware Loops, see Chapter~\ref{chap:hwloop}
\end{itemize}
\end{itemize}

View file

@ -107,3 +107,11 @@
\caption{#3}
\end{figure}}
\newcommand\instrDesc[3]{%
\subsection{#1}
\begin{center}
#3
\end{center}
\textbf{Operation:} \texttt{#2}%
}