mirror of
https://github.com/lowRISC/ibex.git
synced 2025-04-22 04:47:25 -04:00
Working on the documentation
This commit is contained in:
parent
2b8fdcd6d3
commit
e9197db83c
14 changed files with 640 additions and 432 deletions
2
docs/datasheet/content/aluext.tex
Normal file
2
docs/datasheet/content/aluext.tex
Normal file
|
@ -0,0 +1,2 @@
|
|||
\chapter{PULP ALU Extensions}
|
||||
\label{chap:aluext}
|
|
@ -1,7 +1,108 @@
|
|||
\chapter{Control and Status Registers}
|
||||
\label{chap:csr}
|
||||
|
||||
\rvcore does not implement all control and status registers specified in the
|
||||
\rvcore does not implement all control and status registers specified in the
|
||||
\riscv privileged specifications, but is limited to the registers that were
|
||||
needed for the PULP system.
|
||||
The reason for this is that we wanted to keep the footprint of the core as low
|
||||
as possible and avoid any overhead that we do not explicitely need.
|
||||
|
||||
\begin{landscape}
|
||||
\begin{table}[H]
|
||||
\caption{Control and Status Register Map}
|
||||
\label{tab:csr_map}
|
||||
\centering\begin{tabularx}{\linewidth}{@{}|cccc|c|l|l|X|@{}} \toprule
|
||||
\multicolumn{4}{|c|}{\textbf{CSR Address}} & \textbf{Hex} & \textbf{Name} & \textbf{Access} & \textbf{Description} \\ \hline
|
||||
\textbf{[11:10]} & \textbf{[9:8]} & \textbf{[7:6]} & \textbf{[5:0]} & & & & \\ \toprule
|
||||
00 & 11 & 00 & 000000 & 0x300 & MSTATUS & R/W & Machine Status Register \\ \hline
|
||||
00 & 11 & 01 & 000000 & 0x340 & MSCRATCH & R/W & Scratch Register for machine trap handlers \\ \hline
|
||||
00 & 11 & 01 & 000001 & 0x341 & MEPC & R/W & Machine exception program counter \\ \hline
|
||||
00 & 11 & 01 & 000010 & 0x342 & MCAUSE & R/W & Machine trap cause \\ \hline
|
||||
01 & 11 & 00 & 0XXXXX & 0x780 - 0x79F & PCCRs & R/W & Performance Counter Counter Registers \\ \hline
|
||||
01 & 11 & 10 & 200000 & 0x7A0 & PCER & R/W & Performance Counter Enable Register \\ \hline
|
||||
01 & 11 & 10 & 100001 & 0x7A1 & PCMR & R/W & Performance Counter Mode Register \\ \hline
|
||||
01 & 11 & 10 & 110XXX & 0x7B0 - 0x7B6 & HWLP & R/W & Hardware Loop Registers \\ \hline
|
||||
11 & 11 & 00 & 000000 & 0xF00 & MCPUID & R & CPU description \\ \hline
|
||||
11 & 11 & 00 & 000001 & 0xF01 & MIMPID & R & Vendor ID and version number \\ \hline
|
||||
11 & 11 & 00 & 010000 & 0xF10 & MHARTID & R & Hardware Thread ID \\ \bottomrule
|
||||
\end{tabularx}
|
||||
\end{table}
|
||||
\end{landscape}
|
||||
|
||||
\section{Register Description}
|
||||
|
||||
\subsection{MSTATUS}
|
||||
\csrDesc{0x300}{0x0000\_0006}{MSTATUS}{
|
||||
\begin{bytefield}[endianness=big,bitheight=60pt]{32}
|
||||
\bitheader{31,2,1,0} \\
|
||||
\bitbox{29}{ Unused }
|
||||
\bitbox{2}{\rotatebox{90}{\tiny PRV[1:0] }}
|
||||
\bitbox{1}{\rotatebox{90}{\tiny Interrupt Enable }}
|
||||
\end{bytefield}
|
||||
}
|
||||
|
||||
Note that \signal{PRV[1:0]} is statically \signal{2'b11} and cannot be altered (read-only).
|
||||
|
||||
|
||||
\subsection{MSCRATCH}
|
||||
\csrDesc{0x340}{0x0000\_0000}{MSRATCH}{
|
||||
\begin{bytefield}[endianness=big]{32}
|
||||
\bitheader{31,0} \\
|
||||
\bitbox{32}{ mscratch }
|
||||
\end{bytefield}
|
||||
}
|
||||
|
||||
\subsection{MEPC}
|
||||
\csrDesc{0x341}{0x0000\_0000}{MEPC}{
|
||||
\begin{bytefield}[endianness=big]{32}
|
||||
\bitheader{31,0} \\
|
||||
\bitbox{32}{ mepc }
|
||||
\end{bytefield}
|
||||
}
|
||||
|
||||
When an exception is encountered, the current program counter is saved in
|
||||
\signal{mepc} and the core jumps to the exception address.
|
||||
When an \instr{eret} instruction is executed, the value from \signal{mepc}
|
||||
replaces the current program counter.
|
||||
|
||||
\subsection{MCAUSE}
|
||||
\csrDesc{0x341}{0x0000\_0000}{MCAUSE}{
|
||||
\begin{bytefield}[endianness=big,bitheight=60pt]{32}
|
||||
\bitheader{31,30,4,0} \\
|
||||
\bitbox{1}{\rotatebox{90}{\tiny Interrupt }}
|
||||
\bitbox{27}{ Unused }
|
||||
\bitbox{5}{\rotatebox{90}{\tiny Exception Code }}
|
||||
\end{bytefield}
|
||||
}
|
||||
|
||||
\subsection{MCPUID}
|
||||
\csrDesc{0xF00}{0x0000\_0100}{MCPUID}{
|
||||
\begin{bytefield}[endianness=big,bitheight=60pt]{32}
|
||||
\bitheader{31,29,26,25,0} \\
|
||||
\bitbox{2}{\rotatebox{90}{\tiny Base }}
|
||||
\bitbox{4}{ 0 }
|
||||
\bitbox{26}{ Extensions }
|
||||
\end{bytefield}
|
||||
}
|
||||
|
||||
\subsection{MIMPID}
|
||||
\csrDesc{0xF01}{0x0000\_8000}{MIMPID}{
|
||||
\begin{bytefield}[endianness=big]{32}
|
||||
\bitheader{31,16,15,0} \\
|
||||
\bitbox{16}{ Implementation }
|
||||
\bitbox{16}{ Source }
|
||||
\end{bytefield}
|
||||
}
|
||||
|
||||
\subsection{MHARTID}
|
||||
\csrDesc{0xF10}{Defined}{MHARTID}{
|
||||
\begin{bytefield}[endianness=big,bitheight=60pt]{32}
|
||||
\bitheader{31,9,5,4,0} \\
|
||||
\bitbox{22}{ Unused }
|
||||
\bitbox{5}{\rotatebox{90}{\tiny Cluster ID }}
|
||||
\bitbox{5}{\rotatebox{90}{\tiny Core ID }}
|
||||
\end{bytefield}
|
||||
}
|
||||
|
||||
Both \signal{core id} and \signal{cluster id} are set on the top-level module
|
||||
of the core and are read-only.
|
||||
|
|
2
docs/datasheet/content/debug.tex
Normal file
2
docs/datasheet/content/debug.tex
Normal file
|
@ -0,0 +1,2 @@
|
|||
\chapter{Debug}
|
||||
\label{chap:debug}
|
4
docs/datasheet/content/exceptions.tex
Normal file
4
docs/datasheet/content/exceptions.tex
Normal file
|
@ -0,0 +1,4 @@
|
|||
\chapter{Exceptions and Interrupts}
|
||||
\label{chap:exceptions}
|
||||
|
||||
\rvcore supports
|
3
docs/datasheet/content/hwloop.tex
Normal file
3
docs/datasheet/content/hwloop.tex
Normal file
|
@ -0,0 +1,3 @@
|
|||
\chapter{PULP Hardware Loop Extensions}
|
||||
\label{chap:hwloop}
|
||||
|
|
@ -1,21 +1,25 @@
|
|||
\chapter{Instruction Fetch}
|
||||
\label{chap:if}
|
||||
|
||||
The instruction fetcher of the core is able to supply one instruction to the ID
|
||||
stage per cycle if the instruction cache or the instruction memory is able to
|
||||
deliver an instruction after one cycle.
|
||||
The instruction address must be word-aligned. It is not possible to jump to
|
||||
misaligned memory addresses.
|
||||
serve one instruction after one cycle.
|
||||
The instruction address must be half-word-aligned. It is not possible to jump to
|
||||
instruction addresses that have the LSB bit set.
|
||||
|
||||
Branch prediction is used for branches where the branch decision is not yet
|
||||
known, i.e. if the \instr{l.sf*} instruction precedes the \instr{l.bf} or
|
||||
\instr{l.bnf} instruction directly.
|
||||
Branch prediction assumes that backward branches are never taken and forward
|
||||
branches are always taken. If the branch predicition guessed wrong, one fetched
|
||||
instruction is wasted.
|
||||
|
||||
Table~\ref{tab:instr_signals} describes the signals that are used by to fetch
|
||||
instructions.
|
||||
For optimal performance and timing closure reasons, a prefetcher is used to
|
||||
fetch instructions. There are two prefetch flavors available:
|
||||
\begin{itemize}
|
||||
\item 32-Bit Word prefetcher. It stores the fetched words in a FIFO with three
|
||||
entries.
|
||||
\item 128-Bit Cache line prefetcher. It stores one 128-bit wide cache line
|
||||
plus 32-bit to allow for cross-cache line misaligned instructions.
|
||||
\end{itemize}
|
||||
|
||||
Table~\ref{tab:instr_signals} describes the signals that are used to fetch
|
||||
instructions. This interface is a simplified version that is used by the
|
||||
LSU that is described in Chapter~\ref{chap:lsu}. The difference is that no
|
||||
writes are possible and thus it needs less signals.
|
||||
|
||||
\begin{table}[H]
|
||||
\caption{Instruction Fetch Signals}
|
||||
|
@ -34,4 +38,4 @@ instructions.
|
|||
\section{Protocol}
|
||||
The protocol used to communicate with the instruction cache or the instruction
|
||||
memory is the same as the protocl used by the LSU. See the description of the
|
||||
LSU in Section~\ref{sec:lsu_protocol} for details about the protocol.
|
||||
LSU in Chapter~\ref{sec:lsu_protocol} for details about the protocol.
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
\chapter{Load-Store-Unit (LSU)}
|
||||
\label{chap:lsu}
|
||||
|
||||
The LSU of the core takes care of accessing the data memory. Load and stores on
|
||||
words (32 bit), half words (16 bit) and bytes (8 bit) are supported.
|
||||
|
@ -12,13 +13,13 @@ Table~\ref{tab:lsu_signals} describes the signals that are used by the LSU.
|
|||
\begin{tabularx}{\textwidth}{@{}llX@{}} \toprule
|
||||
\textbf{Signal} & \textbf{Direction} & \textbf{Description} \\ \toprule
|
||||
\signal{data\_req\_o} & \textbf{output} & Request ready, must stay high until \signal{data\_gnt\_i} is high for one cycle \\ \hline
|
||||
\signal{data\_addr\_o[31:0]} & \textbf{output} & Address \\ \hline
|
||||
\signal{data\_we\_o} & \textbf{output} & Write Enable, high if we want to write, low if we want to read \\ \hline
|
||||
\signal{data\_be\_o[3:0]} & \textbf{output} & Byte Enable, is set for the bytes to write/read \\ \hline
|
||||
\signal{data\_wdata\_o[31:0]} & \textbf{output} & Data to be written to memory \\ \hline
|
||||
\signal{data\_rdata\_i[31:0]} & \textbf{input} & Data read from memory \\ \hline
|
||||
\signal{data\_rvalid\_i} & \textbf{input} & \signal{data\_rdata\_i} is valid. This signal must always be identical to \signal{data\_gnt\_i} delayed by one cycle. \\ \hline
|
||||
\signal{data\_gnt\_i} & \textbf{input} & The memory accepted the request and will answer in the next cycle with valid rdata \\ \bottomrule
|
||||
\signal{data\_addr\_o[31:0]} & \textbf{output} & Address, sent together with \signal{data\_req\_o} \\ \hline
|
||||
\signal{data\_we\_o} & \textbf{output} & Write Enable, high for writes, low for reads, sent together with \signal{data\_req\_o} \\ \hline
|
||||
\signal{data\_be\_o[3:0]} & \textbf{output} & Byte Enable, is set for the bytes to write/read, sent together with \signal{data\_req\_o} \\ \hline
|
||||
\signal{data\_wdata\_o[31:0]} & \textbf{output} & Data to be written to memory, sent together with \signal{data\_req\_o} \\ \hline
|
||||
\signal{data\_rdata\_i[31:0]} & \textbf{input} & Data read from memory, valid when \signal{data\_rvalid\_i} is set \\ \hline
|
||||
\signal{data\_rvalid\_i} & \textbf{input} & \signal{data\_rdata\_i} is valid. \\ \hline
|
||||
\signal{data\_gnt\_i} & \textbf{input} & The memory accepted the request, another request can be sent in the next cycle \\ \bottomrule
|
||||
\end{tabularx}
|
||||
\end{table}
|
||||
|
||||
|
@ -29,14 +30,18 @@ word-aligned accesses internally.
|
|||
This means that at least two cycles are needed for misaligned loads and stores.
|
||||
|
||||
|
||||
\section{Post-Increment Load and Stores}
|
||||
\section{Post-Incrementing Load and Store Instructions}
|
||||
|
||||
Post-incrementing load and store instructions perform a load/store operation
|
||||
from/to the data memory while at the same time increasing the base address by
|
||||
the specified offset.
|
||||
the specified offset. For the memory access the base address without offset is
|
||||
used.
|
||||
|
||||
Post-incrementing load and stores reduce the number of instructions necessary to
|
||||
execute when running in a loop, i.e. the address increment can be embedded in
|
||||
the post-increment instructions.
|
||||
the post-increment instructions. Coupled with the hardware loop extension a
|
||||
significant reduction in the number of instructions necessary to execute small
|
||||
loops can be achieved.
|
||||
|
||||
|
||||
\section{Protocol}
|
||||
|
@ -44,22 +49,23 @@ the post-increment instructions.
|
|||
|
||||
The protocol that is used by the LSU to communicate with a memory works as
|
||||
follows:
|
||||
|
||||
The LSU provides a valid address in \signal{data\_addr\_o} and sets
|
||||
\signal{data\_req\_o} high. The memory then answers with a \signal{data\_gnt\_i}
|
||||
set high as soon as it is ready to serve the request. This may happen in the
|
||||
same cycle as the request was sent or any number of cycles later. After a grant
|
||||
was received, the address may be changed by the LSU without impact. Also the
|
||||
was received, the address may be changed in the next cycle by the LSU. Also the
|
||||
\signal{data\_wdata\_o}, \signal{data\_we\_o} and \signal{data\_be\_o} signals
|
||||
may be changed as it is assumed that the memory has already processed that
|
||||
information. In the case of a read, the memory answers with a
|
||||
may be changed as it is assumed that the memory has already processed and stored that
|
||||
information. After the grant, the memory answers with a
|
||||
\signal{data\_rvalid\_i} set high when \signal{data\_rdata\_i} is valid. This
|
||||
may happen one cycle after the grant was received, but may take any number of
|
||||
cycles after the grant was received.
|
||||
Starting from the cycle when \signal{data\_rvalid\_i} was asserted, another
|
||||
request may be sent.
|
||||
Note that \signal{data\_rvalid\_i} must also be set when a write was performed,
|
||||
although the \signal{data\_rdata\_i} has no meaning in this case.
|
||||
|
||||
Figure~\ref{fig:lsu_trans_basic}, Figure~\ref{fig:lsu_trans_b2b} and
|
||||
Figure~\ref{fig:lsu_trans_slow} show timing diagrams of the protocol.
|
||||
Figure~\ref{fig:lsu_trans_slow} show example timing diagrams of the protocol.
|
||||
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
\chapter{Multiplier}
|
||||
\chapter{Multiply-Accumulate}
|
||||
\label{chap:mac}
|
||||
|
||||
\rvcore uses a single-cycle 32 bit lower result multiplier. Only a subset of the
|
||||
standard M extension is implemented, i.e. the \instr{mul} instruction.
|
||||
|
|
|
@ -2,8 +2,8 @@
|
|||
|
||||
\rvcore is a 4-stage in-order \riscv CPU. The ISA of \rvcore was extended to
|
||||
also support multiple additional instructions including hardware loops,
|
||||
post-increment load and store instructions and packed-SIMD instructions that
|
||||
were not part of the standard \riscv ISA.
|
||||
post-increment load and store instructions and additional ALUinstructions that
|
||||
are not part of the standard \riscv ISA.
|
||||
|
||||
Figure~\ref{fig:ri5cy_overview} shows a block diagram of the core.
|
||||
|
||||
|
@ -13,3 +13,35 @@ Figure~\ref{fig:ri5cy_overview} shows a block diagram of the core.
|
|||
\caption{\rvcore Overview.}
|
||||
\label{fig:ri5cy_overview}
|
||||
\end{figure}
|
||||
|
||||
|
||||
\section{Supported Instruction Set}
|
||||
|
||||
\rvcore supports the following instructions:
|
||||
|
||||
\begin{itemize}
|
||||
\item Full support for RV32I Base Integer Instruction Set
|
||||
\item Full support for RV32C Standard Extension for Compressed Instructions
|
||||
\item Partial support for RV32M Standard Extension for Integer Multiplication
|
||||
and Division \\
|
||||
Only the \instr{mul} instruction is supported.
|
||||
\item PULP specific extensions \\
|
||||
\begin{itemize}
|
||||
\item Hardware Loops, see Chapter~\ref{chap:hwloop}
|
||||
\item ALU extensions, see Chapter~\ref{chap:aluext}
|
||||
\item Multiply-Accumulate extensions, see Chapter~\ref{chap:mac}
|
||||
\item Post-Incrementing load and stores, see Chapter~\ref{chap:lsu}
|
||||
\end{itemize}
|
||||
\end{itemize}
|
||||
|
||||
\section{ASIC Synthesis}
|
||||
ASIC synthesis is supported for \rvcore. The whole design is completely
|
||||
synchronous and uses positive-edge triggered flip-flops, except for the register
|
||||
file, where there is an option to use latches instead of flip-flops. See
|
||||
Chapter~\ref{chap:rf} for more details about the register file. The core
|
||||
occupies an area of about 35~kGE when the latch based register file is used.
|
||||
|
||||
\section{FPGA Synthesis}
|
||||
FPGA synthesis is supported for \rvcore when the flip-flop based register file
|
||||
is used. Since latches are not well supported on FPGAs, it is crucial to select
|
||||
the flip-flop based register file.
|
||||
|
|
|
@ -19,9 +19,9 @@ access to the performance counters.
|
|||
\end{table}
|
||||
|
||||
|
||||
\section{Performance Counters Mode Register (PCMR)}
|
||||
\section{Performance Counter Mode Register (PCMR)}
|
||||
|
||||
\sprDesc{0x3821}{0x0000\_0003}{PCMR}{
|
||||
\csrDesc{0x3821}{0x0000\_0003}{PCMR}{
|
||||
\begin{bytefield}[endianness=big,bitheight=60pt]{32}
|
||||
\bitheader{31,1,0} \\
|
||||
\bitbox{30}{ Unused }
|
||||
|
@ -38,9 +38,9 @@ The \instr{Saturation} bit controls saturation behaviour of the performance
|
|||
counters. If it is set, saturating arithmetic is used.
|
||||
After reset, the \instr{Saturation} bit is set.
|
||||
|
||||
\section{Performance Counters Event Register (PCER)}
|
||||
\section{Performance Counter Event Register (PCER)}
|
||||
|
||||
\sprDesc{0x3820}{0x0000\_0000}{PCER}{
|
||||
\csrDesc{0x3820}{0x0000\_0000}{PCER}{
|
||||
\begin{bytefield}[endianness=big,bitheight=60pt]{32}
|
||||
\bitheader{31,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0} \\
|
||||
\bitbox{1}{\rotatebox{90}{\tiny (ALL) }}
|
||||
|
@ -78,9 +78,9 @@ In the FPGA or Simulation version each event has its own counter and can be
|
|||
accesses separately.
|
||||
|
||||
|
||||
\section{Performance Counters Counter Registers (PCCR0-31)}
|
||||
\section{Performance Counter Counter Registers (PCCR0-31)}
|
||||
|
||||
\sprDesc{0x3800 - 0x381F}{0x0000\_0000}{PCCR0-31}{
|
||||
\csrDesc{0x3800 - 0x381F}{0x0000\_0000}{PCCR0-31}{
|
||||
\begin{bytefield}[endianness=big]{32}
|
||||
\bitheader{31,0} \\
|
||||
\bitbox{32}{Unsigned integer counter value}
|
||||
|
|
34
docs/datasheet/content/rf.tex
Normal file
34
docs/datasheet/content/rf.tex
Normal file
|
@ -0,0 +1,34 @@
|
|||
\chapter{Register File}
|
||||
\label{chap:rf}
|
||||
|
||||
\rvcore has 31 $\times$ 32-bit wide registers which form registers \signal{x1} to
|
||||
\signal{x31}. Register \signal{x0} is statically bound \signal{0} and can only be
|
||||
read and not written, it does not contain any sequential logic.
|
||||
|
||||
There are two flavors of register file available:
|
||||
|
||||
\begin{enumerate}
|
||||
\item Latch-based
|
||||
\item Flip-flop based
|
||||
\end{enumerate}
|
||||
|
||||
While the latch-based register file is recommended for ASICs, the flip-flop
|
||||
based register file is recommended for FPGA synthesis, although both are
|
||||
compatible with either synthesis target.
|
||||
Note the flip-flop based register file is significantly larger than the
|
||||
latch-based register-file for an ASIC implementation.
|
||||
|
||||
\section{Latch-based Register File}
|
||||
The latch based register file contains manually instantiated clock gating cells
|
||||
to keep the clock inactive when the latches are not written.
|
||||
|
||||
It is assumed that there is a clock gating cell for the target technology that
|
||||
is wrapped in a module called \signal{cluster\_clock\_gating} and has the following
|
||||
ports:
|
||||
\begin{itemize}
|
||||
\item \signal{clk\_i}: Clock Input
|
||||
\item \signal{en\_i}: Clock Enable Input
|
||||
\item \signal{test\_en\_i}: Test Enable Input (activates the clock even though
|
||||
\signal{en\_i} is not set)
|
||||
\item \signal{clk\_o}: Gated Clock Output
|
||||
\end{itemize}
|
|
@ -17,7 +17,7 @@
|
|||
|
||||
|
||||
%%%%% Mandatory title page settings.
|
||||
\title{RI5CY: Datasheet}
|
||||
\title{RI5CY Core: Datasheet}
|
||||
|
||||
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
|
||||
%%%%% %
|
||||
|
@ -34,8 +34,13 @@
|
|||
\input{./content/if.tex}
|
||||
\input{./content/lsu.tex}
|
||||
\input{./content/mac.tex}
|
||||
\input{./content/aluext.tex}
|
||||
\input{./content/hwloop.tex}
|
||||
\input{./content/rf.tex}
|
||||
\input{./content/csr.tex}
|
||||
\input{./content/perfcounters.tex}
|
||||
\input{./content/exceptions.tex}
|
||||
\input{./content/debug.tex}
|
||||
|
||||
|
||||
\end{document}
|
||||
|
|
File diff suppressed because it is too large
Load diff
|
@ -55,6 +55,8 @@
|
|||
|
||||
\usepackage{enumitem}
|
||||
|
||||
\usepackage{pdflscape}
|
||||
|
||||
|
||||
\usepackage{tikz-timing}[2009/05/15]
|
||||
|
||||
|
@ -96,8 +98,8 @@
|
|||
|
||||
\newcommand\signal[1]{{\ttfamily\bfseries #1}}
|
||||
|
||||
\newcommand\sprDesc[4]{%
|
||||
\textbf{SPR Address:} \texttt{#1}\\%
|
||||
\newcommand\csrDesc[4]{%
|
||||
\textbf{CSR Address:} \texttt{#1}\\%
|
||||
\textbf{Reset Value:} \texttt{#2}\\%
|
||||
\begin{figure}[H]
|
||||
\centering
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue