Restructure

This commit is contained in:
felsabbagh3 2019-03-22 04:14:52 -04:00
parent 097e0217de
commit 6c64fa35f8
239 changed files with 3839 additions and 819 deletions

BIN
.DS_Store vendored

Binary file not shown.

View file

@ -1,21 +0,0 @@
FIGS = fig/simd_core.pdf fig/byte_enc.pdf fig/word_enc.pdf
DIRS = fig
NAME = harp_iset
all: $(NAME).pdf
$(NAME).pdf: $(NAME).tex $(FIGS)
%.pdf: %.tex
pdflatex $<
fig/%.pdf: fig/%.fig
cd fig; $(MAKE) $(MFLAGS) $(@F); cd ..
clean:
rm -f $(NAME).pdf $(NAME).aux $(NAME).log $(NAME).toc $(NAME).snm \
$(NAME).out $(NAME).nav fig/*.pdf
force_look:
true

View file

@ -1,2 +0,0 @@
%.pdf: %.fig
fig2mpdf $<

View file

@ -1,35 +0,0 @@
#FIG 3.2 Produced by xfig version 3.2.5b
Landscape
Center
Metric
A4
100.00
Single
-2
1200 2
6 90 225 1260 540
4 0 0 50 -1 16 8 0.0000 4 120 1095 90 345 Pred. reg number or\001
4 0 0 50 -1 16 8 0.0000 4 120 1155 90 510 0xff for no predicate\001
-6
6 1710 225 2925 540
4 0 0 50 -1 16 8 0.0000 4 120 1200 1710 360 Opcode according to \001
4 0 0 50 -1 16 8 0.0000 4 105 1080 1710 525 table in Section 3.2\001
-6
6 3330 225 4635 720
4 0 0 50 -1 16 8 0.0000 4 120 1275 3330 360 Operands according to\001
4 0 0 50 -1 16 8 0.0000 4 120 1290 3330 525 instructioncoding type.\001
4 0 0 50 -1 16 8 0.0000 4 120 1050 3330 690 (based on opcode)\001
-6
2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
0 -180 1440 -180 1440 135 0 135 0 -180
2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
1620 -180 3060 -180 3060 135 1620 135 1620 -180
2 2 2 1 0 7 50 -1 -1 3.000 0 0 -1 0 0 5
3240 -180 4680 -180 4680 135 3240 135 3240 -180
4 1 0 50 -1 18 8 0.0000 4 105 570 720 0 Predicate\001
4 1 0 50 -1 18 8 0.0000 4 120 450 2340 0 Opcode\001
4 1 0 50 -1 18 8 0.0000 4 30 150 5085 0 . . .\001
4 1 0 50 -1 16 8 0.0000 4 120 345 720 -225 byte 0\001
4 1 0 50 -1 16 8 0.0000 4 120 345 2340 -225 byte 1\001
4 1 0 50 -1 16 8 0.0000 4 120 345 3960 -225 byte 2\001
4 1 0 50 -1 18 8 0.0000 4 120 615 3960 0 Operand 1\001

Binary file not shown.

View file

@ -1,95 +0,0 @@
#FIG 3.2 Produced by xfig version 3.2.5b
Landscape
Center
Metric
A4
100.00
Single
-2
1200 2
6 90 630 2925 990
2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
90 630 2925 630 2925 990 90 990 90 630
4 1 0 50 -1 16 12 0.0000 4 165 1485 1485 885 Decode/Control\001
-6
6 90 1215 630 2250
6 90 1215 630 1575
2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
90 1215 630 1215 630 1575 90 1575 90 1215
4 1 0 50 -1 16 12 0.0000 4 165 405 360 1485 ALU\001
-6
6 270 1890 630 2250
2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
270 1890 630 1890 630 2250 270 2250 270 1890
4 1 0 50 -1 16 12 0.0000 4 165 270 450 2160 RF\001
-6
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
0 0 1.00 60.00 60.00
0 0 1.00 60.00 60.00
450 1575 450 1890
-6
6 900 1215 1440 2250
6 900 1215 1440 1575
2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
900 1215 1440 1215 1440 1575 900 1575 900 1215
4 1 0 50 -1 16 12 0.0000 4 165 405 1170 1485 ALU\001
-6
6 1080 1890 1440 2250
2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
1080 1890 1440 1890 1440 2250 1080 2250 1080 1890
4 1 0 50 -1 16 12 0.0000 4 165 270 1260 2160 RF\001
-6
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
0 0 1.00 60.00 60.00
0 0 1.00 60.00 60.00
1260 1575 1260 1890
-6
6 1710 1215 2250 2250
6 1710 1215 2250 1575
2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
1710 1215 2250 1215 2250 1575 1710 1575 1710 1215
4 1 0 50 -1 16 12 0.0000 4 165 405 1980 1485 ALU\001
-6
6 1890 1890 2250 2250
2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
1890 1890 2250 1890 2250 2250 1890 2250 1890 1890
4 1 0 50 -1 16 12 0.0000 4 165 270 2070 2160 RF\001
-6
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
0 0 1.00 60.00 60.00
0 0 1.00 60.00 60.00
2070 1575 2070 1890
-6
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
0 0 1.00 60.00 60.00
0 0 1.00 60.00 60.00
360 990 360 1215
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
0 0 1.00 60.00 60.00
0 0 1.00 60.00 60.00
1170 990 1170 1215
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
0 0 1.00 60.00 60.00
0 0 1.00 60.00 60.00
1980 990 1980 1215
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
0 0 1.00 60.00 60.00
0 0 1.00 60.00 60.00
180 1575 180 2475
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
0 0 1.00 60.00 60.00
0 0 1.00 60.00 60.00
990 1575 990 2475
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 1 2
0 0 1.00 60.00 60.00
0 0 1.00 60.00 60.00
1800 1575 1800 2475
2 2 1 1 0 7 50 -1 -1 4.000 0 0 -1 0 0 5
0 0 3015 0 3015 2925 0 2925 0 0
2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
90 90 2925 90 2925 450 90 450 90 90
2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
90 2475 2925 2475 2925 2835 90 2835 90 2475
4 1 0 50 -1 18 18 0.0000 4 45 375 2700 1755 . . .\001
4 1 0 50 -1 16 12 0.0000 4 210 570 1485 2745 L1 D$\001
4 1 0 50 -1 16 12 0.0000 4 210 480 1485 375 L1 I$\001

View file

@ -1,51 +0,0 @@
#FIG 3.2 Produced by xfig version 3.2.5b
Landscape
Center
Metric
A4
100.00
Single
-2
1200 2
2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
0 1080 270 1080 270 1395 0 1395 0 1080
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 1 0 3
0 0 1.00 60.00 120.00
2295 1665 135 1665 135 1395
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 4
0 1035 0 990 270 990 270 1035
2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
315 1080 1755 1080 1755 1395 315 1395 315 1080
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 4
315 1035 315 990 1755 990 1755 1035
2 2 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 5
1800 1080 3240 1080 3240 1395 1800 1395 1800 1080
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 4
1800 1035 1800 990 3240 990 3240 1035
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 4
0 540 0 495 7425 495 7425 540
2 2 2 1 0 7 50 -1 -1 3.000 0 0 -1 0 0 5
3285 1080 4725 1080 4725 1395 3285 1395 3285 1080
2 2 2 1 0 7 50 -1 -1 3.000 0 0 -1 0 0 5
4770 1080 6210 1080 6210 1395 4770 1395 4770 1080
2 1 2 1 0 7 50 -1 -1 3.000 0 0 -1 0 0 4
7425 1080 6255 1080 6255 1395 7425 1395
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 4
3285 1035 3285 990 4725 990 4725 1035
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 4
4770 1035 4770 990 6210 990 6210 1035
2 1 0 1 0 7 50 -1 -1 0.000 0 0 -1 0 0 3
7425 990 6255 990 6255 1035
4 0 0 50 -1 18 8 0.0000 4 135 2070 2385 1710 Predicated (1 if predicated, 0 if not)\001
4 1 0 50 -1 18 8 0.0000 4 120 585 990 1305 Pred Reg.\001
4 1 0 50 -1 16 8 0.0000 4 90 75 135 945 1\001
4 1 0 50 -1 16 8 0.0000 4 120 735 990 945 log2(#pregs)\001
4 1 0 50 -1 16 8 0.0000 4 90 75 2520 945 6\001
4 1 0 50 -1 18 8 0.0000 4 120 450 2520 1305 Opcode\001
4 1 0 50 -1 18 8 0.0000 4 120 810 4005 1305 Reg. Operand\001
4 1 0 50 -1 16 8 0.0000 4 120 735 3960 945 log2(#pregs)\001
4 1 0 50 -1 16 8 0.0000 4 120 735 5445 945 log2(#pregs)\001
4 1 0 50 -1 18 8 0.0000 4 120 1155 5400 1305 Pred. Reg. Operand\001
4 1 0 50 -1 18 8 0.0000 4 120 810 6840 1305 Imm. Operand\001
4 1 0 50 -1 16 8 0.0000 4 120 825 6840 945 Remaining bits\001
4 1 0 50 -1 18 8 0.0000 4 135 1110 3600 405 One (aligned) word\001

View file

@ -1,442 +0,0 @@
\documentclass[10pt,letterpaper]{article}
\usepackage[margin=1in]{geometry}
\usepackage{graphicx}
\title{HARP Instruction Set Manual}
\author{Chad D. Kersey}
\begin{document}
\maketitle
\section*{Disclaimer}
This document, like the work it documents, is very much a work in progress.
Send any corrections, updates, suggestions, or complaints (preferably in patch form, this file is under \texttt{harptool/doc} in the HarpTool repo) to \texttt{cdkersey@gatech.edu}.
\section*{Introduction}
HARP is two things, a multi-year Heterogeneous Architecture Research Project, and an implementation of a specific Heterogeneous Architecture Research Prototype.
It is for the latter that the HARP instruction set architectures have been created.
This is a space of SIMT(GPU) oriented RISC-like instruction sets with the following properties:
\begin{itemize}
\item{Full predication}
\item{Assembly language level compatibility}
\item{SIM[DT] parallelism}
\item{Little endianness}
\item{8-bit byte size}
\item{Customizability}
\end{itemize}
The customizability of the HARP ISAs is illustrated by facts missing from this list of features.
The data path width, instruction encodings, number of registers (general purpose and predicate) are all left up to the implementation.
Harptool; the HARP assembler, linker, emulator, and disassembler, is passed information about the ISA through an architecture identifier string, or \texttt{ArchID}.
An \texttt{ArchID} uniquely identifies a HARP ISA.
\section{Architecture Identifier String (\texttt{ArchID})}
The best way to understand the multifaceted parameterizablity of the HARP ISAs is to study the architecture identifier strings used to uniquely identify a single HARP instruction set architecture.
We'll start by breaking down Harptool's default \texttt{ArchID}: \texttt{8w32/32/8/8}:
\begin{center}
\begin{tabular}{rl}
\textbf{Field}&\textbf{Meaning}\\
\hline
\texttt{8} &8-byte (64-bit) registers and addresses\\
\texttt{w} &Word-based (64-bit) fixed-width instruction encoding\\
\texttt{32}&32 general-purpose registers per lane\\
\texttt{32}&32 predicate registers per lane\\
\texttt{8} &8 SIMD lanes\\
\texttt{8} &8 warps (thread groups)\\
\end{tabular}
\end{center}
All ArchIDs have a similar format, although the final two fields can be omitted, as object files are still fully compatible even if the dimensions of the core change.
\section{HarpTool}
The assembler/linker/emulator/disassembler program for HARP is called HarpTool.
It is a multiple-function executable, its function selected with the first command line argument.
When run with no command line arugments the HarpTool executable prints a help message explaining the available command line arguments.
All of the HARP utilities can take an archID as a command line parameter.
If none is provided, a default will be assumed.
\subsection{Assembler}
The assembler converts assembly files to HOF, the Harp Object Format.
\subsection{Linker}
The linker combines HOF files and produces raw RAM images for use by the emulator.
An intended future use is the conversion of multiple HOF files to statically-linked HOF executables.
%\subsection{Emulator}
%The emulator emulates a HARP system.
%It can be provided with a
\subsection{Disassembler}
The disassembler is used to convert HOF files to equivalent assembly files.
One of its intended uses is the conversion of HOF object files between different HARP ISAs, say from 8w32/32 to 8b32/32.
\subsection{Test Programs}
In the \texttt{harptool/test} directory there is a set of test programs.
The makefile in this directory assembles, links, and emulates them, placing the output in plain text files.
\subsubsection{\texttt{hello.s}}
The simplest example prints a message and exits.
\subsubsection{\texttt{2thread.s}}
\texttt{2thread} performs a vector addition across two threads.
\subsubsection{\texttt{sieve.s}}
\texttt{sieve} performs the Sieve of Eratosthenes in a single thread and prints the results, including the count of total prime numbers found.
\section{Instruction Encoding}
There are two currently-supported types of instruction encoding, but they all share a similar basic structure.
The opcodes and types of fields required by each instruction are identical, differentiated only by the number of bits available for each type of field and the way predication is specified.
\subsection{Argument Classes}
Instructions can be broadly categorized by the types of arguments they require.
The bit fields in the instruction encodings depend heavily on this quality.
\begin{center}
\begin{tabular}{c|c|l}
\textbf{Argument Class}&\textbf{Description}&\textbf{Example}\\
\hline
\texttt{AC\_NONE} &No arguments &\texttt{di;} \\
\texttt{AC\_2REG} &2 GPRs, 1 in, 1 out &\texttt{neg \%r1, \%r2;} \\
\texttt{AC\_2IMM} &1 immediate in, 1 GPR out&\texttt{ldi \%r1, \#0xff;}\\
\texttt{AC\_3REG} &3 GPRs, 2 in, 1 out &\texttt{add \%r1, \%r2, \%r2;}\\
\texttt{AC\_3PREG} &3 pred. regs, 2 in, 1 out&\texttt{andp @p0, @p0, @p1;}\\
\texttt{AC\_3IMM} &GPR in, imm. in, GPR out &\texttt{andi \%r1, \%r3, \#3;}\\
\texttt{AC\_3REGSRC} &3 GPRs in &\texttt{tlbadd \%r0, \%r1, \%r2;}\\
\texttt{AC\_1IMM} &1 imm in &\texttt{jmpi label;} \\
\texttt{AC\_1REG} &1 reg in &\texttt{jmpr \%r2} \\
\texttt{AC\_3IMMSRC} &2 GPRs in, 1 imm. in &\texttt{st \%r1, \%r2, \#10;}\\
\texttt{AC\_PREG\_REG}&GPR in, pred. reg. out &\texttt{iszero @p0, \%r3;} \\
\texttt{AC\_2PREG} &2 pred. regs, 1 in, 1 out&\texttt{notp @p0, @p0;} \\
\end{tabular}
\end{center}
\subsection{Opcode/Instruction Class Table}
\begin{verbatim}
00 "nop" NONE 01 "di" NONE 02 "ei" NONE
03 "tlbadd" 3REGSRC 04 "tlbflush" NONE 05 "neg" 2REG
06 "not" 2REG 07 "and" 3REG 08 "or" 3REG
09 "xor" 3REG 0a "add" 3REG 0b "sub" 3REG
0c "mul" 3REG 0d "div" 3REG 0e "mod" 3REG
0f "shl" 3REG 10 "shr" 3REG 11 "andi" 3IMM
12 "ori" 3IMM 13 "xori" 3IMM 14 "addi" 3IMM
15 "subi" 3IMM 16 "muli" 3IMM 17 "divi" 3IMM
18 "modi" 3IMM 19 "shli" 3IMM 1a "shri" 3IMM
1b "jali" 2IMM 1c "jalr" 2REG 1d "jmpi" 1IMM
1e "jmpr" 1REG 1f "clone" 1REG 20 "jalis" 3IMM
21 "jalrs" 3REG 22 "jmprt" 1REG 23 "ld" 3IMM
24 "st" 3IMMSRC 25 "ldi" 2IMM 26 "rtop" PREG_REG
27 "andp" 3PREG 28 "orp" 3PREG 29 "xorp" 3PREG
2a "notp" 2PREG 2b "isneg" PREG_REG 2c "iszero" PREG_REG
2d "halt" NONE 2e "trap" NONE 2f "jmpru" 1REG
30 "skep" 1REG 31 "reti" NONE 32 "tlbrm" 1REG
33 "itof" 2REG 34 "ftoi" 2REG 35 "fadd" 3REG
36 "fsub" 3REG 37 "fmul" 3REG 38 "fdiv" 3REG
39 "fneg" 2REG 3a "wspawn" 3REG 3b "split" NONE
3c "join" NONE 3d "bar"
\end{verbatim}
\subsection{Word Encoding}
Word-based instruction encodings all share the initial fields:
\begin{itemize}
\item The most-significant bit is 1 if the instruction is predicated and 0 otherwise.
\item The next $log_2(\mathrm{\#pred\_regs})$ specify the predicate register.
\item The next 6 bits are used for the opcode.
\end{itemize}
After this, the operands of the instruction are ordered corresponding to their ordering in the assembly language, sized according to the following rules:
\begin{itemize}
\item Register operands are $log_2(\mathrm{\#GPRs})$ bits long, or just enough bits to uniquely identify a register.
\item Predicate register operands are $log_2(\mathrm{\#pred\_regs})$ bits long, or just enough bits to uniquely identify a predicate register.
\item Immediate fields are always the last field and occupy the remaining bits of the instruction. All immediate fields are sign extended to the length of a machine word.
\end{itemize}
\begin{center}
\includegraphics{fig/word_enc}
\end{center}
\subsection{Byte Encoding}
In the byte encoding, each field of the instruction (predicate, opcode, operands) occupies a byte, with the exception of immediates, which occupy an unaligned word.
All instructions have a predicate and opcode byte.
The predicate byte is all ones if the instruction is not predicated; oterwise the predicate byte contains the predicate register number used to predicate the instruction.
Just like the word-based instruction encoding, registers appear in the same order as the assembly language, destination-first.
\begin{center}
\includegraphics{fig/byte_enc}
\end{center}
\section{Assembly Language}
The assembly language is fairly easy to pick up from the HarpTools examples. It is RISC-like, and written destination register first (in this it differs from Unix assembly syntax).
Registers names are prefixed with the percent sign (\%) and predicate register names with the at symbol (@).
Predicated instructions are prefixed with the predicate register name and a question mark:
\begin{verbatim}
@p0 ? addi %r7, %r1, #1
\end{verbatim}
A small set of directives is provided to express non-instruction data:
\begin{center}
\begin{tabular}{cl}
\textbf{Directive}&\textbf{Use}\\
\hline
\texttt{.align 256} &Align next symbol to a multiple of 256 bytes.\\
\texttt{.word 0x1234} &Insert a word with the value \texttt{0x1234}.\\
\texttt{.byte 0xff} &Insert a byte with the value \texttt{0xff}.\\
\texttt{.def SYM 123} &Replace SYM with 123 in immediate operands.\\
\texttt{.entry} &Make the next label the HOF executable entry point.\\
\texttt{.global} &Give the next label global (external) linkage.\\
\texttt{.perm rw} &Set HOF permissions of the next label to read/write.\\
\texttt{.string "Str"} &Create a null terminated string.\\
\end{tabular}
\end{center}
\section{Instruction Set}
\subsection{Trivial Instruction}
\begin{center}
\begin{tabular}{cl}
\textbf{Instruction}&\textbf{Description}\\
\hline
\texttt{nop}&No operation.\\
\end{tabular}
\end{center}
\subsection{Privileged Instructions}
\begin{center}
\begin{tabular}{cl}
\textbf{Instruction}&\textbf{Description}\\
\hline
\texttt{ei} &Enable interrupts.\\
\texttt{di} &Disable interrupts.\\
\texttt{skep} \%addr&Set kernel entry point.\\
\texttt{tlbadd} \%virt, \%phys, \%flags&Add an entry to the TLB.\\
\texttt{tlbrm} \%virt&Remove entry corresponding to virt. address from TLB.\\
\texttt{tlbflush}&Remove all but default entry from TLB.\\
\texttt{jmpru} \%addr&Jump indirect and switch to user mode.\\
\texttt{reti}&Return from interrupt.\\
\texttt{halt}&Halt CPU until next interrupt.\\
\end{tabular}
\end{center}
The flags register used by \texttt{tlbadd} stores, in its least-significant four bits, in order from most to least significant:
\begin{center}
\begin{tabular}{cl}\
\textbf{Bit}&\textbf{Meaning}\\
\hline
\texttt{kx}&Kernel can execute.\\
\texttt{kw}&Kernel can write.\\
\texttt{kr}&Kernel can read.\\
\texttt{ux}&User can execute.\\
\texttt{uw}&User can write.\\
\texttt{ur}&User can read.\\
\end{tabular}
\end{center}
\subsection{Memory Loads/Stores}
\begin{center}
\begin{tabular}{cl}
\textbf{Instruction}&\textbf{Description}\\
\hline
\texttt{st} \%val, \%base, \textsc{\#Offset}&Store.\\
\texttt{ld} \%dest, \%base, \textsc{\#Offset}&Load.\\
\end{tabular}
\end{center}
\subsection{Predicate Manipulation}
\begin{center}
\begin{tabular}{cl}
\textbf{Instruction}&\textbf{Description}\\
\hline
\texttt{andp} @dest, @src1, @src2&Logical and.\\
\texttt{orp} @dest, @src1, @src2&Logical or.\\
\texttt{xorp} @dest, @src1, @src2&Exclusive or.\\
\texttt{notp} @dest, @src&Complement.\\
\end{tabular}
\end{center}
\subsection{Value Tests}
\begin{center}
\begin{tabular}{cl}
\textbf{Instruction}&\textbf{Description}\\
\hline
\texttt{rtop} @dest, \%src&Set @dest if \%src is nonzero.\\
\texttt{isneg} @dest, \%src&Set @dest if \%src is negative.\\
\texttt{iszero} @dest, \%src&Set @dest if \%src is zero.\\
\end{tabular}
\end{center}
\subsection{Immediate Integer Arithmetic/Logic}
\begin{center}
\begin{tabular}{cl}
\textbf{Instruction}&\textbf{Description}\\
\hline
\texttt{ldi} \%dest, \textsc{\#Imm}&Load immediate.\\
\texttt{addi} \%dest, \%src1, \textsc{\#Imm}&Add immediate.\\
\texttt{subi} \%dest, \%src1, \textsc{\#Imm}&Subtract immediate.\\
\texttt{muli} \%dest, \%src1, \textsc{\#Imm}&Multiply immediate.\\
\texttt{divi} \%dest, \%src1, \textsc{\#Imm}&Divide immediate.\\
\texttt{modi} \%dest, \%src1, \textsc{\#Imm}&Modulus immediate.\\
\texttt{shli} \%dest, \%src1, \textsc{\#Imm}&Shift left immediate.\\
\texttt{shri} \%dest, \%src1, \textsc{\#Imm}&Shift right immediate.\\
\texttt{andi} \%dest, \%src1, \textsc{\#Imm}&And immediate.\\
\texttt{ori} \%dest, \%src1, \textsc{\#Imm}&Or immediate.\\
\texttt{xori} \%dest, \%src1, \textsc{\#Imm}&Xor immediate.\\
\end{tabular}
\end{center}
\subsection{Register Integer Arithmetic/Logic}
\begin{center}
\begin{tabular}{cl}
\textbf{Instruction}&\textbf{Description}\\
\hline
\texttt{add} \%dest, \%src1, \%src2&Add.\\
\texttt{sub} \%dest, \%src1, \%src2&Subtract.\\
\texttt{mul} \%dest, \%src1, \%src2&Multiply.\\
\texttt{div} \%dest, \%src1, \%src2&Divide.\\
\texttt{mod} \%dest, \%src1, \%src2&Modulus.\\
\texttt{shl} \%dest, \%src1, \%src2&Shift left.\\
\texttt{shr} \%dest, \%src1, \%src2&Shift right.\\
\texttt{and} \%dest, \%src1, \%src2&And.\\
\texttt{or} \%dest, \%src1, \%src2&Or.\\
\texttt{xor} \%dest, \%src1, \%src2&Xor.\\
\texttt{neg} \%dest, \%src1&Two's complement.\\
\texttt{not} \%dest, \%src1&Bitwise complement.\\
\end{tabular}
\end{center}
\subsection{Floating Point Arithmetic}
These operations operate on real numbers in an implementation-determined
format, which can be fixed point or floating point.
\begin{center}
\begin{tabular}{cl}
\textbf{Instruction}&\textbf{Description}\\
\hline
\texttt{itof} \%dest, \%src&Signed integer to floating point.\\
\texttt{ftoi} \%dest, \%src&Floating point to signed integer.\\
\texttt{fneg} \%dest, \%src&Negate (complement sign bit).\\
\texttt{fadd} \%dest, \%src1, \%src2&Floating point add.\\
\texttt{fsub} \%dest, \%src1, \%src2&Floating point subtract.\\
\texttt{fmul} \%dest, \%src1, \%src2&Floating point multiply.\\
\texttt{fdiv} \%dest, \%src1, \%src2&Floating point divide.\\
\end{tabular}
\end{center}
\subsection{Control Flow}
\begin{center}
\begin{tabular}{cl}
\textbf{Instruction}&\textbf{Description}\\
\hline
\texttt{jmpi} \textsc{\#RelDest}&Jump to immediate (PC-relative).\\
\texttt{jmpr} \%addr&Jump indirect.\\
\texttt{jali} \%link, \textsc{\#RelDest}&Jump and link immediate.\\
\texttt{jalr} \%link, \%reg&Jump and link indirect.\\
\end{tabular}
\end{center}
\subsection{SIMD Control}
\begin{center}
\begin{tabular}{cl}
\textbf{Instruction}&\textbf{Description}\\
\hline
\texttt{clone} \%lane&Clone register state into specified lane.\\
\texttt{jalis} \%link, \%n, \textsc{\#RelDest}&Jump and link immediate, spawning N active lanes.\\
\texttt{jalrs} \%link, \%n, \%dest&Jump and link indirect, spawning N active lanes.\\
\texttt{jmprt} \%addr&Jump indirect, terminating execution on all but a single lane.\\
\texttt{split}&Control flow diverge.\\
\texttt{join}&Control flow reconverge.\\
\end{tabular}
\end{center}
\subsection{Warp Control}
\begin{center}
\begin{tabular}{cl}
\textbf{Instruction}&\textbf{Description}\\
\hline
\texttt{wspawn} \%dest, \%pc, \%src&Create new warp, copying \%src in current warp to to \%dest in new warp.\\
\texttt{bar} \%id, \%n&Barrier of \%n warps. Identified by \%id.\\
\end{tabular}
\end{center}
\subsection{User/Kernel Interteraction}
\begin{center}
\begin{tabular}{cl}
\textbf{Instruction}&\textbf{Description}\\
\hline
\texttt{trap}&User-generated interrupt.\\
\end{tabular}
\end{center}
\section{Interrupts}
The HARP interrupt mechanism is simple.
For SIMD lane 0, there is a shadow register file, program counter, and active lane count.
When an interrupt occurs, the state of lane zero is saved into these shadow registers, and execution resumes at the kernel entry point.
The type of interrupt is specified by the value placed in register 0 at this time, according to the following table:
\begin{center}
\begin{tabular}{cl}
\textbf{Number}&\textbf{Description}\\
\hline
0 &Trap (user-generated interrupt)\\
1 &Page fault due to absence from TLB\\
2 &Page fault due to permission violation\\
3 &Invalid/unsupported instruction\\
4 &Divergent branch\\
5 &Numerical domain (divide by zero)\\
6-7&(reserved for future exceptions)\\
8 &Console input\\
\end{tabular}
\end{center}
The first eight interrupt numbers are reserved for internal CPU-generated exceptions, and all of the remaining numbers are free for use by hardware.
\section{Application Binary Interface}
The ABI assumes a set of at least four general purpose registers.
The frame pointer is optional and can be stored on the stack itself if necessary.
The stack pointer and link register, in this order, are always the two highest-numbered registers.
If 8 or more registers are available, the frame pointer may be the register one less than the register number of the stack pointer.
\begin{itemize}
\item The lower-numbered half of the registers are caller-saved (temporary).
\item The upper-numbered half are therefore callee-saved.
\item The callee is responsible for adjusting the stack and frame pointers, if such adjustment is required.
\item The stack grows toward smaller addresses (subtract to push, add to pop).
\item Pointer function arguments and numerical arguments that can fit in a single register are passed through temporary registers, starting with register \texttt{\%r0}. If more registers are required than there are temporary registers available, stack space at addresses less than the stack pointer is used.
\item Record (struct) return values and numerical return values larger than the word size are always passed on the stack. The caller is responsible for allocating the necessary space. The stack pointer at the time of call is a pointer to the returned structure. All other return values are returned in \texttt{\%r0}.
\end{itemize}
\section{SIMD Operation}
The HARP ISAs are inherently SIMD.
In addition to designs with a single set of functional units and architectural registers, designs are allowed that replicate these while retaining a single front-end and memory system.
This allows for multiple threads executing the same stream of instructions to simultaneously occupy multiple ``lanes'' of the processor.
When a predicated control flow instruction occurs without unanimous agreement among predicate registers, a divergent branch has occurred.
The current response to this is to trap to the operating system (interrupt number 4).
%\begin{center}
%\includegraphics{fig/simd_core}
%\end{center}
\subsection{Instructions for SIMD Operation}
The \texttt{clone}, \texttt{jalis}, \texttt{jalrs}, and \texttt{jmprt} instructions form the basis of SIMD context control in the HARP instruction set.
Context is created using \texttt{clone}, the waiting threads are spawned using \texttt{jalrs} or \texttt{jalis}, ``jump-and-link immediate/register and spawn'', and finally the parallel section returns using \texttt{jmprt}, ``jump register and terminate'', best thought of as ``return and terminate.''
There are times when a control flow operation will need to be predicated, going one direction on some lanes and the other direction on other lanes.
For this, the HARP instruction set provides the \texttt{split} and \texttt{join} instructions.
When a predicated \texttt{split} is first encountered, only the lanes for which the \texttt{split}'s predicate are true are allowed to continue.
The other lanes are masked out until the corresponding \texttt{join} is encountered.
The first time \texttt{join} is reached, control flow returns to the instruction following the corresponding \texttt{split} with the set of masked-out lanes complemented.
The second time the same \texttt{join} is reached, control flow falls through and the original lane mask is restored.
A hardware stack is maintained to keep track of nested \texttt{split}s.
\section{Default I/O Devices}
The emulator currently only supports a single I/O device, simple console I/O.
Writing to the address \texttt{0x800...0} (an address with its MSB set and all other bits cleared) causes text to be written to the display.
Input on this console interface causes an interrupt (number 8).
\end{document}

View file

BIN
emulator/harptool Executable file

Binary file not shown.

BIN
emulator/libharplib.a Normal file

Binary file not shown.

BIN
emulator/libharplib.so Executable file

Binary file not shown.

Some files were not shown because too many files have changed in this diff Show more