Merged in harp-iset and fixed instruction support.

This commit is contained in:
chad 2014-09-26 07:50:31 -04:00
commit ddea87689c
9 changed files with 175 additions and 68 deletions

View file

@ -132,7 +132,8 @@ The bit fields in the instruction encodings depend heavily on this quality.
30 "skep" 1REG 31 "reti" NONE 32 "tlbrm" 1REG
33 "itof" 2REG 34 "ftoi" 2REG 35 "fadd" 3REG
36 "fsub" 3REG 37 "fmul" 3REG 38 "fdiv" 3REG
39 "fneg" 2REG
39 "fneg" 2REG 3a "wspawn" 2REG 3b "split" NONE
3c "join" NONE
\end{verbatim}
\subsection{Word Encoding}
@ -346,6 +347,8 @@ format, which can be fixed point or floating point.
\texttt{jalis} \%link, \%n, \textsc{\#RelDest}&Jump and link immediate, spawning N active lanes.\\
\texttt{jalrs} \%link, \%n, \%dest&Jump and link indirect, spawning N active lanes.\\
\texttt{jmprt} \%addr&Jump indirect, terminating execution on all but a single lane.\\
\texttt{split}&Control flow diverge.\\
\texttt{join}&Control flow reconverge.\\
\end{tabular}
\end{center}
@ -412,6 +415,14 @@ The current response to this is to trap to the operating system (interrupt numbe
The \texttt{clone}, \texttt{jalis}, \texttt{jalrs}, and \texttt{jmprt} instructions form the basis of SIMD context control in the HARP instruction set.
Context is created using \texttt{clone}, the waiting threads are spawned using \texttt{jalrs} or \texttt{jalis}, ``jump-and-link immediate/register and spawn'', and finally the parallel section returns using \texttt{jmprt}, ``jump register and terminate'', best thought of as ``return and terminate.''
There are times when a control flow operation will need to be predicated, going one direction on some lanes and the other direction on other lanes.
For this, the HARP instruction set provides the \texttt{split} and \texttt{join} instructions.
When a predicated \texttt{split} is first encountered, only the lanes for which the \texttt{split}'s predicate are true are allowed to continue.
The other lanes are masked out until the corresponding \texttt{join} is encountered.
The first time \texttt{join} is reached, control flow returns to the instruction following the corresponding \texttt{split} with the set of masked-out lanes complemented.
The second time the same \texttt{join} is reached, control flow falls through and the original lane mask is restored.
A hardware stack is maintained to keep track of nested \texttt{split}s.
\section{Default I/O Devices}
The emulator currently only supports a single I/O device, simple console I/O.
Writing to the address \texttt{0x800...0} (an address with its MSB set and all other bits cleared) causes text to be written to the display.

View file

@ -1,7 +1,7 @@
################################################################################
# HARPtools by Chad D. Kersey, Summer 2011 #
################################################################################
CXXFLAGS ?= -fPIC -O3 # -g -DUSE_DEBUG=3
CXXFLAGS ?= -fPIC -O3 #-g -DUSE_DEBUG=3
LDLIBS ?= -pthread
PREFIX ?= /usr/local

View file

@ -46,6 +46,8 @@ Core::Core(const ArchDef &a, Decoder &d, MemoryUnit &mem, Word id) :
for (Word i = 0; i < a.getNPRegs(); ++i) {
pred[j].push_back(Reg<bool>(id, regNum++));
}
tmask.push_back(true);
}
/* Set initial register contents. */
@ -131,6 +133,11 @@ void Core::step() {
D_RAW(" (");
for (unsigned i = 0; i < shadowPReg.size(); ++i) D_RAW(shadowPReg[i]);
D_RAW(')' << endl);
D(3, "Thread mask:");
D_RAW(" ");
for (unsigned i = 0; i < tmask.size(); ++i) D_RAW(tmask[i] << ' ');
D_RAW(endl);
}
#endif

View file

@ -6,6 +6,7 @@
#include <string>
#include <vector>
#include <stack>
#include "types.h"
#include "archdef.h"
@ -26,7 +27,7 @@ namespace Harp {
Reg &operator=(T r) { val = r; doWrite(); return *this; }
operator T() { doRead(); return val; }
operator T() const { doRead(); return val; }
void trunc(Size s) {
Word mask((~0ull >> (sizeof(Word)-s)*8));
@ -39,14 +40,36 @@ namespace Harp {
#ifdef EMU_INSTRUMENTATION
/* Access size here is 8, representing the register size of 64-bit cores. */
void doWrite() { reg_doWrite(cpuId, regNum); }
void doRead() { reg_doRead(cpuId, regNum); }
void doWrite() const { reg_doWrite(cpuId, regNum); }
void doRead() const { reg_doRead(cpuId, regNum); }
#else
void doWrite() {}
void doRead() {}
void doWrite() const {}
void doRead() const {}
#endif
};
// Entry in the IPDOM Stack
struct DomStackEntry {
DomStackEntry(
unsigned p, const std::vector<std::vector<Reg<bool> > >& m, Word pc
): pc(pc), fallThrough(false)
{
std::cout << "New DomStackEntry:";
for (unsigned i = 0; i < m.size(); ++i) {
tmask.push_back(!bool(m[i][p]));
std::cout << ' ' << bool(m[i][p]);
}
std::cout << std::endl;
}
DomStackEntry(const std::vector<bool> &tmask):
tmask(tmask), fallThrough(true) {}
bool fallThrough;
std::vector<bool> tmask;
Word pc;
};
class Core {
public:
Core(const ArchDef &a, Decoder &d, MemoryUnit &mem, Word id=0);
@ -67,6 +90,9 @@ namespace Harp {
std::vector<std::vector<Reg<Word> > > reg;
std::vector<std::vector<Reg<bool> > > pred;
std::vector<bool> tmask;
std::stack<DomStackEntry> domStack;
std::vector<Word> shadowReg;
std::vector<bool> shadowPReg;

View file

@ -30,10 +30,11 @@ namespace Harp {
JALI, JALR, JMPI, JMPR, CLONE, JALIS, JALRS,
JMPRT, LD, ST, LDI, RTOP, ANDP, ORP, XORP, NOTP, ISNEG,
ISZERO, HALT, TRAP, JMPRU, SKEP, RETI, TLBRM,
ITOF, FTOI, FADD, FSUB, FMUL, FDIV, FNEG };
ITOF, FTOI, FADD, FSUB, FMUL, FDIV, FNEG, WSPAWN,
SPLIT, JOIN };
enum ArgClass {
AC_NONE, AC_2REG, AC_2IMM, AC_3REG, AC_3PREG, AC_3IMM, AC_3REGSRC,
AC_1IMM, AC_1REG, AC_3IMMSRC, AC_PREG_REG, AC_2PREG
AC_1IMM, AC_1REG, AC_3IMMSRC, AC_PREG_REG, AC_2PREG, AC_2REGSRC
};
enum InstType {
ITYPE_NULL, ITYPE_INTBASIC, ITYPE_INTMUL, ITYPE_INTDIV, ITYPE_STACK, ITYPE_BR,

View file

@ -80,6 +80,9 @@ Instruction::InstTableEntry Instruction::instTable[] = {
{"fmul", false, false, false, false, AC_3REG, ITYPE_FPMUL },
{"fdiv", false, false, false, false, AC_3REG, ITYPE_FPDIV },
{"fneg", false, false, false, false, AC_2REG, ITYPE_FPBASIC },
{"wspawn", false, false, true, false, AC_2REGSRC, ITYPE_NULL },
{"split", false, false, true, false, AC_NONE, ITYPE_NULL },
{"join", false, false, true, false, AC_NONE, ITYPE_NULL },
{NULL,false,false,false,false,AC_NONE,ITYPE_NULL}/////// End of table.
};
@ -118,12 +121,16 @@ void Instruction::executeOn(Core &c) {
return;
}
/* Also throw exceptions on divergent branches. */
if (predicated && instTable[op].controlFlow) {
bool p0 = c.pred[0][pred];
for (Size t = 1; t < c.activeThreads; t++) {
if (c.pred[t][pred] != p0) throw DivergentBranchException();
/* Also throw exceptions on non-masked divergent branches. */
if (instTable[op].controlFlow) {
Size t, count, active;
for (t = 0, count = 0, active = 0; t < c.activeThreads; ++t) {
if ((!predicated || c.pred[t][pred]) && c.tmask[t]) ++count;
if (c.tmask[t]) ++active;
}
if (count != 0 && count != active)
throw DivergentBranchException();
}
Size nextActiveThreads = c.activeThreads;
@ -132,8 +139,12 @@ void Instruction::executeOn(Core &c) {
for (Size t = 0; t < c.activeThreads; t++) {
vector<Reg<Word> > &reg(c.reg[t]);
vector<Reg<bool> > &pReg(c.pred[t]);
stack<DomStackEntry> &domStack(c.domStack);
if (predicated && !pReg[pred]) continue;
// If this thread is masked out, don't execute the instruction, unless it's
// a split or join.
if (((predicated && !pReg[pred]) || !c.tmask[t]) &&
op != SPLIT && op != JOIN) continue;
Word memAddr;
switch (op) {
@ -241,6 +252,10 @@ void Instruction::executeOn(Core &c) {
break;
case NOTP: pReg[pdest] = !(pReg[psrc[0]]);
break;
case ANDP: pReg[pdest] = pReg[psrc[0]] & pReg[psrc[1]];
break;
case ORP: pReg[pdest] = pReg[psrc[0]] | pReg[psrc[1]];
break;
case ISNEG: pReg[pdest] = (1ll<<(wordSz*8 - 1))&reg[rsrc[0]];
break;
case HALT: c.activeThreads = 0;
@ -283,6 +298,23 @@ void Instruction::executeOn(Core &c) {
case FDIV: reg[rdest] = Float(double(Float(reg[rsrc[0]], wordSz)) /
double(Float(reg[rsrc[1]], wordSz)),wordSz);
break;
case SPLIT:if (t == 0) {
// TODO: if mask becomes all-zero, fall through
DomStackEntry e(pred, c.pred, c.pc);
c.domStack.push(c.tmask);
c.domStack.push(e);
for (unsigned i = 0; i < e.tmask.size(); ++i)
c.tmask[i] = !e.tmask[i];
}
break;
case JOIN: if (t == 0) {
// TODO: if mask becomes all-zero, fall through
if (!c.domStack.top().fallThrough)
c.pc = c.domStack.top().pc;
c.tmask = c.domStack.top().tmask;
c.domStack.pop();
}
break;
default:
cout << "ERROR: Unsupported instruction: " << *this << "\n";
exit(1);

View file

@ -5,12 +5,14 @@ HARPDIS = ../harptool -D
4BARCH = 4b16/16/2
all: simple.bin sieve.bin 2thread.bin simple.4b.bin sieve.4b.bin 2thread.4b.bin bubble.bin bubble.4b.bin dotprod.bin dotprod.4b.bin matmul.bin matmul.4b.bin \
matmul-mt.s lfsr.bin
matmul-mt.s lfsr.bin diverge.bin
run: simple.out sieve.out 2thread.out simple.4b.out sieve.4b.out 2thread.4b.out bubble.out bubble.4b.out dotprod.out dotprod.4b.out matmul.out matmul.4b.out\
matmul-mt.out lfsr.4b.out lfsr.out
matmul-mt.out lfsr.4b.out lfsr.out diverge.out
disas: simple.d sieve.d 2thread.d simple.4b.d sieve.4b.d 2thread.4b.d bubble.d bubble.4b.d dotprod.d dotprod.4b.d matmul.d matmul.4b.d matmul-mt.d lfsr.d
disas: simple.d sieve.d 2thread.d simple.4b.d sieve.4b.d 2thread.4b.d bubble.d \
bubble.4b.d dotprod.d dotprod.4b.d matmul.d matmul.4b.d matmul-mt.d lfsr.d \
diverge.d
%.4b.out : %.4b.bin
$(HARPEM) -a $(4BARCH) -c $< > $@
@ -18,56 +20,11 @@ disas: simple.d sieve.d 2thread.d simple.4b.d sieve.4b.d 2thread.4b.d bubble.d b
%.out : %.bin
$(HARPEM) -c $< > $@
2thread.bin : boot.HOF lib.HOF 2thread.HOF
$(HARPLD) -o 2thread.bin $^
2thread.4b.bin : boot.4b.HOF lib.4b.HOF 2thread.4b.HOF
$(HARPLD) --arch $(4BARCH) -o 2thread.4b.bin $^
bubble.bin : boot.HOF lib.HOF bubble.HOF
$(HARPLD) -o bubble.bin $^
bubble.4b.bin : boot.4b.HOF lib.4b.HOF bubble.4b.HOF
$(HARPLD) --arch $(4BARCH) -o bubble.4b.bin $^
simple.bin : boot.HOF lib.HOF simple.HOF
$(HARPLD) -o $@ $^
sieve.bin : boot.HOF lib.HOF sieve.HOF
$(HARPLD) -o $@ $^
lfsr.bin : boot.HOF lib.HOF lfsr.HOF
$(HARPLD) -o $@ $^
dotprod.bin : boot.HOF lib.HOF dotprod.HOF
$(HARPLD) -o $@ $^
matmul.bin : boot.HOF lib.HOF matmul.HOF
$(HARPLD) -o $@ $^
matmul-mt.bin : boot.HOF lib.HOF matmul-mt.HOF
$(HARPLD) -o $@ $^
simple.4b.bin : boot.4b.HOF lib.4b.HOF simple.4b.HOF
%.4b.bin : boot.4b.HOF lib.4b.HOF %.4b.HOF
$(HARPLD) --arch $(4BARCH) -o $@ $^
sieve.4b.bin : boot.4b.HOF lib.4b.HOF sieve.4b.HOF
$(HARPLD) --arch $(4BARCH) -o $@ $^
dotprod.4b.bin : boot.4b.HOF lib.4b.HOF dotprod.4b.HOF
$(HARPLD) --arch $(4BARCH) -o $@ $^
matmul.4b.bin : boot.4b.HOF lib.4b.HOF matmul.4b.HOF
$(HARPLD) --arch $(4BARCH) -o $@ $^
lfsr.4b.bin : boot.4b.HOF lib.4b.HOF lfsr.4b.HOF
$(HARPLD) --arch $(4BARCH) -o $@ $^
%.4b.bin : %.4b.HOF
$(HARPLD) --arch $(4BARCH) -o $@ $<
%.bin : %.HOF
$(HARPLD) -o $@ $<
%.bin : boot.HOF lib.HOF %.HOF
$(HARPLD) -o $@ $^
%.4b.HOF : %.s
$(HARPAS) --arch $(4BARCH) -o $@ $<

64
src/test/diverge.s Normal file
View file

@ -0,0 +1,64 @@
/*******************************************************************************
Harptools by Chad D. Kersey, Summer 2011
********************************************************************************
Sample HARP assmebly program.
*******************************************************************************/
/* Divergent branch: test immediate postdominator branch divergence support. */
.def THREADS 8
.align 4096
.perm x
.entry
.global
entry:
ldi %r0, #1
ldi %r1, THREADS
sloop: clone %r0
addi %r0, %r0, #1
sub %r2, %r1, %r0
rtop @p0, %r2
@p0 ? jmpi sloop
ldi %r0, #0
jalis %r5, %r1, dthread;
ldi %r0, #0
ldi %r1, (__WORD * THREADS)
ploop: ld %r7, %r0, array
jali %r5, printdec
addi %r0, %r0, __WORD
sub %r7, %r1, %r0
rtop @p0, %r7
@p0 ? jmpi ploop
trap;
dthread: ldi %r1, #10
ldi %r2, #0
loop: andi %r3, %r0, #1
rtop @p1, %r3
@p1 ? split
@p1 ? jmpi else
add %r2, %r2, %r0
jmpi after
else: sub %r2, %r2, %r0
after: join
subi %r1, %r1, #1
rtop @p0, %r1
@p0 ? jmpi loop
shli %r4, %r0, (`__WORD)
st %r2, %r4, array
jmprt %r5;
.align 4096
array: .space 4096

View file

@ -11,13 +11,22 @@
.perm x
.entry
.global
entry: ldi %r7, hello
entry: ldi %r0, wentry
ldi %r7, hello2
/* wspawn %r0, %r7 */
ldi %r0, hello1
wentry: ori %r7, %r0, #0
jali %r5, puts
trap; /* All traps currently cause a halt. */
.perm rw
hello:
hello1:
.byte 0x22
.string "Harp!\" is how a harp seal says hello!\n"
hello2:
.string "This is a string for another thread!\n"