Working + Stable - Mutex

This commit is contained in:
felsabbagh3 2019-02-22 04:01:07 -05:00
parent 96a8615a5f
commit 087a39ccf4
18 changed files with 845 additions and 182676 deletions

View file

@ -304,6 +304,7 @@ Instruction *WordDecoder::decode(const std::vector<Byte> &v, Size &idx) {
case InstType::N_TYPE:
break;
case InstType::R_TYPE:
inst.setPred((code>>shift_rs1) & reg_mask);
inst.setDestReg((code>>shift_rd) & reg_mask);
inst.setSrcReg((code>>shift_rs1) & reg_mask);
inst.setSrcReg((code>>shift_rs2) & reg_mask);

View file

@ -53,7 +53,7 @@ namespace Harp {
// Entry in the IPDOM Stack
struct DomStackEntry {
DomStackEntry(
unsigned p, const std::vector<std::vector<Reg<bool> > >& m,
unsigned p, const std::vector<std::vector<Reg<Word> > >& m,
std::vector<bool> &tm, Word pc
): pc(pc), fallThrough(false), uni(false)
{

View file

@ -4,7 +4,7 @@
#ifndef __DEBUG_H
#define __DEBUG_H
#define USE_DEBUG 9
// #define USE_DEBUG 9
#ifdef USE_DEBUG
#include <iostream>

View file

@ -85,17 +85,17 @@ void Instruction::executeOn(Warp &c) {
return;
}
/* Also throw exceptions on non-masked divergent branches. */
if (instTable[op].controlFlow) {
Size t, count, active;
for (t = 0, count = 0, active = 0; t < c.activeThreads; ++t) {
if ((!predicated || c.pred[t][pred]) && c.tmask[t]) ++count;
if (c.tmask[t]) ++active;
}
// /* Also throw exceptions on non-masked divergent branches. */
// if (instTable[op].controlFlow) {
// Size t, count, active;
// for (t = 0, count = 0, active = 0; t < c.activeThreads; ++t) {
// if ((!predicated || c.pred[t][pred]) && c.tmask[t]) ++count;
// if (c.tmask[t]) ++active;
// }
if (count != 0 && count != active)
throw DivergentBranchException();
}
// if (count != 0 && count != active)
// throw DivergentBranchException();
// }
Size nextActiveThreads = c.activeThreads;
Size wordSz = c.core->a.getWordSize();
@ -106,14 +106,14 @@ void Instruction::executeOn(Warp &c) {
// If we have a load, overwriting a register's contents, we have to make sure
// ahead of time it will not fault. Otherwise we may perform an indirect load
// by mistake.
if (op == L_INST && rdest == rsrc[0]) {
for (Size t = 0; t < c.activeThreads; t++) {
if ((!predicated || c.pred[t][pred]) && c.tmask[t]) {
Word memAddr = c.reg[t][rsrc[0]] + immsrc;
c.core->mem.read(memAddr, c.supervisorMode);
}
}
}
// if (op == L_INST && rdest == rsrc[0]) {
// for (Size t = 0; t < c.activeThreads; t++) {
// if ((!predicated || c.pred[t][pred]) && c.tmask[t]) {
// Word memAddr = c.reg[t][rsrc[0]] + immsrc;
// c.core->mem.read(memAddr, c.supervisorMode);
// }
// }
// }
bool sjOnce(true), // Has not yet split or joined once.
pcSet(false); // PC has already been set
@ -134,6 +134,8 @@ void Instruction::executeOn(Warp &c) {
Word shamt;
Word temp;
Word data_read;
// Word pred;
DomStackEntry e(pred, c.reg, c.tmask, c.pc);
int op1, op2;
switch (op) {
@ -331,16 +333,19 @@ void Instruction::executeOn(Warp &c) {
++c.stores;
memAddr = reg[rsrc[0]] + immsrc;
// std::cout << "STORE MEM ADDRESS: " << std::hex << reg[rsrc[0]] << " + " << immsrc << "\n";
// std::cout << "FUNC3: " << func3 << "\n";
switch (func3)
{
case 0:
// std::cout << "SB\n";
c.core->mem.write(memAddr, reg[rsrc[1]] & 0x000000FF, c.supervisorMode, 1);
break;
case 1:
// std::cout << std::hex << "INST: about to write: " << reg[rsrc[1]] << " to " << memAddr << "\n";
// std::cout << "SH\n";
c.core->mem.write(memAddr, reg[rsrc[1]], c.supervisorMode, 2);
break;
case 2:
// std::cout << std::hex << "SW: about to write: " << reg[rsrc[1]] << " to " << memAddr << "\n";
c.core->mem.write(memAddr, reg[rsrc[1]], c.supervisorMode, 4);
break;
default:
@ -525,6 +530,26 @@ void Instruction::executeOn(Warp &c) {
}
}
break;
case 2:
// SPLIT
c.domStack.push(c.tmask);
c.domStack.push(e);
for (unsigned i = 0; i < e.tmask.size(); ++i)
{
c.tmask[i] = !e.tmask[i] && c.tmask[i];
}
break;
case 3:
// JOIN
if (!c.domStack.top().fallThrough) {
if (!pcSet) nextPc = c.domStack.top().pc;
pcSet = true;
}
c.tmask = c.domStack.top().tmask;
c.domStack.pop();
break;
case 4:
// JMPRT
nextActiveThreads = 1;
@ -534,7 +559,7 @@ void Instruction::executeOn(Warp &c) {
case 5:
// CLONE
// std::cout << "CLONE\n";
// std::cout << "CLONING THREAD: " << reg[rsrc[0]] << "\n";
// std::cout << "CLONING REG: " << rsrc[0] << " lane: " << reg[rsrc[0]] << "\n";
c.reg[reg[rsrc[0]]] = reg;
break;
case 6:
@ -544,7 +569,15 @@ void Instruction::executeOn(Warp &c) {
if (!pcSet) nextPc = reg[rsrc[0]];
pcSet = true;
// std::cout << "ACTIVE_THREDS: " << rsrc[1] << " val: " << reg[rsrc[1]] << "\n";
// std::cout << "nextPC: " << rsrc[0] << " val: " << reg[rsrc[0]] << "\n";
// std::cout << "nextPC: " << rsrc[0] << " val: " << std::hex << reg[rsrc[0]] << "\n";
break;
case 7:
// pred jump reg
if (reg[rsrc[0]])
{
nextPc = reg[rsrc[1]];
pcSet = true;
}
break;
default:
cout << "ERROR: UNSUPPORTED GPGPU INSTRUCTION " << *this << "\n";

File diff suppressed because it is too large Load diff

View file

@ -9,40 +9,56 @@ void matAddition (unsigned, unsigned);
#include "./lib/lib.h"
unsigned x[] = {1, 1, 6, 0, 3, 1, 1, 2, 0, 3, 6, 7, 5, 7};
unsigned y[] = {0, 2, 2, 0, 5, 0, 1, 1, 4, 2, 0, 0, 3, 2};
unsigned z[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
// unsigned x[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
// unsigned y[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
// unsigned z[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
// unsigned x[] = {1, 1, 6, 0, 3, 1, 1, 2, 0, 3, 6, 7, 5, 7, 7, 9};
// unsigned y[] = {0, 2, 2, 0, 5, 0, 1, 1, 4, 2, 0, 0, 3, 2, 3, 2};
// unsigned z[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
unsigned x[] = {1, 1, 6, 0, 3, 1, 1, 2, 0, 3, 6, 7, 5, 7, 7, 9};
unsigned y[] = {0, 2, 2, 0, 5, 0, 1, 1, 4, 2, 0, 0, 3, 2, 3, 2};
unsigned z[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
#define NUM_WARPS 8
#define NUM_THREADS 2
// unsigned x[] = {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2};
// unsigned y[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
// unsigned z[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
#define NUM_WARPS 16
#define NUM_THREADS 1
int main()
{
initiate_stack();
queue_initialize();
createWarps(NUM_WARPS, NUM_THREADS, matAddition, x, y, z);
while(!queue_isEmpty()) {}
ECALL;
return 0;
}
void matAddition(unsigned tid, unsigned wid)
{
unsigned * x_ptr = get_1st_arg();
unsigned * y_ptr = get_2nd_arg();
unsigned * z_ptr = get_3rd_arg();
unsigned i = (wid * NUM_THREADS) + tid;
// int cond = i < 16;
// __if(cond)
// // DO SOMETHING
// __else
// // DO SOMETHING ELSE
// __end_if
z_ptr[i] = x_ptr[i] + y_ptr[i];
sleep((100 * wid)+100);
return;
}

File diff suppressed because it is too large Load diff

Binary file not shown.

View file

@ -1,149 +1,99 @@
:0200000480007A
:10000000130101FF232611002324810013040101A1
:10001000EF004046EF000048B707008193870700D4
:10002000370700811307C719B70600819386461664
:10003000370600801306060793052000130580008D
:10004000EF00C02D13000000EF00407E9307050075
:10005000E38C07FE73000000930700001385070080
:100060008320C10003248100130101016780000087
:10007000130101FD2326110223248102130401032D
:10008000232EA4FC232CB4FCEF0040372326A4FE2F
:10009000EF0040392324A4FEEF00403B2322A4FEBE
:1000A000832784FD939717000327C4FDB307F70048
:1000B0002320F4FE832704FE939727000327C4FE22
:1000C000B307F70083A60700832704FE9397270052
:1000D000032784FEB307F70003A70700832704FE66
:1000E00093972700032644FEB307F6003387E60004
:1000F00023A0E700130000008320C1020324810233
:100100001301010367800000130101FC232E81020B
:10011000232CA10313040104232EA4FC232CB4FCE0
:10012000232AC4FC2328D4FC2326E4FC2324F4FC47
:10013000930F05009300060093850500938B06003E
:10014000130C0700938C0300130F010013820F00A0
:10015000930710002326F4FE6F0000020325C4FE5F
:100160000323C4FE130101806B5003008327C4FEE8
:10017000938717002326F4FE13870F008327C4FEFE
:10018000E3EEE7FC13010F0013050000938F00005E
:10019000930D0200EBE0BF01B70700809387472073
:1001A000138507006B400500130000000324C10302
:1001B000032D81031301010467800000130101FD79
:1001C00023268102130401032326A4FE2324B4FE64
:1001D0002322C4FE2320D4FE232EE4FC232CF4FC93
:1001E00093830700B7070080938787101383070066
:1001F0006B000300130000000324C102130101037C
:1002000067800000130101FD2326110223248102CF
:1002100013040103EF008061930705006384070066
:1002200073000000930744FD13850700EF00004BA7
:100230008327C4FD13810700032584FD832544FD26
:10024000032604FE832644FE032784FE8327C4FE80
:10025000EFF09FEB130000008320C1020324810212
:100260001301010367800000130101FD2326110221
:100270002324810213040103130F01006F004003C4
:10028000930744FD13850700EF0040458327C4FD15
:1002900013810700032584FD832544FD032604FE06
:1002A000832644FE032784FE8327C4FEEFF01FF15C
:1002B000EF00C0579307050063980700EF00005A4E
:1002C00093070500E39E07FA13010F0013000000D7
:1002D0008320C1020324810213010103678000000F
:1002E000130101FE232E810013040102232604FEC4
:1002F0006F0000018327C4FE938717002326F4FEB6
:100300000327C4FE93073006E3D6E7FE1300000080
:100310000324C1011301010267800000130101FBE6
:100320002326110423248104130401052326A4FC9D
:100330002324B4FC2322C4FC2320D4FC232EE4FA7F
:10034000232CF4FA130F0100232604FE6F00C005CE
:10035000B70FFFFF3301F1018327C4FE2328F4FC0C
:10036000832784FC232AF4FC93070100232CF4FC4C
:10037000832744FC232EF4FC832704FC2320F4FE73
:100380008327C4FB2322F4FE832784FB2324F4FE6B
:10039000930704FD13850700EF0080158327C4FE33
:1003A000938717002326F4FE0327C4FE8327C4FC8B
:1003B000E360F7FA13010F00EFF01FEBEFF05FF2CD
:1003C000130000008320C1040324810413010105EC
:1003D00067800000130101FF23268100232471019F
:1003E0001304010193870B00138507000324C10048
:1003F000832B81001301010167800000130101FFBD
:1004000023268100232471011304010193870B002B
:10041000138507000324C100832B81001301010110
:1004200067800000130101FF23268100232481013E
:100430001304010193070C00138507000324C10076
:10044000032C81001301010167800000130101FFEB
:1004500023268100232491011304010193870C00BA
:10046000138507000324C100832C810013010101BF
:1004700067800000130101FF23268100130401019E
:1004800037F1FF7F130000000324C10013010101B5
:1004900067800000130101FF23268100130401017E
:1004A000B70700819387870323AC0710B707008144
:1004B0009387870323AE0710B707008193878703CD
:1004C00023A00712B70700819387870313077000E3
:1004D00023A2E712B70700819387870323A407129B
:1004E000130000000324C100130101016780000014
:1004F000130101FE232E8100130401022326A4FE12
:10050000B70700819387870383A707121387170014
:10051000B70700819387870323A0E712B7070081FD
:100520009387870303A7C7118327C4FE83A6070009
:10053000370600819307070093973700B387E7409A
:100540009397270013078603B387E70023A0D700FC
:10055000B70700819387870303A7C7118327C4FECA
:1005600083A64700B7070081138687039307070018
:1005700093973700B387E74093972700B307F600B8
:1005800023A2D700B70700819387870303A7C7116A
:100590008327C4FE83A68700B707008113868703DD
:1005A0009307070093973700B387E7409397270097
:1005B000B307F60023A4D700B7070081938787030A
:1005C00003A7C7118327C4FE83A6C700B70700810E
:1005D000138687039307070093973700B387E74095
:1005E00093972700B307F60023A6D700B70700812B
:1005F0009387870303A7C7118327C4FE83A6070138
:10060000370600819307070093973700B387E740C9
:100610009397270013078603B387E70023A8D70023
:10062000B70700819387870303A7C7118327C4FEF9
:1006300083A64701B7070081138687039307070046
:1006400093973700B387E74093972700B307F600E7
:1006500023AAD700B70700819387870303A7C71191
:100660008327C4FE83A68701B7070081138687030B
:100670009307070093973700B387E74093972700C6
:10068000B307F60023ACD700B70700819387870331
:1006900083A7C711138717009307900063E2E7024F
:1006A000B70700819387870383A7C71113871700B4
:1006B000B70700819387870323AEE7106F0000011F
:1006C000B70700819387870323AE0710130000004C
:1006D0000324C1011301010267800000130101FD21
:1006E0002326810213040103232EA4FCB7070081F3
:1006F0009387870383A707121387F7FFB707008144
:100700009387870323A0E712B707008193878703A6
:1007100003A787119307070093973700B387E74034
:10072000939727003707008113078703B387E700F4
:100730002326F4FEB70700819387870383A78711D9
:10074000138717009307900063E2E702B707008161
:100750009387870383A7871113871700B707008143
:100760009387870323ACE7106F000001B707008170
:100770009387870323AC07108327C4FE03A70700D2
:100780008327C4FD23A0E7008327C4FE03A74700F7
:100790008327C4FD23A2E7008327C4FE03A78700A5
:1007A0008327C4FD23A4E7008327C4FE03A7C70053
:1007B0008327C4FD23A6E7008327C4FE03A7070100
:1007C0008327C4FD23A8E7008327C4FE03A74701AE
:1007D0008327C4FD23AAE7008327C4FE03A787015C
:1007E0008327C4FD23ACE700130000000324C102EB
:1007F0001301010367800000130101FF232681001C
:1008000013040101B70700819387870383A70712A9
:10081000938767FF93B7170093F7F70F13850700C8
:100820000324C1001301010167800000130101FFCF
:100830002326810013040101B707008193878703F2
:1008400083A7071293B7170093F7F70F13850700D5
:100850000324C1001301010167800000130101FF9F
:100860002326810013040101B707008193878703C2
:1008700003A78712B70700819387870383A74712CF
:10088000B337F70093F7F70F138507000324C10070
:08089000130101016780000063
:1000000037F1FF7FEF00C018EF008006730000009B
:10001000938B0600130C0700938C0700130F01004D
:100020009303050013051000635C75001301018044
:10003000130305006B500300130515006FF0DFFE7E
:1000400013010F0013050000930F0600938D0300AA
:10005000EBE0BF01170500001305852A6B40050082
:1000600017030000130303FB6B000300678000000D
:10007000130101FF23261100232481001304010131
:10008000B707008193870700370700811307C76010
:10009000B70600819386C65C370600801306460DBE
:1000A0009305100013050001EF0000391300000054
:1000B000EF00402193070500E38C07FE9307000043
:1000C000138507008320C10003248100130101016F
:1000D00067800000130101FD232611022324810201
:1000E00013040103232EA4FC232CB4FCEF00804056
:1000F0002326A4FEEF0080422324A4FEEF008044C8
:100100002322A4FE032784FD8327C4FDB307F70041
:100110002320F4FE832704FE939727000327C4FEC1
:10012000B307F70083A60700832704FE93972700F1
:10013000032784FEB307F70003A70700832704FE05
:1001400093972700032644FEB307F6003387E600A3
:1001500023A0E700832784FD138717009307070078
:1001600093971700B387E70093973700B387E700AB
:100170009397270013850700EF0000281300000065
:100180008320C10203248102130101036780000060
:1001900097020001938202EB1303000093037000A7
:1001A00023A0620023A2620023A4620023A672009F
:1001B00023A862006780000097020001938282E812
:1001C00003A382001303130023A4620013834201DC
:1001D00083AE420093935E0033037300032E050049
:1001E0002320C301032E45002322C301032E8500D3
:1001F0002324C301032EC5002326C301032E0501BA
:100200002328C301032E4501232AC301032E8501A0
:10021000232CC301938E1E00130F20036394EE0161
:10022000930E000023A2D201678000009702000114
:10023000938242E103A382001303F3FF23A462002D
:100240001383420183AE0200930F2003138F0E002D
:10025000130F1F006314FF01130F000023A0E2011E
:1002600093935E0033037300032E03002320C50124
:10027000032E43002322C501032E83002324C5013E
:10028000032EC3002326C501032E03012328C50125
:10029000032E4301232AC501032E8301232CC5010C
:1002A00067800000970200019382C2D903A38200F5
:1002B00013050000130E200363146E0013051500D0
:1002C00067800000970200019382C2D703A38200D7
:1002D00013050000130E000063146E0013051500D3
:1002E00067800000970200019382C2D503A3C20079
:1002F00083A3020133B5630067800000130101FD91
:10030000232611022324810213040103EFF09FFB33
:10031000930705006384070073000000930744FD02
:1003200013850700EFF09FF08327C4FD13810700BA
:10033000032584FD832544FD032604FE832644FE15
:10034000032784FE8327C4FEEFF09FCC73000000D8
:10035000130000008320C102032481021301010362
:1003600067800000130101FD23261102232481026E
:1003700013040103930901006F000005B707008112
:100380009387070483A7070113871700B707008126
:100390009387070423A8E700930744FD138507000C
:1003A000EFF0DFE88327C4FD13810700032584FDF8
:1003B000832544FD032604FE832644FE032784FE92
:1003C0008327C4FEEFF0DFC9EFF0DFEF93070500EE
:1003D00063980700EFF01FF193070500E39007FA19
:1003E00013810900130000008320C102032481024D
:1003F0001301010367800000130101FD2326810220
:1004000013040103232EA4FC232604FE6F00000125
:100410008327C4FE938717002326F4FE0327C4FE18
:100420008327C4FDE346F7FE130000000324C10246
:100430001301010367800000130101FB232611044F
:1004400023248104130401052326A4FC2324B4FCE3
:100450002322C4FC2320D4FC232EE4FA232CF4FA18
:1004600013090100232604FE6F00C005B709FFFF32
:10047000330131018327C4FE2328F4FC832784FC45
:10048000232AF4FC93070100232CF4FC832744FC6B
:10049000232EF4FC832704FC2320F4FE8327C4FBD3
:1004A0002322F4FE832784FB2324F4FE930704FD18
:1004B00013850700EFF05FD08327C4FE93871700F2
:1004C0002326F4FE0327C4FE8327C4FCE360F7FA67
:1004D00013010900EFF01FE913054006EFF0DFF10B
:1004E000130000008320C1040324810413010105CB
:1004F00067800000130101FF23268100232471017E
:100500001304010193870B00138507000324C10026
:10051000832B81001301010167800000130101FF9B
:1005200023268100232481011304010193070C0079
:10053000138507000324C100032C8100130101016E
:1005400067800000130101FF23268100232491010D
:100550001304010193870C00138507000324C100D5
:0C056000832C8100130101016780000062
:02000004810079
:100164000100000001000000060000000000000083
:100174000300000001000000010000000200000074
:10018400000000000300000006000000070000005B
:10019400050000000700000000000000020000004D
:1001A4000200000000000000050000000000000044
:1001B4000100000001000000040000000200000033
:1001C4000000000000000000030000000200000026
:1005CC000100000001000000060000000000000017
:1005DC000300000001000000010000000200000008
:1005EC0000000000030000000600000007000000EF
:1005FC0005000000070000000700000009000000D3
:10060C0000000000020000000200000000000000DA
:10061C0005000000000000000100000001000000C7
:10062C0004000000020000000000000000000000B8
:10063C0003000000020000000300000002000000A4
:040000058000000077
:00000001FF

View file

@ -1,3 +1,3 @@
/opt/riscv/bin/riscv32-unknown-linux-gnu-gcc -march=rv32i -mabi=ilp32 -O0 -Wl,-Bstatic,-T,linker.ld -ffreestanding -nostdlib gpgpu_test.c ./lib/lib.c ./lib/queue.c -o gpgpu_test.elf
/opt/riscv/bin/riscv32-unknown-linux-gnu-gcc -march=rv32i -mabi=ilp32 -O0 -Wl,-Bstatic,-T,linker.ld -ffreestanding -nostdlib ./lib/lib.s gpgpu_test.c ./lib/queue.s ./lib/lib.c -o gpgpu_test.elf
/opt/riscv/bin/riscv32-unknown-linux-gnu-objdump -D gpgpu_test.elf > gpgpu_test.dump
/opt/riscv/bin/riscv32-unknown-linux-gnu-objcopy -O ihex gpgpu_test.elf gpgpu_test.hex

Binary file not shown.

View file

@ -1,111 +1,9 @@
#include "lib.h"
void createThreads(unsigned num_threads, unsigned wid, unsigned func_addr, unsigned * x_ptr, unsigned * y_ptr, unsigned * z_ptr)
{
asm __volatile__("mv t6, a0");
asm __volatile__("mv ra, a2");
asm __volatile__("mv a1, a1");
extern void createThreads(unsigned, unsigned, unsigned, unsigned *, unsigned *, unsigned *);
extern void wspawn(unsigned, unsigned, unsigned, unsigned *, unsigned *, unsigned *);
asm __volatile__("mv s7, a3");
asm __volatile__("mv s8, a4");
asm __volatile__("mv s9, t2");
asm __volatile__("addi t5, sp, 0");
register unsigned num_threads_ asm("t6");
asm __volatile__("mv tp, t6");
for (unsigned i = 1; i < num_threads_; i++)
{
register unsigned cur_tid asm("a0") = i;
register unsigned not_sure asm("t1") = i;
asm __volatile__("addi sp, sp, -2048");
CLONE;
}
asm __volatile__("addi sp, t5, 0");
register unsigned cur_tid asm("a0") = 0;
// jalis TO FUNC
// register unsigned num_lanes asm("t6") = func_addr;
// register unsigned link asm("s11") = num_threads;
asm __volatile__("mv t6, ra");
asm __volatile__("mv s11, tp");
JALRS;
register unsigned jump_dest asm("a0") = (unsigned) reschedule_warps;
JMPRT;
// // register unsigned *xx asm("s7") = x_ptr;
// // register unsigned *yy asm("s8") = y_ptr;
// // register unsigned *zz asm("s9") = z_ptr;
// register unsigned wid_ asm("a1") = wid;
// asm __volatile__("addi t5, sp, 0");
// for (unsigned i = 1; i < num_threads; i++)
// {
// register unsigned cur_tid asm("a0") = i;
// register unsigned not_sure asm("t1") = i;
// asm __volatile__("addi sp, sp, -256");
// CLONE;
// }
// asm __volatile__("addi sp, t5, 0");
// register unsigned cur_tid asm("a0") = 0;
// // jalis TO FUNC
// register unsigned num_lanes asm("t6") = func_addr;
// register unsigned link asm("s11") = num_threads;
// JALRS;
// register unsigned jump_dest asm("a0") = (unsigned) reschedule_warps;
// JMPRT;
}
void wspawn(unsigned num_threads, unsigned wid, unsigned func, unsigned * x_ptr, unsigned * y_ptr, unsigned * z_ptr)
{
asm __volatile__("mv t2, a5");
// asm __volatile__("mv t1, a5");
register unsigned func_add asm("t1") = (unsigned) &createThreads;
WSPAWN; // THIS SHOULD COPY THE CSR REGISTERS TO THE NEW WARP
// register unsigned *tzz asm("t2") = z_ptr;
// register unsigned func_add asm("t1") = (unsigned) &createThreads;
// register unsigned n_threads asm("a0") = num_threads;
// register unsigned wwid asm("a1") = wid;
// register unsigned ffunc asm("a2") = func;
// register unsigned *xx asm("a3") = x_ptr;
// register unsigned *yy asm("a4") = y_ptr;
// register unsigned *zz asm("a5") = tzz;
// WSPAWN; // THIS SHOULD COPY THE CSR REGISTERS TO THE NEW WARP
}
void reschedule_warps()
{
@ -120,25 +18,28 @@ void reschedule_warps()
asm __volatile__("mv sp,%0"::"r" (j.base_sp):);
createThreads(j.n_threads, j.wid, j.func_ptr, j.x, j.y, j.z);
ECALL;
}
void schedule_warps()
{
asm __volatile__("mv t5, sp");
asm __volatile__("mv s3, sp");
while (!queue_isEmpty() && queue_availableWarps())
{
++q.active_warps;
Job j;
queue_dequeue(&j);
asm __volatile__("mv sp,%0"::"r" (j.base_sp):);
wspawn(j.n_threads, j.wid, j.func_ptr, j.x, j.y, j.z);
}
asm __volatile__("mv sp, t5");
asm __volatile__("mv sp, s3");
}
void sleep()
void sleep(int t)
{
for(int z = 0; z < 100; z++) {}
for(int z = 0; z < t; z++) {}
}
@ -146,11 +47,11 @@ void sleep()
void createWarps(unsigned num_Warps, unsigned num_threads, FUNC, unsigned * x_ptr, unsigned * y_ptr, unsigned * z_ptr)
{
asm __volatile__("addi t5, sp, 0");
asm __volatile__("addi s2, sp, 0");
for (unsigned i = 0; i < num_Warps; i++)
{
asm __volatile__("lui t6, 0xFFFF0");
asm __volatile__("add sp, sp, t6");
asm __volatile__("lui s3, 0xFFFF0");
asm __volatile__("add sp, sp, s3");
register unsigned stack_ptr asm("sp");
Job j;
@ -164,12 +65,12 @@ void createWarps(unsigned num_Warps, unsigned num_threads, FUNC, unsigned * x_pt
queue_enqueue(&j);
}
asm __volatile__("addi sp, t5, 0");
asm __volatile__("addi sp, s2, 0");
schedule_warps();
sleep();
sleep(100);
// asm __volatile__("addi t5, sp, 0");
@ -186,11 +87,11 @@ void createWarps(unsigned num_Warps, unsigned num_threads, FUNC, unsigned * x_pt
}
unsigned get_wid()
{
register unsigned ret asm("s7");
return ret;
}
// unsigned get_wid()
// {
// register unsigned ret asm("s7");
// return ret;
// }
unsigned * get_1st_arg(void)
{
@ -208,7 +109,3 @@ unsigned * get_3rd_arg(void)
return ret;
}
void initiate_stack()
{
asm __volatile__("lui sp,0x7ffff":::);
}

View file

@ -6,21 +6,38 @@
#define WSPAWN asm __volatile__(".word 0x3006b"::);
#define CLONE asm __volatile__(".word 0x3506b":::);
#define JALRS asm __volatile__(".word 0x1bfe0eb":::"s10")
#define ECALL asm __volatile__(".word 0x00000073")
#define JMPRT asm __volatile__(".word 0x5406b")
#define JALRS asm __volatile__(".word 0x1bfe0eb":::"s10");
#define ECALL asm __volatile__(".word 0x00000073");
#define JMPRT asm __volatile__(".word 0x5406b");
#define SPLIT asm __volatile__(".word 0xf206b");
#define P_JUMP asm __volatile__(".word 0x1ff706b");
#define JOIN asm __volatile__(".word 0x306b");
// #define __if(val) { \
// register unsigned p asm("t5") = val; \
// register unsigned * e asm("t6") = &&ELSE; \
// SPLIT; \
// P_JUMP; \
// }
// #define __else asm __volatile__("j AFTER"); \
// ELSE: asm __volatile__("nop");
// #define __end_if AFTER: JOIN;
#define FUNC void (func)(unsigned, unsigned)
void createWarps(unsigned num_Warps, unsigned num_threads, FUNC, unsigned *, unsigned *, unsigned *);
void reschedule_warps(void);
unsigned get_wid();
unsigned * get_1st_arg(void);
unsigned * get_2nd_arg(void);
unsigned * get_3rd_arg(void);
void initiate_stack();
void sleep(int);
#endif

49
src/riscv_gpgpu/lib/lib.s Normal file
View file

@ -0,0 +1,49 @@
.section .text
.type _start, @function
.global _start
_start:
lui sp, 0x7ffff
jal queue_initialize
jal main
ecall
.type createThreads, @function
.global createThreads
createThreads:
mv s7,a3 # Moving x_ptr to s7
mv s8,a4 # Moving y_ptr to s8
mv s9,a5 # Moving z_ptr to s9
mv t5,sp # Saving the current stack pointer to t5
mv t2, a0 # t2 = num_threads
loop_init:
li a0,1 # i = 0
loop_cond:
bge a0, t2, loop_done # i < num_threads
loop_body:
addi sp,sp,-2048 # Allocate 2k stack for new thread
mv t1, a0 # #lane = i
.word 0x3506b # clone register state
loop_inc:
addi a0, a0, 1
j loop_cond
loop_done:
mv sp,t5 # Restoring the stack
li a0,0 # setting tid = 0 for main thread
mv t6,a2 # setting func_addr
mv s11,t2 # setting num_threads to spawn
.word 0x1bfe0eb
la a0, reschedule_warps
.word 0x5406b
.type wspawn, @function
.global wspawn
wspawn:
la t1, createThreads
.word 0x3006b # WSPAWN instruction
ret

View file

@ -1,73 +0,0 @@
#include "queue.h"
void queue_initialize(void)
{
q.start_i = 0;
q.end_i = 0;
q.num_j = 0;
q.total_warps = 7;
q.active_warps = 0;
}
void queue_enqueue(Job * j)
{
q.num_j++;
// q.jobs[q.end_i] = j;
q.jobs[q.end_i].wid = j->wid;
q.jobs[q.end_i].n_threads = j->n_threads;
q.jobs[q.end_i].base_sp = j->base_sp;
q.jobs[q.end_i].func_ptr = j->func_ptr;
q.jobs[q.end_i].x = j->x;
q.jobs[q.end_i].y = j->y;
q.jobs[q.end_i].z = j->z;
if ((q.end_i + 1) < SIZE)
{
q.end_i++;
}
else
{
q.end_i = 0;
}
}
void queue_dequeue(Job * r)
{
q.num_j--;
Job * j = &(q.jobs[q.start_i]);
if ((q.start_i + 1) < SIZE)
{
q.start_i++;
}
else
{
q.start_i = 0;
}
r->wid = j->wid;
r->n_threads = j->n_threads;
r->base_sp = j->base_sp;
r->func_ptr = j->func_ptr;
r->x = j->x;
r->y = j->y;
r->z = j->z;
}
int queue_isFull(void)
{
return (q.num_j == SIZE);
}
int queue_isEmpty(void)
{
return (q.num_j == 0);
}
int queue_availableWarps()
{
return (q.active_warps < q.total_warps);
}

View file

@ -5,7 +5,8 @@
#define SIZE 10
#define SIZE 50
#define WARPS 7
typedef struct Job_t
@ -22,13 +23,12 @@ typedef struct Job_t
typedef struct Queue_t
{
struct Job_t jobs[SIZE];
unsigned start_i;
unsigned end_i;
unsigned num_j;
unsigned total_warps;
unsigned active_warps;
struct Job_t jobs[SIZE];
} Queue;

128
src/riscv_gpgpu/lib/queue.s Normal file
View file

@ -0,0 +1,128 @@
.equ A_WARPS, 7
.equ SIZE, 50
.section .text
.type queue_initialize, @function
.global queue_initialize
queue_initialize:
la t0, q # loading base address of q
li t1, 0 # to initialize variables
li t2, 7 # Num of available warps
sw t1, 0 (t0) # start_i
sw t1, 4 (t0) # end_i
sw t1, 8 (t0) # num_j
sw t2, 12(t0) # total_warps
sw t1, 16(t0) # active_warps
ret
.type queue_enqueue, @function
.global queue_enqueue
queue_enqueue:
la t0, q # loading base address of q
lw t1, 8 (t0) # t1 = num_j
addi t1, t1, 1 # ++t1
sw t1, 8 (t0) # num_j = t1
addi t1, t0, 20 # t1 = jobs_addr
lw t4, 4 (t0) # t4 = end_i
slli t2, t4, 5 # index * 32 [log(sizeof(job))]
add t1, t1, t2 # jobs + index
lw t3, 0 (a0) # wid
sw t3, 0 (t1) #
lw t3, 4 (a0) # n_threads
sw t3, 4 (t1) #
lw t3, 8 (a0) # base_sp
sw t3, 8 (t1) #
lw t3, 12(a0) # func_ptr
sw t3, 12(t1) #
lw t3, 16(a0) # x
sw t3, 16(t1) #
lw t3, 20(a0) # y
sw t3, 20(t1) #
lw t3, 24(a0) # z
sw t3, 24(t1) #
addi t4, t4, 1 # end_i++
li t5, SIZE # size
bne t4, t5, ec # if ((q.end_i + 1) == SIZE)
mv t4, zero
ec:
sw t4, 4 (t0) # end_i
ret
.type queue_dequeue, @function
.global queue_dequeue
queue_dequeue:
la t0, q # loading base address of q
lw t1, 8 (t0) # t1 = num_j
addi t1, t1, -1 # --t1
sw t1, 8 (t0) # num_j = t1
addi t1, t0, 20 # t1 = jobs_addr
lw t4, 0 (t0) # t4 = start_i
li t6, SIZE # size
mv t5, t4 # t5 = start_i
addi t5, t5, 1 # t5++
bne t5, t6, dc # if ((q.start_i + 1) == SIZE)
mv t5, zero
dc:
sw t5, 0(t0) # storing start_i
slli t2, t4, 5 # index * 32 [log(sizeof(job))]
add t1, t1, t2 # jobs + index
lw t3, 0 (t1) # wid
sw t3, 0 (a0) #
lw t3, 4 (t1) # n_threads
sw t3, 4 (a0) #
lw t3, 8 (t1) # base_sp
sw t3, 8 (a0) #
lw t3, 12(t1) # func_ptr
sw t3, 12(a0) #
lw t3, 16(t1) # x
sw t3, 16(a0) #
lw t3, 20(t1) # y
sw t3, 20(a0) #
lw t3, 24(t1) # z
sw t3, 24(a0) #
ret
.type queue_isFull, @function
.global queue_isFull
queue_isFull:
la t0, q # loading base address of q
lw t1, 8 (t0) # t1 = num_j
mv a0, zero # ret_val = 0
li t3, SIZE # t3 = SIZE
bne t3, t1, qf # if (num_j == 1)
addi a0, a0, 1 # ret_val = 1;
qf:
ret
.type queue_isEmpty, @function
.global queue_isEmpty
queue_isEmpty:
la t0, q # loading base address of q
lw t1, 8 (t0) # t1 = num_j
mv a0, zero # ret_val = 0
mv t3, zero # t3 = 0
bne t3, t1, qe # if (num_j == 0)
addi a0, a0, 1 # ret_val = 1;
qe:
ret
.type queue_availableWarps, @function
.global queue_availableWarps
queue_availableWarps:
la t0, q # loading base address of q
lw t1, 12(t0) # t1 = total_warps
lw t2, 16(t0) # t2 = active_warps
sltu a0, t2, t1
ret

View file

@ -7,7 +7,7 @@
OUTPUT_FORMAT("elf32-littleriscv", "elf32-littleriscv",
"elf32-littleriscv")
OUTPUT_ARCH(riscv)
ENTRY(main)
ENTRY(_start)
SECTIONS
{
. = 0x80000000;