mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
Merge branch 'master' of https://github.com/vortexgpgpu/vortex
This commit is contained in:
commit
e248f744d5
9 changed files with 172 additions and 15 deletions
128
doc/execute_opencl_on_vortex.md
Normal file
128
doc/execute_opencl_on_vortex.md
Normal file
|
@ -0,0 +1,128 @@
|
|||
# Execute OpenCL on Vortex backend
|
||||
|
||||
## Requirements
|
||||
- [Vortex](https://github.com/vortexgpgpu/vortex)
|
||||
- [POCL for Vortex](https://github.com/vortexgpgpu/pocl)
|
||||
- [riscv-toolchain](https://github.com/riscv-collab/riscv-gnu-toolchain)
|
||||
- [llvm-riscv](https://github.com/llvm-mirror/llvm)
|
||||
|
||||
For installation, please see [Basic Installation](https://github.com/vortexgpgpu/vortex#basic-installation) for more details.
|
||||
|
||||
**For Ubuntu18.04 users, you can directly download pre-build toolchains with [toolchain_install.sh](https://github.com/vortexgpgpu/vortex/blob/master/ci/toolchain_install.sh) script.**
|
||||
```bash
|
||||
# please modify the DESTDIR variable in the script before execution
|
||||
bash toolchain_install.sh -all
|
||||
```
|
||||
Assuming we have installed all dependencies in `/opt` path, we can get the following environment:
|
||||
```bash
|
||||
tree -L 2 /opt
|
||||
'''
|
||||
/opt/
|
||||
├── llvm-riscv
|
||||
│ ├── bin
|
||||
│ ├── include
|
||||
│ ├── lib
|
||||
│ ├── libexec
|
||||
│ └── share
|
||||
├── pocl
|
||||
│ ├── compiler
|
||||
│ └── runtime
|
||||
├── riscv-gnu-toolchain
|
||||
│ ├── bin
|
||||
│ ├── drops
|
||||
│ ├── include
|
||||
│ ├── lib
|
||||
│ ├── libexec
|
||||
│ ├── riscv32-unknown-elf
|
||||
│ ├── share
|
||||
│ └── var
|
||||
└── verilator
|
||||
├── bin
|
||||
├── examples
|
||||
├── include
|
||||
├── verilator-config.cmake
|
||||
└── verilator-config-version.cmake
|
||||
'''
|
||||
```
|
||||
## Execute OpenCL on Vortex
|
||||
In this tutorial, we show the example of executing a vecadd programs on SIMX backend.
|
||||
To execute a OpenCL program on Vortex, we have the following steps:
|
||||
- Compile the [OpenCL kernels](https://github.com/vortexgpgpu/vortex/blob/master/tests/opencl/vecadd/kernel.cl) into risc-v binary by POCL compiler.
|
||||
- Compile the [OpenCL host](https://github.com/vortexgpgpu/vortex/blob/master/tests/opencl/vecadd/main.cc) and link with Vortex driver(```-lvortex```).
|
||||
- Execute the compiled host programs on a backend.
|
||||
|
||||
Thus, we can write a Makefile as following:
|
||||
```Makefile
|
||||
LLVM_PREFIX ?= /opt/llvm-riscv
|
||||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
SYSROOT ?= $(RISCV_TOOLCHAIN_PATH)/riscv32-unknown-elf
|
||||
POCL_CC_PATH ?= /opt/pocl/compiler
|
||||
POCL_RT_PATH ?= /opt/pocl/runtime
|
||||
|
||||
OPTS ?= -n64
|
||||
|
||||
# please edit these two variable to your environment
|
||||
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
|
||||
VORTEX_RT_PATH ?= $(realpath ../../../runtime)
|
||||
|
||||
K_LLCFLAGS += "-O3 -march=riscv32 -target-abi=ilp32f -mcpu=generic-rv32 -mattr=+m,+f -mattr=+vortex -float-abi=hard -code-model=small"
|
||||
K_CFLAGS += "-v -O3 --sysroot=$(SYSROOT) --gcc-toolchain=$(RISCV_TOOLCHAIN_PATH) -march=rv32imf -mabi=ilp32f -Xclang -target-feature -Xclang +vortex -I$(VORTEX_RT_PATH)/include -fno-rtti -fno-exceptions -ffreestanding -nostartfiles -fdata-sections -ffunction-sections"
|
||||
K_LDFLAGS += "-Wl,-Bstatic,-T$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a -lm"
|
||||
|
||||
CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -Wno-deprecated-declarations -Wno-unused-parameter
|
||||
|
||||
CXXFLAGS += -I$(POCL_RT_PATH)/include
|
||||
|
||||
LDFLAGS += -L$(POCL_RT_PATH)/lib -L$(VORTEX_DRV_PATH)/stub -lOpenCL -lvortex
|
||||
|
||||
PROJECT = vecadd
|
||||
|
||||
SRCS = main.cc
|
||||
|
||||
all: $(PROJECT) kernel.pocl
|
||||
|
||||
kernel.pocl: kernel.cl
|
||||
LLVM_PREFIX=$(LLVM_PREFIX) POCL_DEBUG=all LD_LIBRARY_PATH=$(LLVM_PREFIX)/lib:$(POCL_CC_PATH)/lib $(POCL_CC_PATH)/bin/poclcc -LLCFLAGS $(K_LLCFLAGS) -CFLAGS $(K_CFLAGS) -LDFLAGS $(K_LDFLAGS) -o kernel.pocl kernel.cl
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
run-fpga: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/fpga:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-asesim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/asesim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-vlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-simx: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-rtlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) *.o .depend
|
||||
|
||||
clean-all: clean
|
||||
rm -rf *.pocl *.dump
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
||||
```
|
||||
|
||||
First, build the host program.
|
||||
```bash
|
||||
make all
|
||||
```
|
||||
If we want to execute on SIMX, we can execute the command below.
|
||||
```bash
|
||||
make run-simx
|
||||
```
|
|
@ -194,7 +194,6 @@ module VX_decode #(
|
|||
end
|
||||
`INST_F: begin
|
||||
ex_type = `EX_LSU;
|
||||
op_type = `INST_OP_BITS'(func3[0]);
|
||||
op_mod = `INST_MOD_BITS'(1);
|
||||
end
|
||||
`INST_SYS : begin
|
||||
|
@ -387,6 +386,12 @@ module VX_decode #(
|
|||
`USED_IREG (rs3);
|
||||
end
|
||||
`endif
|
||||
3'h6: begin
|
||||
ex_type = `EX_LSU;
|
||||
op_type = `INST_OP_BITS'(`INST_LSU_LW);
|
||||
op_mod = `INST_MOD_BITS'(2);
|
||||
`USED_IREG (rs1);
|
||||
end
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
|
|
|
@ -154,7 +154,8 @@
|
|||
`define INST_LSU_BITS 4
|
||||
`define INST_LSU_FMT(x) x[2:0]
|
||||
`define INST_LSU_WSIZE(x) x[1:0]
|
||||
`define INST_LSU_IS_FENCE(x) x[0]
|
||||
`define INST_LSU_IS_FENCE(x) (3'h1 == x)
|
||||
`define INST_LSU_IS_PREFETCH(x) (3'h2 == x)
|
||||
|
||||
`define INST_FENCE_BITS 1
|
||||
`define INST_FENCE_D 1'h0
|
||||
|
|
|
@ -60,17 +60,18 @@ module VX_instr_demux (
|
|||
wire lsu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_LSU);
|
||||
wire [`INST_LSU_BITS-1:0] lsu_op_type = `INST_LSU_BITS'(ibuffer_if.op_type);
|
||||
wire lsu_is_fence = `INST_LSU_IS_FENCE(ibuffer_if.op_mod);
|
||||
wire lsu_is_prefetch = `INST_LSU_IS_PREFETCH(ibuffer_if.op_mod);
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32)),
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + `INST_LSU_BITS + 1 + 32 + `NR_BITS + 1 + (2 * `NUM_THREADS * 32) + 1),
|
||||
.OUT_REG (1)
|
||||
) lsu_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (lsu_req_valid),
|
||||
.ready_in (lsu_req_ready),
|
||||
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, lsu_op_type, lsu_is_fence, ibuffer_if.imm, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data}),
|
||||
.data_out ({lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, lsu_req_if.op_type, lsu_req_if.is_fence, lsu_req_if.offset, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.base_addr, lsu_req_if.store_data}),
|
||||
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, lsu_op_type, lsu_is_fence, ibuffer_if.imm, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, lsu_is_prefetch}),
|
||||
.data_out ({lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, lsu_req_if.op_type, lsu_req_if.is_fence, lsu_req_if.offset, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.base_addr, lsu_req_if.store_data, lsu_req_if.is_prefetch}),
|
||||
.valid_out (lsu_req_if.valid),
|
||||
.ready_out (lsu_req_if.ready)
|
||||
);
|
||||
|
|
|
@ -40,6 +40,7 @@ module VX_lsu_unit #(
|
|||
wire [`NW_BITS-1:0] req_wid;
|
||||
wire [31:0] req_pc;
|
||||
wire req_is_dup;
|
||||
wire req_is_prefetch;
|
||||
|
||||
wire mbuf_empty;
|
||||
|
||||
|
@ -79,15 +80,17 @@ module VX_lsu_unit #(
|
|||
|
||||
wire lsu_valid = lsu_req_if.valid && ~fence_wait;
|
||||
|
||||
wire lsu_wb = lsu_req_if.wb | lsu_req_if.is_prefetch;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + 1 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * ADDR_TYPEW) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)),
|
||||
.DATAW (1 + 1 + 1 + `NW_BITS + `NUM_THREADS + 32 + (`NUM_THREADS * 32) + (`NUM_THREADS * ADDR_TYPEW) + `INST_LSU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32)),
|
||||
.RESETW (1)
|
||||
) req_pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!stall_in),
|
||||
.data_in ({lsu_valid, lsu_is_dup, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, full_addr, lsu_addr_type, lsu_req_if.op_type, lsu_req_if.rd, lsu_req_if.wb, lsu_req_if.store_data}),
|
||||
.data_out ({req_valid, req_is_dup, req_wid, req_tmask, req_pc, req_addr, req_addr_type, req_type, req_rd, req_wb, req_data})
|
||||
.data_in ({lsu_valid, lsu_is_dup, lsu_req_if.is_prefetch, lsu_req_if.wid, lsu_req_if.tmask, lsu_req_if.PC, full_addr, lsu_addr_type, lsu_req_if.op_type, lsu_req_if.rd, lsu_wb, lsu_req_if.store_data}),
|
||||
.data_out ({req_valid, req_is_dup, req_is_prefetch, req_wid, req_tmask, req_pc, req_addr, req_addr_type, req_type, req_rd, req_wb, req_data})
|
||||
);
|
||||
|
||||
// Can accept new request?
|
||||
|
@ -99,8 +102,10 @@ module VX_lsu_unit #(
|
|||
wire rsp_wb;
|
||||
wire [`INST_LSU_BITS-1:0] rsp_type;
|
||||
wire rsp_is_dup;
|
||||
wire rsp_is_prefetch;
|
||||
|
||||
`UNUSED_VAR (rsp_type)
|
||||
`UNUSED_VAR (rsp_is_prefetch)
|
||||
|
||||
reg [`LSUQ_SIZE-1:0][`NUM_THREADS-1:0] rsp_rem_mask;
|
||||
wire [`NUM_THREADS-1:0] rsp_rem_mask_n;
|
||||
|
@ -131,10 +136,13 @@ module VX_lsu_unit #(
|
|||
wire mbuf_pop = dcache_rsp_fire && (0 == rsp_rem_mask_n);
|
||||
|
||||
assign mbuf_raddr = dcache_rsp_if.tag[ADDR_TYPEW +: `LSUQ_ADDR_BITS];
|
||||
`UNUSED_VAR (dcache_rsp_if.tag)
|
||||
`UNUSED_VAR (dcache_rsp_if.tag)
|
||||
|
||||
// do not writeback from software prefetch
|
||||
wire req_wb2 = req_wb && ~req_is_prefetch;
|
||||
|
||||
VX_index_buffer #(
|
||||
.DATAW (`NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `INST_LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1),
|
||||
.DATAW (`NW_BITS + 32 + `NUM_THREADS + `NR_BITS + 1 + `INST_LSU_BITS + (`NUM_THREADS * REQ_ASHIFT) + 1 + 1),
|
||||
.SIZE (`LSUQ_SIZE)
|
||||
) req_metadata (
|
||||
.clk (clk),
|
||||
|
@ -142,8 +150,8 @@ module VX_lsu_unit #(
|
|||
.write_addr (mbuf_waddr),
|
||||
.acquire_slot (mbuf_push),
|
||||
.read_addr (mbuf_raddr),
|
||||
.write_data ({req_wid, req_pc, req_tmask, req_rd, req_wb, req_type, req_offset, req_is_dup}),
|
||||
.read_data ({rsp_wid, rsp_pc, rsp_tmask, rsp_rd, rsp_wb, rsp_type, rsp_offset, rsp_is_dup}),
|
||||
.write_data ({req_wid, req_pc, req_tmask, req_rd, req_wb2, req_type, req_offset, req_is_dup, req_is_prefetch}),
|
||||
.read_data ({rsp_wid, rsp_pc, rsp_tmask, rsp_rd, rsp_wb, rsp_type, rsp_offset, rsp_is_dup, rsp_is_prefetch}),
|
||||
.release_addr (mbuf_raddr),
|
||||
.release_slot (mbuf_pop),
|
||||
.full (mbuf_full),
|
||||
|
@ -346,7 +354,7 @@ module VX_lsu_unit #(
|
|||
`TRACE_ARRAY1D(dcache_req_if.data, `NUM_THREADS);
|
||||
dpi_trace("\n");
|
||||
end else begin
|
||||
dpi_trace("%d: D$%0d Rd Req: wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_wid, req_pc, dcache_req_fire);
|
||||
dpi_trace("%d: D$%0d Rd Req: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, addr=", $time, CORE_ID, req_is_prefetch, req_wid, req_pc, dcache_req_fire);
|
||||
`TRACE_ARRAY1D(req_addr, `NUM_THREADS);
|
||||
dpi_trace(", tag=%0h, byteen=%0h, type=", req_tag, dcache_req_if.byteen);
|
||||
`TRACE_ARRAY1D(req_addr_type, `NUM_THREADS);
|
||||
|
@ -354,8 +362,8 @@ module VX_lsu_unit #(
|
|||
end
|
||||
end
|
||||
if (dcache_rsp_fire) begin
|
||||
dpi_trace("%d: D$%0d Rsp: wid=%0d, PC=%0h, tmask=%b, tag=%0h, rd=%0d, data=",
|
||||
$time, CORE_ID, rsp_wid, rsp_pc, dcache_rsp_if.tmask, mbuf_raddr, rsp_rd);
|
||||
dpi_trace("%d: D$%0d Rsp: prefetch=%b, wid=%0d, PC=%0h, tmask=%b, tag=%0h, rd=%0d, data=",
|
||||
$time, CORE_ID, rsp_is_prefetch, rsp_wid, rsp_pc, dcache_rsp_if.tmask, mbuf_raddr, rsp_rd);
|
||||
`TRACE_ARRAY1D(dcache_rsp_if.data, `NUM_THREADS);
|
||||
dpi_trace(", is_dup=%b\n", rsp_is_dup);
|
||||
end
|
||||
|
|
|
@ -17,6 +17,7 @@ interface VX_lsu_req_if ();
|
|||
wire [`NR_BITS-1:0] rd;
|
||||
wire wb;
|
||||
wire ready;
|
||||
wire is_prefetch;
|
||||
|
||||
modport master (
|
||||
output valid,
|
||||
|
@ -30,6 +31,7 @@ interface VX_lsu_req_if ();
|
|||
output offset,
|
||||
output rd,
|
||||
output wb,
|
||||
output is_prefetch,
|
||||
input ready
|
||||
);
|
||||
|
||||
|
@ -45,6 +47,7 @@ interface VX_lsu_req_if ();
|
|||
input offset,
|
||||
input rd,
|
||||
input wb,
|
||||
input is_prefetch,
|
||||
output ready
|
||||
);
|
||||
|
||||
|
|
|
@ -149,6 +149,11 @@ inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
|
|||
asm volatile (".insn s 0x6b, 4, %1, 0(%0)" :: "r"(barried_id), "r"(num_warps));
|
||||
}
|
||||
|
||||
// Prefetch
|
||||
inline void vx_prefetch(unsigned addr) {
|
||||
asm volatile (".insn s 0x6b, 6, x0, 0(%0)" :: "r"(addr) );
|
||||
}
|
||||
|
||||
// Return active warp's thread id
|
||||
inline int vx_thread_id() {
|
||||
int result;
|
||||
|
|
|
@ -182,6 +182,7 @@ static const char* op_string(const Instr &instr) {
|
|||
case 2: return "SPLIT";
|
||||
case 3: return "JOIN";
|
||||
case 4: return "BAR";
|
||||
case 6: return "PREFETCH";
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
|
|
|
@ -712,6 +712,11 @@ void Warp::execute(const Instr &instr, Pipeline *pipeline) {
|
|||
pipeline->stall_warp = true;
|
||||
runOnce = true;
|
||||
} break;
|
||||
case 6: {
|
||||
// PREFETCH
|
||||
int addr = rsdata[0];
|
||||
printf("*** PREFETCHED %d ***\n", addr);
|
||||
} break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue