minor update

This commit is contained in:
Blaise Tine 2024-05-12 20:21:23 -07:00
parent 60107cf2b6
commit 19beb0728e
7 changed files with 87 additions and 85 deletions

View file

@ -1,10 +1,10 @@
// Copyright © 2019-2023
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -65,7 +65,7 @@ enum class RegType {
inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
switch (type) {
case RegType::None: break;
case RegType::Integer: os << "x"; break;
case RegType::Integer: os << "x"; break;
case RegType::Float: os << "f"; break;
default: assert(false);
}
@ -142,14 +142,14 @@ enum class AddrType {
};
inline AddrType get_addr_type(uint64_t addr) {
if (LMEM_ENABLED) {
if (addr >= LMEM_BASE_ADDR && addr < (LMEM_BASE_ADDR + (1 << LMEM_LOG_SIZE))) {
return AddrType::Shared;
}
}
if (addr >= IO_BASE_ADDR) {
return AddrType::IO;
}
if (LMEM_ENABLED) {
if (addr >= LMEM_BASE_ADDR && (addr-LMEM_BASE_ADDR) < (1 << LMEM_LOG_SIZE)) {
return AddrType::Shared;
}
}
return AddrType::Global;
}
@ -247,13 +247,13 @@ struct MemReq {
bool write;
AddrType type;
uint32_t tag;
uint32_t cid;
uint32_t cid;
uint64_t uuid;
MemReq(uint64_t _addr = 0,
MemReq(uint64_t _addr = 0,
bool _write = false,
AddrType _type = AddrType::Global,
uint64_t _tag = 0,
uint64_t _tag = 0,
uint32_t _cid = 0,
uint64_t _uuid = 0
) : addr(_addr)
@ -276,12 +276,12 @@ inline std::ostream &operator<<(std::ostream &os, const MemReq& req) {
///////////////////////////////////////////////////////////////////////////////
struct MemRsp {
uint64_t tag;
uint64_t tag;
uint32_t cid;
uint64_t uuid;
MemRsp(uint64_t _tag = 0, uint32_t _cid = 0, uint64_t _uuid = 0)
: tag (_tag)
: tag (_tag)
, cid(_cid)
, uuid(_uuid)
{}
@ -297,16 +297,16 @@ inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) {
template <typename T>
class HashTable {
public:
public:
HashTable(uint32_t capacity)
: entries_(capacity)
, size_(0)
, size_(0)
{}
bool empty() const {
return (0 == size_);
}
bool full() const {
return (size_ == entries_.size());
}
@ -337,7 +337,7 @@ public:
if (!entry.first) {
entry.first = true;
entry.second = value;
++size_;
++size_;
return i;
}
}
@ -374,13 +374,13 @@ public:
std::vector<SimPort<Type>> Outputs;
Mux(
const SimContext& ctx,
const char* name,
ArbiterType type,
uint32_t num_inputs,
const SimContext& ctx,
const char* name,
ArbiterType type,
uint32_t num_inputs,
uint32_t num_outputs = 1,
uint32_t delay = 1
) : SimObject<Mux<Type>>(ctx, name)
) : SimObject<Mux<Type>>(ctx, name)
, Inputs(num_inputs, this)
, Outputs(num_outputs, this)
, type_(type)
@ -388,13 +388,13 @@ public:
, cursors_(num_outputs, 0)
, num_reqs_(num_inputs / num_outputs)
{
assert(delay != 0);
assert(delay != 0);
assert(num_inputs <= 32);
assert(num_outputs <= 32);
assert(num_inputs >= num_outputs);
// bypass mode
if (num_inputs == num_outputs) {
if (num_inputs == num_outputs) {
for (uint32_t i = 0; i < num_inputs; ++i) {
Inputs.at(i).bind(&Outputs.at(i));
}
@ -415,20 +415,20 @@ public:
// skip bypass mode
if (I == O)
return;
// process inputs
// process inputs
for (uint32_t o = 0; o < O; ++o) {
for (uint32_t r = 0; r < R; ++r) {
uint32_t i = (cursors_.at(o) + r) & (R-1);
uint32_t j = o * R + i;
if (j >= I)
continue;
auto& req_in = Inputs.at(j);
if (!req_in.empty()) {
auto& req = req_in.front();
DT(4, this->name() << "-" << req);
Outputs.at(o).push(req, delay_);
Outputs.at(o).push(req, delay_);
req_in.pop();
this->update_cursor(o, i);
break;
@ -446,7 +446,7 @@ private:
}
ArbiterType type_;
uint32_t delay_;
uint32_t delay_;
std::vector<uint32_t> cursors_;
uint32_t num_reqs_;
};
@ -459,33 +459,33 @@ public:
std::vector<SimPort<Req>> ReqIn;
std::vector<SimPort<Rsp>> RspIn;
std::vector<SimPort<Req>> ReqOut;
std::vector<SimPort<Req>> ReqOut;
std::vector<SimPort<Rsp>> RspOut;
Switch(
const SimContext& ctx,
const char* name,
ArbiterType type,
uint32_t num_inputs,
const SimContext& ctx,
const char* name,
ArbiterType type,
uint32_t num_inputs,
uint32_t num_outputs = 1,
uint32_t delay = 1
)
: SimObject<Switch<Req, Rsp>>(ctx, name)
)
: SimObject<Switch<Req, Rsp>>(ctx, name)
, ReqIn(num_inputs, this)
, RspIn(num_inputs, this)
, ReqOut(num_outputs, this)
, ReqOut(num_outputs, this)
, RspOut(num_outputs, this)
, type_(type)
, delay_(delay)
, cursors_(num_outputs, 0)
, lg_num_reqs_(log2ceil(num_inputs / num_outputs))
{
assert(delay != 0);
assert(delay != 0);
assert(num_inputs <= 32);
assert(num_outputs <= 32);
assert(num_inputs >= num_outputs);
// bypass mode
// bypass mode
if (num_inputs == num_outputs) {
for (uint32_t i = 0; i < num_inputs; ++i) {
ReqIn.at(i).bind(&ReqOut.at(i));
@ -508,7 +508,7 @@ public:
// skip bypass mode
if (I == O)
return;
for (uint32_t o = 0; o < O; ++o) {
// process incoming responses
if (!RspOut.at(o).empty()) {
@ -517,10 +517,10 @@ public:
if (lg_num_reqs_ != 0) {
i = rsp.tag & (R-1);
rsp.tag >>= lg_num_reqs_;
}
}
DT(4, this->name() << "-" << rsp);
uint32_t j = o * R + i;
RspIn.at(j).push(rsp, 1);
RspIn.at(j).push(rsp, 1);
RspOut.at(o).pop();
}
@ -530,7 +530,7 @@ public:
uint32_t j = o * R + i;
if (j >= I)
continue;
auto& req_in = ReqIn.at(j);
if (!req_in.empty()) {
auto& req = req_in.front();
@ -538,7 +538,7 @@ public:
req.tag = (req.tag << lg_num_reqs_) | i;
}
DT(4, this->name() << "-" << req);
ReqOut.at(o).push(req, delay_);
ReqOut.at(o).push(req, delay_);
req_in.pop();
this->update_cursor(o, i);
break;
@ -555,7 +555,7 @@ public:
private:
ArbiterType type_;
uint32_t delay_;
uint32_t delay_;
std::vector<uint32_t> cursors_;
uint32_t lg_num_reqs_;
};
@ -576,8 +576,8 @@ public:
SimPort<MemRsp> RspDC;
LocalMemDemux(
const SimContext& ctx,
const char* name,
const SimContext& ctx,
const char* name,
uint32_t delay
);

View file

@ -46,6 +46,7 @@ CXXFLAGS += -I$(POCL_RT_PATH)/include
# Debugigng
ifdef DEBUG
CXXFLAGS += -g -O0
POCL_RT_FLAGS += POCL_DEBUG=all
else
CXXFLAGS += -O2 -DNDEBUG
endif
@ -72,7 +73,7 @@ kernel.cl: $(SRC_DIR)/kernel.cl
cp $< $@
kernel.pocl: $(SRC_DIR)/kernel.cl
LD_LIBRARY_PATH=$(LLVM_POCL)/lib:$(POCL_CC_PATH)/lib:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) LLVM_PREFIX=$(LLVM_VORTEX) POCL_DEBUG=all POCL_KERNEL_CACHE=0 POCL_VORTEX_BINTOOL="OBJCOPY=$(LLVM_VORTEX)/bin/llvm-objcopy $(VORTEX_HOME)/kernel/scripts/vxbin.py" POCL_VORTEX_LLCFLAGS="$(VX_LLCFLAGS)" POCL_VORTEX_CFLAGS="$(VX_CFLAGS)" POCL_VORTEX_LDFLAGS="$(VX_LDFLAGS)" $(POCL_CC_PATH)/bin/poclcc -o $@ $<
LD_LIBRARY_PATH=$(LLVM_POCL)/lib:$(POCL_CC_PATH)/lib:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) LLVM_PREFIX=$(LLVM_VORTEX) POCL_KERNEL_CACHE=0 POCL_VORTEX_BINTOOL="OBJCOPY=$(LLVM_VORTEX)/bin/llvm-objcopy $(VORTEX_HOME)/kernel/scripts/vxbin.py" POCL_VORTEX_LLCFLAGS="$(VX_LLCFLAGS)" POCL_VORTEX_CFLAGS="$(VX_CFLAGS)" POCL_VORTEX_LDFLAGS="$(VX_LDFLAGS)" $(POCL_CC_PATH)/bin/poclcc -o $@ $<
%.cc.o: $(SRC_DIR)/%.cc
$(CXX) $(CXXFLAGS) -c $< -o $@
@ -106,19 +107,19 @@ run-gpu: $(PROJECT).host kernel.cl
./$(PROJECT).host $(OPTS)
run-simx: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/simx:$(LD_LIBRARY_PATH) $(POCL_RT_FLAGS) ./$(PROJECT) $(OPTS)
run-rtlsim: $(PROJECT) kernel.pocl
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/rtlsim:$(LD_LIBRARY_PATH) $(POCL_RT_FLAGS) ./$(PROJECT) $(OPTS)
run-opae: $(PROJECT) kernel.pocl
SCOPE_JSON_PATH=$(ROOT_DIR)/runtime/opae/scope.json OPAE_DRV_PATHS=$(OPAE_DRV_PATHS) LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
SCOPE_JSON_PATH=$(ROOT_DIR)/runtime/opae/scope.json OPAE_DRV_PATHS=$(OPAE_DRV_PATHS) LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/opae:$(LD_LIBRARY_PATH) $(POCL_RT_FLAGS) ./$(PROJECT) $(OPTS)
run-xrt: $(PROJECT) kernel.pocl
ifeq ($(TARGET), hw)
XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/xrt:$(LD_LIBRARY_PATH) $(POCL_RT_FLAGS) ./$(PROJECT) $(OPTS)
else
XCL_EMULATION_MODE=$(TARGET) XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
XCL_EMULATION_MODE=$(TARGET) XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/xrt:$(LD_LIBRARY_PATH) $(POCL_RT_FLAGS) ./$(PROJECT) $(OPTS)
endif
.depend: $(SRCS)

View file

@ -10,8 +10,6 @@
#include <algorithm>
#include <numeric>
#define LOCAL_SIZE 16
#define FLOAT_ULP 6
#define KERNEL_NAME "parallelSum"
@ -97,19 +95,23 @@ static void cleanup() {
if (kernel_bin) free(kernel_bin);
}
int size = 16;
size_t size = 16;
size_t local_size = 8;
static void show_usage() {
printf("Usage: [-n size] [-h: help]\n");
printf("Usage: [-n size] [-l local size] [-h: help]\n");
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:h?")) != -1) {
while ((c = getopt(argc, argv, "n:l:h?")) != -1) {
switch (c) {
case 'n':
size = atoi(optarg);
break;
case 'l':
local_size = atoi(optarg);
break;
case 'h':
case '?': {
show_usage();
@ -126,14 +128,14 @@ int main (int argc, char **argv) {
// parse command arguments
parse_args(argc, argv);
printf("input size=%d\n", size);
if ((size / LOCAL_SIZE) * LOCAL_SIZE != size) {
printf("Error: input size must be a multiple of %d\n", LOCAL_SIZE);
printf("input size=%ld, local size=%ld\n", size, local_size);
if ((size / local_size) * local_size != size) {
printf("Error: input size must be a multiple of %ld\n", local_size);
return -1;
}
uint32_t num_inputs = size;
uint32_t num_outputs = size / LOCAL_SIZE;
uint32_t num_outputs = size / local_size;
cl_platform_id platform_id;
size_t kernel_size;
@ -178,14 +180,11 @@ int main (int argc, char **argv) {
// Create kernel
kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
size_t global_size[1] = {size};
size_t local_size[1] = {LOCAL_SIZE};
// Set kernel arguments
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&c_memobj));
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(uint32_t), &size));
CL_CHECK(clSetKernelArg(kernel, 3, local_size[0]*sizeof(float), NULL));
CL_CHECK(clSetKernelArg(kernel, 3, local_size*sizeof(float), NULL));
// Allocate memories for input arrays and output arrays.
std::vector<float> h_a(num_inputs);
@ -204,7 +203,7 @@ int main (int argc, char **argv) {
printf("Execute the kernel\n");
auto time_start = std::chrono::high_resolution_clock::now();
CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_size, local_size, 0, NULL, NULL));
CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, &size, &local_size, 0, NULL, NULL));
CL_CHECK(clFinish(commandQueue));
auto time_end = std::chrono::high_resolution_clock::now();
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();

View file

@ -1,7 +1,7 @@
#ifndef COMMON_H
#define COMMON_H
#define LOCAL_SIZE 4
#define TILE_SIZE 8
#ifndef TYPE
#define TYPE float

View file

@ -11,13 +11,13 @@ __kernel void sgemm2(__global TYPE *A,
int localCol = get_local_id(0);
// Static local memory declaration
__local TYPE localA[LOCAL_SIZE][LOCAL_SIZE];
__local TYPE localB[LOCAL_SIZE][LOCAL_SIZE];
__local TYPE localA[TILE_SIZE][TILE_SIZE];
__local TYPE localB[TILE_SIZE][TILE_SIZE];
TYPE sum = 0;
// Iterate over blocks
for (int k = 0; k < N; k += LOCAL_SIZE) {
for (int k = 0; k < N; k += TILE_SIZE) {
// Load block of matrix A & B to local memory
localA[localRow][localCol] = A[globalRow * N + (k + localCol)];
localB[localRow][localCol] = B[(k + localRow) * N + globalCol];
@ -26,7 +26,7 @@ __kernel void sgemm2(__global TYPE *A,
barrier(CLK_LOCAL_MEM_FENCE);
// Compute multiplication for this block
for (int j = 0; j < LOCAL_SIZE; j++) {
for (int j = 0; j < TILE_SIZE; j++) {
sum += localA[localRow][j] * localB[j][localCol];
}

View file

@ -170,9 +170,9 @@ int main (int argc, char **argv) {
uint32_t size_sq = size * size;
printf("Matrix size=%d\n", size);
if ((size / LOCAL_SIZE) * LOCAL_SIZE != size) {
printf("Error: matrix size must be a multiple of %d\n", LOCAL_SIZE);
printf("Matrix size=%dx%d, tile size=%dx%d\n", size, size, TILE_SIZE, TILE_SIZE);
if ((size / TILE_SIZE) * TILE_SIZE != size) {
printf("Error: matrix size must be a multiple of %d\n", TILE_SIZE);
return -1;
}
@ -220,7 +220,7 @@ int main (int argc, char **argv) {
kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
size_t global_size[2] = {size, size};
size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE};
size_t local_size[2] = {TILE_SIZE, TILE_SIZE};
// Set kernel arguments
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));

View file

@ -9,8 +9,6 @@
#include <vector>
#include "common.h"
#define LOCAL_SIZE 4
#define FLOAT_ULP 6
#define KERNEL_NAME "sgemm3"
@ -142,18 +140,22 @@ static void cleanup() {
}
int size = 16;
int tile_size = 8;
static void show_usage() {
printf("Usage: [-n size] [-h: help]\n");
printf("Usage: [-n size] [-t tile size] [-h: help]\n");
}
static void parse_args(int argc, char **argv) {
int c;
while ((c = getopt(argc, argv, "n:h?")) != -1) {
while ((c = getopt(argc, argv, "n:t:h?")) != -1) {
switch (c) {
case 'n':
size = atoi(optarg);
break;
case 't':
tile_size = atoi(optarg);
break;
case 'h':
case '?': {
show_usage();
@ -172,9 +174,9 @@ int main (int argc, char **argv) {
uint32_t size_sq = size * size;
printf("Matrix size=%d\n", size);
if ((size / LOCAL_SIZE) * LOCAL_SIZE != size) {
printf("Error: matrix size must be a multiple of %d\n", LOCAL_SIZE);
printf("Matrix size=%dx%d, tile size=%dx%d\n", size, size, tile_size, tile_size);
if ((size / tile_size) * tile_size != size) {
printf("Error: matrix size must be a multiple of %d\n", tile_size);
return -1;
}
@ -222,7 +224,7 @@ int main (int argc, char **argv) {
kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
size_t global_size[2] = {size, size};
size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE};
size_t local_size[2] = {tile_size, tile_size};
// Set kernel arguments
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));