mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
minor update
This commit is contained in:
parent
60107cf2b6
commit
19beb0728e
7 changed files with 87 additions and 85 deletions
|
@ -1,10 +1,10 @@
|
|||
// Copyright © 2019-2023
|
||||
//
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
|
@ -65,7 +65,7 @@ enum class RegType {
|
|||
inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
|
||||
switch (type) {
|
||||
case RegType::None: break;
|
||||
case RegType::Integer: os << "x"; break;
|
||||
case RegType::Integer: os << "x"; break;
|
||||
case RegType::Float: os << "f"; break;
|
||||
default: assert(false);
|
||||
}
|
||||
|
@ -142,14 +142,14 @@ enum class AddrType {
|
|||
};
|
||||
|
||||
inline AddrType get_addr_type(uint64_t addr) {
|
||||
if (LMEM_ENABLED) {
|
||||
if (addr >= LMEM_BASE_ADDR && addr < (LMEM_BASE_ADDR + (1 << LMEM_LOG_SIZE))) {
|
||||
return AddrType::Shared;
|
||||
}
|
||||
}
|
||||
if (addr >= IO_BASE_ADDR) {
|
||||
return AddrType::IO;
|
||||
}
|
||||
if (LMEM_ENABLED) {
|
||||
if (addr >= LMEM_BASE_ADDR && (addr-LMEM_BASE_ADDR) < (1 << LMEM_LOG_SIZE)) {
|
||||
return AddrType::Shared;
|
||||
}
|
||||
}
|
||||
return AddrType::Global;
|
||||
}
|
||||
|
||||
|
@ -247,13 +247,13 @@ struct MemReq {
|
|||
bool write;
|
||||
AddrType type;
|
||||
uint32_t tag;
|
||||
uint32_t cid;
|
||||
uint32_t cid;
|
||||
uint64_t uuid;
|
||||
|
||||
MemReq(uint64_t _addr = 0,
|
||||
MemReq(uint64_t _addr = 0,
|
||||
bool _write = false,
|
||||
AddrType _type = AddrType::Global,
|
||||
uint64_t _tag = 0,
|
||||
uint64_t _tag = 0,
|
||||
uint32_t _cid = 0,
|
||||
uint64_t _uuid = 0
|
||||
) : addr(_addr)
|
||||
|
@ -276,12 +276,12 @@ inline std::ostream &operator<<(std::ostream &os, const MemReq& req) {
|
|||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
struct MemRsp {
|
||||
uint64_t tag;
|
||||
uint64_t tag;
|
||||
uint32_t cid;
|
||||
uint64_t uuid;
|
||||
|
||||
|
||||
MemRsp(uint64_t _tag = 0, uint32_t _cid = 0, uint64_t _uuid = 0)
|
||||
: tag (_tag)
|
||||
: tag (_tag)
|
||||
, cid(_cid)
|
||||
, uuid(_uuid)
|
||||
{}
|
||||
|
@ -297,16 +297,16 @@ inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) {
|
|||
|
||||
template <typename T>
|
||||
class HashTable {
|
||||
public:
|
||||
public:
|
||||
HashTable(uint32_t capacity)
|
||||
: entries_(capacity)
|
||||
, size_(0)
|
||||
, size_(0)
|
||||
{}
|
||||
|
||||
bool empty() const {
|
||||
return (0 == size_);
|
||||
}
|
||||
|
||||
|
||||
bool full() const {
|
||||
return (size_ == entries_.size());
|
||||
}
|
||||
|
@ -337,7 +337,7 @@ public:
|
|||
if (!entry.first) {
|
||||
entry.first = true;
|
||||
entry.second = value;
|
||||
++size_;
|
||||
++size_;
|
||||
return i;
|
||||
}
|
||||
}
|
||||
|
@ -374,13 +374,13 @@ public:
|
|||
std::vector<SimPort<Type>> Outputs;
|
||||
|
||||
Mux(
|
||||
const SimContext& ctx,
|
||||
const char* name,
|
||||
ArbiterType type,
|
||||
uint32_t num_inputs,
|
||||
const SimContext& ctx,
|
||||
const char* name,
|
||||
ArbiterType type,
|
||||
uint32_t num_inputs,
|
||||
uint32_t num_outputs = 1,
|
||||
uint32_t delay = 1
|
||||
) : SimObject<Mux<Type>>(ctx, name)
|
||||
) : SimObject<Mux<Type>>(ctx, name)
|
||||
, Inputs(num_inputs, this)
|
||||
, Outputs(num_outputs, this)
|
||||
, type_(type)
|
||||
|
@ -388,13 +388,13 @@ public:
|
|||
, cursors_(num_outputs, 0)
|
||||
, num_reqs_(num_inputs / num_outputs)
|
||||
{
|
||||
assert(delay != 0);
|
||||
assert(delay != 0);
|
||||
assert(num_inputs <= 32);
|
||||
assert(num_outputs <= 32);
|
||||
assert(num_inputs >= num_outputs);
|
||||
|
||||
// bypass mode
|
||||
if (num_inputs == num_outputs) {
|
||||
if (num_inputs == num_outputs) {
|
||||
for (uint32_t i = 0; i < num_inputs; ++i) {
|
||||
Inputs.at(i).bind(&Outputs.at(i));
|
||||
}
|
||||
|
@ -415,20 +415,20 @@ public:
|
|||
// skip bypass mode
|
||||
if (I == O)
|
||||
return;
|
||||
|
||||
// process inputs
|
||||
|
||||
// process inputs
|
||||
for (uint32_t o = 0; o < O; ++o) {
|
||||
for (uint32_t r = 0; r < R; ++r) {
|
||||
uint32_t i = (cursors_.at(o) + r) & (R-1);
|
||||
uint32_t j = o * R + i;
|
||||
if (j >= I)
|
||||
continue;
|
||||
|
||||
|
||||
auto& req_in = Inputs.at(j);
|
||||
if (!req_in.empty()) {
|
||||
auto& req = req_in.front();
|
||||
DT(4, this->name() << "-" << req);
|
||||
Outputs.at(o).push(req, delay_);
|
||||
Outputs.at(o).push(req, delay_);
|
||||
req_in.pop();
|
||||
this->update_cursor(o, i);
|
||||
break;
|
||||
|
@ -446,7 +446,7 @@ private:
|
|||
}
|
||||
|
||||
ArbiterType type_;
|
||||
uint32_t delay_;
|
||||
uint32_t delay_;
|
||||
std::vector<uint32_t> cursors_;
|
||||
uint32_t num_reqs_;
|
||||
};
|
||||
|
@ -459,33 +459,33 @@ public:
|
|||
std::vector<SimPort<Req>> ReqIn;
|
||||
std::vector<SimPort<Rsp>> RspIn;
|
||||
|
||||
std::vector<SimPort<Req>> ReqOut;
|
||||
std::vector<SimPort<Req>> ReqOut;
|
||||
std::vector<SimPort<Rsp>> RspOut;
|
||||
|
||||
Switch(
|
||||
const SimContext& ctx,
|
||||
const char* name,
|
||||
ArbiterType type,
|
||||
uint32_t num_inputs,
|
||||
const SimContext& ctx,
|
||||
const char* name,
|
||||
ArbiterType type,
|
||||
uint32_t num_inputs,
|
||||
uint32_t num_outputs = 1,
|
||||
uint32_t delay = 1
|
||||
)
|
||||
: SimObject<Switch<Req, Rsp>>(ctx, name)
|
||||
)
|
||||
: SimObject<Switch<Req, Rsp>>(ctx, name)
|
||||
, ReqIn(num_inputs, this)
|
||||
, RspIn(num_inputs, this)
|
||||
, ReqOut(num_outputs, this)
|
||||
, ReqOut(num_outputs, this)
|
||||
, RspOut(num_outputs, this)
|
||||
, type_(type)
|
||||
, delay_(delay)
|
||||
, cursors_(num_outputs, 0)
|
||||
, lg_num_reqs_(log2ceil(num_inputs / num_outputs))
|
||||
{
|
||||
assert(delay != 0);
|
||||
assert(delay != 0);
|
||||
assert(num_inputs <= 32);
|
||||
assert(num_outputs <= 32);
|
||||
assert(num_inputs >= num_outputs);
|
||||
|
||||
// bypass mode
|
||||
// bypass mode
|
||||
if (num_inputs == num_outputs) {
|
||||
for (uint32_t i = 0; i < num_inputs; ++i) {
|
||||
ReqIn.at(i).bind(&ReqOut.at(i));
|
||||
|
@ -508,7 +508,7 @@ public:
|
|||
// skip bypass mode
|
||||
if (I == O)
|
||||
return;
|
||||
|
||||
|
||||
for (uint32_t o = 0; o < O; ++o) {
|
||||
// process incoming responses
|
||||
if (!RspOut.at(o).empty()) {
|
||||
|
@ -517,10 +517,10 @@ public:
|
|||
if (lg_num_reqs_ != 0) {
|
||||
i = rsp.tag & (R-1);
|
||||
rsp.tag >>= lg_num_reqs_;
|
||||
}
|
||||
}
|
||||
DT(4, this->name() << "-" << rsp);
|
||||
uint32_t j = o * R + i;
|
||||
RspIn.at(j).push(rsp, 1);
|
||||
RspIn.at(j).push(rsp, 1);
|
||||
RspOut.at(o).pop();
|
||||
}
|
||||
|
||||
|
@ -530,7 +530,7 @@ public:
|
|||
uint32_t j = o * R + i;
|
||||
if (j >= I)
|
||||
continue;
|
||||
|
||||
|
||||
auto& req_in = ReqIn.at(j);
|
||||
if (!req_in.empty()) {
|
||||
auto& req = req_in.front();
|
||||
|
@ -538,7 +538,7 @@ public:
|
|||
req.tag = (req.tag << lg_num_reqs_) | i;
|
||||
}
|
||||
DT(4, this->name() << "-" << req);
|
||||
ReqOut.at(o).push(req, delay_);
|
||||
ReqOut.at(o).push(req, delay_);
|
||||
req_in.pop();
|
||||
this->update_cursor(o, i);
|
||||
break;
|
||||
|
@ -555,7 +555,7 @@ public:
|
|||
|
||||
private:
|
||||
ArbiterType type_;
|
||||
uint32_t delay_;
|
||||
uint32_t delay_;
|
||||
std::vector<uint32_t> cursors_;
|
||||
uint32_t lg_num_reqs_;
|
||||
};
|
||||
|
@ -576,8 +576,8 @@ public:
|
|||
SimPort<MemRsp> RspDC;
|
||||
|
||||
LocalMemDemux(
|
||||
const SimContext& ctx,
|
||||
const char* name,
|
||||
const SimContext& ctx,
|
||||
const char* name,
|
||||
uint32_t delay
|
||||
);
|
||||
|
||||
|
|
|
@ -46,6 +46,7 @@ CXXFLAGS += -I$(POCL_RT_PATH)/include
|
|||
# Debugigng
|
||||
ifdef DEBUG
|
||||
CXXFLAGS += -g -O0
|
||||
POCL_RT_FLAGS += POCL_DEBUG=all
|
||||
else
|
||||
CXXFLAGS += -O2 -DNDEBUG
|
||||
endif
|
||||
|
@ -72,7 +73,7 @@ kernel.cl: $(SRC_DIR)/kernel.cl
|
|||
cp $< $@
|
||||
|
||||
kernel.pocl: $(SRC_DIR)/kernel.cl
|
||||
LD_LIBRARY_PATH=$(LLVM_POCL)/lib:$(POCL_CC_PATH)/lib:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) LLVM_PREFIX=$(LLVM_VORTEX) POCL_DEBUG=all POCL_KERNEL_CACHE=0 POCL_VORTEX_BINTOOL="OBJCOPY=$(LLVM_VORTEX)/bin/llvm-objcopy $(VORTEX_HOME)/kernel/scripts/vxbin.py" POCL_VORTEX_LLCFLAGS="$(VX_LLCFLAGS)" POCL_VORTEX_CFLAGS="$(VX_CFLAGS)" POCL_VORTEX_LDFLAGS="$(VX_LDFLAGS)" $(POCL_CC_PATH)/bin/poclcc -o $@ $<
|
||||
LD_LIBRARY_PATH=$(LLVM_POCL)/lib:$(POCL_CC_PATH)/lib:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) LLVM_PREFIX=$(LLVM_VORTEX) POCL_KERNEL_CACHE=0 POCL_VORTEX_BINTOOL="OBJCOPY=$(LLVM_VORTEX)/bin/llvm-objcopy $(VORTEX_HOME)/kernel/scripts/vxbin.py" POCL_VORTEX_LLCFLAGS="$(VX_LLCFLAGS)" POCL_VORTEX_CFLAGS="$(VX_CFLAGS)" POCL_VORTEX_LDFLAGS="$(VX_LDFLAGS)" $(POCL_CC_PATH)/bin/poclcc -o $@ $<
|
||||
|
||||
%.cc.o: $(SRC_DIR)/%.cc
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
@ -106,19 +107,19 @@ run-gpu: $(PROJECT).host kernel.cl
|
|||
./$(PROJECT).host $(OPTS)
|
||||
|
||||
run-simx: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/simx:$(LD_LIBRARY_PATH) $(POCL_RT_FLAGS) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-rtlsim: $(PROJECT) kernel.pocl
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/rtlsim:$(LD_LIBRARY_PATH) $(POCL_RT_FLAGS) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-opae: $(PROJECT) kernel.pocl
|
||||
SCOPE_JSON_PATH=$(ROOT_DIR)/runtime/opae/scope.json OPAE_DRV_PATHS=$(OPAE_DRV_PATHS) LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
SCOPE_JSON_PATH=$(ROOT_DIR)/runtime/opae/scope.json OPAE_DRV_PATHS=$(OPAE_DRV_PATHS) LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/opae:$(LD_LIBRARY_PATH) $(POCL_RT_FLAGS) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-xrt: $(PROJECT) kernel.pocl
|
||||
ifeq ($(TARGET), hw)
|
||||
XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/xrt:$(LD_LIBRARY_PATH) $(POCL_RT_FLAGS) ./$(PROJECT) $(OPTS)
|
||||
else
|
||||
XCL_EMULATION_MODE=$(TARGET) XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
XCL_EMULATION_MODE=$(TARGET) XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/xrt:$(LD_LIBRARY_PATH) $(POCL_RT_FLAGS) ./$(PROJECT) $(OPTS)
|
||||
endif
|
||||
|
||||
.depend: $(SRCS)
|
||||
|
|
|
@ -10,8 +10,6 @@
|
|||
#include <algorithm>
|
||||
#include <numeric>
|
||||
|
||||
#define LOCAL_SIZE 16
|
||||
|
||||
#define FLOAT_ULP 6
|
||||
|
||||
#define KERNEL_NAME "parallelSum"
|
||||
|
@ -97,19 +95,23 @@ static void cleanup() {
|
|||
if (kernel_bin) free(kernel_bin);
|
||||
}
|
||||
|
||||
int size = 16;
|
||||
size_t size = 16;
|
||||
size_t local_size = 8;
|
||||
|
||||
static void show_usage() {
|
||||
printf("Usage: [-n size] [-h: help]\n");
|
||||
printf("Usage: [-n size] [-l local size] [-h: help]\n");
|
||||
}
|
||||
|
||||
static void parse_args(int argc, char **argv) {
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "n:h?")) != -1) {
|
||||
while ((c = getopt(argc, argv, "n:l:h?")) != -1) {
|
||||
switch (c) {
|
||||
case 'n':
|
||||
size = atoi(optarg);
|
||||
break;
|
||||
case 'l':
|
||||
local_size = atoi(optarg);
|
||||
break;
|
||||
case 'h':
|
||||
case '?': {
|
||||
show_usage();
|
||||
|
@ -126,14 +128,14 @@ int main (int argc, char **argv) {
|
|||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
printf("input size=%d\n", size);
|
||||
if ((size / LOCAL_SIZE) * LOCAL_SIZE != size) {
|
||||
printf("Error: input size must be a multiple of %d\n", LOCAL_SIZE);
|
||||
printf("input size=%ld, local size=%ld\n", size, local_size);
|
||||
if ((size / local_size) * local_size != size) {
|
||||
printf("Error: input size must be a multiple of %ld\n", local_size);
|
||||
return -1;
|
||||
}
|
||||
|
||||
uint32_t num_inputs = size;
|
||||
uint32_t num_outputs = size / LOCAL_SIZE;
|
||||
uint32_t num_outputs = size / local_size;
|
||||
|
||||
cl_platform_id platform_id;
|
||||
size_t kernel_size;
|
||||
|
@ -178,14 +180,11 @@ int main (int argc, char **argv) {
|
|||
// Create kernel
|
||||
kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
|
||||
|
||||
size_t global_size[1] = {size};
|
||||
size_t local_size[1] = {LOCAL_SIZE};
|
||||
|
||||
// Set kernel arguments
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&c_memobj));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(uint32_t), &size));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, local_size[0]*sizeof(float), NULL));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, local_size*sizeof(float), NULL));
|
||||
|
||||
// Allocate memories for input arrays and output arrays.
|
||||
std::vector<float> h_a(num_inputs);
|
||||
|
@ -204,7 +203,7 @@ int main (int argc, char **argv) {
|
|||
|
||||
printf("Execute the kernel\n");
|
||||
auto time_start = std::chrono::high_resolution_clock::now();
|
||||
CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_size, local_size, 0, NULL, NULL));
|
||||
CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, &size, &local_size, 0, NULL, NULL));
|
||||
CL_CHECK(clFinish(commandQueue));
|
||||
auto time_end = std::chrono::high_resolution_clock::now();
|
||||
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
#ifndef COMMON_H
|
||||
#define COMMON_H
|
||||
|
||||
#define LOCAL_SIZE 4
|
||||
#define TILE_SIZE 8
|
||||
|
||||
#ifndef TYPE
|
||||
#define TYPE float
|
||||
|
|
|
@ -11,13 +11,13 @@ __kernel void sgemm2(__global TYPE *A,
|
|||
int localCol = get_local_id(0);
|
||||
|
||||
// Static local memory declaration
|
||||
__local TYPE localA[LOCAL_SIZE][LOCAL_SIZE];
|
||||
__local TYPE localB[LOCAL_SIZE][LOCAL_SIZE];
|
||||
__local TYPE localA[TILE_SIZE][TILE_SIZE];
|
||||
__local TYPE localB[TILE_SIZE][TILE_SIZE];
|
||||
|
||||
TYPE sum = 0;
|
||||
|
||||
// Iterate over blocks
|
||||
for (int k = 0; k < N; k += LOCAL_SIZE) {
|
||||
for (int k = 0; k < N; k += TILE_SIZE) {
|
||||
// Load block of matrix A & B to local memory
|
||||
localA[localRow][localCol] = A[globalRow * N + (k + localCol)];
|
||||
localB[localRow][localCol] = B[(k + localRow) * N + globalCol];
|
||||
|
@ -26,7 +26,7 @@ __kernel void sgemm2(__global TYPE *A,
|
|||
barrier(CLK_LOCAL_MEM_FENCE);
|
||||
|
||||
// Compute multiplication for this block
|
||||
for (int j = 0; j < LOCAL_SIZE; j++) {
|
||||
for (int j = 0; j < TILE_SIZE; j++) {
|
||||
sum += localA[localRow][j] * localB[j][localCol];
|
||||
}
|
||||
|
||||
|
|
|
@ -170,9 +170,9 @@ int main (int argc, char **argv) {
|
|||
|
||||
uint32_t size_sq = size * size;
|
||||
|
||||
printf("Matrix size=%d\n", size);
|
||||
if ((size / LOCAL_SIZE) * LOCAL_SIZE != size) {
|
||||
printf("Error: matrix size must be a multiple of %d\n", LOCAL_SIZE);
|
||||
printf("Matrix size=%dx%d, tile size=%dx%d\n", size, size, TILE_SIZE, TILE_SIZE);
|
||||
if ((size / TILE_SIZE) * TILE_SIZE != size) {
|
||||
printf("Error: matrix size must be a multiple of %d\n", TILE_SIZE);
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
@ -220,7 +220,7 @@ int main (int argc, char **argv) {
|
|||
kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
|
||||
|
||||
size_t global_size[2] = {size, size};
|
||||
size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE};
|
||||
size_t local_size[2] = {TILE_SIZE, TILE_SIZE};
|
||||
|
||||
// Set kernel arguments
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));
|
||||
|
|
|
@ -9,8 +9,6 @@
|
|||
#include <vector>
|
||||
#include "common.h"
|
||||
|
||||
#define LOCAL_SIZE 4
|
||||
|
||||
#define FLOAT_ULP 6
|
||||
|
||||
#define KERNEL_NAME "sgemm3"
|
||||
|
@ -142,18 +140,22 @@ static void cleanup() {
|
|||
}
|
||||
|
||||
int size = 16;
|
||||
int tile_size = 8;
|
||||
|
||||
static void show_usage() {
|
||||
printf("Usage: [-n size] [-h: help]\n");
|
||||
printf("Usage: [-n size] [-t tile size] [-h: help]\n");
|
||||
}
|
||||
|
||||
static void parse_args(int argc, char **argv) {
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "n:h?")) != -1) {
|
||||
while ((c = getopt(argc, argv, "n:t:h?")) != -1) {
|
||||
switch (c) {
|
||||
case 'n':
|
||||
size = atoi(optarg);
|
||||
break;
|
||||
case 't':
|
||||
tile_size = atoi(optarg);
|
||||
break;
|
||||
case 'h':
|
||||
case '?': {
|
||||
show_usage();
|
||||
|
@ -172,9 +174,9 @@ int main (int argc, char **argv) {
|
|||
|
||||
uint32_t size_sq = size * size;
|
||||
|
||||
printf("Matrix size=%d\n", size);
|
||||
if ((size / LOCAL_SIZE) * LOCAL_SIZE != size) {
|
||||
printf("Error: matrix size must be a multiple of %d\n", LOCAL_SIZE);
|
||||
printf("Matrix size=%dx%d, tile size=%dx%d\n", size, size, tile_size, tile_size);
|
||||
if ((size / tile_size) * tile_size != size) {
|
||||
printf("Error: matrix size must be a multiple of %d\n", tile_size);
|
||||
return -1;
|
||||
}
|
||||
|
||||
|
@ -222,7 +224,7 @@ int main (int argc, char **argv) {
|
|||
kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));
|
||||
|
||||
size_t global_size[2] = {size, size};
|
||||
size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE};
|
||||
size_t local_size[2] = {tile_size, tile_size};
|
||||
|
||||
// Set kernel arguments
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue