minor update

2025-04-23 21:39:10 -04:00 · 2024-05-12 20:21:23 -07:00 · 2024-05-12 20:21:23 -07:00 · 19beb0728e
commit 19beb0728e
parent 60107cf2b6
7 changed files with 87 additions and 85 deletions
--- a/sim/simx/types.h
+++ b/sim/simx/types.h
@ -1,10 +1,10 @@
 // Copyright © 2019-2023
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ -65,7 +65,7 @@ enum class RegType {
 inline std::ostream &operator<<(std::ostream &os, const RegType& type) {
  switch (type) {
  case RegType::None: break;
-  case RegType::Integer: os << "x"; break;  
+  case RegType::Integer: os << "x"; break;
  case RegType::Float:   os << "f"; break;
  default: assert(false);
  }
@ -142,14 +142,14 @@ enum class AddrType {
 };

 inline AddrType get_addr_type(uint64_t addr) {
-  if (LMEM_ENABLED) {
-    if (addr >= LMEM_BASE_ADDR && addr < (LMEM_BASE_ADDR + (1 << LMEM_LOG_SIZE))) {
-        return AddrType::Shared;
-    }
-  }
  if (addr >= IO_BASE_ADDR) {
     return AddrType::IO;
  }
+  if (LMEM_ENABLED) {
+    if (addr >= LMEM_BASE_ADDR && (addr-LMEM_BASE_ADDR) < (1 << LMEM_LOG_SIZE)) {
+        return AddrType::Shared;
+    }
+  }
  return AddrType::Global;
 }

@ -247,13 +247,13 @@ struct MemReq {
  bool     write;
  AddrType type;
  uint32_t tag;
-  uint32_t cid;    
+  uint32_t cid;
  uint64_t uuid;

-  MemReq(uint64_t _addr = 0, 
+  MemReq(uint64_t _addr = 0,
          bool _write = false,
          AddrType _type = AddrType::Global,
-          uint64_t _tag = 0, 
+          uint64_t _tag = 0,
          uint32_t _cid = 0,
          uint64_t _uuid = 0
  ) : addr(_addr)
@ -276,12 +276,12 @@ inline std::ostream &operator<<(std::ostream &os, const MemReq& req) {
 ///////////////////////////////////////////////////////////////////////////////

 struct MemRsp {
-  uint64_t tag;    
+  uint64_t tag;
  uint32_t cid;
  uint64_t uuid;
-  
+
  MemRsp(uint64_t _tag = 0, uint32_t _cid = 0, uint64_t _uuid = 0)
-    : tag (_tag) 
+    : tag (_tag)
    , cid(_cid)
    , uuid(_uuid)
  {}
@ -297,16 +297,16 @@ inline std::ostream &operator<<(std::ostream &os, const MemRsp& rsp) {

 template <typename T>
 class HashTable {
-public:    
+public:
  HashTable(uint32_t capacity)
    : entries_(capacity)
-    , size_(0) 
+    , size_(0)
  {}

  bool empty() const {
    return (0 == size_);
  }
-  
+
  bool full() const {
    return (size_ == entries_.size());
  }
@ -337,7 +337,7 @@ public:
      if (!entry.first) {
        entry.first = true;
        entry.second = value;
-        ++size_;              
+        ++size_;
        return i;
      }
    }
@ -374,13 +374,13 @@ public:
  std::vector<SimPort<Type>> Outputs;

  Mux(
-    const SimContext& ctx, 
-    const char* name, 
-    ArbiterType type, 
-    uint32_t num_inputs, 
+    const SimContext& ctx,
+    const char* name,
+    ArbiterType type,
+    uint32_t num_inputs,
    uint32_t num_outputs = 1,
    uint32_t delay = 1
-  ) : SimObject<Mux<Type>>(ctx, name)    
+  ) : SimObject<Mux<Type>>(ctx, name)
    , Inputs(num_inputs, this)
    , Outputs(num_outputs, this)
    , type_(type)
@ -388,13 +388,13 @@ public:
    , cursors_(num_outputs, 0)
    , num_reqs_(num_inputs / num_outputs)
  {
-    assert(delay != 0);    
+    assert(delay != 0);
    assert(num_inputs <= 32);
    assert(num_outputs <= 32);
    assert(num_inputs >= num_outputs);

    // bypass mode
-    if (num_inputs == num_outputs) {      
+    if (num_inputs == num_outputs) {
      for (uint32_t i = 0; i < num_inputs; ++i) {
        Inputs.at(i).bind(&Outputs.at(i));
      }
@ -415,20 +415,20 @@ public:
    // skip bypass mode
    if (I == O)
      return;
-        
-    // process inputs       
+
+    // process inputs
    for (uint32_t o = 0; o < O; ++o) {
      for (uint32_t r = 0; r < R; ++r) {
        uint32_t i = (cursors_.at(o) + r) & (R-1);
        uint32_t j = o * R + i;
        if (j >= I)
          continue;
-        
+
        auto& req_in = Inputs.at(j);
        if (!req_in.empty()) {
          auto& req = req_in.front();
          DT(4, this->name() << "-" << req);
-          Outputs.at(o).push(req, delay_);                
+          Outputs.at(o).push(req, delay_);
          req_in.pop();
          this->update_cursor(o, i);
          break;
@ -446,7 +446,7 @@ private:
  }

  ArbiterType type_;
-  uint32_t delay_;  
+  uint32_t delay_;
  std::vector<uint32_t> cursors_;
  uint32_t num_reqs_;
 };
@ -459,33 +459,33 @@ public:
  std::vector<SimPort<Req>>  ReqIn;
  std::vector<SimPort<Rsp>>  RspIn;

-  std::vector<SimPort<Req>>  ReqOut;  
+  std::vector<SimPort<Req>>  ReqOut;
  std::vector<SimPort<Rsp>>  RspOut;

  Switch(
-    const SimContext& ctx, 
-    const char* name, 
-    ArbiterType type, 
-    uint32_t num_inputs, 
+    const SimContext& ctx,
+    const char* name,
+    ArbiterType type,
+    uint32_t num_inputs,
    uint32_t num_outputs = 1,
    uint32_t delay = 1
-  ) 
-    : SimObject<Switch<Req, Rsp>>(ctx, name)    
+  )
+    : SimObject<Switch<Req, Rsp>>(ctx, name)
    , ReqIn(num_inputs, this)
    , RspIn(num_inputs, this)
-    , ReqOut(num_outputs, this)    
+    , ReqOut(num_outputs, this)
    , RspOut(num_outputs, this)
    , type_(type)
    , delay_(delay)
    , cursors_(num_outputs, 0)
    , lg_num_reqs_(log2ceil(num_inputs / num_outputs))
  {
-    assert(delay != 0);    
+    assert(delay != 0);
    assert(num_inputs <= 32);
    assert(num_outputs <= 32);
    assert(num_inputs >= num_outputs);

-    // bypass mode    
+    // bypass mode
    if (num_inputs == num_outputs) {
      for (uint32_t i = 0; i < num_inputs; ++i) {
        ReqIn.at(i).bind(&ReqOut.at(i));
@ -508,7 +508,7 @@ public:
    // skip bypass mode
    if (I == O)
      return;
-        
+
    for (uint32_t o = 0; o < O; ++o) {
      // process incoming responses
      if (!RspOut.at(o).empty()) {
@ -517,10 +517,10 @@ public:
        if (lg_num_reqs_ != 0) {
          i = rsp.tag & (R-1);
          rsp.tag >>= lg_num_reqs_;
-        }      
+        }
        DT(4, this->name() << "-" << rsp);
        uint32_t j = o * R + i;
-        RspIn.at(j).push(rsp, 1);      
+        RspIn.at(j).push(rsp, 1);
        RspOut.at(o).pop();
      }

@ -530,7 +530,7 @@ public:
        uint32_t j = o * R + i;
        if (j >= I)
          continue;
-        
+
        auto& req_in = ReqIn.at(j);
        if (!req_in.empty()) {
          auto& req = req_in.front();
@ -538,7 +538,7 @@ public:
            req.tag = (req.tag << lg_num_reqs_) | i;
          }
          DT(4, this->name() << "-" << req);
-          ReqOut.at(o).push(req, delay_);                
+          ReqOut.at(o).push(req, delay_);
          req_in.pop();
          this->update_cursor(o, i);
          break;
@ -555,7 +555,7 @@ public:

 private:
  ArbiterType type_;
-  uint32_t delay_;  
+  uint32_t delay_;
  std::vector<uint32_t> cursors_;
  uint32_t lg_num_reqs_;
 };
@ -576,8 +576,8 @@ public:
  SimPort<MemRsp> RspDC;

  LocalMemDemux(
-    const SimContext& ctx, 
-    const char* name, 
+    const SimContext& ctx,
+    const char* name,
    uint32_t delay
  );

--- a/tests/opencl/common.mk
+++ b/tests/opencl/common.mk
@ -46,6 +46,7 @@ CXXFLAGS += -I$(POCL_RT_PATH)/include
 # Debugigng
 ifdef DEBUG
 	CXXFLAGS += -g -O0
+	POCL_RT_FLAGS += POCL_DEBUG=all
 else
 	CXXFLAGS += -O2 -DNDEBUG
 endif
@ -72,7 +73,7 @@ kernel.cl: $(SRC_DIR)/kernel.cl
 	cp $< $@

 kernel.pocl: $(SRC_DIR)/kernel.cl
-	LD_LIBRARY_PATH=$(LLVM_POCL)/lib:$(POCL_CC_PATH)/lib:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) LLVM_PREFIX=$(LLVM_VORTEX) POCL_DEBUG=all POCL_KERNEL_CACHE=0 POCL_VORTEX_BINTOOL="OBJCOPY=$(LLVM_VORTEX)/bin/llvm-objcopy $(VORTEX_HOME)/kernel/scripts/vxbin.py" POCL_VORTEX_LLCFLAGS="$(VX_LLCFLAGS)" POCL_VORTEX_CFLAGS="$(VX_CFLAGS)" POCL_VORTEX_LDFLAGS="$(VX_LDFLAGS)" $(POCL_CC_PATH)/bin/poclcc -o $@ $<
+	LD_LIBRARY_PATH=$(LLVM_POCL)/lib:$(POCL_CC_PATH)/lib:$(LLVM_VORTEX)/lib:$(LD_LIBRARY_PATH) LLVM_PREFIX=$(LLVM_VORTEX) POCL_KERNEL_CACHE=0 POCL_VORTEX_BINTOOL="OBJCOPY=$(LLVM_VORTEX)/bin/llvm-objcopy $(VORTEX_HOME)/kernel/scripts/vxbin.py" POCL_VORTEX_LLCFLAGS="$(VX_LLCFLAGS)" POCL_VORTEX_CFLAGS="$(VX_CFLAGS)" POCL_VORTEX_LDFLAGS="$(VX_LDFLAGS)" $(POCL_CC_PATH)/bin/poclcc -o $@ $<

 %.cc.o: $(SRC_DIR)/%.cc
 	$(CXX) $(CXXFLAGS) -c $< -o $@
@ -106,19 +107,19 @@ run-gpu: $(PROJECT).host kernel.cl
 	./$(PROJECT).host $(OPTS)

 run-simx: $(PROJECT) kernel.pocl
-	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/simx:$(LD_LIBRARY_PATH) $(POCL_RT_FLAGS) ./$(PROJECT) $(OPTS)

 run-rtlsim: $(PROJECT) kernel.pocl
-	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+	LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/rtlsim:$(LD_LIBRARY_PATH) $(POCL_RT_FLAGS) ./$(PROJECT) $(OPTS)

 run-opae: $(PROJECT) kernel.pocl
-	SCOPE_JSON_PATH=$(ROOT_DIR)/runtime/opae/scope.json OPAE_DRV_PATHS=$(OPAE_DRV_PATHS) LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+	SCOPE_JSON_PATH=$(ROOT_DIR)/runtime/opae/scope.json OPAE_DRV_PATHS=$(OPAE_DRV_PATHS) LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/opae:$(LD_LIBRARY_PATH) $(POCL_RT_FLAGS) ./$(PROJECT) $(OPTS)

 run-xrt: $(PROJECT) kernel.pocl
 ifeq ($(TARGET), hw)
-	XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+	XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/xrt:$(LD_LIBRARY_PATH) $(POCL_RT_FLAGS) ./$(PROJECT) $(OPTS)
 else
-	XCL_EMULATION_MODE=$(TARGET) XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/xrt:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
+	XCL_EMULATION_MODE=$(TARGET) XRT_INI_PATH=$(XRT_SYN_DIR)/xrt.ini EMCONFIG_PATH=$(FPGA_BIN_DIR) XRT_DEVICE_INDEX=$(XRT_DEVICE_INDEX) XRT_XCLBIN_PATH=$(FPGA_BIN_DIR)/vortex_afu.xclbin LD_LIBRARY_PATH=$(XILINX_XRT)/lib:$(POCL_RT_PATH)/lib:$(ROOT_DIR)/runtime/xrt:$(LD_LIBRARY_PATH) $(POCL_RT_FLAGS) ./$(PROJECT) $(OPTS)
 endif

 .depend: $(SRCS)
--- a/tests/opencl/psum/main.cc
+++ b/tests/opencl/psum/main.cc
@ -10,8 +10,6 @@
 #include <algorithm>
 #include <numeric>

-#define LOCAL_SIZE 16
-
 #define FLOAT_ULP 6

 #define KERNEL_NAME "parallelSum"
@ -97,19 +95,23 @@ static void cleanup() {
  if (kernel_bin) free(kernel_bin);
 }

-int size = 16;
+size_t size = 16;
+size_t local_size = 8;

 static void show_usage() {
-  printf("Usage: [-n size] [-h: help]\n");
+  printf("Usage: [-n size] [-l local size] [-h: help]\n");
 }

 static void parse_args(int argc, char **argv) {
  int c;
-  while ((c = getopt(argc, argv, "n:h?")) != -1) {
+  while ((c = getopt(argc, argv, "n:l:h?")) != -1) {
    switch (c) {
    case 'n':
      size = atoi(optarg);
      break;
+    case 'l':
+      local_size = atoi(optarg);
+      break;
    case 'h':
    case '?': {
      show_usage();
@ -126,14 +128,14 @@ int main (int argc, char **argv) {
  // parse command arguments
  parse_args(argc, argv);

-  printf("input size=%d\n", size);
-  if ((size / LOCAL_SIZE) * LOCAL_SIZE != size) {
-    printf("Error: input size must be a multiple of %d\n", LOCAL_SIZE);
+  printf("input size=%ld, local size=%ld\n", size, local_size);
+  if ((size / local_size) * local_size != size) {
+    printf("Error: input size must be a multiple of %ld\n", local_size);
    return -1;
  }

  uint32_t num_inputs = size;
-  uint32_t num_outputs = size / LOCAL_SIZE;
+  uint32_t num_outputs = size / local_size;

  cl_platform_id platform_id;
  size_t kernel_size;
@ -178,14 +180,11 @@ int main (int argc, char **argv) {
  // Create kernel
  kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));

-  size_t global_size[1] = {size};
-  size_t local_size[1] =  {LOCAL_SIZE};
-
  // Set kernel arguments
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));
  CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&c_memobj));
  CL_CHECK(clSetKernelArg(kernel, 2, sizeof(uint32_t), &size));
-  CL_CHECK(clSetKernelArg(kernel, 3, local_size[0]*sizeof(float), NULL));
+  CL_CHECK(clSetKernelArg(kernel, 3, local_size*sizeof(float), NULL));

 // Allocate memories for input arrays and output arrays.
 std::vector<float> h_a(num_inputs);
@ -204,7 +203,7 @@ int main (int argc, char **argv) {

  printf("Execute the kernel\n");
  auto time_start = std::chrono::high_resolution_clock::now();
-  CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_size, local_size, 0, NULL, NULL));
+  CL_CHECK(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, &size, &local_size, 0, NULL, NULL));
  CL_CHECK(clFinish(commandQueue));
  auto time_end = std::chrono::high_resolution_clock::now();
  double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
--- a/tests/opencl/sgemm2/common.h
+++ b/tests/opencl/sgemm2/common.h
@ -1,7 +1,7 @@
 #ifndef COMMON_H
 #define COMMON_H

-#define LOCAL_SIZE 4
+#define TILE_SIZE 8

 #ifndef TYPE
 #define TYPE float
--- a/tests/opencl/sgemm2/kernel.cl
+++ b/tests/opencl/sgemm2/kernel.cl
@ -11,13 +11,13 @@ __kernel void sgemm2(__global TYPE *A,
    int localCol  = get_local_id(0);

    // Static local memory declaration
-    __local TYPE localA[LOCAL_SIZE][LOCAL_SIZE];
-    __local TYPE localB[LOCAL_SIZE][LOCAL_SIZE];
+    __local TYPE localA[TILE_SIZE][TILE_SIZE];
+    __local TYPE localB[TILE_SIZE][TILE_SIZE];

    TYPE sum = 0;

    // Iterate over blocks
-    for (int k = 0; k < N; k += LOCAL_SIZE) {
+    for (int k = 0; k < N; k += TILE_SIZE) {
        // Load block of matrix A & B to local memory
        localA[localRow][localCol] = A[globalRow * N + (k + localCol)];
        localB[localRow][localCol] = B[(k + localRow) * N + globalCol];
@ -26,7 +26,7 @@ __kernel void sgemm2(__global TYPE *A,
        barrier(CLK_LOCAL_MEM_FENCE);

        // Compute multiplication for this block
-        for (int j = 0; j < LOCAL_SIZE; j++) {
+        for (int j = 0; j < TILE_SIZE; j++) {
            sum += localA[localRow][j] * localB[j][localCol];
        }

--- a/tests/opencl/sgemm2/main.cc
+++ b/tests/opencl/sgemm2/main.cc
@ -170,9 +170,9 @@ int main (int argc, char **argv) {

  uint32_t size_sq = size * size;

-  printf("Matrix size=%d\n", size);
-  if ((size / LOCAL_SIZE) * LOCAL_SIZE != size) {
-    printf("Error: matrix size must be a multiple of %d\n", LOCAL_SIZE);
+  printf("Matrix size=%dx%d, tile size=%dx%d\n", size, size, TILE_SIZE, TILE_SIZE);
+  if ((size / TILE_SIZE) * TILE_SIZE != size) {
+    printf("Error: matrix size must be a multiple of %d\n", TILE_SIZE);
    return -1;
  }

@ -220,7 +220,7 @@ int main (int argc, char **argv) {
  kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));

  size_t global_size[2] = {size, size};
-  size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE};
+  size_t local_size[2] = {TILE_SIZE, TILE_SIZE};

  // Set kernel arguments
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));
--- a/tests/opencl/sgemm3/main.cc
+++ b/tests/opencl/sgemm3/main.cc
@ -9,8 +9,6 @@
 #include <vector>
 #include "common.h"

-#define LOCAL_SIZE 4
-
 #define FLOAT_ULP 6

 #define KERNEL_NAME "sgemm3"
@ -142,18 +140,22 @@ static void cleanup() {
 }

 int size = 16;
+int tile_size = 8;

 static void show_usage() {
-  printf("Usage: [-n size] [-h: help]\n");
+  printf("Usage: [-n size] [-t tile size] [-h: help]\n");
 }

 static void parse_args(int argc, char **argv) {
  int c;
-  while ((c = getopt(argc, argv, "n:h?")) != -1) {
+  while ((c = getopt(argc, argv, "n:t:h?")) != -1) {
    switch (c) {
    case 'n':
      size = atoi(optarg);
      break;
+    case 't':
+      tile_size = atoi(optarg);
+      break;
    case 'h':
    case '?': {
      show_usage();
@ -172,9 +174,9 @@ int main (int argc, char **argv) {

  uint32_t size_sq = size * size;

-  printf("Matrix size=%d\n", size);
-  if ((size / LOCAL_SIZE) * LOCAL_SIZE != size) {
-    printf("Error: matrix size must be a multiple of %d\n", LOCAL_SIZE);
+  printf("Matrix size=%dx%d, tile size=%dx%d\n", size, size, tile_size, tile_size);
+  if ((size / tile_size) * tile_size != size) {
+    printf("Error: matrix size must be a multiple of %d\n", tile_size);
    return -1;
  }

@ -222,7 +224,7 @@ int main (int argc, char **argv) {
  kernel = CL_CHECK2(clCreateKernel(program, KERNEL_NAME, &_err));

  size_t global_size[2] = {size, size};
-  size_t local_size[2] = {LOCAL_SIZE, LOCAL_SIZE};
+  size_t local_size[2] = {tile_size, tile_size};

  // Set kernel arguments
  CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_memobj));