added debug print states or rtl

This commit is contained in:
Blaise Tine 2020-05-16 14:19:17 -04:00
parent 65c2da76cf
commit d6c87dbb0a
24 changed files with 7100 additions and 5980 deletions

View file

@ -3,11 +3,18 @@ CFLAGS += -std=c++11 -g -O0 -Wall -Wextra -Wfatal-errors
CFLAGS += -I../../include -I../../../hw/simulate -I../../../runtime
# control RTL debug print states
DBG_PRINT = -DDBG_PRINT_CORE_ICACHE \
-DDBG_PRINT_CORE_DCACHE \
-DDBG_PRINT_BANK \
-DDBG_PRINT_DRAM \
-DDBG_PRINT_SNP_FWD
#MULTICORE += -DNUM_CLUSTERS=2 -DNUM_CORES=2
#MULTICORE += -DNUM_CLUSTERS=1 -DNUM_CORES=2
MULTICORE += -DNUM_CLUSTERS=1 -DNUM_CORES=2 -DL2_ENABLE=0
#DEBUG = 1
DEBUG = 1
CFLAGS += -fPIC
@ -31,7 +38,7 @@ VL_FLAGS += -DGLOBAL_BLOCK_SIZE=64
# Debugigng
ifdef DEBUG
VL_FLAGS += --trace -DVL_DEBUG=1
VL_FLAGS += --trace -DVL_DEBUG=1 $(DBG_PRINT)
CFLAGS += -DVCD_OUTPUT
else
CFLAGS += -DNDEBUG

View file

@ -46,7 +46,7 @@ run-ase: $(PROJECT)
ASE_LOG=0 LD_LIBRARY_PATH=../../opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) -f kernel.bin -n 16
run-rtlsim: $(PROJECT)
LD_LIBRARY_PATH=../../rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) -f kernel.bin -n 16
LD_LIBRARY_PATH=../../rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) -f kernel.bin -n 4
run-simx: $(PROJECT)
LD_LIBRARY_PATH=../../simx:$(LD_LIBRARY_PATH) ./$(PROJECT) -f kernel.bin -n 16

View file

@ -4,8 +4,6 @@
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
struct kernel_arg_t {
uint32_t num_warps;
uint32_t num_threads;
uint32_t stride;
uint32_t src0_ptr;
uint32_t src1_ptr;

View file

@ -6,7 +6,7 @@
#define RT_CHECK(_expr) \
do { \
int _ret = _expr; \
int _ret = _expr; \
if (0 == _ret) \
break; \
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
@ -15,7 +15,7 @@
} while (false)
const char* program_file = "kernel.bin";
uint32_t data_stride = 0xffffffff;
uint32_t data_stride = 0;
static void show_usage() {
std::cout << "Vortex Driver Test." << std::endl;
@ -111,19 +111,22 @@ int main(int argc, char *argv[]) {
// parse command arguments
parse_args(argc, argv);
uint32_t block_size = vx_dev_caps(VX_CAPS_CACHE_LINESIZE);
uint32_t max_cores = vx_dev_caps(VX_CAPS_MAX_CORES);
uint32_t max_warps = vx_dev_caps(VX_CAPS_MAX_WARPS);
uint32_t max_threads = vx_dev_caps(VX_CAPS_MAX_THREADS);
if (data_stride == 0xffffffff) {
data_stride = block_size / sizeof(uint32_t);
if (data_stride == 0) {
data_stride = 1;
}
uint32_t num_points = max_cores * max_warps * max_threads * data_stride;
uint32_t buf_size = num_points * sizeof(uint32_t);
kernel_arg.stride = data_stride;
uint32_t num_points = max_cores * max_warps * max_threads;
uint32_t buf_size = num_points * data_stride * sizeof(uint32_t);
std::cout << "number of workitems: " << num_points << std::endl;
std::cout << "workitem size: " << data_stride * sizeof(uint32_t) << " bytes" << std::endl;
std::cout << "buffer size: " << buf_size << " bytes" << std::endl;
// open device connection
std::cout << "open device connection" << std::endl;
@ -167,10 +170,6 @@ int main(int argc, char *argv[]) {
// upload kernel argument
std::cout << "upload kernel argument" << std::endl;
{
kernel_arg.num_warps = max_warps;
kernel_arg.num_threads = max_threads;
kernel_arg.stride = data_stride;
auto buf_ptr = (int*)vx_host_ptr(buffer);
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
RT_CHECK(vx_copy_to_dev(buffer, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));

Binary file not shown.

View file

@ -6,27 +6,24 @@
void kernel_body(void* arg) {
struct kernel_arg_t* _arg = (struct kernel_arg_t*)(arg);
int* x = (int*)_arg->src0_ptr;
int* y = (int*)_arg->src1_ptr;
int* z = (int*)_arg->dst_ptr;
int* src0_ptr = (int*)_arg->src0_ptr;
int* src1_ptr = (int*)_arg->src1_ptr;
int* dst_ptr = (int*)_arg->dst_ptr;
unsigned wid = vx_warp_gid();
unsigned tid = vx_thread_id();
unsigned offset = vx_thread_gid() * _arg->stride;
unsigned i = ((wid * _arg->num_threads) + tid) * _arg->stride;
for (unsigned j = 0; j < _arg->stride; ++j) {
z[i+j] = x[i+j] + y[i+j];
for (unsigned i = 0; i < _arg->stride; ++i) {
dst_ptr[offset+i] = src0_ptr[offset+i] + src1_ptr[offset+i];
}
}
void main() {
struct kernel_arg_t* arg = (struct kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
/*printf("num_warps=%d\n", arg->num_warps);
printf("num_threads=%d\n", arg->num_threads);
printf("stride=%d\n", arg->stride);
printf("src0_ptr=0x%x\n", arg->src0_ptr);
printf("src1_ptr=0x%x\n", arg->src1_ptr);
printf("dst_ptr=0x%x\n", arg->dst_ptr);*/
vx_spawn_warps(arg->num_warps, arg->num_threads, kernel_body, arg);
/*printf("stride=%d\n", arg->stride);
printf("src0_ptr=0x%src0\n", arg->src0_ptr);
printf("src1_ptr=0x%src0\n", arg->src1_ptr);
printf("dst_ptr=0x%src0\n", arg->dst_ptr);*/
int num_warps = vx_num_warps();
int num_threads = vx_num_threads();
vx_spawn_warps(num_warps, num_threads, kernel_body, arg);
}

Binary file not shown.

File diff suppressed because it is too large Load diff

View file

@ -62,13 +62,15 @@ module VX_icache_stage #(
end
end
/*always_comb begin
`ifdef DBG_PRINT_CORE_ICACHE
always_comb begin
if (1'($time & 1) && icache_req_if.core_req_ready && icache_req_if.core_req_valid) begin
$display("*** %t: I%01d$ req: tag=%0h, pc=%0h, warp=%0d", $time, CORE_ID, icache_req_if.core_req_tag, fe_inst_meta_fi.inst_pc, fe_inst_meta_fi.warp_num);
end
if (1'($time & 1) && icache_rsp_if.core_rsp_ready && icache_rsp_if.core_rsp_valid) begin
$display("*** %t: I%01d$ rsp: tag=%0h, pc=%0h, warp=%0d, instr=%0h", $time, CORE_ID, icache_rsp_if.core_rsp_tag, fe_inst_meta_id.inst_pc, fe_inst_meta_id.warp_num, fe_inst_meta_id.instruction);
end
end*/
end
`endif
endmodule

View file

@ -62,14 +62,16 @@ module VX_lsu_unit #(
assign dcache_rsp_if.core_rsp_ready = ~no_slot_mem;
assign {mem_wb_if.pc, mem_wb_if.wb, mem_wb_if.rd, mem_wb_if.warp_num} = dcache_rsp_if.core_rsp_tag;
/*always_comb begin
`ifdef DBG_PRINT_CORE_DCACHE
always_comb begin
if (1'($time & 1) && dcache_req_if.core_req_ready && (| dcache_req_if.core_req_valid)) begin
$display("*** %t: D%01d$ req: valid=%b, addr=%0h, tag=%0h, r=%0d, w=%0d, pc=%0h, rd=%0d, warp=%0d, data=%0h", $time, CORE_ID, use_valid, use_address, dcache_req_if.core_req_tag, use_mem_read, use_mem_write, use_pc, use_rd, use_warp_num, use_store_data);
end
if (1'($time & 1) && dcache_rsp_if.core_rsp_ready && (| dcache_rsp_if.core_rsp_valid)) begin
$display("*** %t: D%01d$ rsp: valid=%b, tag=%0h, pc=%0h, rd=%0d, warp=%0d, data=%0h", $time, CORE_ID, mem_wb_if.valid, dcache_rsp_if.core_rsp_tag, mem_wb_if.pc, mem_wb_if.rd, mem_wb_if.warp_num, mem_wb_if.data);
end
end*/
end
`endif
endmodule

View file

@ -328,13 +328,15 @@ module Vortex_Socket (
);
end
/*always_comb begin
`ifdef DBG_PRINT_DRAM
always_comb begin
if (1'($time & 1) && (dram_req_read || dram_req_write) && dram_req_ready) begin
$display("*** %t: DRAM req: w=%b addr=%0h, tag=%0h, data=%0h", $time, dram_req_write, {dram_req_addr, `CLOG2(`GLOBAL_BLOCK_SIZE)'(0)}, dram_req_tag, dram_req_data);
end
if (1'($time & 1) && dram_rsp_valid && dram_rsp_ready) begin
$display("*** %t: DRAM rsp: tag=%0h, data=%0h", $time, dram_rsp_tag, dram_rsp_data);
end
end*/
end
`endif
endmodule

View file

@ -627,4 +627,18 @@ module VX_bank #(
|| msrq_push_stall
|| dram_fill_req_stall;
`ifdef DBG_PRINT_BANK
always_comb begin
if (1'($time & 1) && dram_fill_req_valid && dram_fill_req_ready) begin
$display("*** %t: bank%02d:%01d dram_fill req: addr=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(dram_fill_req_addr, BANK_ID));
end
if (1'($time & 1) && dram_wb_req_valid && dram_wb_req_ready) begin
$display("*** %t: bank%02d:%01d dram_wb req: addr=%0h, data=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(dram_wb_req_addr, BANK_ID), dram_wb_req_data);
end
if (1'($time & 1) && dram_fill_rsp_valid && dram_fill_rsp_ready) begin
$display("*** %t: bank%02d:%01d dram_fill rsp: addr=%0h, data=%0h", $time, CACHE_ID, BANK_ID, `LINE_TO_BYTE_ADDR(dram_fill_rsp_addr, BANK_ID), dram_fill_rsp_data);
end
end
`endif
endmodule : VX_bank

View file

@ -70,6 +70,8 @@
`define DRAM_TO_LINE_ADDR(x) x[`DRAM_ADDR_WIDTH-1:`BANK_SELECT_BITS]
`define LINE_TO_DRAM_ADDR(x, i) {x, (`BANK_SELECT_BITS)'(i)};
`define LINE_TO_DRAM_ADDR(x, i) {x, `BANK_SELECT_BITS'(i)}
`define LINE_TO_BYTE_ADDR(x, i) {x, `BANK_SELECT_BITS'(i), `BASE_ADDR_BITS'(0)}
`endif

View file

@ -112,7 +112,8 @@ module VX_snp_forwarder #(
assign snp_fwdin_ready[i] = fwdin_ready && (fwdin_sel == `REQS_BITS'(i));
end
/*always_comb begin
`ifdef DBG_PRINT_SNP_FWD
always_comb begin
if (1'($time & 1) && snp_req_valid && snp_req_ready) begin
$display("*** %t: snp req: addr=%0h, tag=%0h", $time, snp_req_addr, snp_req_tag);
end
@ -125,6 +126,7 @@ module VX_snp_forwarder #(
if (1'($time & 1) && snp_rsp_valid && snp_rsp_ready) begin
$display("*** %t: snp rsp: addr=%0h, tag=%0h", $time, snp_rsp_addr, snp_rsp_tag);
end
end*/
end
`endif
endmodule

View file

@ -0,0 +1,25 @@
`include "VX_define.vh"
module VX_encoder_onehot #(
parameter N = 6
) (
input wire [N-1:0] onehot,
output reg valid,
output reg [`LOG2UP(N)-1:0] value
);
integer i;
always @(*) begin
valid = 1'b0;
value = {`LOG2UP(N){1'bx}};
for (i = 0; i < N; i++) begin
if (onehot[i]) begin
valid = 1'b1;
value = `LOG2UP(N)'(i);
break;
end
end
end
endmodule

View file

@ -3,54 +3,51 @@
module VX_matrix_arbiter #(
parameter N = 0
) (
input wire clk,
input wire reset,
input wire [N-1:0] inputs,
output wire [N-1:0] grant
input wire clk,
input wire reset,
input wire [N-1:0] requests,
output wire grant_valid,
output wire [N-1:0] grant_onehot,
output wire [`LOG2UP(N)-1:0] grant_index
);
reg [N-1:1][N-1:0] pri;
reg [N-1:0] state [0:N-1];
wire [N-1:0] dis [0:N-1];
always @(posedge clk) begin
if (reset) begin
integer i, j;
for (i = 0; i < N; ++i) begin
for (j = 0; j < N; ++j) begin
pri[i][j] <= 1;
genvar i, j;
for (i = 0; i < N; ++i) begin
for (j = i + 1; j < N; ++j) begin
always @(posedge clk) begin
if (reset) begin
state[i][j] <= 0;
end else begin
state[i][j] <= (state[i][j] || grant_onehot[j]) && ~grant_onehot[i];
end
end
end else begin
integer i, j;
for (i = 0; i < N; ++i) begin
if (grant[i]) begin
for (j = 0; j < N; ++j) begin
if (j > i)
pri[j][i] <= 1;
else if (j < i)
pri[i][j] <= 0;
end
end
end
end
end
genvar i, j;
end
for (i = 0; i < N; ++i) begin
wire [N-1:0] dis;
for (j = 0; j < N; ++j) begin
if (j > i) begin
assign dis[j] = inputs[j] & pri[j][i];
assign dis[j][i] = requests[i] & state[i][j];
end else if (j < i) begin
assign dis[j] = inputs[j] & ~pri[i][j];
assign dis[j][i] = requests[i] & ~state[j][i];
end else begin
assign dis[j] = 0;
assign dis[j][i] = 0;
end
end
assign grant[i] = inputs[i] & ~(| dis);
assign grant_onehot[i] = requests[i] & ~(| dis[i]);
end
VX_encoder_onehot #(
.N(N)
) encoder (
.onehot(grant_onehot),
.valid(grant_valid),
.value(grant_index)
);
endmodule

View file

@ -5,7 +5,7 @@
.type vx_wspawn, @function
.global vx_wspawn
vx_wspawn:
.word 0x00b5106b # wspawn a0(numWarps), a1(PC SPAWN)
.word 0x00b5106b # wspawn a0(num_warps), a1(func_ptr)
ret
.type vx_tmc, @function
@ -17,7 +17,7 @@ vx_tmc:
.type vx_barrier, @function
.global vx_barrier
vx_barrier:
.word 0x00b5406b # barrier a0(barrier id), a1(numWarps)
.word 0x00b5406b # barrier a0(barrier_id), a1(num_warps)
ret
.type vx_split, @function

View file

@ -1,20 +1,18 @@
#ifndef VX_INTRINSICS
#define VX_INTRINSICS
#ifndef VX_INTRINSICS_H
#define VX_INTRINSICS_H
#ifdef __cplusplus
extern "C" {
#endif
// Spawn warps
void vx_wspawn(int numWarps, int PC_spawn);
void vx_wspawn(int num_warps, unsigned func_ptr);
// Set thread mask
void vx_tmc(int numThreads);
void vx_tmc(int num_threads);
// Warp Barrier
void vx_barrier(int barriedID, int numWarps);
void vx_barrier(int barried_id, int num_warps);
// Split on a predicate
void vx_split(int predicate);

View file

@ -1,5 +1,5 @@
#pragma once
#ifndef VX_IO_H
#define VX_IO_H
#include <stdbool.h>
@ -15,7 +15,8 @@ void vx_printf(const char *, unsigned);
void vx_print_str(const char *);
void vx_printc(unsigned, char c);
#ifdef __cplusplus
}
#endif
#endif

View file

@ -625,7 +625,7 @@ Disassembly of section .text:
8000083c: 28d1a023 sw a3,640(gp) # 80016a88 <global_argument_struct>
80000840: 26b1ae23 sw a1,636(gp) # 80016a84 <global_num_threads>
80000844: 00100793 li a5,1
80000848: 00a7fa63 bgeu a5,a0,8000085c <vx_spawn_warps+0x38>
80000848: 00a7da63 bge a5,a0,8000085c <vx_spawn_warps+0x38>
8000084c: 800005b7 lui a1,0x80000
80000850: 7a058593 addi a1,a1,1952 # 800007a0 <__BSS_END__+0xfffe9c78>
80000854: d55ff0ef jal ra,800005a8 <vx_wspawn>

View file

@ -132,7 +132,7 @@
:1008180013351500130101016FF01FD9130101FFF2
:100828002324810023229100232021012326110063
:1008380023A2C12823A0D12823AEB12693071000F4
:1008480063FAA700B70500809385057AEFF05FD5B6
:1008480063DAA700B70500809385057AEFF05FD5D6
:1008580083A5C12713850500EFF01FD503A501283F
:1008680083A74128E7800700EFF01FD60324810003
:100878008320C1008324410003290100133515009A

View file

@ -9,7 +9,7 @@ extern "C" {
func_t global_function_pointer;
void * global_argument_struct;
unsigned global_num_threads;
int global_num_threads;
void spawn_warp_runonce() {
// active all threads
@ -19,12 +19,12 @@ void spawn_warp_runonce() {
global_function_pointer(global_argument_struct);
// resume single-thread execution on exit
unsigned wid = vx_warp_id();
int wid = vx_warp_id();
unsigned tmask = (0 == wid) ? 0x1 : 0x0;
vx_tmc(tmask);
}
void vx_spawn_warps(unsigned numWarps, unsigned numThreads, func_t func_ptr, void * args) {
void vx_spawn_warps(int numWarps, int numThreads, func_t func_ptr, void * args) {
global_function_pointer = func_ptr;
global_argument_struct = args;
global_num_threads = numThreads;
@ -34,7 +34,7 @@ void vx_spawn_warps(unsigned numWarps, unsigned numThreads, func_t func_ptr, voi
spawn_warp_runonce();
}
unsigned pocl_threads;
int pocl_threads;
struct context_t * pocl_ctx;
vx_pocl_workgroup_func pocl_pfn;
const void * pocl_args;

View file

@ -1,6 +1,5 @@
#ifndef VX_API_
#define VX_API_
#ifndef VX_API_H
#define VX_API_H
#include <stdint.h>
#include <stdio.h>
@ -11,7 +10,7 @@ extern "C" {
typedef void (*func_t)(void *);
void vx_spawn_warps(unsigned numWarps, unsigned numThreads, func_t func_ptr , void * args);
void vx_spawn_warps(int num_warps, int num_threads, func_t func_ptr , void * args);
struct context_t {
uint32_t num_groups[3];