rebase master update
|
@ -54,6 +54,9 @@ CONFIGS=-DEXT_F_DISABLE make -C hw/simulate
|
|||
# disable shared memory
|
||||
CONFIGS=-DSM_ENABLE=0 make -C hw/simulate
|
||||
|
||||
# disabling tex extension
|
||||
CONFIGS=-DEXT_TEX_DISABLE make -C hw/simulate
|
||||
|
||||
# using Default FPU core
|
||||
FPU_CORE=FPU_DEFAULT ./ci/blackbox.sh --driver=rtlsim --cores=1 --app=dogfood
|
||||
|
||||
|
|
|
@ -16,6 +16,7 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_MEM
|
|||
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_AVS
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_TEX
|
||||
|
||||
DBG_FLAGS += $(DBG_PRINT_FLAGS)
|
||||
DBG_FLAGS += -DDBG_CACHE_REQ_INFO
|
||||
|
@ -37,7 +38,8 @@ SRCS = fpga.cpp opae_sim.cpp
|
|||
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
|
||||
|
||||
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(DPI_DIR) -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
|
||||
RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE)
|
||||
TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) $(TEX_INCLUDE)
|
||||
RTL_INCLUDE += -I$(RTL_DIR)/afu -I$(RTL_DIR)/afu/ccip
|
||||
|
||||
VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic
|
||||
|
|
|
@ -16,6 +16,7 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_MEM
|
|||
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_AVS
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_TEX
|
||||
|
||||
DBG_FLAGS += $(DBG_PRINT_FLAGS)
|
||||
DBG_FLAGS += -DDBG_CACHE_REQ_INFO
|
||||
|
@ -37,7 +38,8 @@ SRCS = vortex.cpp ../common/vx_utils.cpp ../../hw/simulate/simulator.cpp
|
|||
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
|
||||
|
||||
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(DPI_DIR) -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
|
||||
RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE)
|
||||
TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) $(TEX_INCLUDE)
|
||||
|
||||
VL_FLAGS += -O2 --language 1800-2009 --assert -Wall -Wpedantic
|
||||
VL_FLAGS += -Wno-DECLFILENAME -Wno-REDEFMACRO
|
||||
|
|
|
@ -154,7 +154,7 @@ module VX_alu_unit #(
|
|||
|
||||
assign mul_ready_out = ~stall_out;
|
||||
|
||||
assign result_valid = mul_valid_out | (alu_req_if.valid && ~is_mul_op);
|
||||
assign result_valid = mul_valid_out || (alu_req_if.valid && ~is_mul_op);
|
||||
assign result_wid = mul_valid_out ? mul_wid : alu_req_if.wid;
|
||||
assign result_tmask = mul_valid_out ? mul_tmask : alu_req_if.tmask;
|
||||
assign result_PC = mul_valid_out ? mul_PC : alu_req_if.PC;
|
||||
|
@ -165,7 +165,7 @@ module VX_alu_unit #(
|
|||
|
||||
`else
|
||||
|
||||
assign stall_in = 0;
|
||||
assign stall_in = stall_out;
|
||||
|
||||
assign result_valid = alu_req_if.valid;
|
||||
assign result_wid = alu_req_if.wid;
|
||||
|
|
159
hw/rtl/VX_cache_arb.v
Normal file
|
@ -0,0 +1,159 @@
|
|||
`include "VX_define.vh"
|
||||
|
||||
module VX_cache_arb #(
|
||||
parameter NUM_REQS = 1,
|
||||
parameter LANES = 1,
|
||||
parameter DATA_SIZE = 1,
|
||||
parameter TAG_IN_WIDTH = 1,
|
||||
parameter TAG_SEL_IDX = 0,
|
||||
parameter BUFFERED_REQ = 0,
|
||||
parameter BUFFERED_RSP = 0,
|
||||
parameter TYPE = "R",
|
||||
|
||||
localparam ADDR_WIDTH = (32-`CLOG2(DATA_SIZE)),
|
||||
localparam DATA_WIDTH = (8 * DATA_SIZE),
|
||||
localparam LOG_NUM_REQS = `CLOG2(NUM_REQS),
|
||||
localparam TAG_OUT_WIDTH = TAG_IN_WIDTH + LOG_NUM_REQS
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// input requests
|
||||
input wire [NUM_REQS-1:0][LANES-1:0] req_valid_in,
|
||||
input wire [NUM_REQS-1:0][LANES-1:0] req_rw_in,
|
||||
input wire [NUM_REQS-1:0][LANES-1:0][DATA_SIZE-1:0] req_byteen_in,
|
||||
input wire [NUM_REQS-1:0][LANES-1:0][ADDR_WIDTH-1:0] req_addr_in,
|
||||
input wire [NUM_REQS-1:0][LANES-1:0][DATA_WIDTH-1:0] req_data_in,
|
||||
input wire [NUM_REQS-1:0][LANES-1:0][TAG_IN_WIDTH-1:0] req_tag_in,
|
||||
output wire [NUM_REQS-1:0][LANES-1:0] req_ready_in,
|
||||
|
||||
// output request
|
||||
output wire [LANES-1:0] req_valid_out,
|
||||
output wire [LANES-1:0] req_rw_out,
|
||||
output wire [LANES-1:0][DATA_SIZE-1:0] req_byteen_out,
|
||||
output wire [LANES-1:0][ADDR_WIDTH-1:0] req_addr_out,
|
||||
output wire [LANES-1:0][DATA_WIDTH-1:0] req_data_out,
|
||||
output wire [LANES-1:0][TAG_OUT_WIDTH-1:0] req_tag_out,
|
||||
input wire [LANES-1:0] req_ready_out,
|
||||
|
||||
// input response
|
||||
input wire rsp_valid_in,
|
||||
input wire [LANES-1:0] rsp_tmask_in,
|
||||
input wire [LANES-1:0][DATA_WIDTH-1:0] rsp_data_in,
|
||||
input wire [TAG_OUT_WIDTH-1:0] rsp_tag_in,
|
||||
output wire rsp_ready_in,
|
||||
|
||||
// output responses
|
||||
output wire [NUM_REQS-1:0] rsp_valid_out,
|
||||
output wire [NUM_REQS-1:0][LANES-1:0] rsp_tmask_out,
|
||||
output wire [NUM_REQS-1:0][LANES-1:0][DATA_WIDTH-1:0] rsp_data_out,
|
||||
output wire [NUM_REQS-1:0][TAG_IN_WIDTH-1:0] rsp_tag_out,
|
||||
input wire [NUM_REQS-1:0] rsp_ready_out
|
||||
);
|
||||
localparam REQ_DATAW = TAG_OUT_WIDTH + ADDR_WIDTH + 1 + DATA_SIZE + DATA_WIDTH;
|
||||
localparam RSP_DATAW = LANES * (1 + DATA_WIDTH) + TAG_IN_WIDTH;
|
||||
|
||||
if (NUM_REQS > 1) begin
|
||||
|
||||
wire [NUM_REQS-1:0][LANES-1:0][REQ_DATAW-1:0] req_data_in_merged;
|
||||
wire [LANES-1:0][REQ_DATAW-1:0] req_data_out_merged;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; i++) begin
|
||||
for (genvar j = 0; j < LANES; ++j) begin
|
||||
wire [TAG_OUT_WIDTH-1:0] req_tag_in_w;
|
||||
|
||||
VX_bits_insert #(
|
||||
.N (TAG_IN_WIDTH),
|
||||
.S (LOG_NUM_REQS),
|
||||
.POS (TAG_SEL_IDX)
|
||||
) bits_insert (
|
||||
.data_in (req_tag_in[i][j]),
|
||||
.sel_in (LOG_NUM_REQS'(i)),
|
||||
.data_out (req_tag_in_w)
|
||||
);
|
||||
|
||||
assign req_data_in_merged[i][j] = {req_tag_in_w, req_addr_in[i][j], req_rw_in[i][j], req_byteen_in[i][j], req_data_in[i][j]};
|
||||
end
|
||||
end
|
||||
|
||||
VX_stream_arbiter #(
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.LANES (LANES),
|
||||
.DATAW (REQ_DATAW),
|
||||
.BUFFERED (BUFFERED_REQ),
|
||||
.TYPE (TYPE)
|
||||
) req_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (req_valid_in),
|
||||
.data_in (req_data_in_merged),
|
||||
.ready_in (req_ready_in),
|
||||
.valid_out (req_valid_out),
|
||||
.data_out (req_data_out_merged),
|
||||
.ready_out (req_ready_out)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < LANES; ++i) begin
|
||||
assign {req_tag_out[i], req_addr_out[i], req_rw_out[i], req_byteen_out[i], req_data_out[i]} = req_data_out_merged[i];
|
||||
end
|
||||
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire [NUM_REQS-1:0][RSP_DATAW-1:0] rsp_data_out_merged;
|
||||
|
||||
wire [LOG_NUM_REQS-1:0] rsp_sel = rsp_tag_in[TAG_SEL_IDX +: LOG_NUM_REQS];
|
||||
|
||||
wire [TAG_IN_WIDTH-1:0] rsp_tag_in_w;
|
||||
|
||||
VX_bits_remove #(
|
||||
.N (TAG_OUT_WIDTH),
|
||||
.S (LOG_NUM_REQS),
|
||||
.POS (TAG_SEL_IDX)
|
||||
) bits_remove (
|
||||
.data_in (rsp_tag_in),
|
||||
.data_out (rsp_tag_in_w)
|
||||
);
|
||||
|
||||
VX_stream_demux #(
|
||||
.NUM_REQS (NUM_REQS),
|
||||
.LANES (1),
|
||||
.DATAW (RSP_DATAW),
|
||||
.BUFFERED (BUFFERED_RSP)
|
||||
) rsp_demux (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.sel_in (rsp_sel),
|
||||
.valid_in (rsp_valid_in),
|
||||
.data_in ({rsp_tmask_in, rsp_tag_in_w, rsp_data_in}),
|
||||
.ready_in (rsp_ready_in),
|
||||
.valid_out (rsp_valid_out),
|
||||
.data_out (rsp_data_out_merged),
|
||||
.ready_out (rsp_ready_out)
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; i++) begin
|
||||
assign {rsp_tmask_out[i], rsp_tag_out[i], rsp_data_out[i]} = rsp_data_out_merged[i];
|
||||
end
|
||||
|
||||
end else begin
|
||||
|
||||
`UNUSED_VAR (clk)
|
||||
`UNUSED_VAR (reset)
|
||||
|
||||
assign req_valid_out = req_valid_in;
|
||||
assign req_tag_out = req_tag_in;
|
||||
assign req_addr_out = req_addr_in;
|
||||
assign req_rw_out = req_rw_in;
|
||||
assign req_byteen_out = req_byteen_in;
|
||||
assign req_data_out = req_data_in;
|
||||
assign req_ready_in = req_ready_out;
|
||||
|
||||
assign rsp_valid_out = rsp_valid_in;
|
||||
assign rsp_tmask_out = rsp_tmask_in;
|
||||
assign rsp_tag_out = rsp_tag_in;
|
||||
assign rsp_data_out = rsp_data_in;
|
||||
assign rsp_ready_in = rsp_ready_out;
|
||||
|
||||
end
|
||||
|
||||
endmodule
|
|
@ -73,13 +73,14 @@ module VX_commit #(
|
|||
.ld_commit_if (ld_commit_if),
|
||||
.csr_commit_if (csr_commit_if),
|
||||
.fpu_commit_if (fpu_commit_if),
|
||||
.gpu_commit_if (gpu_commit_if),
|
||||
|
||||
.writeback_if (writeback_if)
|
||||
);
|
||||
|
||||
// store and gpu commits don't writeback
|
||||
// store doesn't writeback
|
||||
assign st_commit_if.ready = 1'b1;
|
||||
assign gpu_commit_if.ready = 1'b1;
|
||||
// assign gpu_commit_if.ready = 1'b1;
|
||||
|
||||
`ifdef DBG_PRINT_PIPELINE
|
||||
always @(posedge clk) begin
|
||||
|
|
|
@ -77,6 +77,10 @@
|
|||
`define EXT_F_ENABLE
|
||||
`endif
|
||||
|
||||
`ifndef EXT_TEX_DISABLE
|
||||
`define EXT_TEX_ENABLE
|
||||
`endif
|
||||
|
||||
// Device identification
|
||||
`define VENDOR_ID 0
|
||||
`define ARCHITECTURE_ID 0
|
||||
|
@ -229,6 +233,21 @@
|
|||
`define CSR_NW 12'hFC1
|
||||
`define CSR_NC 12'hFC2
|
||||
|
||||
////////// Texture Units //////////////////////////////////////////////////////
|
||||
|
||||
`define NUM_TEX_UNITS 2
|
||||
|
||||
`define CSR_TEX_STATES 7
|
||||
`define CSR_TEX_BEGIN(x) (12'hFD0 + (x) * `CSR_TEX_STATES)
|
||||
|
||||
`define CSR_TEX_ADDR(x) (`CSR_TEX_BEGIN(x) + 12'h00)
|
||||
`define CSR_TEX_FORMAT(x) (`CSR_TEX_BEGIN(x) + 12'h01)
|
||||
`define CSR_TEX_WRAP(x) (`CSR_TEX_BEGIN(x) + 12'h02)
|
||||
`define CSR_TEX_FILTER(x) (`CSR_TEX_BEGIN(x) + 12'h03)
|
||||
`define CSR_TEX_MIPOFF(x) (`CSR_TEX_BEGIN(x) + 12'h04)
|
||||
`define CSR_TEX_WIDTH(x) (`CSR_TEX_BEGIN(x) + 12'h05)
|
||||
`define CSR_TEX_HEIGHT(x) (`CSR_TEX_BEGIN(x) + 12'h06)
|
||||
|
||||
// Pipeline Queues ////////////////////////////////////////////////////////////
|
||||
|
||||
// Size of LSU Request Queue
|
||||
|
|
|
@ -12,7 +12,11 @@ module VX_csr_data #(
|
|||
`endif
|
||||
|
||||
VX_cmt_to_csr_if cmt_to_csr_if,
|
||||
VX_fpu_to_csr_if fpu_to_csr_if,
|
||||
VX_fpu_to_csr_if fpu_to_csr_if,
|
||||
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
VX_tex_csr_if tex_csr_if,
|
||||
`endif
|
||||
|
||||
input wire read_enable,
|
||||
input wire[`CSR_ADDR_BITS-1:0] read_addr,
|
||||
|
@ -22,7 +26,7 @@ module VX_csr_data #(
|
|||
input wire write_enable,
|
||||
input wire[`CSR_ADDR_BITS-1:0] write_addr,
|
||||
input wire[`NW_BITS-1:0] write_wid,
|
||||
input wire[`CSR_WIDTH-1:0] write_data,
|
||||
input wire[31:0] write_data,
|
||||
|
||||
input wire busy
|
||||
);
|
||||
|
@ -57,26 +61,33 @@ module VX_csr_data #(
|
|||
`CSR_FRM: fcsr[write_wid][`FRM_BITS+`FFG_BITS-1:`FFG_BITS] <= write_data[`FRM_BITS-1:0];
|
||||
`CSR_FCSR: fcsr[write_wid] <= write_data[`FFG_BITS+`FRM_BITS-1:0];
|
||||
|
||||
`CSR_SATP: csr_satp <= write_data;
|
||||
|
||||
`CSR_MSTATUS: csr_mstatus <= write_data;
|
||||
`CSR_MEDELEG: csr_medeleg <= write_data;
|
||||
`CSR_MIDELEG: csr_mideleg <= write_data;
|
||||
`CSR_MIE: csr_mie <= write_data;
|
||||
`CSR_MTVEC: csr_mtvec <= write_data;
|
||||
|
||||
`CSR_MEPC: csr_mepc <= write_data;
|
||||
|
||||
`CSR_PMPCFG0: csr_pmpcfg[0] <= write_data;
|
||||
`CSR_PMPADDR0: csr_pmpaddr[0] <= write_data;
|
||||
`CSR_SATP: csr_satp <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MSTATUS: csr_mstatus <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MEDELEG: csr_medeleg <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MIDELEG: csr_mideleg <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MIE: csr_mie <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MTVEC: csr_mtvec <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_MEPC: csr_mepc <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_PMPCFG0: csr_pmpcfg[0] <= write_data[`CSR_WIDTH-1:0];
|
||||
`CSR_PMPADDR0: csr_pmpaddr[0] <= write_data[`CSR_WIDTH-1:0];
|
||||
|
||||
default: begin
|
||||
assert(~write_enable) else $error("%t: invalid CSR write address: %0h", $time, write_addr);
|
||||
assert (write_addr >= `CSR_TEX_BEGIN(0) && write_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES))
|
||||
else $error("%t: invalid CSR write address: %0h", $time, write_addr);
|
||||
end
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
||||
`UNUSED_VAR (write_data)
|
||||
|
||||
// TEX CSRs
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
assign tex_csr_if.write_enable = write_enable;
|
||||
assign tex_csr_if.write_addr = write_addr;
|
||||
assign tex_csr_if.write_data = write_data;
|
||||
`endif
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
csr_cycle <= 0;
|
||||
|
@ -201,7 +212,8 @@ module VX_csr_data #(
|
|||
|
||||
default: begin
|
||||
if (!((read_addr >= `CSR_MPM_BASE && read_addr < (`CSR_MPM_BASE + 32))
|
||||
| (read_addr >= `CSR_MPM_BASE_H && read_addr < (`CSR_MPM_BASE_H + 32)))) begin
|
||||
|| (read_addr >= `CSR_MPM_BASE_H && read_addr < (`CSR_MPM_BASE_H + 32)
|
||||
|| (read_addr >= `CSR_TEX_BEGIN(0) && read_addr < `CSR_TEX_BEGIN(`CSR_TEX_STATES))))) begin
|
||||
read_addr_valid_r = 0;
|
||||
end
|
||||
end
|
||||
|
|
|
@ -12,7 +12,11 @@ module VX_csr_unit #(
|
|||
`endif
|
||||
|
||||
VX_cmt_to_csr_if cmt_to_csr_if,
|
||||
VX_fpu_to_csr_if fpu_to_csr_if,
|
||||
VX_fpu_to_csr_if fpu_to_csr_if,
|
||||
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
VX_tex_csr_if tex_csr_if,
|
||||
`endif
|
||||
|
||||
VX_csr_req_if csr_req_if,
|
||||
VX_commit_if csr_commit_if,
|
||||
|
@ -42,6 +46,9 @@ module VX_csr_unit #(
|
|||
`endif
|
||||
.cmt_to_csr_if (cmt_to_csr_if),
|
||||
.fpu_to_csr_if (fpu_to_csr_if),
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
.tex_csr_if (tex_csr_if),
|
||||
`endif
|
||||
.read_enable (csr_req_if.valid),
|
||||
.read_addr (csr_req_if.addr),
|
||||
.read_wid (csr_req_if.wid),
|
||||
|
@ -49,7 +56,7 @@ module VX_csr_unit #(
|
|||
.write_enable (write_enable),
|
||||
.write_addr (csr_addr_s1),
|
||||
.write_wid (csr_commit_if.wid),
|
||||
.write_data (csr_updated_data_s1[`CSR_WIDTH-1:0]),
|
||||
.write_data (csr_updated_data_s1),
|
||||
.busy (busy)
|
||||
);
|
||||
|
||||
|
|
|
@ -42,6 +42,7 @@ module VX_decode #(
|
|||
|
||||
wire [31:0] instr = ifetch_rsp_if.data;
|
||||
wire [6:0] opcode = instr[6:0];
|
||||
wire [1:0] func2 = instr[26:25];
|
||||
wire [2:0] func3 = instr[14:12];
|
||||
wire [6:0] func7 = instr[31:25];
|
||||
wire [11:0] u_12 = instr[31:20];
|
||||
|
@ -372,6 +373,16 @@ module VX_decode #(
|
|||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
end
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
3'h5: begin
|
||||
op_type = `OP_BITS'(`GPU_TEX);
|
||||
op_mod = `MOD_BITS'(func2);
|
||||
use_rd = 1;
|
||||
`USED_IREG (rs1);
|
||||
`USED_IREG (rs2);
|
||||
`USED_IREG (rs3);
|
||||
end
|
||||
`endif
|
||||
default:;
|
||||
endcase
|
||||
end
|
||||
|
@ -379,6 +390,8 @@ module VX_decode #(
|
|||
endcase
|
||||
end
|
||||
|
||||
`UNUSED_VAR (func2)
|
||||
|
||||
// disable write to integer register r0
|
||||
wire wb = use_rd && (| rd_r);
|
||||
|
||||
|
|
|
@ -16,6 +16,8 @@
|
|||
|
||||
`define REQS_BITS `LOG2UP(NUM_REQS)
|
||||
|
||||
`define NTEX_BITS `LOG2UP(`NUM_TEX_UNITS)
|
||||
|
||||
`ifdef EXT_F_ENABLE
|
||||
`define NUM_REGS 64
|
||||
`else
|
||||
|
@ -54,6 +56,8 @@
|
|||
|
||||
`define INST_GPU 7'b1101011
|
||||
|
||||
`define INST_TEX 7'b0101011
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define FRM_RNE 3'b000 // round to nearest even
|
||||
|
@ -185,6 +189,7 @@
|
|||
`define GPU_SPLIT 3'h2
|
||||
`define GPU_JOIN 3'h3
|
||||
`define GPU_BAR 3'h4
|
||||
`define GPU_TEX 3'h5
|
||||
`define GPU_OTHER 3'h7
|
||||
`define GPU_BITS 3
|
||||
`define GPU_OP(x) x[`GPU_BITS-1:0]
|
||||
|
@ -293,11 +298,18 @@
|
|||
// Core request address bits
|
||||
`define DCORE_ADDR_WIDTH (32-`CLOG2(`DWORD_SIZE))
|
||||
|
||||
// TAG sharing enable
|
||||
`define LSUQ_ADDR_BITS `LOG2UP(`LSUQ_SIZE)
|
||||
// Core request tag bits
|
||||
`define LSUQ_ADDR_BITS `LOG2UP(`LSUQ_SIZE)
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
`define LSU_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_ADDR_BITS + `SM_ENABLE)
|
||||
`define TEX_TAG_ID_BITS (2)
|
||||
`define DCORE_TAG_ID_BITS (`MAX(`LSU_TAG_ID_BITS, `TEX_TAG_ID_BITS) + 1)
|
||||
`define LSU_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `LSU_TAG_ID_BITS)
|
||||
`define TEX_DCACHE_TAG_BITS (`DBG_CACHE_REQ_MDATAW + `TEX_TAG_ID_BITS)
|
||||
`define LSU_TEX_DCACHE_TAG_BITS `MAX(`LSU_DCACHE_TAG_BITS, `TEX_DCACHE_TAG_BITS)
|
||||
`else
|
||||
`define DCORE_TAG_ID_BITS (`LSUQ_ADDR_BITS + `NC_ADDR_BITS + `SM_ENABLE)
|
||||
|
||||
// Input request tag bits
|
||||
`endif
|
||||
`define DCORE_TAG_WIDTH (`DBG_CACHE_REQ_MDATAW + `DCORE_TAG_ID_BITS)
|
||||
|
||||
// Memory request data bits
|
||||
|
@ -409,6 +421,8 @@
|
|||
// Merged D-cache/I-cache memory tag
|
||||
`define XMEM_TAG_WIDTH (`DMEM_TAG_WIDTH + `CLOG2(2))
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`include "VX_types.vh"
|
||||
|
||||
`endif
|
||||
|
|
|
@ -39,9 +39,98 @@ module VX_execute #(
|
|||
|
||||
input wire busy
|
||||
);
|
||||
VX_fpu_to_csr_if fpu_to_csr_if();
|
||||
wire[`NUM_WARPS-1:0] csr_pending;
|
||||
wire[`NUM_WARPS-1:0] fpu_pending;
|
||||
VX_fpu_to_csr_if fpu_to_csr_if();
|
||||
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
|
||||
VX_dcache_req_if #(
|
||||
.NUM_REQS (`NUM_THREADS),
|
||||
.WORD_SIZE (4),
|
||||
.TAG_WIDTH (`LSU_DCACHE_TAG_BITS)
|
||||
) lsu_dcache_req_if();
|
||||
|
||||
VX_dcache_rsp_if #(
|
||||
.NUM_REQS (`NUM_THREADS),
|
||||
.WORD_SIZE (4),
|
||||
.TAG_WIDTH (`LSU_DCACHE_TAG_BITS)
|
||||
) lsu_dcache_rsp_if();
|
||||
|
||||
VX_dcache_req_if #(
|
||||
.NUM_REQS (`NUM_THREADS),
|
||||
.WORD_SIZE (4),
|
||||
.TAG_WIDTH (`TEX_DCACHE_TAG_BITS)
|
||||
) tex_dcache_req_if();
|
||||
|
||||
VX_dcache_rsp_if #(
|
||||
.NUM_REQS (`NUM_THREADS),
|
||||
.WORD_SIZE (4),
|
||||
.TAG_WIDTH (`TEX_DCACHE_TAG_BITS)
|
||||
) tex_dcache_rsp_if();
|
||||
|
||||
VX_tex_csr_if tex_csr_if();
|
||||
|
||||
wire [`NUM_THREADS-1:0][`LSU_TEX_DCACHE_TAG_BITS-1:0] tex_tag_in;
|
||||
wire [`LSU_TEX_DCACHE_TAG_BITS-1:0] tex_tag_out;
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
|
||||
assign tex_tag_in[i][`LSUQ_ADDR_BITS-1:0] = `LSUQ_ADDR_BITS'(tex_dcache_req_if.tag[i][1:0]);
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
assign tex_tag_in[i][`LSUQ_ADDR_BITS+:`DBG_CACHE_REQ_MDATAW] = tex_dcache_req_if.tag[i][2+:`DBG_CACHE_REQ_MDATAW];
|
||||
`endif
|
||||
end
|
||||
assign tex_dcache_rsp_if.tag[1:0] = tex_tag_out[1:0];
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
assign tex_dcache_rsp_if.tag[2+:`DBG_CACHE_REQ_MDATAW] = tex_tag_out[`LSUQ_ADDR_BITS+:`DBG_CACHE_REQ_MDATAW];
|
||||
`endif
|
||||
`UNUSED_VAR (tex_tag_out)
|
||||
|
||||
VX_cache_arb #(
|
||||
.NUM_REQS (2),
|
||||
.LANES (`NUM_THREADS),
|
||||
.DATA_SIZE (4),
|
||||
.TAG_IN_WIDTH (`LSU_TEX_DCACHE_TAG_BITS),
|
||||
.TAG_SEL_IDX (2)
|
||||
) tex_lsu_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
// Tex/LSU request
|
||||
.req_valid_in ({tex_dcache_req_if.valid, lsu_dcache_req_if.valid}),
|
||||
.req_rw_in ({tex_dcache_req_if.rw, lsu_dcache_req_if.rw}),
|
||||
.req_byteen_in ({tex_dcache_req_if.byteen, lsu_dcache_req_if.byteen}),
|
||||
.req_addr_in ({tex_dcache_req_if.addr, lsu_dcache_req_if.addr}),
|
||||
.req_data_in ({tex_dcache_req_if.data, lsu_dcache_req_if.data}),
|
||||
.req_tag_in ({tex_tag_in, lsu_dcache_req_if.tag}),
|
||||
.req_ready_in ({tex_dcache_req_if.ready, lsu_dcache_req_if.ready}),
|
||||
|
||||
// Dcache request
|
||||
.req_valid_out (dcache_req_if.valid),
|
||||
.req_rw_out (dcache_req_if.rw),
|
||||
.req_byteen_out (dcache_req_if.byteen),
|
||||
.req_addr_out (dcache_req_if.addr),
|
||||
.req_data_out (dcache_req_if.data),
|
||||
.req_tag_out (dcache_req_if.tag),
|
||||
.req_ready_out (dcache_req_if.ready),
|
||||
|
||||
// Dcache response
|
||||
.rsp_valid_in (dcache_rsp_if.valid),
|
||||
.rsp_tmask_in (dcache_rsp_if.tmask),
|
||||
.rsp_tag_in (dcache_rsp_if.tag),
|
||||
.rsp_data_in (dcache_rsp_if.data),
|
||||
.rsp_ready_in (dcache_rsp_if.ready),
|
||||
|
||||
// Tex/LSU response
|
||||
.rsp_valid_out ({tex_dcache_rsp_if.valid, lsu_dcache_rsp_if.valid}),
|
||||
.rsp_tmask_out ({tex_dcache_rsp_if.tmask, lsu_dcache_rsp_if.tmask}),
|
||||
.rsp_data_out ({tex_dcache_rsp_if.data, lsu_dcache_rsp_if.data}),
|
||||
.rsp_tag_out ({tex_tag_out, lsu_dcache_rsp_if.tag}),
|
||||
.rsp_ready_out ({tex_dcache_rsp_if.ready, lsu_dcache_rsp_if.ready})
|
||||
);
|
||||
|
||||
`endif
|
||||
|
||||
wire [`NUM_WARPS-1:0] csr_pending;
|
||||
wire [`NUM_WARPS-1:0] fpu_pending;
|
||||
|
||||
`RESET_RELAY (alu_reset);
|
||||
`RESET_RELAY (lsu_reset);
|
||||
|
@ -49,7 +138,7 @@ module VX_execute #(
|
|||
`RESET_RELAY (gpu_reset);
|
||||
|
||||
VX_alu_unit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
.CORE_ID(CORE_ID)
|
||||
) alu_unit (
|
||||
.clk (clk),
|
||||
.reset (alu_reset),
|
||||
|
@ -59,29 +148,37 @@ module VX_execute #(
|
|||
);
|
||||
|
||||
VX_lsu_unit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
.CORE_ID(CORE_ID)
|
||||
) lsu_unit (
|
||||
`SCOPE_BIND_VX_execute_lsu_unit
|
||||
.clk (clk),
|
||||
.reset (lsu_reset),
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
.dcache_req_if (lsu_dcache_req_if),
|
||||
.dcache_rsp_if (lsu_dcache_rsp_if),
|
||||
`else
|
||||
.dcache_req_if (dcache_req_if),
|
||||
.dcache_rsp_if (dcache_rsp_if),
|
||||
`endif
|
||||
.lsu_req_if (lsu_req_if),
|
||||
.ld_commit_if (ld_commit_if),
|
||||
.st_commit_if (st_commit_if)
|
||||
);
|
||||
|
||||
VX_csr_unit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
.CORE_ID(CORE_ID)
|
||||
) csr_unit (
|
||||
.clk (clk),
|
||||
.reset (csr_reset),
|
||||
`ifdef PERF_ENABLE
|
||||
.perf_memsys_if (perf_memsys_if),
|
||||
.perf_memsys_if (perf_memsys_if),
|
||||
.perf_pipeline_if (perf_pipeline_if),
|
||||
`endif
|
||||
.cmt_to_csr_if (cmt_to_csr_if),
|
||||
.fpu_to_csr_if (fpu_to_csr_if),
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
.tex_csr_if (tex_csr_if),
|
||||
`endif
|
||||
.csr_req_if (csr_req_if),
|
||||
.csr_commit_if (csr_commit_if),
|
||||
.fpu_pending (fpu_pending),
|
||||
|
@ -93,7 +190,7 @@ module VX_execute #(
|
|||
`RESET_RELAY (fpu_reset);
|
||||
|
||||
VX_fpu_unit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
.CORE_ID(CORE_ID)
|
||||
) fpu_unit (
|
||||
.clk (clk),
|
||||
.reset (fpu_reset),
|
||||
|
@ -122,12 +219,17 @@ module VX_execute #(
|
|||
`endif
|
||||
|
||||
VX_gpu_unit #(
|
||||
.CORE_ID (CORE_ID)
|
||||
.CORE_ID(CORE_ID)
|
||||
) gpu_unit (
|
||||
`SCOPE_BIND_VX_execute_gpu_unit
|
||||
.clk (clk),
|
||||
.reset (gpu_reset),
|
||||
.gpu_req_if (gpu_req_if),
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
.tex_csr_if (tex_csr_if),
|
||||
.dcache_req_if (tex_dcache_req_if),
|
||||
.dcache_rsp_if (tex_dcache_rsp_if),
|
||||
`endif
|
||||
.warp_ctl_if (warp_ctl_if),
|
||||
.gpu_commit_if (gpu_commit_if)
|
||||
);
|
||||
|
@ -139,4 +241,4 @@ module VX_execute #(
|
|||
&& (`BR_OP(alu_req_if.op_type) == `BR_EBREAK
|
||||
|| `BR_OP(alu_req_if.op_type) == `BR_ECALL);
|
||||
|
||||
endmodule
|
||||
endmodule
|
|
@ -11,25 +11,43 @@ module VX_gpu_unit #(
|
|||
// Inputs
|
||||
VX_gpu_req_if gpu_req_if,
|
||||
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
VX_tex_csr_if tex_csr_if,
|
||||
|
||||
VX_dcache_req_if dcache_req_if,
|
||||
VX_dcache_rsp_if dcache_rsp_if,
|
||||
`endif
|
||||
|
||||
// Outputs
|
||||
VX_warp_ctl_if warp_ctl_if,
|
||||
VX_commit_if gpu_commit_if
|
||||
);
|
||||
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
`UNUSED_VAR (clk)
|
||||
`UNUSED_VAR (reset)
|
||||
|
||||
wire rsp_valid;
|
||||
wire [`NW_BITS-1:0] rsp_wid;
|
||||
wire [`NUM_THREADS-1:0] rsp_tmask;
|
||||
wire [31:0] rsp_PC;
|
||||
wire [`NR_BITS-1:0] rsp_rd;
|
||||
wire rsp_wb;
|
||||
wire [`NUM_THREADS-1:0][31:0] rsp_data;
|
||||
|
||||
gpu_tmc_t tmc;
|
||||
gpu_wspawn_t wspawn;
|
||||
gpu_barrier_t barrier;
|
||||
gpu_split_t split;
|
||||
|
||||
wire [(`NUM_THREADS * 32)-1:0] warp_ctl_data;
|
||||
wire is_warp_ctl;
|
||||
|
||||
wire stall_in, stall_out;
|
||||
|
||||
wire is_wspawn = (gpu_req_if.op_type == `GPU_WSPAWN);
|
||||
wire is_tmc = (gpu_req_if.op_type == `GPU_TMC);
|
||||
wire is_split = (gpu_req_if.op_type == `GPU_SPLIT);
|
||||
wire is_bar = (gpu_req_if.op_type == `GPU_BAR);
|
||||
|
||||
|
||||
// tmc
|
||||
|
||||
wire [`NUM_THREADS-1:0] tmc_new_mask;
|
||||
|
@ -41,7 +59,7 @@ module VX_gpu_unit #(
|
|||
|
||||
// wspawn
|
||||
|
||||
wire [31:0] wspawn_pc = gpu_req_if.rs2_data;
|
||||
wire [31:0] wspawn_pc = gpu_req_if.rs2_data[0];
|
||||
wire [`NUM_WARPS-1:0] wspawn_wmask;
|
||||
for (genvar i = 0; i < `NUM_WARPS; i++) begin
|
||||
assign wspawn_wmask[i] = (i < gpu_req_if.rs1_data[0]);
|
||||
|
@ -71,37 +89,113 @@ module VX_gpu_unit #(
|
|||
|
||||
assign barrier.valid = is_bar;
|
||||
assign barrier.id = gpu_req_if.rs1_data[0][`NB_BITS-1:0];
|
||||
assign barrier.size_m1 = (`NW_BITS)'(gpu_req_if.rs2_data - 1);
|
||||
assign barrier.size_m1 = (`NW_BITS)'(gpu_req_if.rs2_data[0] - 1);
|
||||
|
||||
// pack warp ctl result
|
||||
`IGNORE_WARNINGS_BEGIN
|
||||
assign warp_ctl_data = {tmc, wspawn, barrier, split};
|
||||
`IGNORE_WARNINGS_END
|
||||
|
||||
// texture
|
||||
|
||||
`ifdef EXT_TEX_ENABLE
|
||||
|
||||
`UNUSED_VAR (gpu_req_if.op_mod)
|
||||
|
||||
VX_tex_req_if tex_req_if();
|
||||
VX_tex_rsp_if tex_rsp_if();
|
||||
|
||||
wire is_tex = (gpu_req_if.op_type == `GPU_TEX);
|
||||
|
||||
assign tex_req_if.valid = gpu_req_if.valid && is_tex;
|
||||
assign tex_req_if.wid = gpu_req_if.wid;
|
||||
assign tex_req_if.tmask = gpu_req_if.tmask;
|
||||
assign tex_req_if.PC = gpu_req_if.PC;
|
||||
assign tex_req_if.rd = gpu_req_if.rd;
|
||||
assign tex_req_if.wb = gpu_req_if.wb;
|
||||
|
||||
assign tex_req_if.unit = gpu_req_if.op_mod[`NTEX_BITS-1:0];
|
||||
assign tex_req_if.coords[0] = gpu_req_if.rs1_data;
|
||||
assign tex_req_if.coords[1] = gpu_req_if.rs2_data;
|
||||
assign tex_req_if.lod = gpu_req_if.rs3_data;
|
||||
|
||||
VX_tex_unit #(
|
||||
.CORE_ID(CORE_ID)
|
||||
) tex_unit (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.tex_req_if (tex_req_if),
|
||||
.tex_csr_if (tex_csr_if),
|
||||
.tex_rsp_if (tex_rsp_if),
|
||||
.dcache_req_if (dcache_req_if),
|
||||
.dcache_rsp_if (dcache_rsp_if)
|
||||
);
|
||||
|
||||
assign tex_rsp_if.ready = !stall_out;
|
||||
|
||||
assign stall_in = (is_tex && ~tex_req_if.ready)
|
||||
|| (~is_tex && (tex_rsp_if.valid || stall_out));
|
||||
|
||||
assign is_warp_ctl = !(is_tex || tex_rsp_if.valid);
|
||||
|
||||
assign rsp_valid = tex_rsp_if.valid || (gpu_req_if.valid && ~is_tex);
|
||||
assign rsp_wid = tex_rsp_if.valid ? tex_rsp_if.wid : gpu_req_if.wid;
|
||||
assign rsp_tmask = tex_rsp_if.valid ? tex_rsp_if.tmask : gpu_req_if.tmask;
|
||||
assign rsp_PC = tex_rsp_if.valid ? tex_rsp_if.PC : gpu_req_if.PC;
|
||||
assign rsp_rd = tex_rsp_if.rd;
|
||||
assign rsp_wb = tex_rsp_if.valid && tex_rsp_if.wb;
|
||||
assign rsp_data = tex_rsp_if.valid ? tex_rsp_if.data : warp_ctl_data;
|
||||
|
||||
`else
|
||||
|
||||
`UNUSED_VAR (gpu_req_if.op_mod)
|
||||
`UNUSED_VAR (gpu_req_if.rs2_data)
|
||||
`UNUSED_VAR (gpu_req_if.rs3_data)
|
||||
`UNUSED_VAR (gpu_req_if.wb)
|
||||
`UNUSED_VAR (gpu_req_if.rd)
|
||||
|
||||
assign stall_in = stall_out;
|
||||
assign is_warp_ctl = 1;
|
||||
|
||||
assign rsp_valid = gpu_req_if.valid;
|
||||
assign rsp_wid = gpu_req_if.wid;
|
||||
assign rsp_tmask = gpu_req_if.tmask;
|
||||
assign rsp_PC = gpu_req_if.PC;
|
||||
assign rsp_rd = 0;
|
||||
assign rsp_wb = 0;
|
||||
assign rsp_data = warp_ctl_data;
|
||||
|
||||
`endif
|
||||
|
||||
wire is_warp_ctl_r;
|
||||
|
||||
// output
|
||||
|
||||
wire stall = ~gpu_commit_if.ready && gpu_commit_if.valid;
|
||||
assign stall_out = ~gpu_commit_if.ready && gpu_commit_if.valid;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + `GPU_TMC_SIZE + `GPU_WSPAWN_SIZE + `GPU_SPLIT_SIZE + `GPU_BARRIER_SIZE),
|
||||
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32) + 1),
|
||||
.RESETW (1)
|
||||
) pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (!stall),
|
||||
.data_in ({gpu_req_if.valid, gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.rd, gpu_req_if.wb, tmc, wspawn, split, barrier}),
|
||||
.data_out ({gpu_commit_if.valid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.split, warp_ctl_if.barrier})
|
||||
.enable (!stall_out),
|
||||
.data_in ({rsp_valid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data, is_warp_ctl}),
|
||||
.data_out ({gpu_commit_if.valid, gpu_commit_if.wid, gpu_commit_if.tmask, gpu_commit_if.PC, gpu_commit_if.rd, gpu_commit_if.wb, gpu_commit_if.data, is_warp_ctl_r})
|
||||
);
|
||||
|
||||
assign gpu_commit_if.eop = 1'b1;
|
||||
|
||||
assign warp_ctl_if.valid = gpu_commit_if.valid && gpu_commit_if.ready;
|
||||
assign warp_ctl_if.wid = gpu_commit_if.wid;
|
||||
// warp control reponse
|
||||
|
||||
`IGNORE_WARNINGS_BEGIN
|
||||
assign {warp_ctl_if.tmc, warp_ctl_if.wspawn, warp_ctl_if.barrier, warp_ctl_if.split} = gpu_commit_if.data;
|
||||
`IGNORE_WARNINGS_END
|
||||
assign warp_ctl_if.valid = gpu_commit_if.valid && gpu_commit_if.ready && is_warp_ctl_r;
|
||||
assign warp_ctl_if.wid = gpu_commit_if.wid;
|
||||
|
||||
// can accept new request?
|
||||
assign gpu_req_if.ready = ~stall;
|
||||
assign gpu_req_if.ready = ~stall_in;
|
||||
|
||||
`SCOPE_ASSIGN (gpu_req_fire, gpu_req_if.valid && gpu_req_if.ready);
|
||||
`SCOPE_ASSIGN (gpu_req_wid, gpu_req_if.wid);
|
||||
`SCOPE_ASSIGN (gpu_req_tmask, gpu_req_if.tmask);
|
||||
`SCOPE_ASSIGN (gpu_req_op_type, gpu_req_if.op_type);
|
||||
`SCOPE_ASSIGN (gpu_req_rs1, gpu_req_if.rs1_data[0]);
|
||||
`SCOPE_ASSIGN (gpu_req_rs2, gpu_req_if.rs2_data);
|
||||
`SCOPE_ASSIGN (gpu_rsp_valid, warp_ctl_if.valid);
|
||||
`SCOPE_ASSIGN (gpu_rsp_wid, warp_ctl_if.wid);
|
||||
`SCOPE_ASSIGN (gpu_rsp_tmc, warp_ctl_if.tmc);
|
||||
|
|
|
@ -112,14 +112,14 @@ module VX_instr_demux (
|
|||
wire gpu_req_valid = ibuffer_if.valid && (ibuffer_if.ex_type == `EX_GPU);
|
||||
|
||||
VX_skid_buffer #(
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `NR_BITS + 1 + (`NUM_THREADS * 32 + 32))
|
||||
.DATAW (`NW_BITS + `NUM_THREADS + 32 + 32 + `GPU_BITS + `MOD_BITS + `NR_BITS + 1 + (3 * `NUM_THREADS * 32)) //update number of bits
|
||||
) gpu_buffer (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in (gpu_req_valid),
|
||||
.ready_in (gpu_req_ready),
|
||||
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, `GPU_OP(ibuffer_if.op_type), ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data[0]}),
|
||||
.data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.rs1_data, gpu_req_if.rs2_data}),
|
||||
.data_in ({ibuffer_if.wid, ibuffer_if.tmask, ibuffer_if.PC, next_PC, `GPU_OP(ibuffer_if.op_type), ibuffer_if.op_mod, ibuffer_if.rd, ibuffer_if.wb, gpr_rsp_if.rs1_data, gpr_rsp_if.rs2_data, gpr_rsp_if.rs3_data}),
|
||||
.data_out ({gpu_req_if.wid, gpu_req_if.tmask, gpu_req_if.PC, gpu_req_if.next_PC, gpu_req_if.op_type, gpu_req_if.op_mod, gpu_req_if.rd, gpu_req_if.wb, gpu_req_if.rs1_data, gpu_req_if.rs2_data, gpu_req_if.rs3_data}),
|
||||
.valid_out (gpu_req_if.valid),
|
||||
.ready_out (gpu_req_if.ready)
|
||||
);
|
||||
|
|
|
@ -219,6 +219,8 @@ module VX_issue #(
|
|||
`PRINT_ARRAY1D(gpu_req_if.rs1_data, `NUM_THREADS);
|
||||
$write(", rs2_data=");
|
||||
`PRINT_ARRAY1D(gpu_req_if.rs2_data, `NUM_THREADS);
|
||||
$write(", rs3_data=");
|
||||
`PRINT_ARRAY1D(gpu_req_if.rs3_data, `NUM_THREADS);
|
||||
$write("\n");
|
||||
end
|
||||
end
|
||||
|
|
|
@ -197,7 +197,7 @@ module VX_mem_unit # (
|
|||
.TAG_WIDTH (`DCORE_TAG_WIDTH-`SM_ENABLE)
|
||||
) smem_rsp_if();
|
||||
|
||||
VX_smem_arb #(
|
||||
VX_smem_arb #(
|
||||
.NUM_REQS (2),
|
||||
.LANES (`NUM_THREADS),
|
||||
.DATA_SIZE (4),
|
||||
|
|
|
@ -108,7 +108,10 @@ module VX_pipeline #(
|
|||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
VX_cmt_to_csr_if cmt_to_csr_if();
|
||||
VX_cmt_to_csr_if #(
|
||||
.SIZE ($clog2(3*`NUM_THREADS+1))
|
||||
) cmt_to_csr_if();
|
||||
|
||||
VX_decode_if decode_if();
|
||||
VX_branch_ctl_if branch_ctl_if();
|
||||
VX_warp_ctl_if warp_ctl_if();
|
||||
|
|
|
@ -85,11 +85,32 @@
|
|||
|
||||
`define UP(x) (((x) > 0) ? x : 1)
|
||||
|
||||
`define SAFE_RNG(h,l) `MAX(h,l) : l
|
||||
`define SAFE_RNG(h, l) `MAX(h,l) : l
|
||||
|
||||
`define RTRIM(x,s) x[$bits(x)-1:($bits(x)-s)]
|
||||
`define RTRIM(x, s) x[$bits(x)-1:($bits(x)-s)]
|
||||
|
||||
`define LTRIM(x,s) x[s-1:0]
|
||||
`define LTRIM(x, s) x[s-1:0]
|
||||
|
||||
`define PRINT_ARRAY1D(a, m) \
|
||||
$write("{"); \
|
||||
for (integer i = (m-1); i >= 0; --i) begin \
|
||||
if (i != (m-1)) $write(", "); \
|
||||
$write("0x%0h", a[i]); \
|
||||
end \
|
||||
$write("}"); \
|
||||
|
||||
`define PRINT_ARRAY2D(a, m, n) \
|
||||
$write("{"); \
|
||||
for (integer i = n-1; i >= 0; --i) begin \
|
||||
if (i != (n-1)) $write(", "); \
|
||||
$write("{"); \
|
||||
for (integer j = (m-1); j >= 0; --j) begin \
|
||||
if (j != (m-1)) $write(", "); \
|
||||
$write("0x%0h", a[i][j]); \
|
||||
end \
|
||||
$write("}"); \
|
||||
end \
|
||||
$write("}")
|
||||
|
||||
`define PRINT_ARRAY1D(a, m) \
|
||||
$write("{"); \
|
||||
|
|
|
@ -128,6 +128,7 @@ task print_ex_op (
|
|||
`GPU_SPLIT: $write("SPLIT");
|
||||
`GPU_JOIN: $write("JOIN");
|
||||
`GPU_BAR: $write("BAR");
|
||||
`GPU_TEX: $write("TEX");
|
||||
default: $write("?");
|
||||
endcase
|
||||
end
|
||||
|
|
|
@ -11,6 +11,7 @@ module VX_writeback #(
|
|||
VX_commit_if ld_commit_if,
|
||||
VX_commit_if csr_commit_if,
|
||||
VX_commit_if fpu_commit_if,
|
||||
VX_commit_if gpu_commit_if,
|
||||
|
||||
// outputs
|
||||
VX_writeback_if writeback_if
|
||||
|
@ -28,28 +29,30 @@ module VX_writeback #(
|
|||
wire [`NUM_THREADS-1:0][31:0] wb_data;
|
||||
wire wb_eop;
|
||||
|
||||
wire [3:0][DATAW-1:0] rsp_data;
|
||||
wire [3:0] rsp_ready;
|
||||
wire [4:0][DATAW-1:0] rsp_data;
|
||||
wire [4:0] rsp_ready;
|
||||
wire stall;
|
||||
|
||||
wire ld_valid = ld_commit_if.valid && ld_commit_if.wb;
|
||||
wire fpu_valid = fpu_commit_if.valid && fpu_commit_if.wb;
|
||||
wire csr_valid = csr_commit_if.valid && csr_commit_if.wb;
|
||||
wire alu_valid = alu_commit_if.valid && alu_commit_if.wb;
|
||||
wire gpu_valid = gpu_commit_if.valid && gpu_commit_if.wb;
|
||||
|
||||
assign rsp_data[0] = { ld_commit_if.wid, ld_commit_if.PC, ld_commit_if.tmask, ld_commit_if.rd, ld_commit_if.data, ld_commit_if.eop};
|
||||
assign rsp_data[1] = {fpu_commit_if.wid, fpu_commit_if.PC, fpu_commit_if.tmask, fpu_commit_if.rd, fpu_commit_if.data, fpu_commit_if.eop};
|
||||
assign rsp_data[2] = {csr_commit_if.wid, csr_commit_if.PC, csr_commit_if.tmask, csr_commit_if.rd, csr_commit_if.data, csr_commit_if.eop};
|
||||
assign rsp_data[3] = {alu_commit_if.wid, alu_commit_if.PC, alu_commit_if.tmask, alu_commit_if.rd, alu_commit_if.data, alu_commit_if.eop};
|
||||
assign rsp_data[4] = {gpu_commit_if.wid, gpu_commit_if.PC, gpu_commit_if.tmask, gpu_commit_if.rd, gpu_commit_if.data, gpu_commit_if.eop};
|
||||
|
||||
VX_stream_arbiter #(
|
||||
.NUM_REQS (4),
|
||||
.NUM_REQS (5),
|
||||
.DATAW (DATAW),
|
||||
.TYPE ("X")
|
||||
) rsp_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.valid_in ({alu_valid, csr_valid, fpu_valid, ld_valid}),
|
||||
.valid_in ({gpu_valid, alu_valid, csr_valid, fpu_valid, ld_valid}),
|
||||
.data_in (rsp_data),
|
||||
.ready_in (rsp_ready),
|
||||
.valid_out (wb_valid),
|
||||
|
@ -61,6 +64,7 @@ module VX_writeback #(
|
|||
assign fpu_commit_if.ready = rsp_ready[1] || ~fpu_commit_if.wb;
|
||||
assign csr_commit_if.ready = rsp_ready[2] || ~csr_commit_if.wb;
|
||||
assign alu_commit_if.ready = rsp_ready[3] || ~alu_commit_if.wb;
|
||||
assign gpu_commit_if.ready = rsp_ready[4] || ~gpu_commit_if.wb;
|
||||
|
||||
assign stall = ~writeback_if.ready && writeback_if.valid;
|
||||
|
||||
|
|
|
@ -45,8 +45,10 @@ localparam CCI_DATA_WIDTH = $bits(t_ccip_clData);
|
|||
localparam CCI_DATA_SIZE = CCI_DATA_WIDTH / 8;
|
||||
localparam CCI_ADDR_WIDTH = 32 - $clog2(CCI_DATA_SIZE);
|
||||
|
||||
|
||||
localparam AVS_RD_QUEUE_SIZE = 4;
|
||||
localparam AVS_REQ_TAGW_VX = `MAX(`VX_MEM_TAG_WIDTH, `VX_MEM_TAG_WIDTH + $clog2(LMEM_DATA_WIDTH) - $clog2(`VX_MEM_DATA_WIDTH));
|
||||
localparam _AVS_REQ_TAGW_VX = `VX_MEM_TAG_WIDTH + $clog2(LMEM_DATA_WIDTH) - $clog2(`VX_MEM_DATA_WIDTH);
|
||||
localparam AVS_REQ_TAGW_VX = `MAX(`VX_MEM_TAG_WIDTH, _AVS_REQ_TAGW_VX);
|
||||
localparam AVS_REQ_TAGW_CCI = `MAX(CCI_ADDR_WIDTH, CCI_ADDR_WIDTH + $clog2(LMEM_DATA_WIDTH) - $clog2(CCI_DATA_WIDTH));
|
||||
localparam AVS_REQ_TAGW = `MAX(AVS_REQ_TAGW_VX, AVS_REQ_TAGW_CCI);
|
||||
|
||||
|
|
4
hw/rtl/cache/VX_bank.v
vendored
|
@ -253,7 +253,7 @@ module VX_bank #(
|
|||
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin
|
||||
assign {debug_pc_st0, debug_wid_st0} = tag_st0[CORE_TAG_WIDTH-1:CORE_TAG_ID_BITS];
|
||||
assign {debug_pc_st0, debug_wid_st0} = tag_st0[`CACHE_REQ_INFO_RNG];
|
||||
end else begin
|
||||
assign {debug_pc_st0, debug_wid_st0} = 0;
|
||||
end
|
||||
|
@ -322,7 +322,7 @@ module VX_bank #(
|
|||
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
if (CORE_TAG_WIDTH != CORE_TAG_ID_BITS && CORE_TAG_ID_BITS != 0) begin
|
||||
assign {debug_pc_st1, debug_wid_st1} = tag_st1[CORE_TAG_WIDTH-1:CORE_TAG_ID_BITS];
|
||||
assign {debug_pc_st1, debug_wid_st1} = tag_st1[`CACHE_REQ_INFO_RNG];
|
||||
end else begin
|
||||
assign {debug_pc_st1, debug_wid_st1} = 0;
|
||||
end
|
||||
|
|
12
hw/rtl/cache/VX_cache_define.vh
vendored
|
@ -1,5 +1,5 @@
|
|||
`ifndef VX_CACHE_CONFIG
|
||||
`define VX_CACHE_CONFIG
|
||||
`ifndef VX_CACHE_DEFINE
|
||||
`define VX_CACHE_DEFINE
|
||||
|
||||
`include "VX_platform.vh"
|
||||
|
||||
|
@ -51,17 +51,19 @@
|
|||
|
||||
`define LINE_TAG_ADDR(x) x[`LINE_ADDR_WIDTH-1 : `LINE_SELECT_BITS]
|
||||
|
||||
`define CACHE_REQ_INFO_RNG CORE_TAG_WIDTH-1:(CORE_TAG_WIDTH-`NW_BITS-32)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
`define CORE_RSP_TAGS ((CORE_TAG_ID_BITS != 0) ? 1 : NUM_REQS)
|
||||
|
||||
`define BANK_READY_COUNT ((SHARED_BANK_READY != 0) ? 1 : NUM_BANKS)
|
||||
|
||||
`define MEM_ADDR_BANK(x) x[`BANK_SELECT_BITS+BANK_ADDR_OFFSET-1 : BANK_ADDR_OFFSET]
|
||||
`define MEM_ADDR_BANK(x) x[`BANK_SELECT_BITS+BANK_ADDR_OFFSET-1 : BANK_ADDR_OFFSET]
|
||||
|
||||
`define MEM_TO_LINE_ADDR(x) x[`MEM_ADDR_WIDTH-1 : `BANK_SELECT_BITS]
|
||||
`define MEM_TO_LINE_ADDR(x) x[`MEM_ADDR_WIDTH-1 : `BANK_SELECT_BITS]
|
||||
|
||||
`define LINE_TO_MEM_ADDR(x, i) {x, `BANK_SELECT_BITS'(i)}
|
||||
`define LINE_TO_MEM_ADDR(x, i) {x, `BANK_SELECT_BITS'(i)}
|
||||
|
||||
`define LINE_TO_BYTE_ADDR(x, i) {x, (32-$bits(x))'(i << (32-$bits(x)-`BANK_SELECT_BITS))}
|
||||
|
||||
|
|
2
hw/rtl/cache/VX_shared_mem.v
vendored
|
@ -328,7 +328,7 @@ module VX_shared_mem #(
|
|||
end else begin
|
||||
$display("%t: cache%0d:%0d core-rd-rsp: addr=%0h, tag=%0h, byteen=%b, data=%0h, wid=%0d, PC=%0h",
|
||||
$time, CACHE_ID, i, per_bank_core_req_addr[i], per_bank_core_req_tag[i], per_bank_core_req_byteen[i], per_bank_core_rsp_data[i],
|
||||
debug_wid_st1[i], debug_pc_st1[i]);
|
||||
debug_wid_st1[i], debug_pc_st1[i]);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -3,8 +3,7 @@
|
|||
`include "defs_div_sqrt_mvp.sv"
|
||||
|
||||
`TRACING_OFF
|
||||
module VX_fpu_fpnew
|
||||
#(
|
||||
module VX_fpu_fpnew #(
|
||||
parameter TAGW = 1,
|
||||
parameter FMULADD = 1,
|
||||
parameter FDIVSQRT = 1,
|
||||
|
|
|
@ -3,10 +3,12 @@
|
|||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_cmt_to_csr_if ();
|
||||
interface VX_cmt_to_csr_if #(
|
||||
parameter SIZE
|
||||
)();
|
||||
|
||||
wire valid;
|
||||
wire [$clog2(3*`NUM_THREADS+1)-1:0] commit_size;
|
||||
wire valid;
|
||||
wire [SIZE-1:0] commit_size;
|
||||
|
||||
endinterface
|
||||
|
||||
|
|
|
@ -12,8 +12,10 @@ interface VX_gpu_req_if();
|
|||
wire [31:0] PC;
|
||||
wire [31:0] next_PC;
|
||||
wire [`GPU_BITS-1:0] op_type;
|
||||
wire [`MOD_BITS-1:0] op_mod;
|
||||
wire [`NUM_THREADS-1:0][31:0] rs1_data;
|
||||
wire [31:0] rs2_data;
|
||||
wire [`NUM_THREADS-1:0][31:0] rs2_data;
|
||||
wire [`NUM_THREADS-1:0][31:0] rs3_data;
|
||||
wire [`NR_BITS-1:0] rd;
|
||||
wire wb;
|
||||
|
||||
|
|
14
hw/rtl/interfaces/VX_tex_csr_if.v
Normal file
|
@ -0,0 +1,14 @@
|
|||
`ifndef VX_TEX_CSR_IF
|
||||
`define VX_TEX_CSR_IF
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_tex_csr_if ();
|
||||
|
||||
wire write_enable;
|
||||
wire [`CSR_ADDR_BITS-1:0] write_addr;
|
||||
wire [31:0] write_data;
|
||||
|
||||
endinterface
|
||||
|
||||
`endif
|
25
hw/rtl/interfaces/VX_tex_req_if.v
Normal file
|
@ -0,0 +1,25 @@
|
|||
`ifndef VX_TEX_REQ_IF
|
||||
`define VX_TEX_REQ_IF
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_tex_req_if ();
|
||||
|
||||
wire valid;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [31:0] PC;
|
||||
wire [`NR_BITS-1:0] rd;
|
||||
wire wb;
|
||||
|
||||
wire [`NTEX_BITS-1:0] unit;
|
||||
wire [1:0][`NUM_THREADS-1:0][31:0] coords;
|
||||
wire [`NUM_THREADS-1:0][31:0] lod;
|
||||
|
||||
wire ready;
|
||||
|
||||
endinterface
|
||||
`endif
|
||||
|
||||
|
||||
|
21
hw/rtl/interfaces/VX_tex_rsp_if.v
Normal file
|
@ -0,0 +1,21 @@
|
|||
`ifndef VX_TEX_RSP_IF
|
||||
`define VX_TEX_RSP_IF
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
interface VX_tex_rsp_if ();
|
||||
|
||||
wire valid;
|
||||
wire [`NW_BITS-1:0] wid;
|
||||
wire [`NUM_THREADS-1:0] tmask;
|
||||
wire [31:0] PC;
|
||||
wire [`NR_BITS-1:0] rd;
|
||||
wire wb;
|
||||
wire [`NUM_THREADS-1:0][31:0] data;
|
||||
wire ready;
|
||||
|
||||
endinterface
|
||||
|
||||
`endif
|
||||
|
||||
|
151
hw/rtl/tex_unit/VX_tex_addr.v
Normal file
|
@ -0,0 +1,151 @@
|
|||
`include "VX_tex_define.vh"
|
||||
|
||||
module VX_tex_addr #(
|
||||
parameter CORE_ID = 0,
|
||||
parameter REQ_INFO_WIDTH = 1,
|
||||
parameter NUM_REQS = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// inputs
|
||||
|
||||
input wire req_valid,
|
||||
input wire [NUM_REQS-1:0] req_tmask,
|
||||
input wire [1:0][NUM_REQS-1:0][31:0] req_coords,
|
||||
input wire [`TEX_FORMAT_BITS-1:0] req_format,
|
||||
input wire [`TEX_FILTER_BITS-1:0] req_filter,
|
||||
input wire [1:0][`TEX_WRAP_BITS-1:0] req_wraps,
|
||||
input wire [`TEX_ADDR_BITS-1:0] req_baseaddr,
|
||||
input wire [NUM_REQS-1:0][`TEX_MIPOFF_BITS-1:0] req_mipoffset,
|
||||
input wire [1:0][NUM_REQS-1:0][`TEX_DIM_BITS-1:0] req_logdims,
|
||||
input wire [REQ_INFO_WIDTH-1:0] req_info,
|
||||
output wire req_ready,
|
||||
|
||||
// outputs
|
||||
|
||||
output wire rsp_valid,
|
||||
output wire [NUM_REQS-1:0] rsp_tmask,
|
||||
output wire [`TEX_FILTER_BITS-1:0] rsp_filter,
|
||||
output wire [`TEX_STRIDE_BITS-1:0] rsp_stride,
|
||||
output wire [NUM_REQS-1:0][3:0][31:0] rsp_addr,
|
||||
output wire [1:0][NUM_REQS-1:0][`BLEND_FRAC-1:0] rsp_blends,
|
||||
output wire [REQ_INFO_WIDTH-1:0] rsp_info,
|
||||
input wire rsp_ready
|
||||
);
|
||||
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
wire valid_s0;
|
||||
wire [NUM_REQS-1:0] tmask_s0;
|
||||
wire [`TEX_FILTER_BITS-1:0] filter_s0;
|
||||
wire [REQ_INFO_WIDTH-1:0] req_info_s0;
|
||||
|
||||
wire [1:0][NUM_REQS-1:0][`FIXED_FRAC-1:0] clamped_lo, clamped_lo_s0;
|
||||
wire [1:0][NUM_REQS-1:0][`FIXED_FRAC-1:0] clamped_hi, clamped_hi_s0;
|
||||
wire [`TEX_STRIDE_BITS-1:0] log_stride, log_stride_s0;
|
||||
wire [NUM_REQS-1:0][31:0] mip_addr, mip_addr_s0;
|
||||
wire [1:0][NUM_REQS-1:0][`TEX_DIM_BITS-1:0] log_dims_s0;
|
||||
|
||||
wire stall_out;
|
||||
|
||||
// stride
|
||||
|
||||
VX_tex_stride #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) tex_stride (
|
||||
.format (req_format),
|
||||
.log_stride (log_stride)
|
||||
);
|
||||
|
||||
// addressing mode
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
for (genvar j = 0; j < 2; ++j) begin
|
||||
wire [31:0] coord_lo, coord_hi;
|
||||
|
||||
assign coord_lo = req_coords[j][i] - (req_filter ? (`FIXED_HALF >> req_logdims[j][i]) : 0);
|
||||
assign coord_hi = req_coords[j][i] + (req_filter ? (`FIXED_HALF >> req_logdims[j][i]) : 0);
|
||||
|
||||
VX_tex_wrap #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) tex_wrap_lo (
|
||||
.wrap_i (req_wraps[j]),
|
||||
.coord_i (coord_lo),
|
||||
.coord_o (clamped_lo[j][i])
|
||||
);
|
||||
|
||||
VX_tex_wrap #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) tex_wrap_hi (
|
||||
.wrap_i (req_wraps[j]),
|
||||
.coord_i (coord_hi),
|
||||
.coord_o (clamped_hi[j][i])
|
||||
);
|
||||
end
|
||||
assign mip_addr[i] = req_baseaddr + 32'(req_mipoffset[i]);
|
||||
end
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + REQ_INFO_WIDTH + NUM_REQS * (2 * `TEX_DIM_BITS + 32 + 2 * 2 * `FIXED_FRAC)),
|
||||
.RESETW (1)
|
||||
) pipe_reg0 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall_out),
|
||||
.data_in ({req_valid, req_tmask, req_filter, log_stride, req_info, req_logdims, mip_addr, clamped_lo, clamped_hi}),
|
||||
.data_out ({valid_s0, tmask_s0, filter_s0, log_stride_s0, req_info_s0, log_dims_s0, mip_addr_s0, clamped_lo_s0, clamped_hi_s0})
|
||||
);
|
||||
|
||||
// addresses generation
|
||||
|
||||
wire [1:0][NUM_REQS-1:0][`FIXED_INT-1:0] scaled_lo, scaled_hi;
|
||||
wire [1:0][NUM_REQS-1:0][`BLEND_FRAC-1:0] blends;
|
||||
wire [NUM_REQS-1:0][3:0][31:0] addr;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
for (genvar j = 0; j < 2; ++j) begin
|
||||
assign scaled_lo[j][i] = `FIXED_INT'(clamped_lo_s0[j][i] >> ((`FIXED_FRAC) - log_dims_s0[j][i]));
|
||||
assign scaled_hi[j][i] = `FIXED_INT'(clamped_hi_s0[j][i] >> ((`FIXED_FRAC) - log_dims_s0[j][i]));
|
||||
assign blends[j][i] = filter_s0 ? clamped_lo_s0[j][i][`BLEND_FRAC-1:0] : `BLEND_FRAC'(0);
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
assign addr[i][0] = mip_addr_s0[i] + (32'(scaled_lo[0][i]) + (32'(scaled_lo[1][i]) << log_dims_s0[0][i])) << log_stride_s0;
|
||||
assign addr[i][1] = mip_addr_s0[i] + (32'(scaled_hi[0][i]) + (32'(scaled_lo[1][i]) << log_dims_s0[0][i])) << log_stride_s0;
|
||||
assign addr[i][2] = mip_addr_s0[i] + (32'(scaled_lo[0][i]) + (32'(scaled_hi[1][i]) << log_dims_s0[0][i])) << log_stride_s0;
|
||||
assign addr[i][3] = mip_addr_s0[i] + (32'(scaled_hi[0][i]) + (32'(scaled_hi[1][i]) << log_dims_s0[0][i])) << log_stride_s0;
|
||||
end
|
||||
|
||||
assign stall_out = rsp_valid && ~rsp_ready;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + NUM_REQS + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + (NUM_REQS * 4 * 32) + (2 * NUM_REQS * `BLEND_FRAC) + REQ_INFO_WIDTH),
|
||||
.RESETW (1)
|
||||
) pipe_reg1 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall_out),
|
||||
.data_in ({valid_s0, tmask_s0, filter_s0, log_stride_s0, addr, blends, req_info_s0}),
|
||||
.data_out ({rsp_valid, rsp_tmask, rsp_filter, rsp_stride, rsp_addr, rsp_blends, rsp_info})
|
||||
);
|
||||
|
||||
assign req_ready = ~stall_out;
|
||||
|
||||
`ifdef DBG_PRINT_TEX
|
||||
wire [`NW_BITS-1:0] rsp_wid;
|
||||
wire [31:0] rsp_PC;
|
||||
assign {rsp_wid, rsp_PC} = rsp_info[`NW_BITS+32-1:0];
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (rsp_valid && rsp_ready) begin
|
||||
$write("%t: core%0d-tex-addr: wid=%0d, PC=%0h, tmask=%b, req_filter=%0d, tride=%0d, addr=",
|
||||
$time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask, rsp_filter, rsp_stride);
|
||||
`PRINT_ARRAY2D(rsp_addr, 4, NUM_REQS);
|
||||
$write("\n");
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
42
hw/rtl/tex_unit/VX_tex_define.vh
Normal file
|
@ -0,0 +1,42 @@
|
|||
`ifndef VX_TEX_DEFINE
|
||||
`define VX_TEX_DEFINE
|
||||
|
||||
`include "VX_define.vh"
|
||||
|
||||
`define FIXED_FRAC 20
|
||||
`define FIXED_INT (32 - `FIXED_FRAC)
|
||||
`define FIXED_ONE (2 ** `FIXED_FRAC)
|
||||
`define FIXED_HALF (`FIXED_ONE >> 1)
|
||||
`define FIXED_MASK (`FIXED_ONE - 1)
|
||||
|
||||
`define CLAMP(x,lo,hi) (($signed(x) < $signed(lo)) ? lo : ((x > hi) ? hi : x))
|
||||
|
||||
`define TEX_ADDR_BITS 32
|
||||
`define TEX_FORMAT_BITS 3
|
||||
`define TEX_WRAP_BITS 2
|
||||
`define TEX_DIM_BITS 4
|
||||
`define TEX_FILTER_BITS 1
|
||||
|
||||
`define TEX_MIPOFF_BITS (2*12+1)
|
||||
`define TEX_STRIDE_BITS 2
|
||||
|
||||
`define TEX_LOD_BITS 4
|
||||
`define TEX_MIP_BITS (`NTEX_BITS + `TEX_LOD_BITS)
|
||||
|
||||
`define TEX_WRAP_CLAMP 0
|
||||
`define TEX_WRAP_REPEAT 1
|
||||
`define TEX_WRAP_MIRROR 2
|
||||
|
||||
`define TEX_COLOR_BITS 8
|
||||
|
||||
`define BLEND_FRAC 8
|
||||
`define BLEND_ONE (2 ** `BLEND_FRAC)
|
||||
|
||||
`define TEX_FORMAT_R8G8B8A8 `TEX_FORMAT_BITS'(0)
|
||||
`define TEX_FORMAT_R5G6B5 `TEX_FORMAT_BITS'(1)
|
||||
`define TEX_FORMAT_R4G4B4A4 `TEX_FORMAT_BITS'(2)
|
||||
`define TEX_FORMAT_L8A8 `TEX_FORMAT_BITS'(3)
|
||||
`define TEX_FORMAT_L8 `TEX_FORMAT_BITS'(4)
|
||||
`define TEX_FORMAT_A8 `TEX_FORMAT_BITS'(5)
|
||||
|
||||
`endif
|
58
hw/rtl/tex_unit/VX_tex_format.v
Normal file
|
@ -0,0 +1,58 @@
|
|||
`include "VX_tex_define.vh"
|
||||
|
||||
module VX_tex_format #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
input wire [`TEX_FORMAT_BITS-1:0] format,
|
||||
input wire [31:0] texel_in,
|
||||
output wire [31:0] texel_out
|
||||
);
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
reg [31:0] texel_out_r;
|
||||
|
||||
always @(*) begin
|
||||
case (format)
|
||||
`TEX_FORMAT_R5G6B5: begin
|
||||
texel_out_r[07:00] = `TEX_COLOR_BITS'({texel_in[15:11],texel_in[15:13]});
|
||||
texel_out_r[15:08] = `TEX_COLOR_BITS'({texel_in[10:5],texel_in[10:9]});
|
||||
texel_out_r[23:16] = `TEX_COLOR_BITS'({texel_in[4:0],texel_in[4:2]});
|
||||
texel_out_r[31:24] = {`TEX_COLOR_BITS{1'b1}};
|
||||
end
|
||||
`TEX_FORMAT_R4G4B4A4: begin
|
||||
texel_out_r[07:00] = `TEX_COLOR_BITS'({texel_in[11:8],texel_in[15:12]});
|
||||
texel_out_r[15:08] = `TEX_COLOR_BITS'({2{texel_in[7:4]}});
|
||||
texel_out_r[23:16] = `TEX_COLOR_BITS'({2{texel_in[3:0]}});
|
||||
texel_out_r[31:24] = `TEX_COLOR_BITS'({2{texel_in[15:12]}});
|
||||
end
|
||||
`TEX_FORMAT_L8A8: begin
|
||||
texel_out_r[07:00] = `TEX_COLOR_BITS'(texel_in[7:0]);
|
||||
texel_out_r[15:08] = `TEX_COLOR_BITS'(texel_in[7:0]);
|
||||
texel_out_r[23:16] = `TEX_COLOR_BITS'(texel_in[7:0]);
|
||||
texel_out_r[31:24] = `TEX_COLOR_BITS'(texel_in[15:8]);
|
||||
end
|
||||
`TEX_FORMAT_A8: begin
|
||||
texel_out_r[07:00] = `TEX_COLOR_BITS'(0);
|
||||
texel_out_r[15:08] = `TEX_COLOR_BITS'(0);
|
||||
texel_out_r[23:16] = `TEX_COLOR_BITS'(0);
|
||||
texel_out_r[31:24] = `TEX_COLOR_BITS'(texel_in[7:0]);
|
||||
end
|
||||
`TEX_FORMAT_L8: begin
|
||||
texel_out_r[07:00] = `TEX_COLOR_BITS'(texel_in[7:0]);
|
||||
texel_out_r[15:08] = `TEX_COLOR_BITS'(texel_in[7:0]);
|
||||
texel_out_r[23:16] = `TEX_COLOR_BITS'(texel_in[7:0]);
|
||||
texel_out_r[31:24] = {`TEX_COLOR_BITS{1'b1}};
|
||||
end
|
||||
// `TEX_FORMAT_R8G8B8A8
|
||||
default: begin
|
||||
texel_out_r[07:00] = `TEX_COLOR_BITS'(texel_in[7:0]);
|
||||
texel_out_r[15:08] = `TEX_COLOR_BITS'(texel_in[15:8]);
|
||||
texel_out_r[23:16] = `TEX_COLOR_BITS'(texel_in[23:16]);
|
||||
texel_out_r[31:24] = `TEX_COLOR_BITS'(texel_in[31:24]);
|
||||
end
|
||||
endcase
|
||||
end
|
||||
|
||||
assign texel_out = texel_out_r;
|
||||
|
||||
endmodule
|
17
hw/rtl/tex_unit/VX_tex_lerp.v
Normal file
|
@ -0,0 +1,17 @@
|
|||
`include "VX_tex_define.vh"
|
||||
|
||||
module VX_tex_lerp #(
|
||||
) (
|
||||
input wire [`BLEND_FRAC-1:0] blend,
|
||||
input wire [31:0] in1,
|
||||
input wire [31:0] in2,
|
||||
output wire [31:0] out
|
||||
);
|
||||
for (genvar i = 0; i < 4; ++i) begin
|
||||
wire [8:0] blend_m1 = `BLEND_ONE - blend;
|
||||
wire [16:0] sum = in1[i*8+:8] * blend_m1 + in2[i*8+:8] * blend;
|
||||
`UNUSED_VAR (sum)
|
||||
assign out[i*8+:8] = sum[15:8];
|
||||
end
|
||||
|
||||
endmodule
|
128
hw/rtl/tex_unit/VX_tex_lsu_arb.v
Normal file
|
@ -0,0 +1,128 @@
|
|||
`include "../cache/VX_cache_define.vh"
|
||||
|
||||
module VX_tex_lsu_arb #(
|
||||
parameter NUM_REQS = 1,
|
||||
parameter LANES = 1,
|
||||
parameter WORD_SIZE = 1,
|
||||
parameter TAG_IN_WIDTH = 1,
|
||||
parameter TAG_OUT_WIDTH = 1,
|
||||
parameter LOG_NUM_REQS = `CLOG2(NUM_REQS)
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// input requests
|
||||
input wire [NUM_REQS-1:0][LANES-1:0] req_valid_in,
|
||||
input wire [NUM_REQS-1:0][LANES-1:0] req_rw_in,
|
||||
input wire [NUM_REQS-1:0][LANES-1:0][WORD_SIZE-1:0] req_byteen_in,
|
||||
input wire [NUM_REQS-1:0][LANES-1:0][`WORD_ADDR_WIDTH-1:0] req_addr_in,
|
||||
input wire [NUM_REQS-1:0][LANES-1:0][`WORD_WIDTH-1:0] req_data_in,
|
||||
input wire [NUM_REQS-1:0][LANES-1:0][TAG_IN_WIDTH-1:0] req_tag_in,
|
||||
output wire [NUM_REQS-1:0][LANES-1:0] req_ready_in,
|
||||
|
||||
// output request
|
||||
output wire [LANES-1:0] req_valid_out,
|
||||
output wire [LANES-1:0] req_rw_out,
|
||||
output wire [LANES-1:0][WORD_SIZE-1:0] req_byteen_out,
|
||||
output wire [LANES-1:0][`WORD_ADDR_WIDTH-1:0] req_addr_out,
|
||||
output wire [LANES-1:0][`WORD_WIDTH-1:0] req_data_out,
|
||||
output wire [LANES-1:0][TAG_OUT_WIDTH-1:0] req_tag_out,
|
||||
input wire [LANES-1:0] req_ready_out,
|
||||
|
||||
// input response
|
||||
input wire [LANES-1:0] rsp_valid_in,
|
||||
input wire [LANES-1:0][`WORD_WIDTH-1:0] rsp_data_in,
|
||||
input wire [TAG_OUT_WIDTH-1:0] rsp_tag_in,
|
||||
output wire rsp_ready_in,
|
||||
|
||||
// output responses
|
||||
output wire [NUM_REQS-1:0][LANES-1:0] rsp_valid_out,
|
||||
output wire [NUM_REQS-1:0][LANES-1:0][`WORD_WIDTH-1:0] rsp_data_out,
|
||||
output wire [NUM_REQS-1:0][TAG_IN_WIDTH-1:0] rsp_tag_out,
|
||||
input wire [NUM_REQS-1:0] rsp_ready_out
|
||||
);
|
||||
localparam REQ_DATAW = LANES * (1 + TAG_IN_WIDTH + `WORD_ADDR_WIDTH + 1 + WORD_SIZE + `WORD_WIDTH);
|
||||
|
||||
if (NUM_REQS > 1) begin
|
||||
|
||||
wire [NUM_REQS-1:0][REQ_DATAW-1:0] req_merged_data_in;
|
||||
wire [NUM_REQS-1:0] req_valid_in_any;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; i++) begin
|
||||
assign req_merged_data_in[i] = {req_valid_in[i], req_tag_in[i], req_addr_in[i], req_rw_in[i], req_byteen_in[i], req_data_in[i]};
|
||||
assign req_valid_in_any[i] = (| req_valid_in[i]);
|
||||
end
|
||||
|
||||
wire sel_valid;
|
||||
wire [LOG_NUM_REQS-1:0] sel_idx;
|
||||
wire [NUM_REQS-1:0] sel_1hot;
|
||||
|
||||
wire sel_enable = (| req_ready_out);
|
||||
|
||||
VX_rr_arbiter #(
|
||||
.NUM_REQS(NUM_REQS),
|
||||
.LOCK_ENABLE(1)
|
||||
) sel_arb (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.requests (req_valid_in_any),
|
||||
.enable (sel_enable),
|
||||
.grant_valid (sel_valid),
|
||||
.grant_index (sel_idx),
|
||||
.grant_onehot (sel_1hot)
|
||||
);
|
||||
|
||||
wire [LANES-1:0] req_valid_out_unqual;
|
||||
wire [LANES-1:0][TAG_IN_WIDTH-1:0] req_tag_out_unqual;
|
||||
|
||||
assign {req_valid_out_unqual, req_tag_out_unqual, req_addr_out, req_rw_out, req_byteen_out, req_data_out} = req_merged_data_in[sel_idx];
|
||||
|
||||
assign req_valid_out = req_valid_out_unqual & {LANES{sel_valid}};
|
||||
|
||||
for (genvar i = 0; i < LANES; i++) begin
|
||||
assign req_tag_out[i] = {req_tag_out_unqual[i], sel_idx};
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; i++) begin
|
||||
assign req_ready_in[i] = req_ready_out & {LANES{sel_1hot[i]}};
|
||||
end
|
||||
|
||||
///////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire [LOG_NUM_REQS-1:0] rsp_sel = rsp_tag_in[LOG_NUM_REQS-1:0];
|
||||
|
||||
reg [NUM_REQS-1:0][LANES-1:0] rsp_valid_out_unqual;
|
||||
always @(*) begin
|
||||
rsp_valid_out_unqual = '0;
|
||||
rsp_valid_out_unqual[rsp_sel] = rsp_valid_in;
|
||||
end
|
||||
assign rsp_valid_out = rsp_valid_out_unqual;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; i++) begin
|
||||
assign rsp_data_out[i] = rsp_data_in;
|
||||
assign rsp_tag_out[i] = rsp_tag_in[LOG_NUM_REQS +: TAG_IN_WIDTH];
|
||||
end
|
||||
|
||||
assign rsp_ready_in = rsp_ready_out[rsp_sel];
|
||||
|
||||
end else begin
|
||||
|
||||
`UNUSED_VAR (clk)
|
||||
`UNUSED_VAR (reset)
|
||||
|
||||
assign req_valid_out = req_valid_in;
|
||||
assign req_tag_out = req_tag_in;
|
||||
assign req_addr_out = req_addr_in;
|
||||
assign req_rw_out = req_rw_in;
|
||||
assign req_byteen_out = req_byteen_in;
|
||||
assign req_data_out = req_data_in;
|
||||
assign req_ready_in = req_ready_out;
|
||||
|
||||
assign rsp_valid_out = rsp_valid_in;
|
||||
assign rsp_tag_out = rsp_tag_in;
|
||||
assign rsp_data_out = rsp_data_in;
|
||||
assign rsp_ready_in = rsp_ready_out;
|
||||
|
||||
end
|
||||
|
||||
endmodule
|
288
hw/rtl/tex_unit/VX_tex_memory.v
Normal file
|
@ -0,0 +1,288 @@
|
|||
`include "VX_tex_define.vh"
|
||||
module VX_tex_memory #(
|
||||
parameter CORE_ID = 0,
|
||||
parameter REQ_INFO_WIDTH = 1,
|
||||
parameter NUM_REQS = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// memory interface
|
||||
VX_dcache_req_if dcache_req_if,
|
||||
VX_dcache_rsp_if dcache_rsp_if,
|
||||
|
||||
// inputs
|
||||
input wire req_valid,
|
||||
input wire [NUM_REQS-1:0] req_tmask,
|
||||
input wire [`TEX_FILTER_BITS-1:0] req_filter,
|
||||
input wire [`TEX_STRIDE_BITS-1:0] req_stride,
|
||||
input wire [NUM_REQS-1:0][3:0][31:0] req_addr,
|
||||
input wire [REQ_INFO_WIDTH-1:0] req_info,
|
||||
output wire req_ready,
|
||||
|
||||
// outputs
|
||||
output wire rsp_valid,
|
||||
output wire [NUM_REQS-1:0] rsp_tmask,
|
||||
output wire [NUM_REQS-1:0][3:0][31:0] rsp_data,
|
||||
output wire [REQ_INFO_WIDTH-1:0] rsp_info,
|
||||
input wire rsp_ready
|
||||
);
|
||||
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
localparam RSP_CTR_W = $clog2(NUM_REQS * 4 + 1);
|
||||
|
||||
wire [3:0] dup_reqs;
|
||||
wire [3:0][NUM_REQS-1:0][29:0] req_addr_w;
|
||||
wire [3:0][NUM_REQS-1:0][1:0] align_offs;
|
||||
|
||||
// reorder address into quads
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
for (genvar j = 0; j < 4; ++j) begin
|
||||
assign req_addr_w[j][i] = req_addr[i][j][31:2];
|
||||
assign align_offs[j][i] = req_addr[i][j][1:0];
|
||||
end
|
||||
end
|
||||
|
||||
// find duplicate addresses
|
||||
|
||||
for (genvar i = 0; i < 4; ++i) begin
|
||||
wire [NUM_REQS-1:0] addr_matches;
|
||||
for (genvar j = 0; j < NUM_REQS; j++) begin
|
||||
assign addr_matches[j] = (req_addr_w[i][0] == req_addr_w[i][j]) || ~req_tmask[j];
|
||||
end
|
||||
assign dup_reqs[i] = req_tmask[0] && (& addr_matches);
|
||||
end
|
||||
|
||||
// save request addresses into fifo
|
||||
|
||||
wire reqq_push, reqq_pop, reqq_empty, reqq_full;
|
||||
|
||||
wire [3:0][NUM_REQS-1:0][29:0] q_req_addr;
|
||||
wire [NUM_REQS-1:0] q_req_tmask;
|
||||
wire [`TEX_FILTER_BITS-1:0] q_req_filter;
|
||||
wire [REQ_INFO_WIDTH-1:0] q_req_info;
|
||||
wire [`TEX_STRIDE_BITS-1:0] q_req_stride;
|
||||
wire [3:0][NUM_REQS-1:0][1:0] q_align_offs;
|
||||
wire [3:0] q_dup_reqs;
|
||||
|
||||
assign reqq_push = req_valid && req_ready;
|
||||
|
||||
VX_fifo_queue #(
|
||||
.DATAW ((NUM_REQS * 4 * 30) + NUM_REQS + REQ_INFO_WIDTH + `TEX_FILTER_BITS + `TEX_STRIDE_BITS + (4 * NUM_REQS * 2) + 4),
|
||||
.SIZE (`LSUQ_SIZE),
|
||||
.OUTPUT_REG (1)
|
||||
) req_queue (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.push (reqq_push),
|
||||
.pop (reqq_pop),
|
||||
.data_in ({req_addr_w, req_tmask, req_info, req_filter, req_stride, align_offs, dup_reqs}),
|
||||
.data_out ({q_req_addr, q_req_tmask, q_req_info, q_req_filter, q_req_stride, q_align_offs, q_dup_reqs}),
|
||||
.empty (reqq_empty),
|
||||
.full (reqq_full),
|
||||
`UNUSED_PIN (alm_full),
|
||||
`UNUSED_PIN (alm_empty),
|
||||
`UNUSED_PIN (size)
|
||||
);
|
||||
|
||||
// can take more requests?
|
||||
assign req_ready = ~reqq_full;
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////
|
||||
|
||||
wire req_texel_valid;
|
||||
wire sent_all_ready, last_texel_sent;
|
||||
wire req_texel_dup;
|
||||
wire [NUM_REQS-1:0][29:0] req_texel_addr;
|
||||
reg [1:0] req_texel_idx;
|
||||
reg req_texels_done;
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset || last_texel_sent) begin
|
||||
req_texel_idx <= 0;
|
||||
end else if (req_texel_valid && sent_all_ready) begin
|
||||
req_texel_idx <= req_texel_idx + 1;
|
||||
end
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset || reqq_pop) begin
|
||||
req_texels_done <= 0;
|
||||
end else if (last_texel_sent) begin
|
||||
req_texels_done <= 1;
|
||||
end
|
||||
end
|
||||
|
||||
assign req_texel_valid = ~reqq_empty && ~req_texels_done;
|
||||
assign req_texel_addr = q_req_addr[req_texel_idx];
|
||||
assign req_texel_dup = q_dup_reqs[req_texel_idx];
|
||||
|
||||
wire is_last_texel = (req_texel_idx == (q_req_filter ? 3 : 0));
|
||||
assign last_texel_sent = req_texel_valid && sent_all_ready && is_last_texel;
|
||||
|
||||
// DCache Request
|
||||
|
||||
reg [NUM_REQS-1:0] texel_sent_mask;
|
||||
wire [NUM_REQS-1:0] dcache_req_fire;
|
||||
wire [NUM_REQS-1:0] req_dup_mask;
|
||||
|
||||
assign dcache_req_fire = dcache_req_if.valid & dcache_req_if.ready;
|
||||
|
||||
assign sent_all_ready = (&(dcache_req_if.ready | texel_sent_mask | ~q_req_tmask))
|
||||
|| (req_texel_dup & dcache_req_if.ready[0]);
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset || sent_all_ready) begin
|
||||
texel_sent_mask <= 0;
|
||||
end else begin
|
||||
texel_sent_mask <= texel_sent_mask | dcache_req_fire;
|
||||
end
|
||||
end
|
||||
|
||||
assign req_dup_mask = {{(NUM_REQS-1){~req_texel_dup}}, 1'b1};
|
||||
|
||||
assign dcache_req_if.valid = {NUM_REQS{req_texel_valid}} & q_req_tmask & req_dup_mask & ~texel_sent_mask;
|
||||
assign dcache_req_if.rw = {NUM_REQS{1'b0}};
|
||||
assign dcache_req_if.addr = req_texel_addr;
|
||||
assign dcache_req_if.byteen = {NUM_REQS{4'b1111}};
|
||||
assign dcache_req_if.data = 'x;
|
||||
|
||||
`ifdef DBG_CACHE_REQ_INFO
|
||||
wire [`NW_BITS-1:0] q_req_wid;
|
||||
wire [31:0] q_req_PC;
|
||||
assign {q_req_wid, q_req_PC} = q_req_info[`NW_BITS+32-1:0];
|
||||
assign dcache_req_if.tag = {NUM_REQS{q_req_PC, q_req_wid, req_texel_idx}};
|
||||
`else
|
||||
assign dcache_req_if.tag = {NUM_REQS{req_texel_idx}};
|
||||
`endif
|
||||
|
||||
// Dcache Response
|
||||
|
||||
reg [3:0][NUM_REQS-1:0][31:0] rsp_texels, rsp_texels_n;
|
||||
wire [NUM_REQS-1:0][3:0][31:0] rsp_texels_qual;
|
||||
reg [NUM_REQS-1:0][31:0] rsp_data_qual;
|
||||
reg [RSP_CTR_W-1:0] rsp_rem_ctr;
|
||||
wire [NUM_REQS-1:0] rsp_cur_tmask;
|
||||
wire [$clog2(NUM_REQS + 1)-1:0] rsp_cur_cnt;
|
||||
wire dcache_rsp_fire;
|
||||
wire [1:0] rsp_texel_idx;
|
||||
wire rsp_texel_dup;
|
||||
|
||||
assign rsp_texel_idx = dcache_rsp_if.tag[1:0];
|
||||
|
||||
assign rsp_texel_dup = q_dup_reqs[rsp_texel_idx];
|
||||
|
||||
assign dcache_rsp_fire = dcache_rsp_if.valid && dcache_rsp_if.ready;
|
||||
|
||||
assign rsp_cur_tmask = rsp_texel_dup ? q_req_tmask : dcache_rsp_if.tmask;
|
||||
|
||||
assign rsp_cur_cnt = $countones(rsp_cur_tmask);
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; i++) begin
|
||||
wire [31:0] src_mask = {32{dcache_rsp_if.tmask[i]}};
|
||||
wire [31:0] src_data = ((i == 0 || rsp_texel_dup) ? dcache_rsp_if.data[0] : (dcache_rsp_if.data[i]) & src_mask);
|
||||
|
||||
reg [31:0] rsp_data_shifted;
|
||||
always @(*) begin
|
||||
rsp_data_shifted[31:16] = src_data[31:16];
|
||||
rsp_data_shifted[15:0] = q_align_offs[rsp_texel_idx][i][1] ? src_data[31:16] : src_data[15:0];
|
||||
rsp_data_shifted[7:0] = q_align_offs[rsp_texel_idx][i][0] ? rsp_data_shifted[15:8] : rsp_data_shifted[7:0];
|
||||
end
|
||||
|
||||
always @(*) begin
|
||||
case (q_req_stride)
|
||||
0: rsp_data_qual[i] = 32'(rsp_data_shifted[7:0]);
|
||||
1: rsp_data_qual[i] = 32'(rsp_data_shifted[15:0]);
|
||||
default: rsp_data_qual[i] = rsp_data_shifted;
|
||||
endcase
|
||||
end
|
||||
end
|
||||
|
||||
always @(*) begin
|
||||
rsp_texels_n = rsp_texels;
|
||||
rsp_texels_n[rsp_texel_idx] |= rsp_data_qual;
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset || reqq_pop) begin
|
||||
rsp_texels <= '0;
|
||||
end else if (dcache_rsp_fire) begin
|
||||
rsp_texels <= rsp_texels_n;
|
||||
end
|
||||
end
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (reset) begin
|
||||
rsp_rem_ctr <= 0;
|
||||
end else begin
|
||||
if ((| dcache_req_fire) && 0 == rsp_rem_ctr) begin
|
||||
rsp_rem_ctr <= q_req_filter ? {$countones(q_req_tmask), 2'b0} : {2'b0, $countones(q_req_tmask)};
|
||||
end else if (dcache_rsp_fire) begin
|
||||
rsp_rem_ctr <= rsp_rem_ctr - RSP_CTR_W'(rsp_cur_cnt);
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; ++i) begin
|
||||
for (genvar j = 0; j < 4; ++j) begin
|
||||
assign rsp_texels_qual[i][j] = rsp_texels_n[j][i];
|
||||
end
|
||||
end
|
||||
|
||||
wire stall_out = rsp_valid && ~rsp_ready;
|
||||
|
||||
wire rsp_texels_done = dcache_rsp_fire && (rsp_rem_ctr == RSP_CTR_W'(rsp_cur_cnt));
|
||||
|
||||
assign reqq_pop = rsp_texels_done && ~stall_out;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + NUM_REQS + REQ_INFO_WIDTH + (4 * NUM_REQS * 32)),
|
||||
.RESETW (1)
|
||||
) rsp_pipe_reg (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall_out),
|
||||
.data_in ({rsp_texels_done, q_req_tmask, q_req_info, rsp_texels_qual}),
|
||||
.data_out ({rsp_valid, rsp_tmask, rsp_info, rsp_data})
|
||||
);
|
||||
|
||||
// Can accept new cache response?
|
||||
assign dcache_rsp_if.ready = ~stall_out || (rsp_rem_ctr != RSP_CTR_W'(rsp_cur_cnt));
|
||||
|
||||
`ifdef DBG_PRINT_TEX
|
||||
wire [`NW_BITS-1:0] req_wid, rsp_wid;
|
||||
wire [31:0] req_PC, rsp_PC;
|
||||
assign {req_wid, req_PC} = req_info[`NW_BITS+32-1:0];
|
||||
assign {rsp_wid, rsp_PC} = rsp_info[`NW_BITS+32-1:0];
|
||||
|
||||
always @(posedge clk) begin
|
||||
if ((| dcache_req_fire)) begin
|
||||
$write("%t: core%0d-tex-cache-req: wid=%0d, PC=%0h, tmask=%b, tag=%0h, addr=",
|
||||
$time, CORE_ID, q_req_wid, q_req_PC, dcache_req_fire, dcache_req_if.tag);
|
||||
`PRINT_ARRAY1D(req_texel_addr, NUM_REQS);
|
||||
$write(", is_dup=%b\n", req_texel_dup);
|
||||
end
|
||||
if (dcache_rsp_fire) begin
|
||||
$write("%t: core%0d-tex-cache-rsp: wid=%0d, PC=%0h, tmask=%b, tag=%0h, data=",
|
||||
$time, CORE_ID, q_req_wid, q_req_PC, dcache_rsp_if.valid, dcache_rsp_if.tag);
|
||||
`PRINT_ARRAY1D(rsp_data_qual, NUM_REQS);
|
||||
$write("\n");
|
||||
end
|
||||
if (req_valid && req_ready) begin
|
||||
$write("%t: core%0d-tex-mem-req: wid=%0d, PC=%0h, tmask=%b, filter=%0d, stride=%0d, addr=",
|
||||
$time, CORE_ID, req_wid, req_PC, req_tmask, req_filter, req_stride);
|
||||
`PRINT_ARRAY2D(req_addr, 4, NUM_REQS);
|
||||
$write("\n");
|
||||
end
|
||||
if (rsp_valid && rsp_ready) begin
|
||||
$write("%t: core%0d-tex-mem-rsp: wid=%0d, PC=%0h, tmask=%b, data=",
|
||||
$time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask);
|
||||
`PRINT_ARRAY2D(rsp_data, 4, NUM_REQS);
|
||||
$write("\n");
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
136
hw/rtl/tex_unit/VX_tex_sampler.v
Normal file
|
@ -0,0 +1,136 @@
|
|||
`include "VX_tex_define.vh"
|
||||
|
||||
module VX_tex_sampler #(
|
||||
parameter CORE_ID = 0,
|
||||
parameter REQ_INFO_WIDTH = 1,
|
||||
parameter NUM_REQS = 1
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// inputs
|
||||
input wire req_valid,
|
||||
input wire [`NUM_THREADS-1:0] req_tmask,
|
||||
input wire [`TEX_FORMAT_BITS-1:0] req_format,
|
||||
input wire [1:0][NUM_REQS-1:0][`BLEND_FRAC-1:0] req_blends,
|
||||
input wire [NUM_REQS-1:0][3:0][31:0] req_data,
|
||||
input wire [REQ_INFO_WIDTH-1:0] req_info,
|
||||
output wire req_ready,
|
||||
|
||||
// ouputs
|
||||
output wire rsp_valid,
|
||||
output wire [`NUM_THREADS-1:0] rsp_tmask,
|
||||
output wire [NUM_REQS-1:0][31:0] rsp_data,
|
||||
output wire [REQ_INFO_WIDTH-1:0] rsp_info,
|
||||
input wire rsp_ready
|
||||
);
|
||||
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
wire valid_s0;
|
||||
wire [`NUM_THREADS-1:0] tmask_s0;
|
||||
wire [REQ_INFO_WIDTH-1:0] req_info_s0;
|
||||
wire [NUM_REQS-1:0][31:0] texel_ul, texel_uh;
|
||||
wire [NUM_REQS-1:0][31:0] texel_ul_s0, texel_uh_s0;
|
||||
wire [NUM_REQS-1:0][`BLEND_FRAC-1:0] blend_v_s0;
|
||||
wire [NUM_REQS-1:0][31:0] texel_v;
|
||||
|
||||
wire stall_out;
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; i++) begin
|
||||
|
||||
wire [3:0][31:0] fmt_texels;
|
||||
|
||||
for (genvar j = 0; j < 4; j++) begin
|
||||
VX_tex_format #(
|
||||
.CORE_ID (CORE_ID)
|
||||
) tex_format (
|
||||
.format (req_format),
|
||||
.texel_in (req_data[i][j]),
|
||||
.texel_out (fmt_texels[j])
|
||||
);
|
||||
end
|
||||
|
||||
VX_tex_lerp #(
|
||||
) tex_lerp_ul (
|
||||
.blend (req_blends[0][i]),
|
||||
.in1 (fmt_texels[0]),
|
||||
.in2 (fmt_texels[1]),
|
||||
.out (texel_ul[i])
|
||||
);
|
||||
|
||||
VX_tex_lerp #(
|
||||
) tex_lerp_uh (
|
||||
.blend (req_blends[0][i]),
|
||||
.in1 (fmt_texels[2]),
|
||||
.in2 (fmt_texels[3]),
|
||||
.out (texel_uh[i])
|
||||
);
|
||||
end
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + NUM_REQS + REQ_INFO_WIDTH + (NUM_REQS * `BLEND_FRAC) + (2 * NUM_REQS * 32)),
|
||||
.RESETW (1)
|
||||
) pipe_reg0 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall_out),
|
||||
.data_in ({req_valid, req_tmask, req_info, req_blends[1], texel_ul, texel_uh}),
|
||||
.data_out ({valid_s0, tmask_s0, req_info_s0, blend_v_s0, texel_ul_s0, texel_uh_s0})
|
||||
);
|
||||
|
||||
for (genvar i = 0; i < NUM_REQS; i++) begin
|
||||
VX_tex_lerp #(
|
||||
) tex_lerp_v (
|
||||
.blend (blend_v_s0[i]),
|
||||
.in1 (texel_ul_s0[i]),
|
||||
.in2 (texel_uh_s0[i]),
|
||||
.out (texel_v[i])
|
||||
);
|
||||
end
|
||||
|
||||
assign stall_out = rsp_valid && ~rsp_ready;
|
||||
|
||||
VX_pipe_register #(
|
||||
.DATAW (1 + NUM_REQS + REQ_INFO_WIDTH + (NUM_REQS * 32)),
|
||||
.RESETW (1)
|
||||
) pipe_reg1 (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
.enable (~stall_out),
|
||||
.data_in ({valid_s0, tmask_s0, req_info_s0, texel_v}),
|
||||
.data_out ({rsp_valid, rsp_tmask, rsp_info, rsp_data})
|
||||
);
|
||||
|
||||
// can accept new request?
|
||||
assign req_ready = ~stall_out;
|
||||
|
||||
`ifdef DBG_PRINT_TEX
|
||||
|
||||
wire [`NW_BITS-1:0] req_wid, rsp_wid;
|
||||
wire [31:0] req_PC, rsp_PC;
|
||||
|
||||
assign {req_wid, req_PC} = req_info[`NW_BITS+32-1:0];
|
||||
assign {rsp_wid, rsp_PC} = rsp_info[`NW_BITS+32-1:0];
|
||||
|
||||
always @(posedge clk) begin
|
||||
if (req_valid && req_ready) begin
|
||||
$write("%t: core%0d-tex-sampler-req: wid=%0d, PC=%0h, tmask=%b, format=%0d, data=",
|
||||
$time, CORE_ID, req_wid, req_PC, req_tmask, req_format);
|
||||
`PRINT_ARRAY2D(req_data, 4, NUM_REQS);
|
||||
$write(", u0=");
|
||||
`PRINT_ARRAY1D(req_blends[0], NUM_REQS);
|
||||
$write(", v0=");
|
||||
`PRINT_ARRAY1D(req_blends[1], NUM_REQS);
|
||||
$write("\n");
|
||||
end
|
||||
if (rsp_valid && rsp_ready) begin
|
||||
$write("%t: core%0d-tex-sampler-rsp: wid=%0d, PC=%0h, tmask=%b, data=",
|
||||
$time, CORE_ID, rsp_wid, rsp_PC, rsp_tmask);
|
||||
`PRINT_ARRAY1D(rsp_data, NUM_REQS);
|
||||
$write("\n");
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
27
hw/rtl/tex_unit/VX_tex_stride.v
Normal file
|
@ -0,0 +1,27 @@
|
|||
`include "VX_tex_define.vh"
|
||||
|
||||
module VX_tex_stride #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
input wire [`TEX_FORMAT_BITS-1:0] format,
|
||||
output wire [`TEX_STRIDE_BITS-1:0] log_stride
|
||||
);
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
reg [`TEX_STRIDE_BITS-1:0] log_stride_r;
|
||||
|
||||
always @(*) begin
|
||||
case (format)
|
||||
`TEX_FORMAT_A8: log_stride_r = 0;
|
||||
`TEX_FORMAT_L8: log_stride_r = 0;
|
||||
`TEX_FORMAT_L8A8: log_stride_r = 1;
|
||||
`TEX_FORMAT_R5G6B5: log_stride_r = 1;
|
||||
`TEX_FORMAT_R4G4B4A4: log_stride_r = 1;
|
||||
//`TEX_FORMAT_R8G8B8A8
|
||||
default: log_stride_r = 2;
|
||||
endcase
|
||||
end
|
||||
|
||||
assign log_stride = log_stride_r;
|
||||
|
||||
endmodule
|
226
hw/rtl/tex_unit/VX_tex_unit.v
Normal file
|
@ -0,0 +1,226 @@
|
|||
`include "VX_tex_define.vh"
|
||||
|
||||
module VX_tex_unit #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
input wire clk,
|
||||
input wire reset,
|
||||
|
||||
// Texture unit <-> Memory Unit
|
||||
VX_dcache_req_if dcache_req_if,
|
||||
VX_dcache_rsp_if dcache_rsp_if,
|
||||
|
||||
// Inputs
|
||||
VX_tex_req_if tex_req_if,
|
||||
VX_tex_csr_if tex_csr_if,
|
||||
|
||||
// Outputs
|
||||
VX_tex_rsp_if tex_rsp_if
|
||||
);
|
||||
|
||||
localparam REQ_INFO_WIDTH_S = `NR_BITS + 1 + `NW_BITS + 32;
|
||||
localparam REQ_INFO_WIDTH_A = `TEX_FORMAT_BITS + REQ_INFO_WIDTH_S;
|
||||
localparam REQ_INFO_WIDTH_M = (2 * `NUM_THREADS * `BLEND_FRAC) + REQ_INFO_WIDTH_A;
|
||||
|
||||
reg [`TEX_MIPOFF_BITS-1:0] tex_mipoff [`NUM_TEX_UNITS-1:0][(1 << `TEX_LOD_BITS)-1:0];
|
||||
reg [`TEX_DIM_BITS-1:0] tex_dims [1:0][`NUM_TEX_UNITS-1:0][(1 << `TEX_LOD_BITS)-1:0];
|
||||
|
||||
reg [`TEX_ADDR_BITS-1:0] tex_baddr [`NUM_TEX_UNITS-1:0];
|
||||
reg [`TEX_FORMAT_BITS-1:0] tex_format [`NUM_TEX_UNITS-1:0];
|
||||
reg [`TEX_WRAP_BITS-1:0] tex_wraps [1:0][`NUM_TEX_UNITS-1:0];
|
||||
reg [`TEX_FILTER_BITS-1:0] tex_filter [`NUM_TEX_UNITS-1:0];
|
||||
|
||||
// CSRs programming
|
||||
|
||||
for (genvar i = 0; i < `NUM_TEX_UNITS; ++i) begin
|
||||
wire [`TEX_LOD_BITS-1:0] mip_level = tex_csr_if.write_data[28 +: `TEX_LOD_BITS];
|
||||
always @(posedge clk) begin
|
||||
if (tex_csr_if.write_enable) begin
|
||||
case (tex_csr_if.write_addr)
|
||||
`CSR_TEX_ADDR(i) : begin
|
||||
tex_baddr[i] <= tex_csr_if.write_data[`TEX_ADDR_BITS-1:0];
|
||||
end
|
||||
`CSR_TEX_FORMAT(i) : begin
|
||||
tex_format[i] <= tex_csr_if.write_data[`TEX_FORMAT_BITS-1:0];
|
||||
end
|
||||
`CSR_TEX_WRAP(i) : begin
|
||||
tex_wraps[0][i] <= tex_csr_if.write_data[0 +: `TEX_WRAP_BITS];
|
||||
tex_wraps[1][i] <= tex_csr_if.write_data[`TEX_WRAP_BITS +: `TEX_WRAP_BITS];
|
||||
end
|
||||
`CSR_TEX_FILTER(i) : begin
|
||||
tex_filter[i] <= tex_csr_if.write_data[`TEX_FILTER_BITS-1:0];
|
||||
end
|
||||
`CSR_TEX_MIPOFF(i) : begin
|
||||
tex_mipoff[i][mip_level] <= tex_csr_if.write_data[`TEX_MIPOFF_BITS-1:0];
|
||||
end
|
||||
`CSR_TEX_WIDTH(i) : begin
|
||||
tex_dims[0][i][mip_level] <= tex_csr_if.write_data[`TEX_DIM_BITS-1:0];
|
||||
end
|
||||
`CSR_TEX_HEIGHT(i) : begin
|
||||
tex_dims[1][i][mip_level] <= tex_csr_if.write_data[`TEX_DIM_BITS-1:0];
|
||||
end
|
||||
endcase
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
// mipmap attributes
|
||||
|
||||
wire [`NUM_THREADS-1:0][`TEX_MIPOFF_BITS-1:0] sel_mipoff;
|
||||
wire [1:0][`NUM_THREADS-1:0][`TEX_DIM_BITS-1:0] sel_dims;
|
||||
|
||||
for (genvar i = 0; i < `NUM_THREADS; ++i) begin
|
||||
wire [`NTEX_BITS-1:0] unit = tex_req_if.unit[`NTEX_BITS-1:0];
|
||||
wire [`TEX_LOD_BITS-1:0] mip_level = tex_req_if.lod[i][20+:`TEX_LOD_BITS];
|
||||
assign sel_mipoff[i] = tex_mipoff[unit][mip_level];
|
||||
assign sel_dims[0][i] = tex_dims[0][unit][mip_level];
|
||||
assign sel_dims[1][i] = tex_dims[1][unit][mip_level];
|
||||
end
|
||||
|
||||
// address generation
|
||||
|
||||
wire mem_req_valid;
|
||||
wire [`NUM_THREADS-1:0] mem_req_tmask;
|
||||
wire [`TEX_FILTER_BITS-1:0] mem_req_filter;
|
||||
wire [`TEX_STRIDE_BITS-1:0] mem_req_stride;
|
||||
wire [1:0][`NUM_THREADS-1:0][`BLEND_FRAC-1:0] mem_req_blends;
|
||||
wire [`NUM_THREADS-1:0][3:0][31:0] mem_req_addr;
|
||||
wire [REQ_INFO_WIDTH_A-1:0] mem_req_info;
|
||||
wire mem_req_ready;
|
||||
|
||||
VX_tex_addr #(
|
||||
.CORE_ID (CORE_ID),
|
||||
.REQ_INFO_WIDTH (REQ_INFO_WIDTH_A),
|
||||
.NUM_REQS (`NUM_THREADS)
|
||||
) tex_addr (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
.req_valid (tex_req_if.valid),
|
||||
.req_tmask (tex_req_if.tmask),
|
||||
.req_coords (tex_req_if.coords),
|
||||
.req_format (tex_format[tex_req_if.unit]),
|
||||
.req_filter (tex_filter[tex_req_if.unit]),
|
||||
.req_wraps ({tex_wraps[1][tex_req_if.unit], tex_wraps[0][tex_req_if.unit]}),
|
||||
.req_baseaddr(tex_baddr[tex_req_if.unit]),
|
||||
.req_mipoffset(sel_mipoff),
|
||||
.req_logdims(sel_dims),
|
||||
.req_info ({tex_format[tex_req_if.unit], tex_req_if.rd, tex_req_if.wb, tex_req_if.wid, tex_req_if.PC}),
|
||||
.req_ready (tex_req_if.ready),
|
||||
|
||||
.rsp_valid (mem_req_valid),
|
||||
.rsp_tmask (mem_req_tmask),
|
||||
.rsp_filter (mem_req_filter),
|
||||
.rsp_stride (mem_req_stride),
|
||||
.rsp_addr (mem_req_addr),
|
||||
.rsp_blends (mem_req_blends),
|
||||
.rsp_info (mem_req_info),
|
||||
.rsp_ready (mem_req_ready)
|
||||
);
|
||||
|
||||
// retrieve texel values from memory
|
||||
|
||||
wire mem_rsp_valid;
|
||||
wire [`NUM_THREADS-1:0] mem_rsp_tmask;
|
||||
wire [`NUM_THREADS-1:0][3:0][31:0] mem_rsp_data;
|
||||
wire [REQ_INFO_WIDTH_M-1:0] mem_rsp_info;
|
||||
wire mem_rsp_ready;
|
||||
|
||||
VX_tex_memory #(
|
||||
.CORE_ID (CORE_ID),
|
||||
.REQ_INFO_WIDTH (REQ_INFO_WIDTH_M),
|
||||
.NUM_REQS (`NUM_THREADS)
|
||||
) tex_memory (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
// memory interface
|
||||
.dcache_req_if (dcache_req_if),
|
||||
.dcache_rsp_if (dcache_rsp_if),
|
||||
|
||||
// inputs
|
||||
.req_valid (mem_req_valid),
|
||||
.req_tmask (mem_req_tmask),
|
||||
.req_filter(mem_req_filter),
|
||||
.req_stride(mem_req_stride),
|
||||
.req_addr (mem_req_addr),
|
||||
.req_info ({mem_req_blends, mem_req_info}),
|
||||
.req_ready (mem_req_ready),
|
||||
|
||||
// outputs
|
||||
.rsp_valid (mem_rsp_valid),
|
||||
.rsp_tmask (mem_rsp_tmask),
|
||||
.rsp_data (mem_rsp_data),
|
||||
.rsp_info (mem_rsp_info),
|
||||
.rsp_ready (mem_rsp_ready)
|
||||
);
|
||||
|
||||
// apply sampler
|
||||
|
||||
wire [`NUM_THREADS-1:0][1:0][`BLEND_FRAC-1:0] rsp_blends;
|
||||
wire [`TEX_FORMAT_BITS-1:0] rsp_format;
|
||||
wire [REQ_INFO_WIDTH_S-1:0] rsp_info;
|
||||
|
||||
assign {rsp_blends, rsp_format, rsp_info} = mem_rsp_info;
|
||||
|
||||
VX_tex_sampler #(
|
||||
.CORE_ID (CORE_ID),
|
||||
.REQ_INFO_WIDTH (REQ_INFO_WIDTH_S),
|
||||
.NUM_REQS (`NUM_THREADS)
|
||||
) tex_sampler (
|
||||
.clk (clk),
|
||||
.reset (reset),
|
||||
|
||||
// inputs
|
||||
.req_valid (mem_rsp_valid),
|
||||
.req_tmask (mem_rsp_tmask),
|
||||
.req_data (mem_rsp_data),
|
||||
.req_format (rsp_format),
|
||||
.req_blends (rsp_blends),
|
||||
.req_info (rsp_info),
|
||||
.req_ready (mem_rsp_ready),
|
||||
|
||||
// outputs
|
||||
.rsp_valid (tex_rsp_if.valid),
|
||||
.rsp_tmask (tex_rsp_if.tmask),
|
||||
.rsp_data (tex_rsp_if.data),
|
||||
.rsp_info ({tex_rsp_if.rd, tex_rsp_if.wb, tex_rsp_if.wid, tex_rsp_if.PC}),
|
||||
.rsp_ready (tex_rsp_if.ready)
|
||||
);
|
||||
|
||||
`ifdef DBG_PRINT_TEX
|
||||
for (genvar i = 0; i < `NUM_TEX_UNITS; ++i) begin
|
||||
always @(posedge clk) begin
|
||||
if (tex_csr_if.write_enable
|
||||
&& (tex_csr_if.write_addr >= `CSR_TEX_BEGIN(i)
|
||||
&& tex_csr_if.write_addr < `CSR_TEX_BEGIN(i+1))) begin
|
||||
$display("%t: core%0d-tex-csr: tex%0d_addr=%0h", $time, CORE_ID, i, tex_baddr[i]);
|
||||
$display("%t: core%0d-tex-csr: tex%0d_format=%0h", $time, CORE_ID, i, tex_format[i]);
|
||||
$display("%t: core%0d-tex-csr: tex%0d_wrap_u=%0h", $time, CORE_ID, i, tex_wraps[0][i]);
|
||||
$display("%t: core%0d-tex-csr: tex%0d_wrap_v=%0h", $time, CORE_ID, i, tex_wraps[1][i]);
|
||||
$display("%t: core%0d-tex-csr: tex%0d_filter=%0h", $time, CORE_ID, i, tex_filter[i]);
|
||||
$display("%t: core%0d-tex-csr: tex%0d_mipoff[0]=%0h", $time, CORE_ID, i, tex_mipoff[i][0]);
|
||||
$display("%t: core%0d-tex-csr: tex%0d_width[0]=%0h", $time, CORE_ID, i, tex_dims[0][i][0]);
|
||||
$display("%t: core%0d-tex-csr: tex%0d_height[0]=%0h", $time, CORE_ID, i, tex_dims[1][i][0]);
|
||||
end
|
||||
end
|
||||
end
|
||||
always @(posedge clk) begin
|
||||
if (tex_req_if.valid && tex_req_if.ready) begin
|
||||
$display("%t: core%0d-tex-req: wid=%0d, PC=%0h, tmask=%b, unit=%0d, lod=%0h, u=",
|
||||
$time, CORE_ID, tex_req_if.wid, tex_req_if.PC, tex_req_if.tmask, tex_req_if.unit, tex_req_if.lod);
|
||||
`PRINT_ARRAY1D(tex_req_if.coords[0], `NUM_THREADS);
|
||||
$write(", v=");
|
||||
`PRINT_ARRAY1D(tex_req_if.coords[1], `NUM_THREADS);
|
||||
$write("\n");
|
||||
end
|
||||
if (tex_rsp_if.valid && tex_rsp_if.ready) begin
|
||||
$write("%t: core%0d-tex-rsp: wid=%0d, PC=%0h, tmask=%b, data=",
|
||||
$time, CORE_ID, tex_rsp_if.wid, tex_rsp_if.PC, tex_rsp_if.tmask);
|
||||
`PRINT_ARRAY1D(tex_rsp_if.data, `NUM_THREADS);
|
||||
$write("\n");
|
||||
end
|
||||
end
|
||||
`endif
|
||||
|
||||
endmodule
|
32
hw/rtl/tex_unit/VX_tex_wrap.v
Normal file
|
@ -0,0 +1,32 @@
|
|||
`include "VX_tex_define.vh"
|
||||
|
||||
module VX_tex_wrap #(
|
||||
parameter CORE_ID = 0
|
||||
) (
|
||||
input wire [`TEX_WRAP_BITS-1:0] wrap_i,
|
||||
input wire [31:0] coord_i,
|
||||
output wire [`FIXED_FRAC-1:0] coord_o
|
||||
);
|
||||
|
||||
`UNUSED_PARAM (CORE_ID)
|
||||
|
||||
reg [`FIXED_FRAC-1:0] coord_r;
|
||||
|
||||
wire [31:0] clamp = `CLAMP(coord_i, 0, `FIXED_MASK);
|
||||
|
||||
`UNUSED_VAR (clamp)
|
||||
|
||||
always @(*) begin
|
||||
case (wrap_i)
|
||||
`TEX_WRAP_CLAMP:
|
||||
coord_r = clamp[`FIXED_FRAC-1:0];
|
||||
`TEX_WRAP_MIRROR:
|
||||
coord_r = coord_i[`FIXED_FRAC-1:0] ^ {`FIXED_FRAC{coord_i[`FIXED_FRAC]}};
|
||||
default: //`TEX_WRAP_REPEAT
|
||||
coord_r = coord_i[`FIXED_FRAC-1:0];
|
||||
endcase
|
||||
end
|
||||
|
||||
assign coord_o = coord_r;
|
||||
|
||||
endmodule
|
|
@ -17,6 +17,7 @@ DBG_PRINT_FLAGS += -DDBG_PRINT_MEM
|
|||
DBG_PRINT_FLAGS += -DDBG_PRINT_OPAE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_AVS
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_SCOPE
|
||||
DBG_PRINT_FLAGS += -DDBG_PRINT_TEX
|
||||
|
||||
DBG_FLAGS += $(DBG_PRINT_FLAGS)
|
||||
DBG_FLAGS += -DDBG_CACHE_REQ_INFO
|
||||
|
@ -36,7 +37,8 @@ RTL_DIR=../rtl
|
|||
DPI_DIR=../dpi
|
||||
|
||||
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(DPI_DIR) -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include -I$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src -I$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl -I$(RTL_DIR)/fp_cores/fpnew/src
|
||||
RTL_INCLUDE = -I$(RTL_DIR)/ -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/simulate $(FPU_INCLUDE)
|
||||
TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = -I$(RTL_DIR)/ -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/simulate $(FPU_INCLUDE) $(TEX_INCLUDE)
|
||||
|
||||
SRCS = simulator.cpp main.cpp
|
||||
SRCS += $(DPI_DIR)/util_dpi.cpp $(DPI_DIR)/float_dpi.cpp
|
||||
|
@ -96,4 +98,4 @@ build-mt: gen-mt
|
|||
make -j -C obj_dir -f VVortex.mk
|
||||
|
||||
clean:
|
||||
rm -rf obj_dir
|
||||
rm -rf obj_dir
|
|
@ -33,7 +33,8 @@ CONFIG32 := -DNUM_CLUSTERS=8 -DNUM_CORES=4 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS
|
|||
CONFIG64 := -DNUM_CLUSTERS=8 -DNUM_CORES=8 -DL2_ENABLE=0 -DL3_ENABLE=0 $(CONFIGS)
|
||||
|
||||
FPU_INCLUDE = -I$(RTL_DIR)/fp_cores -I$(RTL_DIR)/fp_cores/altera/$(DEVICE_FAMILY)
|
||||
RTL_INCLUDE = -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache $(FPU_INCLUDE) -I$(RTL_DIR) -I$(RTL_DIR)/afu
|
||||
TEX_INCLUDE = -I$(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = -I$(RTL_DIR) -I$(RTL_DIR)/libs -I$(RTL_DIR)/interfaces -I$(RTL_DIR)/cache -I$(RTL_DIR)/afu $(FPU_INCLUDE) $(TEX_INCLUDE)
|
||||
|
||||
CFLAGS += $(RTL_INCLUDE)
|
||||
|
||||
|
|
|
@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
|
|||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE)
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
||||
|
|
|
@ -12,12 +12,12 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
|
|||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
RTL_INCLUDE = $(FPU_INCLUDE);$(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
||||
# Part, Family
|
||||
FAMILY = "Arria 10"
|
||||
DEVICE = 10AX115N3F40E2SG
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
||||
# Executable Configuration
|
||||
SYN_ARGS = --parallel --read_settings_files=on
|
||||
|
|
|
@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
|
|||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE)
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
||||
|
|
|
@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
|
|||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE)
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
||||
|
|
|
@ -12,7 +12,8 @@ FPU_CORE_PATH=$(RTL_DIR)/fp_cores/altera/arria10
|
|||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE)
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
||||
|
|
|
@ -12,7 +12,8 @@ DEVICE = 1SX280HN2F43E2VG
|
|||
FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE)
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
||||
|
|
|
@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
|
|||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE)
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
||||
|
|
|
@ -12,7 +12,8 @@ DEVICE = 1SX280HN2F43E2VG
|
|||
FPU_CORE_PATH=$(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE)
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
||||
|
|
|
@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
|
|||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE)
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(RTL_DIR)/afu;$(RTL_DIR)/afu/ccip;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
||||
|
|
|
@ -12,7 +12,9 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
|
|||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
RTL_INCLUDE = $(FPU_INCLUDE);$(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
||||
# Executable Configuration
|
||||
|
|
|
@ -12,7 +12,8 @@ FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/arria10
|
|||
#FPU_CORE_PATH = $(RTL_DIR)/fp_cores/altera/stratix10
|
||||
|
||||
FPU_INCLUDE = $(RTL_DIR)/fp_cores;$(FPU_CORE_PATH);$(RTL_DIR)/fp_cores/fpnew/src;$(RTL_DIR)/fp_cores/fpnew/src/fpu_div_sqrt_mvp/hdl;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/include;$(RTL_DIR)/fp_cores/fpnew/src/common_cells/src
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE)
|
||||
TEX_INCLUDE = $(RTL_DIR)/tex_unit
|
||||
RTL_INCLUDE = $(RTL_DIR);$(RTL_DIR)/libs;$(RTL_DIR)/interfaces;$(RTL_DIR)/cache;$(FPU_INCLUDE);$(TEX_INCLUDE)
|
||||
|
||||
PROJECT_FILES = $(PROJECT).qpf $(PROJECT).qsf
|
||||
|
||||
|
|
20
hw/unit_tests/cache/Makefile
vendored
|
@ -1,16 +1,16 @@
|
|||
PARAM += -DCACHE_SIZE=4096 -DWORD_SIZE=4 -DCACHE_LINE_SIZE=16 -DNUM_BANKS=4 -DCREQ_SIZE=4 -DMRVQ_SIZE=16 -DDFPQ_SIZE=16 -DSNRQ_SIZE=16 -DCWBQ_SIZE=4 -DDWBQ_SIZE=4 -DFQQ_SIZE=4
|
||||
|
||||
TOP = VX_cache
|
||||
|
||||
PARAMS += -DCACHE_SIZE=4096 -DWORD_SIZE=4 -DCACHE_LINE_SIZE=16 -DNUM_BANKS=4 -DCREQ_SIZE=4 -DMRVQ_SIZE=16 -DDFPQ_SIZE=16 -DSNRQ_SIZE=16 -DCWBQ_SIZE=4 -DDWBQ_SIZE=4 -DFQQ_SIZE=4
|
||||
|
||||
# control RTL debug print states
|
||||
DBG_PRINT_FLAGS = -DDBG_PRINT_CORE_ICACHE \
|
||||
-DDBG_PRINT_CORE_DCACHE \
|
||||
-DDBG_PRINT_CACHE_BANK \
|
||||
-DDBG_PRINT_CACHE_SNP \
|
||||
-DDBG_PRINT_CACHE_MSHR \
|
||||
-DDBG_PRINT_CACHE_MSHR \
|
||||
-DDBG_PRINT_CACHE_TAG \
|
||||
-DDBG_PRINT_CACHE_DATA \
|
||||
-DDBG_PRINT_MEM \
|
||||
-DDBG_PRINT_MEM \
|
||||
-DDBG_PRINT_OPAE \
|
||||
-DDBG_PRINT_AVS
|
||||
|
||||
|
@ -18,29 +18,27 @@ DBG_PRINT_FLAGS = -DDBG_PRINT_CORE_ICACHE \
|
|||
|
||||
INCLUDE = -I../../rtl/ -I../../rtl/cache -I../../rtl/libs
|
||||
|
||||
|
||||
SRCS = cachesim.cpp testbench.cpp
|
||||
|
||||
all: build
|
||||
|
||||
CF += -std=c++11 -fms-extensions -I../..
|
||||
CF += $(PARAMS)
|
||||
|
||||
VF += --language 1800-2009 --assert -Wall --trace #-Wpedantic
|
||||
VF += -Wno-DECLFILENAME
|
||||
VF += --x-initial unique
|
||||
VF += -exe $(SRCS) $(INCLUDE)
|
||||
|
||||
DBG += -DVCD_OUTPUT $(DBG_PRINT)
|
||||
|
||||
VF += $(PARAMS)
|
||||
|
||||
gen:
|
||||
verilator $(VF) -DNDEBUG -cc VX_cache.v $(PARAM) -CFLAGS '$(CF) -DNDEBUG $(PARAM)' --exe $(SRCS)
|
||||
verilator $(VF) -cc $(TOP).v -CFLAGS '$(CF)' --exe $(SRCS)
|
||||
|
||||
build: gen
|
||||
(cd obj_dir && make -j -f VVX_cache.mk)
|
||||
(cd obj_dir && make -j -f V$(TOP).mk)
|
||||
|
||||
run: build
|
||||
(cd obj_dir && ./VVX_cache)
|
||||
(cd obj_dir && ./V$(TOP))
|
||||
|
||||
clean:
|
||||
rm -rf obj_dir
|
||||
|
|
10
hw/unit_tests/cache/cachesim.cpp
vendored
|
@ -173,10 +173,10 @@ void CacheSim::stall_mem(){
|
|||
}
|
||||
|
||||
void CacheSim::send_snoop_req(){
|
||||
cache_->snp_req_valid = 1;
|
||||
/*cache_->snp_req_valid = 1;
|
||||
cache_->snp_req_addr = 0x12222222;
|
||||
cache_->snp_req_invalidate = 1;
|
||||
cache_->snp_req_tag = 0xff;
|
||||
cache_->snp_req_tag = 0xff; */
|
||||
}
|
||||
|
||||
void CacheSim::eval_mem_bus() {
|
||||
|
@ -274,9 +274,9 @@ bool CacheSim::assert_equal(unsigned int* data, unsigned int tag){
|
|||
//DEBUG
|
||||
|
||||
void CacheSim::display_miss(){
|
||||
int i = (unsigned int)cache_->miss_vec;
|
||||
std::bitset<8> x(i);
|
||||
if (i) std::cout << "Miss Vec " << x << std::endl;
|
||||
//int i = (unsigned int)cache_->miss_vec;
|
||||
//std::bitset<8> x(i);
|
||||
//if (i) std::cout << "Miss Vec " << x << std::endl;
|
||||
//std::cout << "Miss Vec 0" << cache_->miss_vec[0] << std::endl;
|
||||
}
|
||||
|
||||
|
|
|
@ -1,11 +1,30 @@
|
|||
all: testbench.iv
|
||||
TOP = VX_fifo_queue
|
||||
|
||||
testbench.iv: testbench.v
|
||||
iverilog testbench.v -o testbench.iv -I ../../rtl/
|
||||
PARAMS ?=
|
||||
|
||||
run: testbench.iv
|
||||
! vvp testbench.iv | grep 'ERROR' || false
|
||||
INCLUDE = -I../../rtl/ -I../../rtl/libs
|
||||
|
||||
SRCS = main.cpp
|
||||
|
||||
all: build
|
||||
|
||||
CF += -std=c++11 -fms-extensions -I../..
|
||||
VF += $(PARAMS)
|
||||
|
||||
VF += --language 1800-2009 --assert -Wall --trace
|
||||
VF += -Wno-DECLFILENAME
|
||||
VF += --x-initial unique
|
||||
VF += -exe $(SRCS) $(INCLUDE)
|
||||
VF += $(PARAMS)
|
||||
|
||||
gen:
|
||||
verilator $(VF) -cc $(TOP).v -CFLAGS '$(CF)' --exe $(SRCS)
|
||||
|
||||
build: gen
|
||||
(cd obj_dir && make -j -f V$(TOP).mk)
|
||||
|
||||
run: build
|
||||
(cd obj_dir && ./V$(TOP))
|
||||
|
||||
clean:
|
||||
rm testbench.iv
|
||||
|
||||
rm -rf obj_dir
|
||||
|
|
93
hw/unit_tests/generic_queue/main.cpp
Normal file
|
@ -0,0 +1,93 @@
|
|||
#include "vl_simulator.h"
|
||||
#include "VVX_fifo_queue.h"
|
||||
#include <iostream>
|
||||
|
||||
#define MAX_TICKS 20
|
||||
|
||||
#define CHECK(x) \
|
||||
do { \
|
||||
if (x) \
|
||||
break; \
|
||||
std::cout << "FAILED: " << #x << std::endl; \
|
||||
std::abort(); \
|
||||
} while (false)
|
||||
|
||||
uint64_t ticks = 0;
|
||||
|
||||
double sc_time_stamp() {
|
||||
return ticks;
|
||||
}
|
||||
|
||||
using Device = VVX_fifo_queue;
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
// Initialize Verilators variables
|
||||
Verilated::commandArgs(argc, argv);
|
||||
|
||||
vl_simulator<Device> sim;
|
||||
|
||||
// run test
|
||||
ticks = sim.reset(0);
|
||||
while (ticks < MAX_TICKS) {
|
||||
switch (ticks) {
|
||||
case 0:
|
||||
// initial values
|
||||
sim->pop = 0;
|
||||
sim->push = 0;
|
||||
ticks = sim.step(ticks, 2);
|
||||
break;
|
||||
case 2:
|
||||
// Verify outputs
|
||||
CHECK(sim->full == 0x0);
|
||||
CHECK(sim->empty == 0x1);
|
||||
// push 0xa
|
||||
sim->pop = 0;
|
||||
sim->push = 1;
|
||||
sim->data_in = 0xa;
|
||||
break;
|
||||
case 4:
|
||||
// verify outputs
|
||||
CHECK(sim->data_out == 0xa);
|
||||
CHECK(sim->full == 0x0);
|
||||
CHECK(sim->empty == 0x0);
|
||||
// push 0xb
|
||||
sim->pop = 0;
|
||||
sim->push = 1;
|
||||
sim->data_in = 0xb;
|
||||
break;
|
||||
case 6:
|
||||
// verify outputs
|
||||
CHECK(sim->data_out == 0xa);
|
||||
CHECK(sim->full == 0x1);
|
||||
CHECK(sim->empty == 0x0);
|
||||
// pop
|
||||
sim->pop = 1;
|
||||
sim->push = 0;
|
||||
break;
|
||||
case 8:
|
||||
// verify outputs
|
||||
CHECK(sim->data_out == 0xb);
|
||||
CHECK(sim->full == 0x0);
|
||||
CHECK(sim->empty == 0x0);
|
||||
// pop
|
||||
sim->pop = 1;
|
||||
sim->push = 0;
|
||||
break;
|
||||
case 10:
|
||||
// verify outputs
|
||||
CHECK(sim->full == 0x0);
|
||||
CHECK(sim->empty == 0x1);
|
||||
sim->pop = 0;
|
||||
sim->push = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
// advance clock
|
||||
ticks = sim.step(ticks, 2);
|
||||
}
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
std::cout << "Simulation time: " << std::dec << ticks/2 << " cycles" << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
81
hw/unit_tests/generic_queue/vl_simulator.h
Normal file
|
@ -0,0 +1,81 @@
|
|||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <cstdint>
|
||||
#include "verilated.h"
|
||||
|
||||
#ifdef VM_TRACE
|
||||
#include <verilated_vcd_c.h> // Trace file format header
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
class vl_simulator {
|
||||
private:
|
||||
|
||||
T top_;
|
||||
#ifdef VM_TRACE
|
||||
VerilatedVcdC tfp_;
|
||||
#endif
|
||||
|
||||
public:
|
||||
|
||||
vl_simulator() {
|
||||
top_.clk = 0;
|
||||
top_.reset = 0;
|
||||
#ifdef VM_TRACE
|
||||
Verilated::traceEverOn(true);
|
||||
top_.trace(&tfp_, 99);
|
||||
tfp_.open("trace.vcd");
|
||||
#endif
|
||||
}
|
||||
|
||||
~vl_simulator() {
|
||||
#ifdef VM_TRACE
|
||||
tfp_.close();
|
||||
#endif
|
||||
top_.final();
|
||||
}
|
||||
|
||||
uint64_t reset(uint64_t ticks) {
|
||||
top_.reset = 1;
|
||||
ticks = this->step(ticks, 2);
|
||||
top_.reset = 0;
|
||||
return ticks;
|
||||
}
|
||||
|
||||
uint64_t step(uint64_t ticks, uint32_t count = 1) {
|
||||
while (count--) {
|
||||
top_.eval();
|
||||
#ifdef VM_TRACE
|
||||
tfp_.dump(ticks);
|
||||
#endif
|
||||
top_.clk = !top_.clk;
|
||||
++ticks;
|
||||
}
|
||||
return ticks;
|
||||
}
|
||||
|
||||
T* operator->() {
|
||||
return &top_;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename... Args>
|
||||
void vl_setw(uint32_t* sig, Args&&... args) {
|
||||
std::array<uint32_t, sizeof... (Args)> arr{static_cast<uint32_t>(std::forward<Args>(args))...};
|
||||
for (size_t i = 0; i < sizeof... (Args); ++i) {
|
||||
sig[i] = arr[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
int vl_cmpw(const uint32_t* sig, Args&&... args) {
|
||||
std::array<uint32_t, sizeof... (Args)> arr{static_cast<uint32_t>(std::forward<Args>(args))...};
|
||||
for (size_t i = 0; i < sizeof... (Args); ++i) {
|
||||
if (sig[i] < arr[i])
|
||||
return -1;
|
||||
if (sig[i] > arr[i])
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
30
hw/unit_tests/tex_unit/tex_sampler/Makefile
Normal file
|
@ -0,0 +1,30 @@
|
|||
TOP = VX_tex_sampler
|
||||
|
||||
PARAMS ?=
|
||||
|
||||
INCLUDE = -I../../../rtl/ -I../../../rtl/libs -I../../../rtl/tex_unit
|
||||
|
||||
SRCS = main.cpp
|
||||
|
||||
all: build
|
||||
|
||||
CF += -std=c++11 -fms-extensions -I../..
|
||||
VF += $(PARAMS)
|
||||
|
||||
VF += --language 1800-2009 --assert -Wall --trace
|
||||
VF += -Wno-DECLFILENAME
|
||||
VF += --x-initial unique
|
||||
VF += -exe $(SRCS) $(INCLUDE)
|
||||
VF += $(PARAMS)
|
||||
|
||||
gen:
|
||||
verilator $(VF) -cc $(TOP).v -CFLAGS '$(CF)' --exe $(SRCS)
|
||||
|
||||
build: gen
|
||||
(cd obj_dir && make -j -f V$(TOP).mk)
|
||||
|
||||
run: build
|
||||
(cd obj_dir && ./V$(TOP))
|
||||
|
||||
clean:
|
||||
rm -rf obj_dir
|
215
hw/unit_tests/tex_unit/tex_sampler/main.cpp
Normal file
|
@ -0,0 +1,215 @@
|
|||
#include "vl_simulator.h"
|
||||
#include "VVX_tex_sampler.h"
|
||||
#include <iostream>
|
||||
#include <map>
|
||||
|
||||
#define MAX_TICKS 20
|
||||
#define MAX_UNIT_CYCLES 5
|
||||
#define NUM_THREADS
|
||||
|
||||
#define CHECK(x) \
|
||||
do { \
|
||||
if (x) \
|
||||
break; \
|
||||
std::cout << "FAILED: " << #x << std::endl; \
|
||||
std::abort(); \
|
||||
} while (false)
|
||||
|
||||
uint64_t ticks = 0;
|
||||
|
||||
// using Device = VVX_tex_sampler;
|
||||
|
||||
template <typename T>
|
||||
class testbench
|
||||
{
|
||||
private:
|
||||
vl_simulator<T> sim;
|
||||
std::map<int, struct Input> input_map;
|
||||
std::map<int, struct Output> output_map;
|
||||
|
||||
public:
|
||||
|
||||
struct UnitTest {
|
||||
bool use_reset;
|
||||
unsigned int num_cycles;
|
||||
bool use_cmodel;
|
||||
struct Output outputs[MAX_UNIT_CYCLES];
|
||||
struct Input inputs[MAX_UNIT_CYCLES];
|
||||
unsigned int num_output_check;
|
||||
unsigned int check_output_cycle[MAX_UNIT_CYCLES];
|
||||
}
|
||||
|
||||
struct Input {
|
||||
bool req_valid;
|
||||
unsigned int req_wid;
|
||||
unsigned int req_tmask;
|
||||
unsigned int req_PC;
|
||||
unsigned int req_rd;
|
||||
unsigned int req_wb;
|
||||
unsigned int req_filter;
|
||||
unsigned int req_format;
|
||||
unsigned int req_u[NUM_THREADS];
|
||||
unsigned int req_v[NUM_THREADS];
|
||||
unsigned int req_texels[NUM_THREADS][4];
|
||||
bool rsp_ready;
|
||||
}
|
||||
|
||||
struct Output {
|
||||
int output_cycle;
|
||||
// outputs
|
||||
bool req_ready;
|
||||
bool rsp_valid;
|
||||
unsigned int rsp_wid;
|
||||
unsigned int rsp_tmask;
|
||||
unsigned int rsp_PC;
|
||||
unsigned int rsp_rd;
|
||||
bool rsp_wb;
|
||||
unsigned int rsp_data[NUM_THREADS];
|
||||
}
|
||||
|
||||
testbench(/* args */){
|
||||
|
||||
}
|
||||
|
||||
~testbench(){
|
||||
}
|
||||
|
||||
void unittest_Cmodel(struct UnitTest * test){
|
||||
int cycles = test->num_cycles;
|
||||
int num_outputs = test->num_output_check;
|
||||
|
||||
// struct Input* inputs = new (struct Input)[cycles];
|
||||
struct Output* outputs = new (struct Output)[num_outputs];
|
||||
|
||||
// implement c model and assign outputs to struct
|
||||
|
||||
if (test->inputs[0]->req_filter == 0){
|
||||
for (int i = 0; i < NUM_THREADS; i++)
|
||||
outputs[0]->rsp_data[0] = test->inputs->req_texels[i][0];
|
||||
} else {
|
||||
// for (int i = 0; i < NUM_THREADS; i++){
|
||||
// uint32_t low[4], high[4];
|
||||
// for (int j = 0; j < 4; j++){
|
||||
// low[j] = test->inputs->req_texels[i][j] & 0x00ff00ff;
|
||||
// high[j] = (test->inputs->req_texels[i][j] >> 8) & 0x00ff00ff;
|
||||
// }
|
||||
|
||||
// }
|
||||
}
|
||||
outputs[0]->output_cycle = 1;
|
||||
test->num_cycles = 1;
|
||||
test->outputs = &outputs;
|
||||
|
||||
}
|
||||
|
||||
void generate_test_vectors(struct UnitTest * tests, int num_tests, bool is_pipe){
|
||||
// for all unit tests create output test vectors (w w/o c-model)
|
||||
int prev_test_cycle = 0;
|
||||
|
||||
for (int i = 0; i < num_tests; i++)
|
||||
{
|
||||
int op_counter = 0;
|
||||
int ip_counter = 0;
|
||||
|
||||
int test_cycle = 0;
|
||||
int last_ip_cycle = 0;
|
||||
|
||||
struct UnitTest curr_test = tests[i];
|
||||
|
||||
if (curr_test->use_cmodel){
|
||||
unittest_Cmodel(&curr_test);
|
||||
}
|
||||
|
||||
for (int j = 0; j < curr_test->num_cycles; j++)
|
||||
{
|
||||
if (curr_test->inputs[ip_counter]->input_cycle == test_cycle){
|
||||
input_map.insert(std::make_pair(prev_test_cycle + test_cycle, curr_test->inputs[j]));
|
||||
last_ip_cycle = prev_test_cycle + test_cycle;
|
||||
ip_counter++;
|
||||
}
|
||||
|
||||
if (curr_test->outputs[op_counter]->output_cycle == test_cycle){
|
||||
output_map.insert(std::make_pair(prev_test_cycle + test_cycle, curr_test->outputs[op_counter]));
|
||||
op_counter++;
|
||||
}
|
||||
|
||||
test_cycle++;
|
||||
}
|
||||
|
||||
if(!is_pipe){
|
||||
prev_test_cycle += (test_cycle - 1);
|
||||
}
|
||||
else{
|
||||
prev_test_cycle = last_ip_cycle + 1;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void run(){
|
||||
|
||||
ticks = sim.reset(0);
|
||||
int cycle = 0;
|
||||
|
||||
while (ticks < MAX_TICKS) {
|
||||
|
||||
auto input = input_map.find(cycle);
|
||||
auto output = output_map.find(cycle);
|
||||
|
||||
if (input != input_map.end()){
|
||||
sim->req_valid = input->req_valid;
|
||||
sim->req_wid = input->req_wid;
|
||||
sim->req_tmask = input->req_tmask;
|
||||
sim->req_PC = input->req_PC;
|
||||
sim->req_rd = input->req_rd;
|
||||
sim->req_wb = input->req_wb;
|
||||
sim->req_filter = input->req_filter;
|
||||
sim->req_format = input->req_format;
|
||||
// sim->req_u = input->req_u[NUM_THREADS];
|
||||
// sim->req_v = input->req_v[NUM_THREADS];
|
||||
vl_setw(sim->req_texels, input->req_texels)
|
||||
// sim->req_texels = input->req_texels[NUM_THREADS][4];
|
||||
sim->rsp_ready = input->rsp_ready;
|
||||
} else{
|
||||
std::cout << "Warning! No Input on Cycle " << cycle << std::endl;
|
||||
}
|
||||
|
||||
if(output != output_map.end()){
|
||||
CHECK(sim->req_ready == output->req_ready);
|
||||
CHECK(sim->rsp_valid == output->rsp_valid);
|
||||
CHECK(sim->rsp_wid == output->rsp_wid);
|
||||
CHECK(sim->rsp_tmask == output->rsp_tmask);
|
||||
CHECK(sim->rsp_PC == output->rsp_PC);
|
||||
CHECK(sim->rsp_rd == output->rsp_rd);
|
||||
CHECK(sim->rsp_wb == output->rsp_wb);
|
||||
CHECK(vl_cmpw(sim->rsp_data, output->rsp_data));
|
||||
}
|
||||
|
||||
cycle++;
|
||||
ticks = sim.step(ticks,2);
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
std::cout << "Simulation time: " << std::dec << ticks/2 << " cycles" << std::endl;
|
||||
|
||||
};
|
||||
|
||||
|
||||
double sc_time_stamp() {
|
||||
return ticks;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
// Initialize Verilators variables
|
||||
Verilated::commandArgs(argc, argv);
|
||||
|
||||
testbench<VVX_tex_sampler> sampler_testbench;
|
||||
|
||||
sampler_testbench.generate_test_vectors(tests, 1, 0);
|
||||
sampler_test_bench.run();
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
81
hw/unit_tests/tex_unit/tex_sampler/vl_simulator.h
Normal file
|
@ -0,0 +1,81 @@
|
|||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <cstdint>
|
||||
#include "verilated.h"
|
||||
|
||||
#ifdef VM_TRACE
|
||||
#include <verilated_vcd_c.h> // Trace file format header
|
||||
#endif
|
||||
|
||||
template <typename T>
|
||||
class vl_simulator {
|
||||
private:
|
||||
|
||||
T top_;
|
||||
#ifdef VM_TRACE
|
||||
VerilatedVcdC tfp_;
|
||||
#endif
|
||||
|
||||
public:
|
||||
|
||||
vl_simulator() {
|
||||
top_.clk = 0;
|
||||
top_.reset = 0;
|
||||
#ifdef VM_TRACE
|
||||
Verilated::traceEverOn(true);
|
||||
top_.trace(&tfp_, 99);
|
||||
tfp_.open("trace.vcd");
|
||||
#endif
|
||||
}
|
||||
|
||||
~vl_simulator() {
|
||||
#ifdef VM_TRACE
|
||||
tfp_.close();
|
||||
#endif
|
||||
top_.final();
|
||||
}
|
||||
|
||||
uint64_t reset(uint64_t ticks) {
|
||||
top_.reset = 1;
|
||||
ticks = this->step(ticks, 2);
|
||||
top_.reset = 0;
|
||||
return ticks;
|
||||
}
|
||||
|
||||
uint64_t step(uint64_t ticks, uint32_t count = 1) {
|
||||
while (count--) {
|
||||
top_.eval();
|
||||
#ifdef VM_TRACE
|
||||
tfp_.dump(ticks);
|
||||
#endif
|
||||
top_.clk = !top_.clk;
|
||||
++ticks;
|
||||
}
|
||||
return ticks;
|
||||
}
|
||||
|
||||
T* operator->() {
|
||||
return &top_;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename... Args>
|
||||
void vl_setw(uint32_t* sig, Args&&... args) {
|
||||
std::array<uint32_t, sizeof... (Args)> arr{static_cast<uint32_t>(std::forward<Args>(args))...};
|
||||
for (size_t i = 0; i < sizeof... (Args); ++i) {
|
||||
sig[i] = arr[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename... Args>
|
||||
int vl_cmpw(const uint32_t* sig, Args&&... args) {
|
||||
std::array<uint32_t, sizeof... (Args)> arr{static_cast<uint32_t>(std::forward<Args>(args))...};
|
||||
for (size_t i = 0; i < sizeof... (Args); ++i) {
|
||||
if (sig[i] < arr[i])
|
||||
return -1;
|
||||
if (sig[i] > arr[i])
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
|
@ -5,7 +5,62 @@
|
|||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
|
||||
#endif
|
||||
#ifdef __ASSEMBLY__
|
||||
#define __ASM_STR(x) x
|
||||
#else
|
||||
#define __ASM_STR(x) #x
|
||||
#endif
|
||||
|
||||
#define vx_csr_swap(csr, val) ({ \
|
||||
unsigned __v = (unsigned )(val); \
|
||||
__asm__ __volatile__ ("csrrw %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \
|
||||
__v; \
|
||||
})
|
||||
|
||||
#define vx_csr_read(csr) ({ \
|
||||
register unsigned __v; \
|
||||
__asm__ __volatile__ ("csrr %0, " __ASM_STR(csr) : "=r" (__v) :: "memory"); \
|
||||
__v; \
|
||||
})
|
||||
|
||||
#define vx_csr_write(csr, val) ({ \
|
||||
unsigned __v = (unsigned )(val); \
|
||||
__asm__ __volatile__ ("csrw " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
|
||||
})
|
||||
|
||||
#define vx_csr_read_set(csr, val) ({ \
|
||||
unsigned __v = (unsigned )(val); \
|
||||
__asm__ __volatile__ ("csrrs %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \
|
||||
__v; \
|
||||
})
|
||||
|
||||
#define vx_csr_set(csr, val) ({ \
|
||||
unsigned __v = (unsigned )(val); \
|
||||
__asm__ __volatile__ ("csrs " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
|
||||
})
|
||||
|
||||
#define vx_csr_read_clear(csr, val) ({ \
|
||||
unsigned __v = (unsigned )(val); \
|
||||
__asm__ __volatile__ ("csrrc %0, " __ASM_STR(csr) ", %1" : "=r" (__v) : "rK" (__v) : "memory"); \
|
||||
__v; \
|
||||
})
|
||||
|
||||
#define vx_csr_clear(csr, val) ({ \
|
||||
unsigned __v = (unsigned )(val); \
|
||||
__asm__ __volatile__ ("csrc " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
|
||||
})
|
||||
|
||||
// Texture load
|
||||
#define vx_tex(unit, u, v, l) ({ \
|
||||
unsigned __r; \
|
||||
unsigned __u = u; \
|
||||
unsigned __v = v; \
|
||||
unsigned __l = l; \
|
||||
__asm__ __volatile__ (".insn r4 0x6b, 5, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r) : "r"(__u), "r"(__v), "r"(__l)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
#ifdef __ASSEMBLY__
|
||||
#define __ASM_STR(x) x
|
||||
|
@ -52,6 +107,16 @@ extern "C" {
|
|||
__asm__ __volatile__ ("csrc " __ASM_STR(csr) ", %0" :: "rK" (__v) : "memory"); \
|
||||
})
|
||||
|
||||
// Texture load
|
||||
#define vx_tex(unit, u, v, l) ({ \
|
||||
unsigned __r; \
|
||||
unsigned __u = u; \
|
||||
unsigned __v = v; \
|
||||
unsigned __l = l; \
|
||||
__asm__ __volatile__ (".insn r4 0x6b, 5, " __ASM_STR(unit) ", %0, %1, %2, %3" : "=r"(__r) : "r"(__u), "r"(__v), "r"(__l)); \
|
||||
__r; \
|
||||
})
|
||||
|
||||
// Set thread mask
|
||||
inline void vx_tmc(unsigned num_threads) {
|
||||
asm volatile (".insn s 0x6b, 0, x0, 0(%0)" :: "r"(num_threads));
|
||||
|
@ -76,7 +141,7 @@ inline void vx_join() {
|
|||
|
||||
// Warp Barrier
|
||||
inline void vx_barrier(unsigned barried_id, unsigned num_warps) {
|
||||
asm volatile (".insn s 0x6b, 4, %1, 0cd (%0)" :: "r"(barried_id), "r"(num_warps));
|
||||
asm volatile (".insn s 0x6b, 4, %1, 0(%0)" :: "r"(barried_id), "r"(num_warps));
|
||||
}
|
||||
|
||||
// Return active warp's thread id
|
||||
|
|
|
@ -555,20 +555,18 @@ void Warp::execute(const Instr &instr, Pipeline *pipeline) {
|
|||
|
||||
// FSGNJ.S, FSGNJN.S, FSGNJX.S
|
||||
case 0x10: {
|
||||
bool fsign1 = rsdata[0] & 0x80000000;
|
||||
bool fsign1 = (rsdata[0] >> 31);
|
||||
uint32_t fdata1 = rsdata[0] & 0x7FFFFFFF;
|
||||
bool fsign2 = rsdata[1] & 0x80000000;
|
||||
bool fsign2 = (rsdata[1] >> 31);
|
||||
switch (func3) {
|
||||
case 0: // FSGNJ.S
|
||||
rddata = (fsign2 << 31) | fdata1;
|
||||
break;
|
||||
case 1: // FSGNJN.S
|
||||
fsign2 = !fsign2;
|
||||
rddata = (fsign2 << 31) | fdata1;
|
||||
rddata = (!fsign2 << 31) | fdata1;
|
||||
break;
|
||||
case 2: { // FSGNJX.S
|
||||
bool sign = fsign1 ^ fsign2;
|
||||
rddata = (sign << 31) | fdata1;
|
||||
rddata = ((fsign1 ^ fsign2) << 31) | fdata1;
|
||||
} break;
|
||||
}
|
||||
} break;
|
||||
|
|
|
@ -7,6 +7,7 @@ all:
|
|||
$(MAKE) -C printf
|
||||
$(MAKE) -C diverge
|
||||
$(MAKE) -C fence
|
||||
$(MAKE) -C tex
|
||||
|
||||
run-simx:
|
||||
$(MAKE) -C basic run-simx
|
||||
|
@ -17,6 +18,7 @@ run-simx:
|
|||
$(MAKE) -C printf run-simx
|
||||
$(MAKE) -C diverge run-simx
|
||||
$(MAKE) -C fence run-simx
|
||||
$(MAKE) -C tex run-simx
|
||||
|
||||
run-vlsim:
|
||||
$(MAKE) -C basic run-vlsim
|
||||
|
@ -27,6 +29,7 @@ run-vlsim:
|
|||
$(MAKE) -C printf run-vlsim
|
||||
$(MAKE) -C diverge run-vlsim
|
||||
$(MAKE) -C fence run-vlsim
|
||||
$(MAKE) -C tex run-vlsim
|
||||
|
||||
clean:
|
||||
$(MAKE) -C basic clean
|
||||
|
@ -37,6 +40,7 @@ clean:
|
|||
$(MAKE) -C printf clean
|
||||
$(MAKE) -C diverge clean
|
||||
$(MAKE) -C fence clean
|
||||
$(MAKE) -C tex clean
|
||||
|
||||
clean-all:
|
||||
$(MAKE) -C basic clean-all
|
||||
|
@ -47,4 +51,5 @@ clean-all:
|
|||
$(MAKE) -C printf clean-all
|
||||
$(MAKE) -C diverge clean-all
|
||||
$(MAKE) -C fence clean-all
|
||||
$(MAKE) -C tex clean-all
|
||||
|
||||
|
|
70
tests/regression/tex/Makefile
Normal file
|
@ -0,0 +1,70 @@
|
|||
RISCV_TOOLCHAIN_PATH ?= /opt/riscv-gnu-toolchain
|
||||
VORTEX_DRV_PATH ?= $(realpath ../../../driver)
|
||||
VORTEX_RT_PATH ?= $(wildcard ../../../runtime)
|
||||
|
||||
OPTS ?= -f1
|
||||
|
||||
VX_CC = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-gcc
|
||||
VX_CXX = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-g++
|
||||
VX_DP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objdump
|
||||
VX_CP = $(RISCV_TOOLCHAIN_PATH)/bin/riscv32-unknown-elf-objcopy
|
||||
|
||||
VX_CFLAGS += -march=rv32imf -mabi=ilp32f -O3 -Wstack-usage=1024 -ffreestanding -nostartfiles -fdata-sections -ffunction-sections
|
||||
VX_CFLAGS += -I$(VORTEX_RT_PATH)/include -I$(VORTEX_RT_PATH)/../hw
|
||||
|
||||
VX_LDFLAGS += -Wl,-Bstatic,-T,$(VORTEX_RT_PATH)/linker/vx_link.ld -Wl,--gc-sections $(VORTEX_RT_PATH)/libvortexrt.a
|
||||
|
||||
VX_SRCS = kernel.c
|
||||
|
||||
#CXXFLAGS += -std=c++11 -O2 -Wall -Wextra -Wfatal-errors
|
||||
CXXFLAGS += -std=c++11 -O0 -g -Wall -Wextra -Wfatal-errors
|
||||
|
||||
CXXFLAGS += -I$(VORTEX_DRV_PATH)/include
|
||||
|
||||
LDFLAGS += -L$(VORTEX_DRV_PATH)/stub -lvortex
|
||||
|
||||
PROJECT = tex
|
||||
|
||||
SRCS = main.cpp utils.cpp
|
||||
|
||||
all: $(PROJECT) kernel.bin kernel.dump
|
||||
|
||||
kernel.dump: kernel.elf
|
||||
$(VX_DP) -D kernel.elf > kernel.dump
|
||||
|
||||
kernel.bin: kernel.elf
|
||||
$(VX_CP) -O binary kernel.elf kernel.bin
|
||||
|
||||
kernel.elf: $(VX_SRCS)
|
||||
$(VX_CC) $(VX_CFLAGS) $(VX_SRCS) $(VX_LDFLAGS) -o kernel.elf
|
||||
|
||||
$(PROJECT): $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) $^ $(LDFLAGS) -o $@
|
||||
|
||||
run-simx: $(PROJECT) kernel.bin
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/simx:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-fpga: $(PROJECT) kernel.bin
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-asesim: $(PROJECT) kernel.bin
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/ase:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-vlsim: $(PROJECT) kernel.bin
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/opae/vlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
run-rtlsim: $(PROJECT) kernel.bin
|
||||
LD_LIBRARY_PATH=$(POCL_RT_PATH)/lib:$(VORTEX_DRV_PATH)/rtlsim:$(LD_LIBRARY_PATH) ./$(PROJECT) $(OPTS)
|
||||
|
||||
.depend: $(SRCS)
|
||||
$(CXX) $(CXXFLAGS) -MM $^ > .depend;
|
||||
|
||||
clean:
|
||||
rm -rf $(PROJECT) *.o .depend
|
||||
|
||||
clean-all: clean
|
||||
rm -rf *.elf *.bin *.dump
|
||||
|
||||
ifneq ($(MAKECMDGOALS),clean)
|
||||
-include .depend
|
||||
endif
|
260
tests/regression/tex/blitter.h
Normal file
|
@ -0,0 +1,260 @@
|
|||
#include "format.h"
|
||||
|
||||
struct SurfaceDesc {
|
||||
ePixelFormat Format;
|
||||
uint8_t *pBits;
|
||||
uint32_t Width;
|
||||
uint32_t Height;
|
||||
uint32_t Pitch;
|
||||
};
|
||||
|
||||
class BlitTable {
|
||||
public:
|
||||
typedef int (*PfnCopy)(const SurfaceDesc &dstDesc,
|
||||
uint32_t dstOffsetX,
|
||||
uint32_t dstOffsetY,
|
||||
uint32_t copyWidth,
|
||||
uint32_t copyHeight,
|
||||
const SurfaceDesc &srcDesc,
|
||||
uint32_t srcOffsetX,
|
||||
uint32_t srcOffsetY);
|
||||
|
||||
BlitTable() {
|
||||
for (uint32_t s = 0; s < FORMAT_COLOR_SIZE_; ++s) {
|
||||
for (uint32_t d = 0; d < FORMAT_COLOR_SIZE_; ++d) {
|
||||
copyFuncs_[s][d] = CopyInvalid;
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t s = 0; s < FORMAT_COLOR_SIZE_; ++s) {
|
||||
switch (s) {
|
||||
case FORMAT_A8:
|
||||
case FORMAT_L8:
|
||||
copyFuncs_[s][s] = CopyFast<uint8_t>;
|
||||
break;
|
||||
|
||||
case FORMAT_A8L8:
|
||||
copyFuncs_[FORMAT_A8L8][FORMAT_A8] = Copy<FORMAT_A8L8, FORMAT_A8>;
|
||||
copyFuncs_[FORMAT_A8L8][FORMAT_A8L8] = CopyFast<uint16_t>;
|
||||
break;
|
||||
|
||||
case FORMAT_R5G6B5:
|
||||
copyFuncs_[FORMAT_R5G6B5][FORMAT_L8] = Copy<FORMAT_R5G6B5, FORMAT_L8>;
|
||||
copyFuncs_[FORMAT_R5G6B5][FORMAT_R5G6B5] = CopyFast<uint16_t>;
|
||||
copyFuncs_[FORMAT_R5G6B5][FORMAT_R8G8B8] =
|
||||
Copy<FORMAT_R5G6B5, FORMAT_R8G8B8>;
|
||||
copyFuncs_[FORMAT_R5G6B5][FORMAT_B8G8R8] =
|
||||
Copy<FORMAT_R5G6B5, FORMAT_B8G8R8>;
|
||||
copyFuncs_[FORMAT_R5G6B5][FORMAT_A8B8G8R8] =
|
||||
Copy<FORMAT_R5G6B5, FORMAT_A8B8G8R8>;
|
||||
copyFuncs_[FORMAT_R5G6B5][FORMAT_A8R8G8B8] =
|
||||
Copy<FORMAT_R5G6B5, FORMAT_A8R8G8B8>;
|
||||
break;
|
||||
|
||||
case FORMAT_A1R5G5B5:
|
||||
copyFuncs_[FORMAT_A1R5G5B5][FORMAT_A8] =
|
||||
Copy<FORMAT_A1R5G5B5, FORMAT_A8>;
|
||||
copyFuncs_[FORMAT_A1R5G5B5][FORMAT_L8] =
|
||||
Copy<FORMAT_A1R5G5B5, FORMAT_L8>;
|
||||
copyFuncs_[FORMAT_A1R5G5B5][FORMAT_A8L8] =
|
||||
Copy<FORMAT_A1R5G5B5, FORMAT_A8L8>;
|
||||
copyFuncs_[FORMAT_A1R5G5B5][FORMAT_R8G8B8] =
|
||||
Copy<FORMAT_A1R5G5B5, FORMAT_R8G8B8>;
|
||||
copyFuncs_[FORMAT_A1R5G5B5][FORMAT_A8R8G8B8] =
|
||||
Copy<FORMAT_A1R5G5B5, FORMAT_A8R8G8B8>;
|
||||
copyFuncs_[FORMAT_A1R5G5B5][FORMAT_R5G5B5A1] =
|
||||
Copy<FORMAT_A1R5G5B5, FORMAT_R5G5B5A1>;
|
||||
copyFuncs_[FORMAT_A1R5G5B5][FORMAT_R4G4B4A4] =
|
||||
Copy<FORMAT_A1R5G5B5, FORMAT_R4G4B4A4>;
|
||||
copyFuncs_[FORMAT_A1R5G5B5][FORMAT_B8G8R8] =
|
||||
Copy<FORMAT_A1R5G5B5, FORMAT_B8G8R8>;
|
||||
copyFuncs_[FORMAT_A1R5G5B5][FORMAT_A8B8G8R8] =
|
||||
Copy<FORMAT_A1R5G5B5, FORMAT_A8B8G8R8>;
|
||||
break;
|
||||
|
||||
case FORMAT_A4R4G4B4:
|
||||
copyFuncs_[FORMAT_A4R4G4B4][FORMAT_A8] =
|
||||
Copy<FORMAT_A4R4G4B4, FORMAT_A8>;
|
||||
copyFuncs_[FORMAT_A4R4G4B4][FORMAT_L8] =
|
||||
Copy<FORMAT_A4R4G4B4, FORMAT_L8>;
|
||||
copyFuncs_[FORMAT_A4R4G4B4][FORMAT_A8L8] =
|
||||
Copy<FORMAT_A4R4G4B4, FORMAT_A8L8>;
|
||||
copyFuncs_[FORMAT_A4R4G4B4][FORMAT_R8G8B8] =
|
||||
Copy<FORMAT_A4R4G4B4, FORMAT_R8G8B8>;
|
||||
copyFuncs_[FORMAT_A4R4G4B4][FORMAT_A8R8G8B8] =
|
||||
Copy<FORMAT_A4R4G4B4, FORMAT_A8R8G8B8>;
|
||||
copyFuncs_[FORMAT_A4R4G4B4][FORMAT_R5G5B5A1] =
|
||||
Copy<FORMAT_A4R4G4B4, FORMAT_R5G5B5A1>;
|
||||
copyFuncs_[FORMAT_A4R4G4B4][FORMAT_R4G4B4A4] =
|
||||
Copy<FORMAT_A4R4G4B4, FORMAT_R4G4B4A4>;
|
||||
copyFuncs_[FORMAT_A4R4G4B4][FORMAT_B8G8R8] =
|
||||
Copy<FORMAT_A4R4G4B4, FORMAT_B8G8R8>;
|
||||
copyFuncs_[FORMAT_A4R4G4B4][FORMAT_A8B8G8R8] =
|
||||
Copy<FORMAT_A4R4G4B4, FORMAT_A8B8G8R8>;
|
||||
break;
|
||||
|
||||
case FORMAT_R8G8B8:
|
||||
copyFuncs_[FORMAT_R8G8B8][FORMAT_L8] = Copy<FORMAT_R8G8B8, FORMAT_L8>;
|
||||
copyFuncs_[FORMAT_R8G8B8][FORMAT_R5G6B5] =
|
||||
Copy<FORMAT_R8G8B8, FORMAT_R5G6B5>;
|
||||
copyFuncs_[FORMAT_R8G8B8][FORMAT_R8G8B8] = CopyFast<uint24_t>;
|
||||
copyFuncs_[FORMAT_R8G8B8][FORMAT_B8G8R8] =
|
||||
Copy<FORMAT_R8G8B8, FORMAT_B8G8R8>;
|
||||
copyFuncs_[FORMAT_R8G8B8][FORMAT_A8B8G8R8] =
|
||||
Copy<FORMAT_R8G8B8, FORMAT_A8B8G8R8>;
|
||||
copyFuncs_[FORMAT_R8G8B8][FORMAT_A8R8G8B8] =
|
||||
Copy<FORMAT_R8G8B8, FORMAT_A8R8G8B8>;
|
||||
break;
|
||||
|
||||
case FORMAT_A8R8G8B8:
|
||||
copyFuncs_[FORMAT_A8R8G8B8][FORMAT_A8] =
|
||||
Copy<FORMAT_A8R8G8B8, FORMAT_A8>;
|
||||
copyFuncs_[FORMAT_A8R8G8B8][FORMAT_L8] =
|
||||
Copy<FORMAT_A8R8G8B8, FORMAT_L8>;
|
||||
copyFuncs_[FORMAT_A8R8G8B8][FORMAT_A8L8] =
|
||||
Copy<FORMAT_A8R8G8B8, FORMAT_A8L8>;
|
||||
copyFuncs_[FORMAT_A8R8G8B8][FORMAT_R5G6B5] =
|
||||
Copy<FORMAT_A8R8G8B8, FORMAT_R5G6B5>;
|
||||
copyFuncs_[FORMAT_A8R8G8B8][FORMAT_R8G8B8] =
|
||||
Copy<FORMAT_A8R8G8B8, FORMAT_R8G8B8>;
|
||||
copyFuncs_[FORMAT_A8R8G8B8][FORMAT_A8R8G8B8] = CopyFast<uint32_t>;
|
||||
copyFuncs_[FORMAT_A8R8G8B8][FORMAT_R5G5B5A1] =
|
||||
Copy<FORMAT_A8R8G8B8, FORMAT_R5G5B5A1>;
|
||||
copyFuncs_[FORMAT_A8R8G8B8][FORMAT_R4G4B4A4] =
|
||||
Copy<FORMAT_A8R8G8B8, FORMAT_R4G4B4A4>;
|
||||
copyFuncs_[FORMAT_A8R8G8B8][FORMAT_B8G8R8] =
|
||||
Copy<FORMAT_A8R8G8B8, FORMAT_B8G8R8>;
|
||||
copyFuncs_[FORMAT_A8R8G8B8][FORMAT_A8B8G8R8] =
|
||||
Copy<FORMAT_A8R8G8B8, FORMAT_A8B8G8R8>;
|
||||
break;
|
||||
|
||||
case FORMAT_R5G5B5A1:
|
||||
copyFuncs_[FORMAT_R5G5B5A1][FORMAT_A8] =
|
||||
Copy<FORMAT_R5G5B5A1, FORMAT_A8>;
|
||||
copyFuncs_[FORMAT_R5G5B5A1][FORMAT_L8] =
|
||||
Copy<FORMAT_R5G5B5A1, FORMAT_L8>;
|
||||
copyFuncs_[FORMAT_R5G5B5A1][FORMAT_A8L8] =
|
||||
Copy<FORMAT_R5G5B5A1, FORMAT_A8L8>;
|
||||
copyFuncs_[FORMAT_R5G5B5A1][FORMAT_RGB] =
|
||||
Copy<FORMAT_R5G5B5A1, FORMAT_RGB>;
|
||||
copyFuncs_[FORMAT_R5G5B5A1][FORMAT_ARGB] =
|
||||
Copy<FORMAT_R5G5B5A1, FORMAT_ARGB>;
|
||||
break;
|
||||
|
||||
case FORMAT_R4G4B4A4:
|
||||
copyFuncs_[FORMAT_R4G4B4A4][FORMAT_A8] =
|
||||
Copy<FORMAT_R4G4B4A4, FORMAT_A8>;
|
||||
copyFuncs_[FORMAT_R4G4B4A4][FORMAT_L8] =
|
||||
Copy<FORMAT_R4G4B4A4, FORMAT_L8>;
|
||||
copyFuncs_[FORMAT_R4G4B4A4][FORMAT_A8L8] =
|
||||
Copy<FORMAT_R4G4B4A4, FORMAT_A8L8>;
|
||||
copyFuncs_[FORMAT_R4G4B4A4][FORMAT_RGB] =
|
||||
Copy<FORMAT_R4G4B4A4, FORMAT_RGB>;
|
||||
copyFuncs_[FORMAT_R4G4B4A4][FORMAT_ARGB] =
|
||||
Copy<FORMAT_R4G4B4A4, FORMAT_ARGB>;
|
||||
break;
|
||||
|
||||
case FORMAT_B8G8R8:
|
||||
copyFuncs_[FORMAT_B8G8R8][FORMAT_L8] = Copy<FORMAT_B8G8R8, FORMAT_L8>;
|
||||
copyFuncs_[FORMAT_B8G8R8][FORMAT_RGB] = Copy<FORMAT_B8G8R8, FORMAT_RGB>;
|
||||
break;
|
||||
|
||||
case FORMAT_A8B8G8R8:
|
||||
copyFuncs_[FORMAT_A8B8G8R8][FORMAT_A8] =
|
||||
Copy<FORMAT_A8B8G8R8, FORMAT_A8>;
|
||||
copyFuncs_[FORMAT_A8B8G8R8][FORMAT_L8] =
|
||||
Copy<FORMAT_A8B8G8R8, FORMAT_L8>;
|
||||
copyFuncs_[FORMAT_A8B8G8R8][FORMAT_A8L8] =
|
||||
Copy<FORMAT_A8B8G8R8, FORMAT_A8L8>;
|
||||
copyFuncs_[FORMAT_A8B8G8R8][FORMAT_RGB] =
|
||||
Copy<FORMAT_A8B8G8R8, FORMAT_RGB>;
|
||||
copyFuncs_[FORMAT_A8B8G8R8][FORMAT_ARGB] =
|
||||
Copy<FORMAT_A8B8G8R8, FORMAT_ARGB>;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
PfnCopy get(uint32_t srcFormat, uint32_t dstFormat) const {
|
||||
assert(srcFormat < FORMAT_COLOR_SIZE_);
|
||||
assert(dstFormat < FORMAT_COLOR_SIZE_);
|
||||
return copyFuncs_[srcFormat][dstFormat];
|
||||
}
|
||||
|
||||
private:
|
||||
template <ePixelFormat SrcFormat, ePixelFormat DstFormat>
|
||||
static int Copy(const SurfaceDesc &dstDesc,
|
||||
uint32_t dstOffsetX,
|
||||
uint32_t dstOffsetY,
|
||||
uint32_t copyWidth,
|
||||
uint32_t copyHeight,
|
||||
const SurfaceDesc &srcDesc,
|
||||
uint32_t srcOffsetX,
|
||||
uint32_t srcOffsetY) {
|
||||
auto srcBPP = TFormatInfo<SrcFormat>::CBSIZE;
|
||||
auto dstBPP = TFormatInfo<DstFormat>::CBSIZE;
|
||||
auto srcNextLine = srcDesc.Pitch;
|
||||
auto dstNextLine = dstDesc.Pitch;
|
||||
|
||||
auto pbSrc = srcDesc.pBits + srcOffsetX * srcBPP + srcOffsetY * srcDesc.Pitch;
|
||||
auto pbDst = dstDesc.pBits + dstOffsetX * dstBPP + dstOffsetY * dstDesc.Pitch;
|
||||
|
||||
while (copyHeight--) {
|
||||
auto pSrc = reinterpret_cast<const typename TFormatInfo<SrcFormat>::TYPE *>(pbSrc);
|
||||
for (auto *pDst = reinterpret_cast<typename TFormatInfo<DstFormat>::TYPE *>(
|
||||
pbDst),
|
||||
*const pEnd = pDst + copyWidth;
|
||||
pDst != pEnd; ++pDst, ++pSrc) {
|
||||
auto tmp = Format::ConvertFrom<SrcFormat, true>(pSrc);
|
||||
Format::ConvertTo<DstFormat>(pDst, tmp);
|
||||
}
|
||||
|
||||
pbSrc += srcNextLine;
|
||||
pbDst += dstNextLine;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename Type>
|
||||
static int CopyFast(const SurfaceDesc &dstDesc,
|
||||
uint32_t dstOffsetX,
|
||||
uint32_t dstOffsetY,
|
||||
uint32_t copyWidth,
|
||||
uint32_t copyHeight,
|
||||
const SurfaceDesc &srcDesc,
|
||||
uint32_t srcOffsetX,
|
||||
uint32_t srcOffsetY) {
|
||||
auto nBPP = sizeof(Type);
|
||||
auto srcNextLine = srcDesc.Pitch;
|
||||
auto dstNextLine = dstDesc.Pitch;
|
||||
|
||||
auto pbSrc = srcDesc.pBits + srcOffsetX * nBPP + srcOffsetY * srcDesc.Pitch;
|
||||
auto pbDst = dstDesc.pBits + dstOffsetX * nBPP + dstOffsetY * dstDesc.Pitch;
|
||||
|
||||
while (copyHeight--) {
|
||||
auto pSrc = reinterpret_cast<const Type *>(pbSrc);
|
||||
for (auto *pDst = reinterpret_cast<Type *>(pbDst), *const pEnd = pDst + copyWidth;
|
||||
pDst != pEnd; ++pDst, ++pSrc) {
|
||||
*pDst = *pSrc;
|
||||
}
|
||||
pbSrc += srcNextLine;
|
||||
pbDst += dstNextLine;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int CopyInvalid(const SurfaceDesc & /*dstDesc*/,
|
||||
uint32_t /*dstOffsetX*/,
|
||||
uint32_t /*dstOffsetY*/,
|
||||
uint32_t /*copyWidth*/,
|
||||
uint32_t /*copyHeight*/,
|
||||
const SurfaceDesc & /*srcDesc*/,
|
||||
uint32_t /*srcOffsetX*/,
|
||||
uint32_t /*srcOffsetY*/)
|
||||
{
|
||||
std::cout << "Error: invalid format" << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
PfnCopy copyFuncs_[FORMAT_COLOR_SIZE_][FORMAT_COLOR_SIZE_];
|
||||
};
|
68
tests/regression/tex/color.h
Normal file
|
@ -0,0 +1,68 @@
|
|||
//
|
||||
// Copyright (c) Blaise Tine. All rights reserved.
|
||||
//
|
||||
//
|
||||
// Use of this sample source code is subject to the terms of the Microsoft
|
||||
// license agreement under which you licensed this sample source code. If
|
||||
// you did not accept the terms of the license agreement, you are not
|
||||
// authorized to use this sample source code. For the terms of the license,
|
||||
// please see the license agreement between you and Microsoft or, if applicable,
|
||||
// see the LICENSE.RTF on your install media or the root of your tools
|
||||
// installation.
|
||||
// THE SAMPLE SOURCE CODE IS PROVIDED "AS IS", WITH NO WARRANTIES OR
|
||||
// INDEMNITIES.
|
||||
//
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
#include <assert.h>
|
||||
|
||||
struct ColorARGB {
|
||||
union {
|
||||
struct {
|
||||
uint32_t value;
|
||||
};
|
||||
struct {
|
||||
uint8_t b, g, r, a;
|
||||
};
|
||||
struct {
|
||||
uint8_t m[4];
|
||||
};
|
||||
};
|
||||
|
||||
ColorARGB() {}
|
||||
|
||||
ColorARGB(int a, int r, int g, int b) {
|
||||
assert((a >= 0) && (a <= 0xff));
|
||||
assert((r >= 0) && (r <= 0xff));
|
||||
assert((g >= 0) && (g <= 0xff));
|
||||
assert((b >= 0) && (b <= 0xff));
|
||||
|
||||
this->b = static_cast<uint8_t>(b);
|
||||
this->g = static_cast<uint8_t>(g);
|
||||
this->r = static_cast<uint8_t>(r);
|
||||
this->a = static_cast<uint8_t>(a);
|
||||
}
|
||||
|
||||
ColorARGB(int r, int g, int b) {
|
||||
assert((r >= 0) && (r <= 0xff));
|
||||
assert((g >= 0) && (g <= 0xff));
|
||||
assert((b >= 0) && (b <= 0xff));
|
||||
|
||||
this->b = static_cast<uint8_t>(b);
|
||||
this->g = static_cast<uint8_t>(g);
|
||||
this->r = static_cast<uint8_t>(r);
|
||||
}
|
||||
|
||||
ColorARGB(int value) {
|
||||
this->value = value;
|
||||
}
|
||||
|
||||
void operator=(const ColorARGB &rhs) {
|
||||
this->value = rhs.value;
|
||||
}
|
||||
|
||||
operator uint32_t() const {
|
||||
return this->value;
|
||||
}
|
||||
};
|
25
tests/regression/tex/common.h
Normal file
|
@ -0,0 +1,25 @@
|
|||
#ifndef _COMMON_H_
|
||||
#define _COMMON_H_
|
||||
|
||||
#define KERNEL_ARG_DEV_MEM_ADDR 0x7ffff000
|
||||
|
||||
struct kernel_arg_t {
|
||||
uint32_t num_tasks;
|
||||
uint8_t format;
|
||||
uint8_t filter;
|
||||
uint8_t wrap;
|
||||
uint8_t use_sw;
|
||||
uint32_t lod;
|
||||
uint8_t src_logWidth;
|
||||
uint8_t src_logHeight;
|
||||
uint8_t src_stride;
|
||||
uint8_t src_pitch;
|
||||
uint32_t src_ptr;
|
||||
uint32_t dst_width;
|
||||
uint32_t dst_height;
|
||||
uint8_t dst_stride;
|
||||
uint32_t dst_pitch;
|
||||
uint32_t dst_ptr;
|
||||
};
|
||||
|
||||
#endif
|
BIN
tests/regression/tex/earth.tga
Normal file
After Width: | Height: | Size: 48 MiB |
BIN
tests/regression/tex/flower.tga
Normal file
After Width: | Height: | Size: 16 MiB |
BIN
tests/regression/tex/football.tga
Normal file
After Width: | Height: | Size: 12 KiB |
1022
tests/regression/tex/format.h
Normal file
37
tests/regression/tex/int24.h
Normal file
|
@ -0,0 +1,37 @@
|
|||
//
|
||||
// Copyright (c) Blaise Tine. All rights reserved.
|
||||
//
|
||||
//
|
||||
// Use of this sample source code is subject to the terms of the Microsoft
|
||||
// license agreement under which you licensed this sample source code. If
|
||||
// you did not accept the terms of the license agreement, you are not
|
||||
// authorized to use this sample source code. For the terms of the license,
|
||||
// please see the license agreement between you and Microsoft or, if applicable,
|
||||
// see the LICENSE.RTF on your install media or the root of your tools
|
||||
// installation.
|
||||
// THE SAMPLE SOURCE CODE IS PROVIDED "AS IS", WITH NO WARRANTIES OR
|
||||
// INDEMNITIES.
|
||||
//
|
||||
#pragma once
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
struct uint24_t {
|
||||
uint8_t m[3];
|
||||
|
||||
explicit uint24_t(uint32_t value) {
|
||||
m[0] = (value >> 0) & 0xff;
|
||||
m[1] = (value >> 8) & 0xff;
|
||||
m[2] = (value >> 16) & 0xff;
|
||||
}
|
||||
|
||||
explicit uint24_t(uint8_t x, uint8_t y, uint8_t z) {
|
||||
m[0] = x;
|
||||
m[1] = y;
|
||||
m[2] = z;
|
||||
}
|
||||
|
||||
operator uint32_t() const {
|
||||
return (m[2] << 16) | (m[1] << 8) | m[0];
|
||||
}
|
||||
};
|
BIN
tests/regression/tex/kernel.bin
Executable file
67
tests/regression/tex/kernel.c
Normal file
|
@ -0,0 +1,67 @@
|
|||
#include <stdint.h>
|
||||
#include <vx_intrinsics.h>
|
||||
#include "common.h"
|
||||
#include "texsw.h"
|
||||
|
||||
#define ENABLE_SW
|
||||
|
||||
struct tile_arg_t {
|
||||
struct kernel_arg_t* state;
|
||||
uint32_t tile_width;
|
||||
uint32_t tile_height;
|
||||
float deltaX;
|
||||
float deltaY;
|
||||
};
|
||||
|
||||
void kernel_body(int task_id, void* arg) {
|
||||
struct tile_arg_t* _arg = (struct tile_arg_t*)(arg);
|
||||
struct kernel_arg_t* state = _arg->state;
|
||||
|
||||
uint32_t xoffset = 0;
|
||||
uint32_t yoffset = task_id * _arg->tile_height;
|
||||
uint8_t* dst_ptr = (uint8_t*)(state->dst_ptr + xoffset * state->dst_stride + yoffset * state->dst_pitch);
|
||||
|
||||
float fv = yoffset * _arg->deltaY;
|
||||
for (uint32_t y = 0; y < _arg->tile_height; ++y) {
|
||||
uint32_t* dst_row = (uint32_t*)dst_ptr;
|
||||
float fu = xoffset * _arg->deltaX;
|
||||
for (uint32_t x = 0; x < _arg->tile_width; ++x) {
|
||||
int32_t u = (int32_t)(fu * (1<<20));
|
||||
int32_t v = (int32_t)(fv * (1<<20));
|
||||
#ifdef ENABLE_SW
|
||||
if (state->use_sw) {
|
||||
dst_row[x] = (state->filter == 2) ? tex3_sw(state, 0, u, v, state->lod) : tex_sw(state, 0, u, v, state->lod);
|
||||
} else {
|
||||
#endif
|
||||
dst_row[x] = (state->filter == 2) ? vx_tex3(0, u, v, state->lod) : vx_tex(0, u, v, state->lod);
|
||||
#ifdef ENABLE_SW
|
||||
}
|
||||
#endif
|
||||
fu += _arg->deltaX;
|
||||
}
|
||||
dst_ptr += state->dst_pitch;
|
||||
fv += _arg->deltaY;
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
struct kernel_arg_t* arg = (struct kernel_arg_t*)KERNEL_ARG_DEV_MEM_ADDR;
|
||||
|
||||
// configure texture unit
|
||||
vx_csr_write(CSR_TEX_ADDR(0), arg->src_ptr);
|
||||
vx_csr_write(CSR_TEX_MIPOFF(0), 0);
|
||||
vx_csr_write(CSR_TEX_WIDTH(0), arg->src_logWidth);
|
||||
vx_csr_write(CSR_TEX_HEIGHT(0), arg->src_logHeight);
|
||||
vx_csr_write(CSR_TEX_FORMAT(0), arg->format);
|
||||
vx_csr_write(CSR_TEX_WRAP(0), (arg->wrap << 2) | arg->wrap);
|
||||
vx_csr_write(CSR_TEX_FILTER(0), (arg->filter ? 1 : 0));
|
||||
|
||||
struct tile_arg_t targ;
|
||||
targ.state = arg;
|
||||
targ.tile_width = arg->dst_width;
|
||||
targ.tile_height = (arg->dst_height + arg->num_tasks - 1) / arg->num_tasks;
|
||||
targ.deltaX = 1.0f / arg->dst_width;
|
||||
targ.deltaY = 1.0f / arg->dst_height;
|
||||
|
||||
vx_spawn_tasks(arg->num_tasks, kernel_body, &targ);
|
||||
}
|
1514
tests/regression/tex/kernel.dump
Normal file
BIN
tests/regression/tex/kernel.elf
Executable file
260
tests/regression/tex/main.cpp
Normal file
|
@ -0,0 +1,260 @@
|
|||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <chrono>
|
||||
#include <cmath>
|
||||
#include <assert.h>
|
||||
#include <vortex.h>
|
||||
#include "common.h"
|
||||
#include "utils.h"
|
||||
|
||||
#define RT_CHECK(_expr) \
|
||||
do { \
|
||||
int _ret = _expr; \
|
||||
if (0 == _ret) \
|
||||
break; \
|
||||
printf("Error: '%s' returned %d!\n", #_expr, (int)_ret); \
|
||||
cleanup(); \
|
||||
exit(-1); \
|
||||
} while (false)
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
const char* kernel_file = "kernel.bin";
|
||||
const char* input_file = "palette64.tga";
|
||||
const char* output_file = "output.tga";
|
||||
int wrap = 0;
|
||||
int filter = 0;
|
||||
float scale = 1.0f;
|
||||
int format = 0;
|
||||
bool use_sw = false;
|
||||
ePixelFormat eformat = FORMAT_A8R8G8B8;
|
||||
|
||||
vx_device_h device = nullptr;
|
||||
vx_buffer_h buffer = nullptr;
|
||||
|
||||
static void show_usage() {
|
||||
std::cout << "Vortex Texture Test." << std::endl;
|
||||
std::cout << "Usage: [-k: kernel] [-i image] [-o image] [-s scale] [-w wrap] [-f format] [-g filter] [-z no_hw] [-h: help]" << std::endl;
|
||||
}
|
||||
|
||||
static void parse_args(int argc, char **argv) {
|
||||
int c;
|
||||
while ((c = getopt(argc, argv, "zi:o:k:w:f:g:h?")) != -1) {
|
||||
switch (c) {
|
||||
case 'i':
|
||||
input_file = optarg;
|
||||
break;
|
||||
case 'o':
|
||||
output_file = optarg;
|
||||
break;
|
||||
case 's':
|
||||
scale = std::stof(optarg, NULL);
|
||||
break;
|
||||
case 'w':
|
||||
wrap = std::atoi(optarg);
|
||||
break;
|
||||
case 'z':
|
||||
use_sw = true;
|
||||
break;
|
||||
case 'f': {
|
||||
format = std::atoi(optarg);
|
||||
switch (format) {
|
||||
case 0: eformat = FORMAT_A8R8G8B8; break;
|
||||
case 1: eformat = FORMAT_R5G6B5; break;
|
||||
case 2: eformat = FORMAT_R4G4B4A4; break;
|
||||
case 3: eformat = FORMAT_L8; break;
|
||||
case 4: eformat = FORMAT_A8; break;
|
||||
default:
|
||||
std::cout << "Error: invalid format: " << format << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
} break;
|
||||
case 'g':
|
||||
filter = std::atoi(optarg);
|
||||
break;
|
||||
case 'k':
|
||||
kernel_file = optarg;
|
||||
break;
|
||||
case 'h':
|
||||
case '?': {
|
||||
show_usage();
|
||||
exit(0);
|
||||
} break;
|
||||
default:
|
||||
show_usage();
|
||||
exit(-1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void cleanup() {
|
||||
if (buffer) {
|
||||
vx_buf_release(buffer);
|
||||
}
|
||||
if (device) {
|
||||
vx_dev_close(device);
|
||||
}
|
||||
}
|
||||
|
||||
int run_test(const kernel_arg_t& kernel_arg,
|
||||
uint32_t buf_size,
|
||||
uint32_t width,
|
||||
uint32_t height,
|
||||
uint32_t bpp) {
|
||||
auto time_start = std::chrono::high_resolution_clock::now();
|
||||
|
||||
// start device
|
||||
std::cout << "start device" << std::endl;
|
||||
RT_CHECK(vx_start(device));
|
||||
|
||||
// wait for completion
|
||||
std::cout << "wait for completion" << std::endl;
|
||||
RT_CHECK(vx_ready_wait(device, -1));
|
||||
|
||||
auto time_end = std::chrono::high_resolution_clock::now();
|
||||
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
|
||||
printf("Elapsed time: %lg ms\n", elapsed);
|
||||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
RT_CHECK(vx_copy_from_dev(buffer, kernel_arg.dst_ptr, buf_size, 0));
|
||||
|
||||
std::vector<uint8_t> dst_pixels(buf_size);
|
||||
auto buf_ptr = (uint8_t*)vx_host_ptr(buffer);
|
||||
for (uint32_t i = 0; i < buf_size; ++i) {
|
||||
dst_pixels[i] = buf_ptr[i];
|
||||
}
|
||||
|
||||
// save output image
|
||||
std::cout << "save output image" << std::endl;
|
||||
//dump_image(dst_pixels, width, height, bpp);
|
||||
RT_CHECK(SaveTGA(output_file, dst_pixels, width, height, bpp));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
kernel_arg_t kernel_arg;
|
||||
std::vector<uint8_t> src_pixels;
|
||||
uint32_t src_width;
|
||||
uint32_t src_height;
|
||||
uint32_t src_bpp;
|
||||
|
||||
// parse command arguments
|
||||
parse_args(argc, argv);
|
||||
|
||||
std::vector<uint8_t> tmp_pixels;
|
||||
RT_CHECK(LoadTGA(input_file, tmp_pixels, &src_width, &src_height));
|
||||
|
||||
// check power of two support
|
||||
if (!ISPOW2(src_width) || !ISPOW2(src_height)) {
|
||||
std::cout << "Error: only power of two textures supported: width=" << src_width << ", heigth=" << src_height << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
RT_CHECK(ConvertImage(src_pixels, tmp_pixels, src_width, src_height, FORMAT_A8R8G8B8, eformat));
|
||||
src_bpp = Format::GetInfo(eformat).BytePerPixel;
|
||||
|
||||
//dump_image(src_pixels, src_width, src_height, src_bpp);
|
||||
|
||||
uint32_t src_bufsize = src_bpp * src_width * src_height;
|
||||
|
||||
uint32_t dst_width = (uint32_t)(src_width * scale);
|
||||
uint32_t dst_height = (uint32_t)(src_height * scale);
|
||||
uint32_t dst_bpp = 4;
|
||||
uint32_t dst_bufsize = dst_bpp * dst_width * dst_height;
|
||||
|
||||
// open device connection
|
||||
std::cout << "open device connection" << std::endl;
|
||||
RT_CHECK(vx_dev_open(&device));
|
||||
|
||||
unsigned max_cores, max_warps, max_threads;
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_CORES, &max_cores));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_WARPS, &max_warps));
|
||||
RT_CHECK(vx_dev_caps(device, VX_CAPS_MAX_THREADS, &max_threads));
|
||||
|
||||
uint32_t num_tasks = max_cores * max_warps * max_threads;
|
||||
|
||||
std::cout << "number of tasks: " << std::dec << num_tasks << std::endl;
|
||||
std::cout << "source buffer: width=" << src_width << ", heigth=" << src_height << ", size=" << src_bufsize << " bytes" << std::endl;
|
||||
std::cout << "destination buffer: width=" << dst_width << ", heigth=" << dst_height << ", size=" << dst_bufsize << " bytes" << std::endl;
|
||||
|
||||
// upload program
|
||||
std::cout << "upload program" << std::endl;
|
||||
RT_CHECK(vx_upload_kernel_file(device, kernel_file));
|
||||
|
||||
// allocate device memory
|
||||
std::cout << "allocate device memory" << std::endl;
|
||||
size_t src_addr, dst_addr;
|
||||
RT_CHECK(vx_alloc_dev_mem(device, src_bufsize, &src_addr));
|
||||
RT_CHECK(vx_alloc_dev_mem(device, dst_bufsize, &dst_addr));
|
||||
|
||||
std::cout << "src_addr=0x" << std::hex << src_addr << std::endl;
|
||||
std::cout << "dst_addr=0x" << std::hex << dst_addr << std::endl;
|
||||
|
||||
// allocate staging shared memory
|
||||
std::cout << "allocate shared memory" << std::endl;
|
||||
uint32_t alloc_size = std::max<uint32_t>(sizeof(kernel_arg_t), std::max<uint32_t>(src_bufsize, dst_bufsize));
|
||||
RT_CHECK(vx_alloc_shared_mem(device, alloc_size, &buffer));
|
||||
|
||||
// upload kernel argument
|
||||
std::cout << "upload kernel argument" << std::endl;
|
||||
{
|
||||
kernel_arg.num_tasks = std::min<uint32_t>(num_tasks, dst_height);
|
||||
kernel_arg.format = format;
|
||||
kernel_arg.filter = filter;
|
||||
kernel_arg.wrap = wrap;
|
||||
kernel_arg.use_sw = use_sw;
|
||||
kernel_arg.lod = 0x0;
|
||||
|
||||
kernel_arg.src_logWidth = (uint32_t)std::log2(src_width);
|
||||
kernel_arg.src_logHeight = (uint32_t)std::log2(src_height);
|
||||
kernel_arg.src_stride = src_bpp;
|
||||
kernel_arg.src_pitch = src_bpp * src_width;
|
||||
kernel_arg.src_ptr = src_addr;
|
||||
|
||||
kernel_arg.dst_width = dst_width;
|
||||
kernel_arg.dst_height = dst_height;
|
||||
kernel_arg.dst_stride = dst_bpp;
|
||||
kernel_arg.dst_pitch = dst_bpp * dst_width;
|
||||
kernel_arg.dst_ptr = dst_addr;
|
||||
|
||||
auto buf_ptr = (int*)vx_host_ptr(buffer);
|
||||
memcpy(buf_ptr, &kernel_arg, sizeof(kernel_arg_t));
|
||||
RT_CHECK(vx_copy_to_dev(buffer, KERNEL_ARG_DEV_MEM_ADDR, sizeof(kernel_arg_t), 0));
|
||||
}
|
||||
|
||||
// upload source buffer
|
||||
std::cout << "upload source buffer" << std::endl;
|
||||
{
|
||||
auto buf_ptr = (int8_t*)vx_host_ptr(buffer);
|
||||
for (uint32_t i = 0; i < src_bufsize; ++i) {
|
||||
buf_ptr[i] = src_pixels[i];
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.src_ptr, src_bufsize, 0));
|
||||
}
|
||||
|
||||
// clear destination buffer
|
||||
std::cout << "clear destination buffer" << std::endl;
|
||||
{
|
||||
auto buf_ptr = (int32_t*)vx_host_ptr(buffer);
|
||||
for (uint32_t i = 0; i < (dst_bufsize/4); ++i) {
|
||||
buf_ptr[i] = 0xdeadbeef;
|
||||
}
|
||||
RT_CHECK(vx_copy_to_dev(buffer, kernel_arg.dst_ptr, dst_bufsize, 0));
|
||||
}
|
||||
|
||||
// run tests
|
||||
std::cout << "run tests" << std::endl;
|
||||
RT_CHECK(run_test(kernel_arg, dst_bufsize, dst_width, dst_height, dst_bpp));
|
||||
|
||||
// cleanup
|
||||
std::cout << "cleanup" << std::endl;
|
||||
cleanup();
|
||||
|
||||
std::cout << "PASSED!" << std::endl;
|
||||
|
||||
return 0;
|
||||
}
|
BIN
tests/regression/tex/output.tga
Normal file
After Width: | Height: | Size: 16 KiB |
BIN
tests/regression/tex/palette16.tga
Normal file
After Width: | Height: | Size: 1 KiB |
BIN
tests/regression/tex/palette4.tga
Normal file
After Width: | Height: | Size: 108 B |
BIN
tests/regression/tex/palette64.tga
Normal file
After Width: | Height: | Size: 16 KiB |
167
tests/regression/tex/texsw.h
Normal file
|
@ -0,0 +1,167 @@
|
|||
#ifndef _TEXSW_H_
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define TEX_LOD_MAX 11
|
||||
|
||||
#define MIN(x, y) ((x < y) ? (x) : (y))
|
||||
|
||||
#define MAX(x, y) ((x > y) ? (x) : (y))
|
||||
|
||||
inline int address(int wrap, int value) {
|
||||
switch (wrap) {
|
||||
case 1: return value & 0xfffff;
|
||||
default:
|
||||
case 0: return MIN(MAX(value, 0), 0xfffff);
|
||||
}
|
||||
}
|
||||
|
||||
inline void unpack(int format, int value, int* l, int* h) {
|
||||
switch (format) {
|
||||
case 1:
|
||||
case 2:
|
||||
*l = value;
|
||||
*h = 0;
|
||||
break;
|
||||
case 3:
|
||||
*l = (value | (value << 8)) & 0x00ff00ff;
|
||||
*h = 0;
|
||||
break;
|
||||
case 4:
|
||||
*l = (value | (value << 16)) & 0x07e0f81f;
|
||||
*h = 0;
|
||||
break;
|
||||
case 5:
|
||||
*l = (value | (value << 12)) & 0x0f0f0f0f;
|
||||
*h = 0;
|
||||
break;
|
||||
default:
|
||||
case 0:
|
||||
*l = value & 0x00ff00ff;
|
||||
*h = (value >> 8) & 0x00ff00ff;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
inline void lerp(int al, int ah, int bl, int bh, int frac, int* l, int* h) {
|
||||
*l = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff;
|
||||
*h = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff;
|
||||
}
|
||||
|
||||
inline int pack(int format, int l, int h) {
|
||||
switch (format) {
|
||||
case 1:
|
||||
case 2:
|
||||
return l;
|
||||
case 3:
|
||||
return (l | (l >> 8)) & 0xffff;
|
||||
case 4:
|
||||
return (l | (l >> 16)) & 0xffff;
|
||||
case 5:
|
||||
return (l | (l >> 12)) & 0xffff;
|
||||
default:
|
||||
case 0:
|
||||
return (h << 8) | l;
|
||||
}
|
||||
}
|
||||
|
||||
inline int tex_sw(struct kernel_arg_t* state, int stage, int u, int v, int lod) {
|
||||
int base_addr = state->src_ptr;
|
||||
int mip_offset = 0;
|
||||
int log_width = state->src_logWidth;
|
||||
int log_height = state->src_logHeight;
|
||||
int format = state->format;
|
||||
int wrap = state->wrap;
|
||||
int filter = state->filter;
|
||||
|
||||
int32_t* pBits = ((uint32_t*)base_addr) + mip_offset;
|
||||
|
||||
if (filter) {
|
||||
int u0 = address(wrap, u - (0x80000 >> log_width));
|
||||
int v0 = address(wrap, v - (0x80000 >> log_height));
|
||||
int u1 = address(wrap, u + (0x80000 >> log_width));
|
||||
int v1 = address(wrap, v + (0x80000 >> log_height));
|
||||
|
||||
int x0 = u0 >> (20 - log_width);
|
||||
int y0 = v0 >> (20 - log_height);
|
||||
int x1 = u1 >> (20 - log_width);
|
||||
int y1 = v1 >> (20 - log_height);
|
||||
|
||||
// memory lookup
|
||||
|
||||
int c0 = pBits[x0 + (y0 << log_width)];
|
||||
int c1 = pBits[x1 + (y0 << log_width)];
|
||||
int c2 = pBits[x0 + (y1 << log_width)];
|
||||
int c3 = pBits[x1 + (y1 << log_width)];
|
||||
|
||||
// filtering
|
||||
|
||||
int alpha = x0 & 0xff;
|
||||
int beta = y0 & 0xff;
|
||||
|
||||
int c0a, c0b;
|
||||
int c1a, c1b;
|
||||
int c01a, c01b;
|
||||
|
||||
unpack(format, c0, &c0a, &c0b);
|
||||
unpack(format, c1, &c1a, &c1b);
|
||||
lerp(c0a, c0b, c1a, c1b, alpha, &c01a, &c01b);
|
||||
|
||||
int c2a, c2b;
|
||||
int c3a, c3b;
|
||||
int c23a, c23b;
|
||||
|
||||
unpack(format, c2, &c2a, &c2b);
|
||||
unpack(format, c3, &c3a, &c3b);
|
||||
lerp(c2a, c2b, c3a, c3b, alpha, &c23a, &c23b);
|
||||
|
||||
int c4a, c4b;
|
||||
lerp(c01a, c01b, c23a, c23b, beta, &c4a, &c4b);
|
||||
return pack(format, c4a, c4b);
|
||||
} else {
|
||||
int u0 = address(wrap, u);
|
||||
int v0 = address(wrap, v);
|
||||
|
||||
int x0 = u0 >> (20 - log_width);
|
||||
int y0 = v0 >> (20 - log_height);
|
||||
|
||||
int c0 = pBits[x0 + (y0 <<log_width)];
|
||||
|
||||
int c0a, c0b;
|
||||
unpack(format, c0, &c0a, &c0b);
|
||||
return pack(format, c0a, c0b);
|
||||
}
|
||||
}
|
||||
|
||||
inline int vx_tex3(int stage, int u, int v, int lod) {
|
||||
int lodn = MIN(lod + 0x100000, TEX_LOD_MAX);
|
||||
int a = vx_tex(0, u, v, lod);
|
||||
int b = vx_tex(0, u, v, lodn);
|
||||
int al = a & 0x00ff00ff;
|
||||
int ah = (a >> 8) & 0x00ff00ff;
|
||||
int bl = b & 0x00ff00ff;
|
||||
int bh = (b >> 8) & 0x00ff00ff;
|
||||
int frac = (lod >> 12) & 0xff;
|
||||
int cl = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff;
|
||||
int ch = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff;
|
||||
int c = al | (ah << 8);
|
||||
return c;
|
||||
}
|
||||
|
||||
inline int tex3_sw(struct kernel_arg_t* state, int stage, int u, int v, int lod) {
|
||||
int lodn = MIN(lod + 0x10000, TEX_LOD_MAX);
|
||||
int a = tex_sw(state, 0, u, v, lod);
|
||||
int b = tex_sw(state, 0, u, v, lodn);
|
||||
int al = a & 0x00ff00ff;
|
||||
int ah = (a >> 8) & 0x00ff00ff;
|
||||
|
||||
int bl = b & 0x00ff00ff;
|
||||
int bh = (b >> 8) & 0x00ff00ff;
|
||||
int frac = (lod >> 12) & 0xff;
|
||||
int cl = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff;
|
||||
int ch = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff;
|
||||
int c = al | (ah << 8);
|
||||
return c;
|
||||
}
|
||||
|
||||
#endif
|
BIN
tests/regression/tex/toad.tga
Normal file
After Width: | Height: | Size: 16 KiB |
217
tests/regression/tex/utils.cpp
Normal file
|
@ -0,0 +1,217 @@
|
|||
#include "utils.h"
|
||||
#include <fstream>
|
||||
#include <assert.h>
|
||||
#include "format.h"
|
||||
|
||||
struct __attribute__((__packed__)) tga_header_t {
|
||||
int8_t idlength;
|
||||
int8_t colormaptype;
|
||||
int8_t imagetype;
|
||||
int16_t colormaporigin;
|
||||
int16_t colormaplength;
|
||||
int8_t colormapdepth;
|
||||
int16_t xoffset;
|
||||
int16_t yoffset;
|
||||
int16_t width;
|
||||
int16_t height;
|
||||
int8_t bitsperpixel;
|
||||
int8_t imagedescriptor;
|
||||
};
|
||||
|
||||
int LoadTGA(const char *filename,
|
||||
std::vector<uint8_t> &pixels,
|
||||
uint32_t *width,
|
||||
uint32_t *height) {
|
||||
std::ifstream ifs(filename, std::ios::in | std::ios::binary);
|
||||
if (!ifs.is_open()) {
|
||||
std::cerr << "couldn't open file: " << filename << "!" << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
tga_header_t header;
|
||||
ifs.read(reinterpret_cast<char *>(&header), sizeof(tga_header_t));
|
||||
if (ifs.fail()) {
|
||||
std::cerr << "invalid TGA file header!" << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (header.imagetype != 2) {
|
||||
std::cerr << "unsupported TGA encoding format!" << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
ifs.seekg(header.idlength, std::ios::cur); // skip string
|
||||
if (ifs.fail()) {
|
||||
std::cerr << "invalid TGA file!" << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
switch (header.bitsperpixel) {
|
||||
case 16:
|
||||
case 24:
|
||||
case 32: {
|
||||
auto stride = header.bitsperpixel / 8;
|
||||
std::vector<uint8_t> staging(stride * header.width * header.height);
|
||||
|
||||
// Read pixels data
|
||||
ifs.read((char*)staging.data(), staging.size());
|
||||
if (ifs.fail()) {
|
||||
std::cerr << "invalid TGA file!" << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
// format conversion to RGBA
|
||||
pixels.resize(4 * header.width * header.height);
|
||||
const uint8_t* src_bytes = staging.data();
|
||||
uint32_t* dst_bytes = (uint32_t*)pixels.data();
|
||||
for (const uint8_t* const src_end = src_bytes + staging.size();
|
||||
src_bytes != src_end;
|
||||
src_bytes += stride) {
|
||||
ColorARGB color;
|
||||
switch (stride) {
|
||||
case 2:
|
||||
color = Format::ConvertFrom<FORMAT_A1R5G5B5, true>(src_bytes);
|
||||
break;
|
||||
case 3:
|
||||
color = Format::ConvertFrom<FORMAT_R8G8B8, true>(src_bytes);
|
||||
break;
|
||||
case 4:
|
||||
color = Format::ConvertFrom<FORMAT_A8R8G8B8, true>(src_bytes);
|
||||
break;
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
*dst_bytes++ = color;
|
||||
}
|
||||
break;
|
||||
}
|
||||
default:
|
||||
std::cerr << "unsupported TGA bitsperpixel!" << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
*width = header.width;
|
||||
*height = header.height;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int SaveTGA(const char *filename,
|
||||
const std::vector<uint8_t> &pixels,
|
||||
uint32_t width,
|
||||
uint32_t height,
|
||||
uint32_t bpp) {
|
||||
std::ofstream ofs(filename, std::ios::out | std::ios::binary);
|
||||
if (!ofs.is_open()) {
|
||||
std::cerr << "couldn't create file: " << filename << "!" << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (bpp < 2 || bpp > 4) {
|
||||
std::cerr << "unsupported pixel stride: " << bpp << "!" << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
tga_header_t header;
|
||||
header.idlength = 0;
|
||||
header.colormaptype = 0; // no palette
|
||||
header.imagetype = 2; // color mapped data
|
||||
header.colormaporigin = 0;
|
||||
header.colormaplength = 0;
|
||||
header.colormapdepth = 0;
|
||||
header.xoffset = 0;
|
||||
header.yoffset = 0;
|
||||
header.width = width;
|
||||
header.height = height;
|
||||
header.bitsperpixel = bpp * 8;
|
||||
header.imagedescriptor = 0;
|
||||
|
||||
// write header
|
||||
ofs.write(reinterpret_cast<char *>(&header), sizeof(tga_header_t));
|
||||
|
||||
// write pixel data
|
||||
uint32_t pitch = bpp * width;
|
||||
const uint8_t* pixel_bytes = pixels.data() + (height - 1) * pitch;
|
||||
for (uint32_t y = 0; y < height; ++y) {
|
||||
const uint8_t* pixel_row = pixel_bytes;
|
||||
for (uint32_t x = 0; x < width; ++x) {
|
||||
ofs.write((const char*)pixel_row, bpp);
|
||||
pixel_row += bpp;
|
||||
}
|
||||
pixel_bytes -= pitch;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void dump_image(const std::vector<uint8_t>& pixels, uint32_t width, uint32_t height, uint32_t bpp) {
|
||||
assert(width * height * bpp == pixels.size());
|
||||
const uint8_t* pixel_bytes = pixels.data();
|
||||
for (uint32_t y = 0; y < height; ++y) {
|
||||
for (uint32_t x = 0; x < width; ++x) {
|
||||
uint32_t pixel32 = 0;
|
||||
for (uint32_t b = 0; b < bpp; ++b) {
|
||||
uint32_t pixel8 = *pixel_bytes++;
|
||||
pixel32 |= pixel8 << (b * 8);
|
||||
}
|
||||
if (x) std::cout << ", ";
|
||||
std::cout << std::hex << pixel32;
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
int CopyBuffers(SurfaceDesc &dstDesc,
|
||||
int32_t dstOffsetX,
|
||||
int32_t dstOffsetY,
|
||||
uint32_t copyWidth,
|
||||
uint32_t copyHeight,
|
||||
const SurfaceDesc &srcDesc,
|
||||
int32_t srcOffsetX,
|
||||
int32_t srcOffsetY) {
|
||||
|
||||
static const BlitTable s_blitTable;
|
||||
|
||||
if ((srcOffsetX >= (int32_t)srcDesc.Width) || (srcOffsetY >= (int32_t)srcDesc.Height) ||
|
||||
(dstOffsetX >= (int32_t)dstDesc.Width) || (dstOffsetY >= (int32_t)dstDesc.Height)) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (copyWidth > dstDesc.Width) {
|
||||
copyWidth = dstDesc.Width;
|
||||
}
|
||||
|
||||
if (copyWidth > srcDesc.Width) {
|
||||
copyWidth = srcDesc.Width;
|
||||
}
|
||||
|
||||
if (copyHeight > dstDesc.Height) {
|
||||
copyHeight = dstDesc.Height;
|
||||
}
|
||||
|
||||
if (copyHeight > srcDesc.Height) {
|
||||
copyHeight = srcDesc.Height;
|
||||
}
|
||||
|
||||
return s_blitTable.get(srcDesc.Format, dstDesc.Format)(
|
||||
dstDesc, dstOffsetX, dstOffsetY, copyWidth, copyHeight, srcDesc,
|
||||
srcOffsetX, srcOffsetY);
|
||||
}
|
||||
|
||||
int ConvertImage(std::vector<uint8_t>& dst_pixels,
|
||||
const std::vector<uint8_t>& src_pixels,
|
||||
uint32_t width,
|
||||
uint32_t height,
|
||||
ePixelFormat src_format,
|
||||
ePixelFormat dst_format) {
|
||||
|
||||
uint32_t src_pitch = Format::GetInfo(src_format).BytePerPixel * width;
|
||||
uint32_t dst_pitch = Format::GetInfo(dst_format).BytePerPixel * width;
|
||||
|
||||
dst_pixels.resize(dst_pitch * height);
|
||||
|
||||
SurfaceDesc srcDesc{src_format, (uint8_t*)src_pixels.data(), width, height, src_pitch};
|
||||
SurfaceDesc dstDesc{dst_format, dst_pixels.data(), width, height, dst_pitch};
|
||||
|
||||
return CopyBuffers(dstDesc, 0, 0, width, height, srcDesc, 0, 0);
|
||||
}
|
42
tests/regression/tex/utils.h
Normal file
|
@ -0,0 +1,42 @@
|
|||
#include <cstdint>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
#include "blitter.h"
|
||||
|
||||
#define ISPOW2(x) (((x) != 0) && (0 == ((x) & ((x) - 1))))
|
||||
|
||||
inline uint32_t ilog2 (uint32_t value) {
|
||||
return (uint32_t)(sizeof(uint32_t) * 8UL) - (uint32_t)__builtin_clzl((value << 1) - 1UL) - 1;
|
||||
}
|
||||
|
||||
int LoadTGA(const char *filename,
|
||||
std::vector<uint8_t> &pixels,
|
||||
uint32_t *width,
|
||||
uint32_t *height);
|
||||
|
||||
int SaveTGA(const char *filename,
|
||||
const std::vector<uint8_t> &pixels,
|
||||
uint32_t width,
|
||||
uint32_t height,
|
||||
uint32_t bpp);
|
||||
|
||||
int CopyBuffers(SurfaceDesc &dstDesc,
|
||||
int32_t dstOffsetX,
|
||||
int32_t dstOffsetY,
|
||||
uint32_t copyWidth,
|
||||
uint32_t copyHeight,
|
||||
const SurfaceDesc &srcDesc,
|
||||
int32_t srcOffsetX,
|
||||
int32_t srcOffsetY);
|
||||
|
||||
int ConvertImage(std::vector<uint8_t>& dst_pixels,
|
||||
const std::vector<uint8_t>& src_pixels,
|
||||
uint32_t width,
|
||||
uint32_t height,
|
||||
ePixelFormat src_format,
|
||||
ePixelFormat dst_format);
|
||||
|
||||
void dump_image(const std::vector<uint8_t>& pixels,
|
||||
uint32_t width,
|
||||
uint32_t height,
|
||||
uint32_t bpp);
|