Merge branch 'graphics' of https://github.com/vortexgpgpu/vortex-dev into graphics

This commit is contained in:
Blaise Tine 2021-03-20 19:22:11 -04:00
commit 8c0c4e2b6e
4 changed files with 533 additions and 31 deletions

432
hw/VX_config.h Normal file
View file

@ -0,0 +1,432 @@
// auto-generated by gen_config.py. DO NOT EDIT
// Generated at 2021-03-20 18:29:13.211392
#ifndef VX_USER_CONFIG
#define VX_USER_CONFIG
#endif
// auto-generated by gen_config.py. DO NOT EDIT
// Generated at 2021-03-20 18:29:13.214396
// Translated from VX_config.vh:
#ifndef VX_CONFIG
#define VX_CONFIG
#ifndef NUM_CLUSTERS
#define NUM_CLUSTERS 1
#endif
#ifndef NUM_CORES
#define NUM_CORES 1
#endif
#ifndef NUM_WARPS
#define NUM_WARPS 4
#endif
#ifndef NUM_THREADS
#define NUM_THREADS 4
#endif
#ifndef NUM_BARRIERS
#define NUM_BARRIERS 4
#endif
#ifndef L2_ENABLE
#define L2_ENABLE 0
#endif
#ifndef L3_ENABLE
#define L3_ENABLE 0
#endif
#ifndef SM_ENABLE
#define SM_ENABLE 1
#endif
#ifndef GLOBAL_BLOCK_SIZE
#define GLOBAL_BLOCK_SIZE 64
#endif
#ifndef L1_BLOCK_SIZE
#define L1_BLOCK_SIZE (NUM_THREADS * 4)
#endif
#ifndef STARTUP_ADDR
#define STARTUP_ADDR 0x80000000
#endif
#ifndef IO_BUS_BASE_ADDR
#define IO_BUS_BASE_ADDR 0xFF000000
#endif
#ifndef SHARED_MEM_BASE_ADDR
#define SHARED_MEM_BASE_ADDR IO_BUS_BASE_ADDR
#endif
#ifndef SHARED_MEM_BASE_ADDR_ALIGN
#define SHARED_MEM_BASE_ADDR_ALIGN 64
#endif
#ifndef IO_BUS_ADDR_COUT
#define IO_BUS_ADDR_COUT 0xFFFFFFFC
#endif
#ifndef FRAME_BUFFER_BASE_ADDR
#define FRAME_BUFFER_BASE_ADDR 0xFF000000
#endif
#ifndef FRAME_BUFFER_WIDTH
#define FRAME_BUFFER_WIDTH 1920
#endif
#ifndef FRAME_BUFFER_HEIGHT
#define FRAME_BUFFER_HEIGHT 1080
#endif
#define FRAME_BUFFER_SIZE (FRAME_BUFFER_WIDTH * FRAME_BUFFER_HEIGHT)
#ifndef EXT_M_DISABLE
#define EXT_M_ENABLE
#endif
#ifndef EXT_F_DISABLE
#define EXT_F_ENABLE
#endif
#ifndef EXT_TEX_DISABLE
#define EXT_TEX_ENABLE
#endif
// Device identification
#define VENDOR_ID 0
#define ARCHITECTURE_ID 0
#define IMPLEMENTATION_ID 0
///////////////////////////////////////////////////////////////////////////////
#ifndef LATENCY_IMUL
#define LATENCY_IMUL 3
#endif
#ifndef LATENCY_FNCP
#define LATENCY_FNCP 2
#endif
#ifndef LATENCY_FMA
#define LATENCY_FMA 4
#endif
#ifndef LATENCY_FDIV
#ifdef ALTERA_S10
#define LATENCY_FDIV 34
#else
#define LATENCY_FDIV 15
#endif
#endif
#ifndef LATENCY_FSQRT
#ifdef ALTERA_S10
#define LATENCY_FSQRT 25
#else
#define LATENCY_FSQRT 10
#endif
#endif
#ifndef LATENCY_FDIVSQRT
#define LATENCY_FDIVSQRT 32
#endif
#ifndef LATENCY_FCVT
#define LATENCY_FCVT 4
#endif
// CSR Addresses //////////////////////////////////////////////////////////////
// User Floating-Point CSRs
#define CSR_FFLAGS 0x001
#define CSR_FRM 0x002
#define CSR_FCSR 0x003
#define CSR_SATP 0x180
#define CSR_PMPCFG0 0x3A0
#define CSR_PMPADDR0 0x3B0
#define CSR_MSTATUS 0x300
#define CSR_MISA 0x301
#define CSR_MEDELEG 0x302
#define CSR_MIDELEG 0x303
#define CSR_MIE 0x304
#define CSR_MTVEC 0x305
#define CSR_MEPC 0x341
// Machine Counter/Timers
#define CSR_CYCLE 0xC00
#define CSR_CYCLE_H 0xC80
#define CSR_INSTRET 0xC02
#define CSR_INSTRET_H 0xC82
// Machine Performance-monitoring counters
// PERF: pipeline
#define CSR_MPM_IBUF_ST 0xB03
#define CSR_MPM_IBUF_ST_H 0xB83
#define CSR_MPM_SCRB_ST 0xB04
#define CSR_MPM_SCRB_ST_H 0xB84
#define CSR_MPM_ALU_ST 0xB05
#define CSR_MPM_ALU_ST_H 0xB85
#define CSR_MPM_LSU_ST 0xB06
#define CSR_MPM_LSU_ST_H 0xB86
#define CSR_MPM_CSR_ST 0xB07
#define CSR_MPM_CSR_ST_H 0xB87
#define CSR_MPM_FPU_ST 0xB08
#define CSR_MPM_FPU_ST_H 0xB88
#define CSR_MPM_GPU_ST 0xB09
#define CSR_MPM_GPU_ST_H 0xB89
// PERF: icache
#define CSR_MPM_ICACHE_READS 0xB0A // total reads
#define CSR_MPM_ICACHE_READS_H 0xB8A
#define CSR_MPM_ICACHE_MISS_R 0xB0B // total misses
#define CSR_MPM_ICACHE_MISS_R_H 0xB8B
#define CSR_MPM_ICACHE_PIPE_ST 0xB0C // pipeline stalls
#define CSR_MPM_ICACHE_PIPE_ST_H 0xB8C
#define CSR_MPM_ICACHE_CRSP_ST 0xB0D // core response stalls
#define CSR_MPM_ICACHE_CRSP_ST_H 0xB8D
// PERF: dcache
#define CSR_MPM_DCACHE_READS 0xB0E // total reads
#define CSR_MPM_DCACHE_READS_H 0xB8E
#define CSR_MPM_DCACHE_WRITES 0xB0F // total writes
#define CSR_MPM_DCACHE_WRITES_H 0xB8F
#define CSR_MPM_DCACHE_MISS_R 0xB10 // read misses
#define CSR_MPM_DCACHE_MISS_R_H 0xB90
#define CSR_MPM_DCACHE_MISS_W 0xB11 // write misses
#define CSR_MPM_DCACHE_MISS_W_H 0xB91
#define CSR_MPM_DCACHE_BANK_ST 0xB12 // bank conflicts stalls
#define CSR_MPM_DCACHE_BANK_ST_H 0xB92
#define CSR_MPM_DCACHE_MSHR_ST 0xB13 // MSHR stalls
#define CSR_MPM_DCACHE_MSHR_ST_H 0xB93
#define CSR_MPM_DCACHE_PIPE_ST 0xB14 // pipeline stalls
#define CSR_MPM_DCACHE_PIPE_ST_H 0xB94
#define CSR_MPM_DCACHE_CRSP_ST 0xB15 // core response stalls
#define CSR_MPM_DCACHE_CRSP_ST_H 0xB95
// PERF: smem
#define CSR_MPM_SMEM_READS 0xB16 // total reads
#define CSR_MPM_SMEM_READS_H 0xB96
#define CSR_MPM_SMEM_WRITES 0xB17 // total writes
#define CSR_MPM_SMEM_WRITES_H 0xB97
#define CSR_MPM_SMEM_BANK_ST 0xB18 // bank conflicts stalls
#define CSR_MPM_SMEM_BANK_ST_H 0xB98
// PERF: memory
#define CSR_MPM_DRAM_READS 0xB19 // dram reads
#define CSR_MPM_DRAM_READS_H 0xB99
#define CSR_MPM_DRAM_WRITES 0xB1A // dram writes
#define CSR_MPM_DRAM_WRITES_H 0xB9A
#define CSR_MPM_DRAM_ST 0xB1B // dram request stalls
#define CSR_MPM_DRAM_ST_H 0xB9B
#define CSR_MPM_DRAM_LAT 0xB1C // dram latency (total)
#define CSR_MPM_DRAM_LAT_H 0xB9C
// Machine Information Registers
#define CSR_MVENDORID 0xF11
#define CSR_MARCHID 0xF12
#define CSR_MIMPID 0xF13
#define CSR_MHARTID 0xF14
// User SIMT CSRs
#define CSR_WTID 0xCC0
#define CSR_LTID 0xCC1
#define CSR_GTID 0xCC2
#define CSR_LWID 0xCC3
#define CSR_GWID CSR_MHARTID
#define CSR_GCID 0xCC5
// Machine SIMT CSRs
#define CSR_NT 0xFC0
#define CSR_NW 0xFC1
#define CSR_NC 0xFC2
////////// Texture Units //////////////////////////////////////////////////////
#define NUM_TEX_UNITS 2
#define CSR_TEX_STATES 8
#define CSR_TEX_BEGIN(x) (0xFD0 + (x) * CSR_TEX_STATES)
#define CSR_TEX_ADDR(x) (CSR_TEX_BEGIN(x) + 0x00)
#define CSR_TEX_FORMAT(x) (CSR_TEX_BEGIN(x) + 0x01)
#define CSR_TEX_WIDTH(x) (CSR_TEX_BEGIN(x) + 0x02)
#define CSR_TEX_HEIGHT(x) (CSR_TEX_BEGIN(x) + 0x03)
#define CSR_TEX_STRIDE(x) (CSR_TEX_BEGIN(x) + 0x04)
#define CSR_TEX_WRAP_U(x) (CSR_TEX_BEGIN(x) + 0x05)
#define CSR_TEX_WRAP_V(x) (CSR_TEX_BEGIN(x) + 0x06)
#define CSR_TEX_FILTER(x) (CSR_TEX_BEGIN(x) + 0x07)
// Pipeline Queues ////////////////////////////////////////////////////////////
// Size of LSU Request Queue
#ifndef LSUQ_SIZE
#define LSUQ_SIZE 8
#endif
// Size of FPU Request Queue
#ifndef FPUQ_SIZE
#define FPUQ_SIZE 8
#endif
// Icache Configurable Knobs //////////////////////////////////////////////////
// Size of cache in bytes
#ifndef ICACHE_SIZE
#define ICACHE_SIZE 16384
#endif
// Core Request Queue Size
#ifndef ICREQ_SIZE
#define ICREQ_SIZE 4
#endif
// Miss Handling Register Size
#ifndef IMSHR_SIZE
#define IMSHR_SIZE NUM_WARPS
#endif
// DRAM Request Queue Size
#ifndef IDREQ_SIZE
#define IDREQ_SIZE 4
#endif
// DRAM Response Queue Size
#ifndef IDRSQ_SIZE
#define IDRSQ_SIZE 4
#endif
// Dcache Configurable Knobs //////////////////////////////////////////////////
// Size of cache in bytes
#ifndef DCACHE_SIZE
#define DCACHE_SIZE 16384
#endif
// Number of banks
#ifndef DNUM_BANKS
#define DNUM_BANKS NUM_THREADS
#endif
// Number of bank ports
#ifndef DNUM_PORTS
#define DNUM_PORTS 1
#endif
// Core Request Queue Size
#ifndef DCREQ_SIZE
#define DCREQ_SIZE 4
#endif
// Miss Handling Register Size
#ifndef DMSHR_SIZE
#define DMSHR_SIZE LSUQ_SIZE
#endif
// DRAM Request Queue Size
#ifndef DDREQ_SIZE
#define DDREQ_SIZE 4
#endif
// DRAM Response Queue Size
#ifndef DDRSQ_SIZE
#define DDRSQ_SIZE MAX(4, (DNUM_BANKS * 2))
#endif
// SM Configurable Knobs //////////////////////////////////////////////////////
// per thread stack size
#ifndef STACK_SIZE
#define STACK_SIZE 1024
#endif
// Size of cache in bytes
#ifndef SMEM_SIZE
#define SMEM_SIZE (STACK_SIZE * NUM_WARPS * NUM_THREADS)
#endif
// Number of banks
#ifndef SNUM_BANKS
#define SNUM_BANKS NUM_THREADS
#endif
// Core Request Queue Size
#ifndef SCREQ_SIZE
#define SCREQ_SIZE 4
#endif
// L2cache Configurable Knobs /////////////////////////////////////////////////
// Size of cache in bytes
#ifndef L2CACHE_SIZE
#define L2CACHE_SIZE 65536
#endif
// Number of banks
#ifndef L2NUM_BANKS
#define L2NUM_BANKS MIN(NUM_CORES, 4)
#endif
// Core Request Queue Size
#ifndef L2CREQ_SIZE
#define L2CREQ_SIZE 4
#endif
// Miss Handling Register Size
#ifndef L2MSHR_SIZE
#define L2MSHR_SIZE 16
#endif
// DRAM Request Queue Size
#ifndef L2DREQ_SIZE
#define L2DREQ_SIZE 4
#endif
// DRAM Response Queue Size
#ifndef L2DRSQ_SIZE
#define L2DRSQ_SIZE MAX(4, (L2NUM_BANKS * 2))
#endif
// L3cache Configurable Knobs /////////////////////////////////////////////////
// Size of cache in bytes
#ifndef L3CACHE_SIZE
#define L3CACHE_SIZE 131072
#endif
// Number of banks
#ifndef L3NUM_BANKS
#define L3NUM_BANKS MIN(NUM_CLUSTERS, 4)
#endif
// Core Request Queue Size
#ifndef L3CREQ_SIZE
#define L3CREQ_SIZE 4
#endif
// Miss Handling Register Size
#ifndef L3MSHR_SIZE
#define L3MSHR_SIZE 16
#endif
// DRAM Request Queue Size
#ifndef L3DREQ_SIZE
#define L3DREQ_SIZE 4
#endif
// DRAM Response Queue Size
#ifndef L3DRSQ_SIZE
#define L3DRSQ_SIZE MAX(4, (L3NUM_BANKS * 2))
#endif
#endif

View file

@ -23,4 +23,16 @@
`define TEX_WRAP_CLAMP 1
`define TEX_WRAP_MIRROR 2
`define MAX_COLOR_WIDTH 8
`define NUM_COLOR_CHANNEL 4
`define R5G6B5 `TEX_FORMAT_BITS'h1
`define R8G8B8 `TEX_FORMAT_BITS'h2
`define R8G8B8A8 `TEX_FORMAT_BITS'h3
`define RBEGIN 24
`define GBEGIN 16
`define BBEGIN 8
`define ABEGIN 0
`endif

View file

@ -3,10 +3,59 @@
module VX_tex_format #(
parameter CORE_ID = 0
) (
// TODO
input wire [31:0] texel_data,
input wire [`TEX_FORMAT_BITS-1:0] format,
output wire [`NUM_COLOR_CHANNEL-1:0] color_enable,
output wire [`MAX_COLOR_BITS-1:0] R,
output wire [`MAX_COLOR_BITS-1:0] G,
output wire [`MAX_COLOR_BITS-1:0] B,
output wire [`MAX_COLOR_BITS-1:0] A
);
`UNUSED_PARAM (CORE_ID)
// TODO
reg [`NUM_COLOR_CHANNEL-1:0] color_enable_r;
reg [`MAX_COLOR_BITS-1:0] R_r;
reg [`MAX_COLOR_BITS-1:0] G_r;
reg [`MAX_COLOR_BITS-1:0] B_r;
reg [`MAX_COLOR_BITS-1:0] A_r;
always @(*) begin
case (format)
`R5G6B5:
R_r = `MAX_COLOR_BITS'(texel_data[15:11]);
G_r = `MAX_COLOR_BITS'(texel_data[10:5]);
B_r = `MAX_COLOR_BITS'(texel_data[4:0]);
A_r = {`MAX_COLOR_BITS{1'b0}};
color_enable_r = 4'b1110;
`R8G8B8:
R_r = `MAX_COLOR_BITS'(texel_data[23:16]);
G_r = `MAX_COLOR_BITS'(texel_data[15:8]);
B_r = `MAX_COLOR_BITS'(texel_data[7:0]);
A_r = {`MAX_COLOR_BITS{1'b0}};
color_enable_r = 4'b1110;
`R8G8B8A8:
R_r = `MAX_COLOR_BITS'(texel_data[31:24]);
G_r = `MAX_COLOR_BITS'(texel_data[23:16]);
B_r = `MAX_COLOR_BITS'(texel_data[15:8]);
A_r = `MAX_COLOR_BITS'(texel_data[7:0]);
color_enable_r = 4'b1111;
default:
R_r = `MAX_COLOR_BITS'(texel_data[23:16]);
G_r = `MAX_COLOR_BITS'(texel_data[15:8]);
B_r = `MAX_COLOR_BITS'(texel_data[7:0]);
A_r = {`MAX_COLOR_BITS{1'b0}};
color_enable_r = 4'b1110;
endcase
end
assign color_enable = color_enable_r;
assign R = R_r;
assign G = G_r;
assign B = B_r;
assign A = A_r;
endmodule

View file

@ -15,6 +15,8 @@ module VX_tex_sampler #(
input wire req_wb,
input wire [`TEX_FILTER_BITS-1:0] req_filter,
input wire [`TEX_FORMAT_BITS-1:0] req_format,
input wire [3:0][`FIXED_FRAC-1:0] req_ufrac,
input wire [3:0][`FIXED_FRAC-1:0] req_vfrac,
input wire [`NUM_THREADS-1:0][3:0][31:0] req_texels,
output wire req_ready,
@ -30,41 +32,48 @@ module VX_tex_sampler #(
);
`UNUSED_PARAM (CORE_ID)
if (req_filter == 0) begin // point sampling
/*
assign tex_req_if.ready = (& pt_addr_ready);
wire [31:0] req_data [`NUM_THREADS-1:0];
assign lsu_req_if.valid = (& pt_addr_valid);
for (genvar i = 0; i<`NUM_THREADS ;i++ ) begin
VX_tex_format #(
.CORE_ID (CORE_ID)
) tex_format_point (
.texel_data (req_texels[i]),
.format (req_format),
assign lsu_req_if.wid = tex_req_if.wid;
assign lsu_req_if.tmask = tex_req_if.tmask;
assign lsu_req_if.PC = tex_req_if.PC;
assign lsu_req_if.rd = tex_req_if.rd;
assign lsu_req_if.wb = tex_req_if.wb;
assign lsu_req_if.offset = 32'h0000;
assign lsu_req_if.op_type = `OP_BITS'({1'b0, 3'b000}); //func3 for word load??
assign lsu_req_if.store_data = {`NUM_THREADS{32'h0000}};
.color_enable (),
.R(req_data[i][`RBEGIN +: 8]),
.G(req_data[i][`GBEGIN +: 8]),
.B(req_data[i][`BBEGIN +: 8]),
.A(req_data[i][`ABEGIN +: 8])
);
// wait buffer for fragments / replace with cache/state fragment fifo for bilerp
// no filtering for point sampling -> directly from dcache to output response
end
VX_pipe_register #(
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)),
.RESETW (1)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (~stall_out),
.data_in ({rsp_valid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data}),
.data_out ({tex_rsp_if.valid, tex_rsp_if.wid, tex_rsp_if.tmask, tex_rsp_if.PC, tex_rsp_if.rd, tex_rsp_if.wb, tex_rsp_if.data})
);
VX_pipe_register #(
.DATAW (1 + `NW_BITS + `NUM_THREADS + 32 + `NR_BITS + 1 + (`NUM_THREADS * 32)),
.RESETW (1)
) pipe_reg (
.clk (clk),
.reset (reset),
.enable (~stall_out),
.data_in ({req_valid, req_wid, req_tmask, req_PC, req_rd, req_wb, req_data}),
.data_out ({rsp_valid, rsp_wid, rsp_tmask, rsp_PC, rsp_rd, rsp_wb, rsp_data})
);
// output
assign stall_out = ~tex_rsp_if.ready && tex_rsp_if.valid;
// output
assign stall_out = ~rsp_ready;
assign req_ready = rsp_ready;
end else begin // bilinear sampling
// TO DO
end
// can accept new request?
assign stall_in = stall_out;
assign ld_commit_if.ready = ~stall_in;*/
endmodule