mirror of
https://github.com/vortexgpgpu/vortex.git
synced 2025-04-23 21:39:10 -04:00
text sw emulation
This commit is contained in:
parent
66ff74eb97
commit
a9f91b7acd
8 changed files with 749 additions and 4836 deletions
|
@ -8,8 +8,8 @@ struct kernel_arg_t {
|
|||
uint32_t format;
|
||||
uint32_t filter;
|
||||
uint32_t wrap;
|
||||
uint32_t src_width;
|
||||
uint32_t src_height;
|
||||
uint32_t src_logWidth;
|
||||
uint32_t src_logHeight;
|
||||
uint32_t src_stride;
|
||||
uint32_t src_pitch;
|
||||
uint8_t src_ptr;
|
||||
|
|
Binary file not shown.
|
@ -1,12 +1,14 @@
|
|||
#include <stdint.h>
|
||||
#include <vx_intrinsics.h>
|
||||
#include "common.h"
|
||||
#include "texsw.h"
|
||||
|
||||
uint32_t ilog2 (uint32_t value) {
|
||||
return (uint32_t)(sizeof(uint32_t) * 8UL) - (uint32_t)__builtin_clzl((value << 1) - 1UL) - 1;
|
||||
}
|
||||
|
||||
struct tile_arg_t {
|
||||
struct kernel_arg_t karg;
|
||||
struct kernel_arg_t* state;
|
||||
uint32_t tile_width;
|
||||
uint32_t tile_height;
|
||||
float deltaX;
|
||||
|
@ -15,10 +17,11 @@ struct tile_arg_t {
|
|||
|
||||
void kernel_body(int task_id, void* arg) {
|
||||
struct tile_arg_t* _arg = (struct tile_arg_t*)(arg);
|
||||
struct kernel_arg_t* state = _arg->state;
|
||||
|
||||
uint32_t xoffset = 0;
|
||||
uint32_t yoffset = task_id * _arg->tile_height;
|
||||
uint8_t* dst_ptr = (uint8_t*)(_arg->karg.dst_ptr + xoffset * _arg->karg.dst_stride + yoffset * _arg->karg.dst_pitch);
|
||||
uint8_t* dst_ptr = (uint8_t*)(state->dst_ptr + xoffset * state->dst_stride + yoffset * state->dst_pitch);
|
||||
|
||||
float fv = yoffset * _arg->deltaY;
|
||||
for (uint32_t y = 0; y < _arg->tile_height; ++y) {
|
||||
|
@ -27,10 +30,11 @@ void kernel_body(int task_id, void* arg) {
|
|||
for (uint32_t x = 0; x < _arg->tile_width; ++x) {
|
||||
int32_t u = (int32_t)(fu * (1<<20));
|
||||
int32_t v = (int32_t)(fv * (1<<20));
|
||||
//dst_row[x] = tex_sw(state, 0, u, v, 0x0);
|
||||
dst_row[x] = vx_tex(0, u, v, 0x0);
|
||||
fu += _arg->deltaX;
|
||||
}
|
||||
dst_ptr += _arg->karg.dst_pitch;
|
||||
dst_ptr += state->dst_pitch;
|
||||
fv += _arg->deltaY;
|
||||
}
|
||||
}
|
||||
|
@ -41,14 +45,14 @@ int main() {
|
|||
// configure texture unit
|
||||
vx_csr_write(CSR_TEX_ADDR(0), arg->src_ptr);
|
||||
vx_csr_write(CSR_TEX_MIPOFF(0), 0);
|
||||
vx_csr_write(CSR_TEX_WIDTH(0), ilog2(arg->src_width));
|
||||
vx_csr_write(CSR_TEX_HEIGHT(0), ilog2(arg->src_height));
|
||||
vx_csr_write(CSR_TEX_WIDTH(0), arg->src_logWidth);
|
||||
vx_csr_write(CSR_TEX_HEIGHT(0), arg->src_logHeight);
|
||||
vx_csr_write(CSR_TEX_FORMAT(0), arg->format);
|
||||
vx_csr_write(CSR_TEX_WRAP(0), (arg->wrap << 2) | arg->wrap);
|
||||
vx_csr_write(CSR_TEX_FILTER(0), arg->filter);
|
||||
|
||||
struct tile_arg_t targ;
|
||||
targ.karg = *arg;
|
||||
targ.state = arg;
|
||||
targ.tile_width = arg->dst_width;
|
||||
targ.tile_height = (arg->dst_height + arg->num_tasks - 1) / arg->num_tasks;
|
||||
targ.deltaX = 1.0f / arg->dst_width;
|
||||
|
|
File diff suppressed because it is too large
Load diff
Binary file not shown.
|
@ -2,6 +2,7 @@
|
|||
#include <vector>
|
||||
#include <unistd.h>
|
||||
#include <string.h>
|
||||
#include <chrono>
|
||||
#include <assert.h>
|
||||
#include <vortex.h>
|
||||
#include "common.h"
|
||||
|
@ -78,6 +79,7 @@ void cleanup() {
|
|||
}
|
||||
|
||||
int run_test(const kernel_arg_t& kernel_arg, uint32_t buf_size, uint32_t width, uint32_t height, uint32_t bpp) {
|
||||
auto time_start = std::chrono::high_resolution_clock::now();
|
||||
// start device
|
||||
std::cout << "start device" << std::endl;
|
||||
RT_CHECK(vx_start(device));
|
||||
|
@ -85,6 +87,10 @@ int run_test(const kernel_arg_t& kernel_arg, uint32_t buf_size, uint32_t width,
|
|||
// wait for completion
|
||||
std::cout << "wait for completion" << std::endl;
|
||||
RT_CHECK(vx_ready_wait(device, -1));
|
||||
|
||||
auto time_end = std::chrono::high_resolution_clock::now();
|
||||
double elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(time_end - time_start).count();
|
||||
printf("Elapsed time: %lg ms\n", elapsed);
|
||||
|
||||
// download destination buffer
|
||||
std::cout << "download destination buffer" << std::endl;
|
||||
|
@ -98,7 +104,7 @@ int run_test(const kernel_arg_t& kernel_arg, uint32_t buf_size, uint32_t width,
|
|||
|
||||
// save output image
|
||||
std::cout << "save output image" << std::endl;
|
||||
dump_image(dst_pixels, width, height, bpp);
|
||||
//dump_image(dst_pixels, width, height, bpp);
|
||||
RT_CHECK(SaveTGA(output_file, dst_pixels, width, height, bpp));
|
||||
|
||||
return 0;
|
||||
|
@ -115,7 +121,14 @@ int main(int argc, char *argv[]) {
|
|||
parse_args(argc, argv);
|
||||
|
||||
RT_CHECK(LoadTGA(input_file, src_pixels, &src_width, &src_height, &src_bpp));
|
||||
dump_image(src_pixels, src_width, src_height, src_bpp);
|
||||
|
||||
// check power of two support
|
||||
if (!ISPOW2(src_width) || !ISPOW2(src_height)) {
|
||||
std::cout << "Error: only power of two textures supported: width=" << src_width << ", heigth=" << src_height << std::endl;
|
||||
return -1;
|
||||
}
|
||||
|
||||
//dump_image(src_pixels, src_width, src_height, src_bpp);
|
||||
uint32_t src_bufsize = src_bpp * src_width * src_height;
|
||||
|
||||
uint32_t dst_width = (uint32_t)(src_width * scale);
|
||||
|
@ -164,8 +177,8 @@ int main(int argc, char *argv[]) {
|
|||
kernel_arg.filter = filter;
|
||||
kernel_arg.wrap = wrap;
|
||||
|
||||
kernel_arg.src_width = src_width;
|
||||
kernel_arg.src_height = src_height;
|
||||
kernel_arg.src_logWidth = ilog2(src_width);
|
||||
kernel_arg.src_logHeight = ilog2(src_height);
|
||||
kernel_arg.src_stride = src_bpp;
|
||||
kernel_arg.src_pitch = src_bpp * src_width;
|
||||
kernel_arg.src_ptr = src_addr;
|
||||
|
|
162
driver/tests/tex_demo/texsw.h
Normal file
162
driver/tests/tex_demo/texsw.h
Normal file
|
@ -0,0 +1,162 @@
|
|||
#ifndef _TEXSW_H_
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define TEX_LOD_MAX 11
|
||||
|
||||
#define MIN(x, y) ((x < y) ? (x) : (y))
|
||||
|
||||
#define MAX(x, y) ((x > y) ? (x) : (y))
|
||||
|
||||
inline int address(int wrap, int value) {
|
||||
switch (wrap) {
|
||||
case 1: return value & 0xfffff;
|
||||
default:
|
||||
case 0: return MIN(MAX(value, 0), 0xfffff);
|
||||
}
|
||||
}
|
||||
|
||||
inline void unpack(int format, int value, int* l, int* h) {
|
||||
switch (format) {
|
||||
case 1:
|
||||
case 2:
|
||||
*l = value;
|
||||
*h = 0;
|
||||
break;
|
||||
case 3:
|
||||
*l = (value | (value << 8)) & 0x00ff00ff;
|
||||
*h = 0;
|
||||
break;
|
||||
case 4:
|
||||
*l = (value | (value << 16)) & 0x07e0f81f;
|
||||
*h = 0;
|
||||
break;
|
||||
case 5:
|
||||
*l = (value | (value << 12)) & 0x0f0f0f0f;
|
||||
*h = 0;
|
||||
break;
|
||||
default:
|
||||
case 0:
|
||||
*l = value & 0x00ff00ff;
|
||||
*h = (value >> 8) & 0x00ff00ff;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
inline void lerp(int al, int ah, int bl, int bh, int frac, int* l, int* h) {
|
||||
*l = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff;
|
||||
*h = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff;
|
||||
}
|
||||
|
||||
inline int pack(int format, int l, int h) {
|
||||
switch (format) {
|
||||
case 1:
|
||||
case 2:
|
||||
return l;
|
||||
case 3:
|
||||
return (l | (l >> 8)) & 0xffff;
|
||||
case 4:
|
||||
return (l | (l >> 16)) & 0xffff;
|
||||
case 5:
|
||||
return (l | (l >> 12)) & 0xffff;
|
||||
default:
|
||||
case 0:
|
||||
return (h << 8) | l;
|
||||
}
|
||||
}
|
||||
|
||||
inline int tex_sw(struct kernel_arg_t* state, int stage, int u, int v, int lod) {
|
||||
int base_addr = state->src_ptr;//vx_csr_read(CSR_TEX_ADDR(0));
|
||||
int mip_offset = 0;//vx_csr_read(CSR_TEX_MIPOFF(0));
|
||||
int log_width = state->src_logWidth;//vx_csr_read(CSR_TEX_WIDTH(0));
|
||||
int log_height = state->src_logHeight;//vx_csr_read(CSR_TEX_HEIGHT(0));
|
||||
int format = state->format;//vx_csr_read(CSR_TEX_FORMAT(0));
|
||||
int wrap = state->wrap;//vx_csr_read(CSR_TEX_WRAP(0));
|
||||
int filter = state->filter;//vx_csr_read(CSR_TEX_FILTER(0));
|
||||
|
||||
int32_t* pBits = ((uint32_t*)base_addr) + mip_offset;
|
||||
|
||||
int u0 = address(wrap, u - (0x80000 >> log_width));
|
||||
int v0 = address(wrap, v - (0x80000 >> log_height));
|
||||
|
||||
int x0 = u0 >> (20 - log_width);
|
||||
int y0 = v0 >> (20 - log_height);
|
||||
|
||||
if (filter) {
|
||||
int u1 = address(wrap, u + (0x80000 >> log_width));
|
||||
int v1 = address(wrap, v + (0x80000 >> log_height));
|
||||
|
||||
int x1 = u1 >> (20 - log_width);
|
||||
int y1 = v1 >> (20 - log_height);
|
||||
|
||||
// memory lookup
|
||||
|
||||
int c0 = pBits[x0 + (y0 << log_width)];
|
||||
int c1 = pBits[x1 + (y0 << log_width)];
|
||||
int c2 = pBits[x0 + (y1 << log_width)];
|
||||
int c3 = pBits[x1 + (y1 << log_width)];
|
||||
|
||||
// filtering
|
||||
|
||||
int alpha = x0 & 0xff;
|
||||
int beta = y0 & 0xff;
|
||||
|
||||
int c0a, c0b;
|
||||
int c1a, c1b;
|
||||
int c01a, c01b;
|
||||
|
||||
unpack(format, c0, &c0a, &c0b);
|
||||
unpack(format, c1, &c1a, &c1b);
|
||||
lerp(c0a, c0b, c1a, c1b, alpha, &c01a, &c01b);
|
||||
|
||||
int c2a, c2b;
|
||||
int c3a, c3b;
|
||||
int c23a, c23b;
|
||||
|
||||
unpack(format, c2, &c2a, &c2b);
|
||||
unpack(format, c3, &c3a, &c3b);
|
||||
lerp(c2a, c2b, c3a, c3b, alpha, &c23a, &c23b);
|
||||
|
||||
int c4a, c4b;
|
||||
lerp(c01a, c01b, c23a, c23b, beta, &c4a, &c4b);
|
||||
return pack(format, c4a, c4b);
|
||||
} else {
|
||||
int c0 = pBits[x0 + (y0 <<log_width)];
|
||||
|
||||
int c0a, c0b;
|
||||
unpack(format, c0, &c0a, &c0b);
|
||||
return pack(format, c0a, c0b);
|
||||
}
|
||||
}
|
||||
|
||||
inline int vx_tex3(int stage, int u, int v, int lod) {
|
||||
int lodn = MIN(lod + 0x100000, TEX_LOD_MAX);
|
||||
int a = vx_tex(0, u, v, lod);
|
||||
int b = vx_tex(0, u, v, lodn);
|
||||
int al = a & 0x00ff00ff;
|
||||
int ah = (a >> 8) & 0x00ff00ff;
|
||||
int bl = b & 0x00ff00ff;
|
||||
int bh = (b >> 8) & 0x00ff00ff;
|
||||
int frac = (lod >> 12) & 0xff;
|
||||
int cl = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff;
|
||||
int ch = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff;
|
||||
int c = al | (ah << 8);
|
||||
return c;
|
||||
}
|
||||
|
||||
inline int tex3_sw(struct kernel_arg_t* state, int stage, int u, int v, int lod) {
|
||||
int lodn = MIN(lod + 0x10000, TEX_LOD_MAX);
|
||||
int a = tex_sw(state, 0, u, v, lod);
|
||||
int b = tex_sw(state, 0, u, v, lodn);
|
||||
int al = a & 0x00ff00ff;
|
||||
int ah = (a >> 8) & 0x00ff00ff;
|
||||
int bl = b & 0x00ff00ff;
|
||||
int bh = (b >> 8) & 0x00ff00ff;
|
||||
int frac = lod & 0xffff;
|
||||
int cl = (al + (((bl - al) * frac) >> 8)) & 0x00ff00ff;
|
||||
int ch = (ah + (((bh - ah) * frac) >> 8)) & 0x00ff00ff;
|
||||
int c = al | (ah << 8);
|
||||
return c;
|
||||
}
|
||||
|
||||
#endif
|
|
@ -3,6 +3,12 @@
|
|||
#include <iostream>
|
||||
#include "blitter.h"
|
||||
|
||||
#define ISPOW2(x) (((x) != 0) && (0 == ((x) & ((x) - 1))))
|
||||
|
||||
inline uint32_t ilog2 (uint32_t value) {
|
||||
return (uint32_t)(sizeof(uint32_t) * 8UL) - (uint32_t)__builtin_clzl((value << 1) - 1UL) - 1;
|
||||
}
|
||||
|
||||
int LoadTGA(const char *filename,
|
||||
std::vector<uint8_t> &pixels,
|
||||
uint32_t *width,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue