mirror of https://github.com/nmlgc/ReC98.git
[Platform] [PC-98] Generic byte-aligned sprite blitter
The fact that every sprite format comes with its own blitter is one of the major sources of bloat in PC-98 Touhou, and of TH01 in particular. So how about writing a single decently optimized blitter, and calling into that from the entire game? Especially because generating distinct blitting functions for every width is a much better use of all that memory: It eliminates horizontal loops, and ensures that we use the optimal MOV variant for each sprite size. Removing any checks for empty bytes (which will turn out to never have been a good idea for any PC-98 model ever) and unrolling the main blitting loop using Duff's Device already gets us something that, depending on the PC-98 model, is easily 2-4× faster than the typical naive C implementation you'd find in TH01. With master.lib being not that faster… Making more use of C++ templates would have been fancy, but horizontal sprite clipping can change the blit width depending on runtime values. So, we're back to X macro code generation after all. Part of P0233, funded by [Anonymous].
This commit is contained in:
parent
abeaf851a4
commit
aa0aad8141
|
@ -0,0 +1,128 @@
|
|||
#include "platform.h"
|
||||
#include "x86real.h"
|
||||
#include "pc98.h"
|
||||
#include "planar.h"
|
||||
#include "platform/x86real/pc98/blitter.hpp"
|
||||
|
||||
blit_state_t blit_state;
|
||||
|
||||
// Supported widths
|
||||
// ----------------
|
||||
// Refer to blitter_body() for the register allocation.
|
||||
|
||||
#define FOREACH_WIDTH \
|
||||
X(8) \
|
||||
X(16) \
|
||||
|
||||
// We want to use a pseudoregister for optimal code generation, but Turbo C++
|
||||
// 4.0J insists on seeing the template type in the function arguments. So, we
|
||||
// just pass a dummy value we never actually use.
|
||||
template <class RowDots> inline void single_write(RowDots *) {
|
||||
*((RowDots __es *)(_DI)) = *((RowDots __ds *)(_SI));
|
||||
}
|
||||
|
||||
template <class RowDots> inline void single_or(RowDots *) {
|
||||
*((RowDots __es *)(_DI)) |= *((RowDots __ds *)(_SI));
|
||||
}
|
||||
// ----------------
|
||||
|
||||
// Row blitters
|
||||
// ------------
|
||||
|
||||
#define row(func_single, type) \
|
||||
func_single(reinterpret_cast<type *>(0)); _SI += _DX; _DI += ROW_SIZE;
|
||||
|
||||
#define X(width) \
|
||||
void write_##width##(seg_t plane_seg, const void far* sprite) \
|
||||
{ \
|
||||
blitter_body(plane_seg, sprite, row, single_write, dots##width##_t); \
|
||||
} \
|
||||
\
|
||||
void or_##width##(seg_t plane_seg, const void far* sprite) \
|
||||
{ \
|
||||
blitter_body(plane_seg, sprite, row, single_or, dots##width##_t); \
|
||||
} \
|
||||
|
||||
FOREACH_WIDTH
|
||||
#undef X
|
||||
|
||||
Blitter BLITTER_FUNCS[] = {
|
||||
{ nullptr, nullptr }, // We want this array to be 1-based
|
||||
#define X(width) \
|
||||
{ write_##width, or_##width },
|
||||
|
||||
FOREACH_WIDTH
|
||||
#undef X
|
||||
};
|
||||
// ------------
|
||||
|
||||
// Initialization
|
||||
// --------------
|
||||
|
||||
#define clip_b(rows, top, h, left) \
|
||||
/* Sneaky! That's how we can pretend this is an actual function that */ \
|
||||
/* returns a value. */ \
|
||||
(RES_Y - top); \
|
||||
if(rows <= 0) { \
|
||||
return nullptr; \
|
||||
} else if(rows > h) { \
|
||||
rows = h; \
|
||||
} \
|
||||
blit_state.sprite_offset = 0; \
|
||||
blit_state.vo = (vram_offset_shift(0, top) + left);
|
||||
|
||||
#define init_wh(w, rows) { \
|
||||
blit_state.sprite_w = w; \
|
||||
blit_state.loops_unrolled = (rows / UNROLL_H); \
|
||||
blit_state.loops_remainder = (rows & (UNROLL_H - 1)); \
|
||||
}
|
||||
|
||||
const Blitter __ds* blitter_init_clip_lrtb(
|
||||
vram_x_t left, vram_y_t top, vram_byte_amount_t w, pixel_t h
|
||||
)
|
||||
{
|
||||
const Blitter __ds* ret;
|
||||
pixel_t rows;
|
||||
|
||||
// Top and bottom edges
|
||||
if(top < 0) {
|
||||
if(top <= -h) {
|
||||
return nullptr;
|
||||
}
|
||||
blit_state.sprite_offset = (-top * w);
|
||||
rows = (h + top);
|
||||
blit_state.vo = 0;
|
||||
} else {
|
||||
rows = clip_b(rows, top, h, 0);
|
||||
}
|
||||
|
||||
// Left and right edges
|
||||
if(left < 0) {
|
||||
if(left <= -w) {
|
||||
return nullptr;
|
||||
}
|
||||
blit_state.sprite_offset -= left;
|
||||
ret = &BLITTER_FUNCS[-left];
|
||||
} else if(left > (ROW_SIZE - w)) {
|
||||
if(left >= ROW_SIZE) {
|
||||
return nullptr;
|
||||
}
|
||||
blit_state.vo += left;
|
||||
ret = &BLITTER_FUNCS[ROW_SIZE - left];
|
||||
} else {
|
||||
blit_state.vo += left;
|
||||
ret = &BLITTER_FUNCS[w];
|
||||
}
|
||||
init_wh(w, rows);
|
||||
return ret;
|
||||
}
|
||||
|
||||
const Blitter __ds* blitter_init_clip_b(
|
||||
vram_x_t left, vram_y_t top, vram_byte_amount_t w, pixel_t h
|
||||
)
|
||||
{
|
||||
pixel_t rows = clip_b(rows, top, h, left);
|
||||
init_wh(w, rows);
|
||||
return &BLITTER_FUNCS[w];
|
||||
}
|
||||
// --------------
|
|
@ -0,0 +1,83 @@
|
|||
/// Optimized byte-aligned sprite blitter
|
||||
/// -------------------------------------
|
||||
|
||||
typedef void (* blit_func_t)(seg_t plane_seg, const void far* sprite);
|
||||
|
||||
struct Blitter {
|
||||
blit_func_t write;
|
||||
blit_func_t or;
|
||||
};
|
||||
|
||||
// Internals
|
||||
// ---------
|
||||
|
||||
static const upixel_t UNROLL_H = 8;
|
||||
|
||||
struct blit_state_t {
|
||||
vram_offset_t vo;
|
||||
|
||||
// First blitted byte within the sprite. Can be nonzero if the sprite was
|
||||
// clipped at the left or top edge of VRAM.
|
||||
uint16_t sprite_offset;
|
||||
|
||||
// Always set to the original width of the sprite. Can be larger than the
|
||||
// blitted width if the sprite is clipped.
|
||||
vram_byte_amount_t sprite_w;
|
||||
|
||||
// 16-bit because it gets loaded into BX anyway.
|
||||
pixel_t loops_remainder;
|
||||
|
||||
int16_t loops_unrolled;
|
||||
};
|
||||
|
||||
extern blit_state_t blit_state;
|
||||
|
||||
#define blitter_body(plane_seg, sprite, func_row, row_p1, row_p2) { \
|
||||
register int16_t loops_unrolled = blit_state.loops_unrolled; \
|
||||
_SI = FP_OFF(sprite); \
|
||||
_SI += blit_state.sprite_offset; \
|
||||
_DI = blit_state.vo; \
|
||||
_DX = blit_state.sprite_w; \
|
||||
_BX = blit_state.loops_remainder; \
|
||||
\
|
||||
/* Turbo C++ 4.0J does not back up DS if the function mutates it. */ \
|
||||
/* [blit_state] can't be accessed anymore beyond this point! */ \
|
||||
_asm { push ds; } \
|
||||
_DS = FP_SEG(sprite); \
|
||||
_ES = plane_seg; \
|
||||
\
|
||||
static_assert(UNROLL_H == 8); \
|
||||
switch(_BX) { \
|
||||
case 0: do { func_row(row_p1, row_p2) \
|
||||
case 7: func_row(row_p1, row_p2) \
|
||||
case 6: func_row(row_p1, row_p2) \
|
||||
case 5: func_row(row_p1, row_p2) \
|
||||
case 4: func_row(row_p1, row_p2) \
|
||||
case 3: func_row(row_p1, row_p2) \
|
||||
case 2: func_row(row_p1, row_p2) \
|
||||
case 1: func_row(row_p1, row_p2) \
|
||||
/* */} while(--loops_unrolled > 0); \
|
||||
} \
|
||||
\
|
||||
_asm { pop ds; } \
|
||||
}
|
||||
// ---------
|
||||
|
||||
// Initialization
|
||||
// --------------
|
||||
// All of these set up blitting of a ([w]*8)×[h]-pixel sprite at the given VRAM
|
||||
// offset, cutting it at the respectively checked VRAM boundaries and assuming
|
||||
// that it does not touch the others. If the sprite would be cut to a width or
|
||||
// height of 0, they return a `nullptr` and leave the blitter in an invalid
|
||||
// state.
|
||||
|
||||
// Checks all 4 edges of VRAM.
|
||||
const Blitter __ds* blitter_init_clip_lrtb(
|
||||
vram_x_t left, vram_y_t top, vram_byte_amount_t w, pixel_t h
|
||||
);
|
||||
|
||||
// Checks the bottom edge of VRAM, assumes that the sprite does not intersect
|
||||
const Blitter __ds* blitter_init_clip_b(
|
||||
vram_x_t left, vram_y_t top, vram_byte_amount_t w, pixel_t h
|
||||
);
|
||||
// --------------
|
Loading…
Reference in New Issue