[Platform] [PC-98] Generic byte-aligned sprite blitter

The fact that every sprite format comes with its own blitter is one of
the major sources of bloat in PC-98 Touhou, and of TH01 in particular.
So how about writing a single decently optimized blitter, and calling
into that from the entire game?

Especially because generating distinct blitting functions for every
width is a much better use of all that memory: It eliminates horizontal
loops, and ensures that we use the optimal MOV variant for each sprite
size. Removing any checks for empty bytes (which will turn out to never
have been a good idea for any PC-98 model ever) and unrolling the main
blitting loop using Duff's Device already gets us something that,
depending on the PC-98 model, is easily 2-4× faster than the typical
naive C implementation you'd find in TH01. With master.lib being not
that faster…

Making more use of C++ templates would have been fancy, but horizontal
sprite clipping can change the blit width depending on runtime values.
So, we're back to X macro code generation after all.

Part of P0233, funded by [Anonymous].
This commit is contained in:
nmlgc 2023-02-18 22:35:10 +01:00
parent abeaf851a4
commit aa0aad8141
2 changed files with 211 additions and 0 deletions

View File

@ -0,0 +1,128 @@
#include "platform.h"
#include "x86real.h"
#include "pc98.h"
#include "planar.h"
#include "platform/x86real/pc98/blitter.hpp"
blit_state_t blit_state;
// Supported widths
// ----------------
// Refer to blitter_body() for the register allocation.
#define FOREACH_WIDTH \
X(8) \
X(16) \
// We want to use a pseudoregister for optimal code generation, but Turbo C++
// 4.0J insists on seeing the template type in the function arguments. So, we
// just pass a dummy value we never actually use.
template <class RowDots> inline void single_write(RowDots *) {
*((RowDots __es *)(_DI)) = *((RowDots __ds *)(_SI));
}
template <class RowDots> inline void single_or(RowDots *) {
*((RowDots __es *)(_DI)) |= *((RowDots __ds *)(_SI));
}
// ----------------
// Row blitters
// ------------
#define row(func_single, type) \
func_single(reinterpret_cast<type *>(0)); _SI += _DX; _DI += ROW_SIZE;
#define X(width) \
void write_##width##(seg_t plane_seg, const void far* sprite) \
{ \
blitter_body(plane_seg, sprite, row, single_write, dots##width##_t); \
} \
\
void or_##width##(seg_t plane_seg, const void far* sprite) \
{ \
blitter_body(plane_seg, sprite, row, single_or, dots##width##_t); \
} \
FOREACH_WIDTH
#undef X
Blitter BLITTER_FUNCS[] = {
{ nullptr, nullptr }, // We want this array to be 1-based
#define X(width) \
{ write_##width, or_##width },
FOREACH_WIDTH
#undef X
};
// ------------
// Initialization
// --------------
#define clip_b(rows, top, h, left) \
/* Sneaky! That's how we can pretend this is an actual function that */ \
/* returns a value. */ \
(RES_Y - top); \
if(rows <= 0) { \
return nullptr; \
} else if(rows > h) { \
rows = h; \
} \
blit_state.sprite_offset = 0; \
blit_state.vo = (vram_offset_shift(0, top) + left);
#define init_wh(w, rows) { \
blit_state.sprite_w = w; \
blit_state.loops_unrolled = (rows / UNROLL_H); \
blit_state.loops_remainder = (rows & (UNROLL_H - 1)); \
}
const Blitter __ds* blitter_init_clip_lrtb(
vram_x_t left, vram_y_t top, vram_byte_amount_t w, pixel_t h
)
{
const Blitter __ds* ret;
pixel_t rows;
// Top and bottom edges
if(top < 0) {
if(top <= -h) {
return nullptr;
}
blit_state.sprite_offset = (-top * w);
rows = (h + top);
blit_state.vo = 0;
} else {
rows = clip_b(rows, top, h, 0);
}
// Left and right edges
if(left < 0) {
if(left <= -w) {
return nullptr;
}
blit_state.sprite_offset -= left;
ret = &BLITTER_FUNCS[-left];
} else if(left > (ROW_SIZE - w)) {
if(left >= ROW_SIZE) {
return nullptr;
}
blit_state.vo += left;
ret = &BLITTER_FUNCS[ROW_SIZE - left];
} else {
blit_state.vo += left;
ret = &BLITTER_FUNCS[w];
}
init_wh(w, rows);
return ret;
}
const Blitter __ds* blitter_init_clip_b(
vram_x_t left, vram_y_t top, vram_byte_amount_t w, pixel_t h
)
{
pixel_t rows = clip_b(rows, top, h, left);
init_wh(w, rows);
return &BLITTER_FUNCS[w];
}
// --------------

View File

@ -0,0 +1,83 @@
/// Optimized byte-aligned sprite blitter
/// -------------------------------------
typedef void (* blit_func_t)(seg_t plane_seg, const void far* sprite);
struct Blitter {
blit_func_t write;
blit_func_t or;
};
// Internals
// ---------
static const upixel_t UNROLL_H = 8;
struct blit_state_t {
vram_offset_t vo;
// First blitted byte within the sprite. Can be nonzero if the sprite was
// clipped at the left or top edge of VRAM.
uint16_t sprite_offset;
// Always set to the original width of the sprite. Can be larger than the
// blitted width if the sprite is clipped.
vram_byte_amount_t sprite_w;
// 16-bit because it gets loaded into BX anyway.
pixel_t loops_remainder;
int16_t loops_unrolled;
};
extern blit_state_t blit_state;
#define blitter_body(plane_seg, sprite, func_row, row_p1, row_p2) { \
register int16_t loops_unrolled = blit_state.loops_unrolled; \
_SI = FP_OFF(sprite); \
_SI += blit_state.sprite_offset; \
_DI = blit_state.vo; \
_DX = blit_state.sprite_w; \
_BX = blit_state.loops_remainder; \
\
/* Turbo C++ 4.0J does not back up DS if the function mutates it. */ \
/* [blit_state] can't be accessed anymore beyond this point! */ \
_asm { push ds; } \
_DS = FP_SEG(sprite); \
_ES = plane_seg; \
\
static_assert(UNROLL_H == 8); \
switch(_BX) { \
case 0: do { func_row(row_p1, row_p2) \
case 7: func_row(row_p1, row_p2) \
case 6: func_row(row_p1, row_p2) \
case 5: func_row(row_p1, row_p2) \
case 4: func_row(row_p1, row_p2) \
case 3: func_row(row_p1, row_p2) \
case 2: func_row(row_p1, row_p2) \
case 1: func_row(row_p1, row_p2) \
/* */} while(--loops_unrolled > 0); \
} \
\
_asm { pop ds; } \
}
// ---------
// Initialization
// --------------
// All of these set up blitting of a ([w]*8)×[h]-pixel sprite at the given VRAM
// offset, cutting it at the respectively checked VRAM boundaries and assuming
// that it does not touch the others. If the sprite would be cut to a width or
// height of 0, they return a `nullptr` and leave the blitter in an invalid
// state.
// Checks all 4 edges of VRAM.
const Blitter __ds* blitter_init_clip_lrtb(
vram_x_t left, vram_y_t top, vram_byte_amount_t w, pixel_t h
);
// Checks the bottom edge of VRAM, assumes that the sprite does not intersect
const Blitter __ds* blitter_init_clip_b(
vram_x_t left, vram_y_t top, vram_byte_amount_t w, pixel_t h
);
// --------------