From aa0aad814163a62adb73f946da2269496141698e Mon Sep 17 00:00:00 2001 From: nmlgc Date: Sat, 18 Feb 2023 22:35:10 +0100 Subject: [PATCH] [Platform] [PC-98] Generic byte-aligned sprite blitter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fact that every sprite format comes with its own blitter is one of the major sources of bloat in PC-98 Touhou, and of TH01 in particular. So how about writing a single decently optimized blitter, and calling into that from the entire game? Especially because generating distinct blitting functions for every width is a much better use of all that memory: It eliminates horizontal loops, and ensures that we use the optimal MOV variant for each sprite size. Removing any checks for empty bytes (which will turn out to never have been a good idea for any PC-98 model ever) and unrolling the main blitting loop using Duff's Device already gets us something that, depending on the PC-98 model, is easily 2-4× faster than the typical naive C implementation you'd find in TH01. With master.lib being not that faster… Making more use of C++ templates would have been fancy, but horizontal sprite clipping can change the blit width depending on runtime values. So, we're back to X macro code generation after all. Part of P0233, funded by [Anonymous]. --- platform/x86real/pc98/blitter.cpp | 128 ++++++++++++++++++++++++++++++ platform/x86real/pc98/blitter.hpp | 83 +++++++++++++++++++ 2 files changed, 211 insertions(+) create mode 100644 platform/x86real/pc98/blitter.cpp create mode 100644 platform/x86real/pc98/blitter.hpp diff --git a/platform/x86real/pc98/blitter.cpp b/platform/x86real/pc98/blitter.cpp new file mode 100644 index 00000000..8a67a318 --- /dev/null +++ b/platform/x86real/pc98/blitter.cpp @@ -0,0 +1,128 @@ +#include "platform.h" +#include "x86real.h" +#include "pc98.h" +#include "planar.h" +#include "platform/x86real/pc98/blitter.hpp" + +blit_state_t blit_state; + +// Supported widths +// ---------------- +// Refer to blitter_body() for the register allocation. + +#define FOREACH_WIDTH \ + X(8) \ + X(16) \ + +// We want to use a pseudoregister for optimal code generation, but Turbo C++ +// 4.0J insists on seeing the template type in the function arguments. So, we +// just pass a dummy value we never actually use. +template inline void single_write(RowDots *) { + *((RowDots __es *)(_DI)) = *((RowDots __ds *)(_SI)); +} + +template inline void single_or(RowDots *) { + *((RowDots __es *)(_DI)) |= *((RowDots __ds *)(_SI)); +} +// ---------------- + +// Row blitters +// ------------ + +#define row(func_single, type) \ + func_single(reinterpret_cast(0)); _SI += _DX; _DI += ROW_SIZE; + +#define X(width) \ + void write_##width##(seg_t plane_seg, const void far* sprite) \ + { \ + blitter_body(plane_seg, sprite, row, single_write, dots##width##_t); \ + } \ + \ + void or_##width##(seg_t plane_seg, const void far* sprite) \ + { \ + blitter_body(plane_seg, sprite, row, single_or, dots##width##_t); \ + } \ + + FOREACH_WIDTH +#undef X + +Blitter BLITTER_FUNCS[] = { + { nullptr, nullptr }, // We want this array to be 1-based + #define X(width) \ + { write_##width, or_##width }, + + FOREACH_WIDTH + #undef X +}; +// ------------ + +// Initialization +// -------------- + +#define clip_b(rows, top, h, left) \ + /* Sneaky! That's how we can pretend this is an actual function that */ \ + /* returns a value. */ \ + (RES_Y - top); \ + if(rows <= 0) { \ + return nullptr; \ + } else if(rows > h) { \ + rows = h; \ + } \ + blit_state.sprite_offset = 0; \ + blit_state.vo = (vram_offset_shift(0, top) + left); + +#define init_wh(w, rows) { \ + blit_state.sprite_w = w; \ + blit_state.loops_unrolled = (rows / UNROLL_H); \ + blit_state.loops_remainder = (rows & (UNROLL_H - 1)); \ +} + +const Blitter __ds* blitter_init_clip_lrtb( + vram_x_t left, vram_y_t top, vram_byte_amount_t w, pixel_t h +) +{ + const Blitter __ds* ret; + pixel_t rows; + + // Top and bottom edges + if(top < 0) { + if(top <= -h) { + return nullptr; + } + blit_state.sprite_offset = (-top * w); + rows = (h + top); + blit_state.vo = 0; + } else { + rows = clip_b(rows, top, h, 0); + } + + // Left and right edges + if(left < 0) { + if(left <= -w) { + return nullptr; + } + blit_state.sprite_offset -= left; + ret = &BLITTER_FUNCS[-left]; + } else if(left > (ROW_SIZE - w)) { + if(left >= ROW_SIZE) { + return nullptr; + } + blit_state.vo += left; + ret = &BLITTER_FUNCS[ROW_SIZE - left]; + } else { + blit_state.vo += left; + ret = &BLITTER_FUNCS[w]; + } + init_wh(w, rows); + return ret; +} + +const Blitter __ds* blitter_init_clip_b( + vram_x_t left, vram_y_t top, vram_byte_amount_t w, pixel_t h +) +{ + pixel_t rows = clip_b(rows, top, h, left); + init_wh(w, rows); + return &BLITTER_FUNCS[w]; +} +// -------------- diff --git a/platform/x86real/pc98/blitter.hpp b/platform/x86real/pc98/blitter.hpp new file mode 100644 index 00000000..1d973034 --- /dev/null +++ b/platform/x86real/pc98/blitter.hpp @@ -0,0 +1,83 @@ +/// Optimized byte-aligned sprite blitter +/// ------------------------------------- + +typedef void (* blit_func_t)(seg_t plane_seg, const void far* sprite); + +struct Blitter { + blit_func_t write; + blit_func_t or; +}; + +// Internals +// --------- + +static const upixel_t UNROLL_H = 8; + +struct blit_state_t { + vram_offset_t vo; + + // First blitted byte within the sprite. Can be nonzero if the sprite was + // clipped at the left or top edge of VRAM. + uint16_t sprite_offset; + + // Always set to the original width of the sprite. Can be larger than the + // blitted width if the sprite is clipped. + vram_byte_amount_t sprite_w; + + // 16-bit because it gets loaded into BX anyway. + pixel_t loops_remainder; + + int16_t loops_unrolled; +}; + +extern blit_state_t blit_state; + +#define blitter_body(plane_seg, sprite, func_row, row_p1, row_p2) { \ + register int16_t loops_unrolled = blit_state.loops_unrolled; \ + _SI = FP_OFF(sprite); \ + _SI += blit_state.sprite_offset; \ + _DI = blit_state.vo; \ + _DX = blit_state.sprite_w; \ + _BX = blit_state.loops_remainder; \ + \ + /* Turbo C++ 4.0J does not back up DS if the function mutates it. */ \ + /* [blit_state] can't be accessed anymore beyond this point! */ \ + _asm { push ds; } \ + _DS = FP_SEG(sprite); \ + _ES = plane_seg; \ + \ + static_assert(UNROLL_H == 8); \ + switch(_BX) { \ + case 0: do { func_row(row_p1, row_p2) \ + case 7: func_row(row_p1, row_p2) \ + case 6: func_row(row_p1, row_p2) \ + case 5: func_row(row_p1, row_p2) \ + case 4: func_row(row_p1, row_p2) \ + case 3: func_row(row_p1, row_p2) \ + case 2: func_row(row_p1, row_p2) \ + case 1: func_row(row_p1, row_p2) \ + /* */} while(--loops_unrolled > 0); \ + } \ + \ + _asm { pop ds; } \ +} +// --------- + +// Initialization +// -------------- +// All of these set up blitting of a ([w]*8)×[h]-pixel sprite at the given VRAM +// offset, cutting it at the respectively checked VRAM boundaries and assuming +// that it does not touch the others. If the sprite would be cut to a width or +// height of 0, they return a `nullptr` and leave the blitter in an invalid +// state. + +// Checks all 4 edges of VRAM. +const Blitter __ds* blitter_init_clip_lrtb( + vram_x_t left, vram_y_t top, vram_byte_amount_t w, pixel_t h +); + +// Checks the bottom edge of VRAM, assumes that the sprite does not intersect +const Blitter __ds* blitter_init_clip_b( + vram_x_t left, vram_y_t top, vram_byte_amount_t w, pixel_t h +); +// --------------