#pragma option -O- #include "th01/end/pic.hpp" void pascal end_pics_load_palette_show(const char *fn) { graph_accesspage_func(1); grp_put_palette_show(fn); } // Avoid symbol duplication... #define egc_start_copy egc_start_copy_1 #include "th01/hardware/egcstart.cpp" void end_pic_show(int quarter) { egc_start_copy(); pixel_t src_left = ((quarter % 2) * PIC_W); pixel_t src_top = ((quarter / 2) * PIC_H); uvram_offset_t vram_offset_src = vram_offset_shift(src_left, src_top); uvram_offset_t vram_offset_dst = vram_offset_shift(PIC_LEFT, PIC_TOP); vram_word_amount_t vram_x; pixel_t y; // ZUN quirk: This EGC-"accelerated" copy operation ends up performing a // total of ((320 / 16) × 200 × 2) = 8000 VRAM page switches, which are // everything but instant. Even the optimal assembly instructions for a // *single* page switch, `MOV AL, (0|1)` followed by `OUT 0xA6, AL`, take // 12 cycles on a 386 and 17 cycles on a 486, and ZUN adds the bloat of a // standard function call on top of even that. // Optimizations aside, using the EGC can't give you a better algorithm, // as its tile registers are limited to 16 dots. Expanding to at least 32 // dots would have really been nice for ≥386 CPUs... for(y = 0; y < PIC_H; y++) { for(vram_x = 0; vram_x < (PIC_VRAM_W / EGC_REGISTER_SIZE); vram_x++) { egc_temp_t d; graph_accesspage_func(1); d = egc_chunk(vram_offset_src); graph_accesspage_func(0); egc_chunk(vram_offset_dst) = d; vram_offset_src += EGC_REGISTER_SIZE; vram_offset_dst += EGC_REGISTER_SIZE; } vram_offset_src += (ROW_SIZE - PIC_VRAM_W); vram_offset_dst += (ROW_SIZE - PIC_VRAM_W); } egc_off(); } #pragma option -O.