ReC98/th01/end/pic.cpp

#pragma option -O-

#include "th01/end/pic.hpp"

void pascal end_pics_load_palette_show(const char *fn)
{
	graph_accesspage_func(1);
	grp_put_palette_show(fn);
}

// Avoid symbol duplication...
#define egc_start_copy egc_start_copy_1
#include "th01/hardware/egcstart.cpp"

void end_pic_show(int quarter)
{
	egc_start_copy();

	pixel_t src_left = ((quarter % 2) * PIC_W);
	pixel_t src_top  = ((quarter / 2) * PIC_H);
	uvram_offset_t vram_offset_src = vram_offset_shift(src_left, src_top);
	uvram_offset_t vram_offset_dst = vram_offset_shift(PIC_LEFT, PIC_TOP);
	vram_word_amount_t vram_x;
	pixel_t y;

	// ZUN quirk: This EGC-"accelerated" copy operation ends up performing a
	// total of ((320 / 16) × 200 × 2) = 8000 VRAM page switches, which are
	// everything but instant. Even the optimal assembly instructions for a
	// *single* page switch, `MOV AL, (0|1)` followed by `OUT 0xA6, AL`, take
	// 12 cycles on a 386 and 17 cycles on a 486, and ZUN adds the bloat of a
	// standard function call on top of even that.
	// Optimizations aside, using the EGC can't give you a better algorithm,
	// as its tile registers are limited to 16 dots. Expanding to at least 32
	// dots would have really been nice for ≥386 CPUs...
	for(y = 0; y < PIC_H; y++) {
		for(vram_x = 0; vram_x < (PIC_VRAM_W / EGC_REGISTER_SIZE); vram_x++) {
			egc_temp_t d;

			graph_accesspage_func(1);	d = egc_chunk(vram_offset_src);
			graph_accesspage_func(0);	egc_chunk(vram_offset_dst) = d;

			vram_offset_src += EGC_REGISTER_SIZE;
			vram_offset_dst += EGC_REGISTER_SIZE;
		}
		vram_offset_src += (ROW_SIZE - PIC_VRAM_W);
		vram_offset_dst += (ROW_SIZE - PIC_VRAM_W);
	}
	egc_off();
}

#pragma option -O.