From 8b0165738a1fd66f4083b0e9818ff50869fa61f4 Mon Sep 17 00:00:00 2001
From: nmlgc <nmlgc@nmlgc.net>
Date: Sun, 8 Nov 2020 21:04:36 +0100
Subject: [PATCH] [Decompilation] [th03] .MRS: Byte-aligned, opaque blitting
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Containing not one, but two decompilation innovations, one of which
works around a compiler bug using C++ template functions…

Completes P0126, funded by [Anonymous] and Blue Bolt.
---
 Research/Borland C++ decompilation.md |  13 ++
 decomp.h                              |  64 +++++++-
 th03/formats/mrs.cpp                  | 105 ++++++++++++
 th03/formats/mrs.hpp                  |   6 +
 th03_main.asm                         | 223 ++++++++------------------
 5 files changed, 250 insertions(+), 161 deletions(-)

diff --git a/Research/Borland C++ decompilation.md b/Research/Borland C++ decompilation.md
index 97ac8018..5e23ed79 100644
--- a/Research/Borland C++ decompilation.md	
+++ b/Research/Borland C++ decompilation.md	
@@ -105,6 +105,10 @@ case it's part of an arithmetic expression that was promoted to `int`.
 
 ## Assignments
 
+| | |
+|-|-|
+| `MOV ???, [SI+????]` | Only achievable through pointer arithmetic? |
+
 * When assigning to a array element at a variable or non-0 index, the array
   element address is typically evaluated before the expression to be assigned.
   But when assigning
@@ -452,3 +456,12 @@ contains one of the following:
 
   **Certainty:** Confirmed through reverse-engineering `TCC.EXE`, no way
   around it.
+
+### Compiler bugs
+
+* Dereferencing a `far` pointer constructed from the `_FS` and `_GS`
+  pseudoregisters emits wrong segment prefix opcodes – 0x46 (`INC SI`) and
+  0x4E (`DEC SI`) rather than the correct 0x64 and 0x65, respectively.
+
+  **Workaround**: Not happening when compiling via TASM (`-B` on the command
+  line, or `#pragma inline`).
diff --git a/decomp.h b/decomp.h
index 773c3559..74143f79 100644
--- a/decomp.h
+++ b/decomp.h
@@ -9,6 +9,7 @@
 //	if(FLAGS_*) { goto some_label; | return; }
 // these assemble into the single given instruction. Apply the ! operator to
 // get the N versions.
+#define FLAGS_CARRY (_FLAGS & 0x01) /* JC / JAE / JB */
 #define FLAGS_ZERO (_FLAGS & 0x40) /* JZ */
 #define FLAGS_SIGN (_FLAGS & 0x80) /* JS */
 // ----------------
@@ -26,5 +27,66 @@
 	out dx, ax; \
 }
 
-// Versions that actually inline with pseudoregisters
+// poke() versions that actually inline with pseudoregisters
+// ---------------------------------------------------------
 #define pokew(sgm, off, val) { *(uint16_t far *)(MK_FP(sgm, off)) = val; }
+
+// Turbo C++ 4.0 generates wrong segment prefix opcodes for the _FS and _GS
+// pseudoregisters - 0x46 (INC SI) and 0x4E (DEC SI) rather than the correct
+// 0x64 and 0x65, respectively. These prefixes are also not supported in
+// inline assembly, which is limited to pre-386 anyway. Compiling via assembly
+// (`#pragma inline`) would work and generate the correct instructions here,
+// but that would incur yet another dependency on a 16-bit TASM, for something
+// honestly quite insignificant.
+//
+// So, can we somehow work around this issue while retaining the readability
+// of the usage code and pretending that this bug doesn't exist? Comparisons
+// with segment registers unfortunately don't inline, so something like
+// 	if(sgm == _FS)
+// wouldn't work, even inside a macro that replaces [sgm] with _FS. But since
+// __emit__() *does* inline, we can use function templates! The default
+// versions provide the regularly intended C code for all other registers,
+// while explicit specializations for _FS and _GS __emit__() the correct
+// instruction opcodes for all offset registers needed. Then, we only need to
+// somehow move the pseudoregisters up into the type system... which can
+// simply be done by turning them into class names via preprocessor token
+// pasting. Sure, this limits this approach to raw registers with no immediate
+// offsets, but let's hope we won't ever need those...
+//
+// Also, hey, no need for the MK_FP() macro if we directly return the correct
+// types.
+#ifdef __cplusplus
+}
+	struct Decomp_FS { void  __seg* value() { return (void  __seg *)(_FS); } };
+	struct Decomp_GS { void  __seg* value() { return (void  __seg *)(_GS); } };
+	struct Decomp_DI { void __near* value() { return (void __near *)(_DI); } };
+
+	// Removing [val] from the parameter lists of the template functions below
+	// perfects the inlining.
+	#define poked(sgm, off, val) \
+		_EAX = val; \
+		poked_eax((Decomp##sgm *)NULL, (Decomp##off *)NULL);
+
+	template <class Segment, class Offset> inline void poked_eax(
+		Segment *sgm, Offset *off
+	) {
+		*reinterpret_cast<uint32_t far *>(sgm->value() + off->value()) = _EAX;
+	}
+
+	inline void poked_eax(Decomp_FS *sgm, Decomp_DI *off) {
+		__emit__(0x66, 0x64, 0x89, 0x05); // MOV FS:[DI], EAX
+	}
+
+	inline void poked_eax(Decomp_GS *sgm, Decomp_DI *off) {
+		__emit__(0x66, 0x65, 0x89, 0x05); // MOV GS:[DI], EAX
+	}
+
+extern "C" {
+#endif
+// ---------------------------------------------------------
+
+// 32-bit ASM instructions not supported by Turbo C++ 4.0J's built-in
+// assembler. Makes no sense to compile with `#pragma inline` (and thus,
+// require a 16-bit TASM) just for those.
+#define MOVSD	__emit__(0x66, 0xA5);
+#define REP  	__emit__(0xF3);
diff --git a/th03/formats/mrs.cpp b/th03/formats/mrs.cpp
index 4b472ee7..fe749f5b 100644
--- a/th03/formats/mrs.cpp
+++ b/th03/formats/mrs.cpp
@@ -1,9 +1,15 @@
+#pragma option -3
 #pragma codeseg SHARED
 
+extern "C" {
+#include <stddef.h>
 #include "platform.h"
 #include "pc98.h"
 #include "planar.h"
+#include "decomp.h"
 #include "th03/formats/hfliplut.h"
+}
+
 #include "th03/formats/mrs.hpp"
 
 static const vram_byte_amount_t MRS_BYTE_W = (MRS_W / BYTE_DOTS);
@@ -21,6 +27,9 @@ struct mrs_t {
 
 extern mrs_t far *mrs_images[MRS_SLOT_COUNT];
 
+// Decompilation workarounds
+// -------------------------
+
 // Points [reg_sgm]:[reg_off] to the alpha plane of the .MRS image in the
 // given [slot].
 #define mrs_slot_assign(reg_sgm, reg_off, slot) { \
@@ -29,6 +38,102 @@ extern mrs_t far *mrs_images[MRS_SLOT_COUNT];
 	__asm { l##reg_sgm reg_off, mrs_images[bx]; } \
 }
 
+// Single iteration across [row_dword_w] 32-dot units of a .MRS image, from
+// bottom to top. _DI is assumed to point at the bottom left target position,
+// while [body] is responsible to increment _DI by [MRS_BYTE_W].
+#define mrs_put_rows(row_dword_w, body) \
+	do { \
+		_CX = row_dword_w; \
+		body \
+		_DI -= (ROW_SIZE + MRS_BYTE_W); \
+	} while(!FLAGS_CARRY);
+
+// ZUN optimized mrs_put_noalpha_8() to blit 3 out of the 4 bitplanes within a
+// single loop. Annoyingly, he does this by first moving the source pointer to
+// the beginning of the G plane within a mrs_t instance, and then accesses the
+// earlier planes with *negative* offsets, rather than, y'know, just using
+// positive ones like a sane person.
+// These offsets are encoded as immediates within the instructions that read
+// the dot patterns. Subtracting the raw values wouldn't decompile correctly,
+// but thankfully, pointer arithmetic does, and is also a lot cleaner...
+// conceptually, at least. It also inlines perfectly, allowing us to give some
+// meaningful names to these horrifying expressions.
+struct mrs_at_G_t : public mrs_plane_t {
+	dots32_t dots_from_alpha(void) const { return *(*((this - 3)->dots)); }
+	dots32_t dots_from_B(void) const     { return *(*((this - 2)->dots)); }
+	dots32_t dots_from_R(void) const     { return *(*((this - 1)->dots)); }
+};
+
+static inline mrs_at_G_t near* mrs_at_G(void) {
+	return reinterpret_cast<mrs_at_G_t near *>(offsetof(mrs_t, planes.G));
+}
+// -------------------------
+
+inline uint16_t to_bottom_left_8(const screen_x_t &left) {
+	return ((left >> 3) + ((MRS_H - 1) * ROW_SIZE));
+}
+
+inline seg_t to_segment(const uscreen_y_t &top) {
+	_AX = (top / 2); // screen_y_t -> vram_y_t...
+	_DX = _AX;
+	return ((_AX << 2) + _DX); // ... and -> segment
+}
+
+void pascal mrs_put_noalpha_8(
+	screen_x_t left, uscreen_y_t top, int slot, bool altered_colors
+)
+{
+	#define _SI	reinterpret_cast<mrs_at_G_t near *>(_SI)
+	#define at_bottom_left	_DX // *Not* rooted at (0, 0)!
+
+	__asm { push ds; }
+	_DI = to_bottom_left_8(left);
+	_AX = to_segment(top);
+	mrs_slot_assign(ds, si, slot);
+	_SI = mrs_at_G();
+
+	// "I've spent good money on that Intel 386 CPU, so let's actually use
+	// *all* its segment registers!" :zunpet: :zunpet: :zunpet:
+	_FS = (_AX += SEG_PLANE_B);       	// = B
+	_GS = (_AX += SEG_PLANE_DIST_BRG);	// = R
+	_ES = (_AX += SEG_PLANE_DIST_BRG);	// = G
+	// At this point though, we're out of segment registers. That's why this
+	// approach of not changing destination segments within a blitting loop
+	// only works for 3 out of the 4 bitplanes, and why we need a second loop
+	// for the final one after all.
+	_BX = (_AX += SEG_PLANE_DIST_E);  	// = E
+	at_bottom_left = _DI;
+	if(altered_colors) {
+		mrs_put_rows(MRS_DWORD_W, { put_altered:
+			poked(_FS, _DI, (~_SI->dots_from_alpha() | _SI->dots_from_B()));
+			poked(_GS, _DI, _SI->dots_from_R());
+			MOVSD;
+			__asm { loop put_altered; }
+		});
+		// SI is now at the beginning of the E plane. Blit it in its own loop
+		_DI = at_bottom_left;
+		_ES = _BX;
+		mrs_put_rows(MRS_DWORD_W, REP MOVSD);
+	} else {
+		mrs_put_rows(MRS_DWORD_W, { put_regular:
+			poked(_FS, _DI, _SI->dots_from_B());
+			poked(_GS, _DI, _SI->dots_from_R());
+			MOVSD;
+			_asm { loop put_regular; }
+		});
+		// SI is now at the beginning of the E plane. Blit it in its own loop
+		_DI = at_bottom_left;
+		_ES = _BX;
+		mrs_put_rows(MRS_DWORD_W, REP MOVSD);
+	}
+	__asm { pop	ds; }
+
+	#undef at_bottom_left
+	#undef _SI
+}
+
+#pragma codestring "\x90"
+
 void pascal mrs_hflip(int slot)
 {
 	_CX = sizeof(mrs_t);
diff --git a/th03/formats/mrs.hpp b/th03/formats/mrs.hpp
index ad9e19d4..3a8fe0db 100644
--- a/th03/formats/mrs.hpp
+++ b/th03/formats/mrs.hpp
@@ -7,6 +7,12 @@ static const int MRS_SLOT_COUNT = 8;
 static const pixel_t MRS_W = 288;
 static const pixel_t MRS_H = 184;
 
+// Displays the .MRS image in the given [slot] at (⌊left/8⌋*8, top),
+// disregarding its alpha plane, and optionally altering its colors slightly.
+void pascal mrs_put_noalpha_8(
+	screen_x_t left, uscreen_y_t top, int slot, bool altered_colors
+);
+
 // Persistently flips the image in [slot] horizontally, using the [hflip_lut].
 void pascal mrs_hflip(int slot);
 /// ---------------------------------------------------------------------------
diff --git a/th03_main.asm b/th03_main.asm
index c2670e35..2552e23e 100644
--- a/th03_main.asm
+++ b/th03_main.asm
@@ -9052,104 +9052,7 @@ sub_EF46	endp
 
 ; ---------------------------------------------------------------------------
 		nop
-
-; =============== S U B	R O U T	I N E =======================================
-
-; Attributes: bp-based frame
-
-sub_EFF4	proc far
-
-arg_0		= byte ptr  6
-arg_2		= word ptr  8
-arg_4		= word ptr  0Ah
-arg_6		= word ptr  0Ch
-
-		push	bp
-		mov	bp, sp
-		push	si
-		push	di
-		push	ds
-		mov	ax, [bp+arg_6]
-		sar	ax, 3
-		add	ax, 3930h
-		mov	di, ax
-		mov	ax, [bp+arg_4]
-		shr	ax, 1
-		mov	dx, ax
-		shl	ax, 2
-		add	ax, dx
-		mov	bx, [bp+arg_2]
-		shl	bx, 2
-		lds	si, _mrs_images[bx]
-		mov	si, 4DA0h
-		add	ax, 0A800h
-		mov	fs, ax
-		add	ax, 800h
-		mov	gs, ax
-		add	ax, 800h
-		mov	es, ax
-		add	ax, 2800h
-		mov	bx, ax
-		mov	dx, di
-		cmp	[bp+arg_0], 0
-		jz	short loc_F071
-
-loc_F03A:
-		mov	cx, 9
-
-loc_F03D:
-		mov	eax, [si-4DA0h]
-		not	eax
-		or	eax, [si-33C0h]
-		mov	fs:[di], eax
-		mov	eax, [si-19E0h]
-		mov	gs:[di], eax
-		movsd
-		loop	loc_F03D
-		sub	di, 74h	; 't'
-		jnb	short loc_F03A
-		mov	di, dx
-		mov	es, bx
-
-loc_F064:
-		mov	cx, 9
-		rep movsd
-		sub	di, 74h	; 't'
-		jnb	short loc_F064
-		jmp	short loc_F09E
-; ---------------------------------------------------------------------------
-
-loc_F071:
-		mov	cx, 9
-
-loc_F074:
-		mov	eax, [si-33C0h]
-		mov	fs:[di], eax
-		mov	eax, [si-19E0h]
-		mov	gs:[di], eax
-		movsd
-		loop	loc_F074
-		sub	di, 74h	; 't'
-		jnb	short loc_F071
-		mov	di, dx
-		mov	es, bx
-
-loc_F093:
-		mov	cx, 9
-		rep movsd
-		sub	di, 74h	; 't'
-		jnb	short loc_F093
-
-loc_F09E:
-		pop	ds
-		pop	di
-		pop	si
-		pop	bp
-		retf	8
-sub_EFF4	endp
-
-; ---------------------------------------------------------------------------
-		nop
+	extern @MRS_PUT_NOALPHA_8$QIUIIC:proc
 	extern @MRS_HFLIP$QI:proc
 	SPRITE16_SPRITES_COMMIT procdesc pascal far
 	SPRITE16_PUT procdesc pascal far \
@@ -19177,19 +19080,19 @@ loc_1494A:
 loc_1495C:
 		cmp	_pid_current, 0
 		jz	short loc_14967
-		add	si, 140h
+		add	si, PLAYFIELD_W_BORDERED
 
 loc_14967:
-		push	si
-		push	10h
+		push	si	; left
+		push	PLAYFIELD_TOP	; top
 		mov	al, _pid_current
 		mov	ah, 0
 		add	ax, 2
-		push	ax
+		push	ax	; slot
 		mov	al, _pid_current
 		mov	ah, 0
-		push	ax
-		call	sub_EFF4
+		push	ax	; altered_colors
+		call	@mrs_put_noalpha_8$qiuiic
 		mov	al, [bp+@@frame]
 		mov	ah, 0
 		mov	bx, 8
@@ -20313,18 +20216,18 @@ loc_152D7:
 		mov	_palette_changed, 1
 		mov	al, _pid_current
 		mov	ah, 0
-		imul	ax, 140h
-		add	ax, 10h
-		push	ax
-		push	10h
+		imul	ax, PLAYFIELD_W_BORDERED
+		add	ax, PLAYFIELD_LEFT
+		push	ax	; left
+		push	PLAYFIELD_TOP	; top
 		mov	al, _pid_current
 		mov	ah, 0
 		add	ax, 2
-		push	ax
+		push	ax	; slot
 		mov	al, _pid_current
 		mov	ah, 0
-		push	ax
-		call	sub_EFF4
+		push	ax	; altered_colors
+		call	@mrs_put_noalpha_8$qiuiic
 		jmp	short loc_15323
 ; ---------------------------------------------------------------------------
 
@@ -21393,18 +21296,18 @@ loc_15C23:
 loc_15C32:
 		mov	al, _pid_current
 		mov	ah, 0
-		imul	ax, 140h
-		add	ax, 10h
-		push	ax
-		push	10h
+		imul	ax, PLAYFIELD_W_BORDERED
+		add	ax, PLAYFIELD_LEFT
+		push	ax	; left
+		push	PLAYFIELD_TOP	; top
 		mov	al, _pid_current
 		mov	ah, 0
 		add	ax, 2
-		push	ax
+		push	ax	; slot
 		mov	al, _pid_current
 		mov	ah, 0
-		push	ax
-		call	sub_EFF4
+		push	ax	; altered_colors
+		call	@mrs_put_noalpha_8$qiuiic
 		jmp	short loc_15CB0
 ; ---------------------------------------------------------------------------
 
@@ -24121,23 +24024,23 @@ loc_172F0:
 loc_172FF:
 		mov	al, _pid_current
 		mov	ah, 0
-		imul	ax, 140h
+		imul	ax, PLAYFIELD_W_BORDERED
 		mov	dl, _pid_current
 		mov	dh, 0
 		add	dx, dx
 		mov	bx, dx
 		add	ax, _playfield_fg_shift_x[bx]
-		add	ax, 10h
-		push	ax
-		push	10h
+		add	ax, PLAYFIELD_LEFT
+		push	ax	; left
+		push	PLAYFIELD_TOP	; top
 		mov	al, _pid_current
 		mov	ah, 0
 		add	ax, 2
-		push	ax
+		push	ax	; slot
 		mov	al, _pid_current
 		mov	ah, 0
-		push	ax
-		call	sub_EFF4
+		push	ax	; altered_colors
+		call	@mrs_put_noalpha_8$qiuiic
 		jmp	short loc_1737D
 ; ---------------------------------------------------------------------------
 
@@ -26392,22 +26295,22 @@ loc_184A5:
 		call	sub_CE5B
 
 loc_18518:
-		mov	si, 10h
+		mov	si, PLAYFIELD_LEFT
 		cmp	_pid_current, 0
 		jz	short loc_18526
-		add	si, 140h
+		add	si, PLAYFIELD_W_BORDERED
 
 loc_18526:
-		push	si
-		push	10h
+		push	si	; left
+		push	PLAYFIELD_TOP	; top
 		mov	al, _pid_current
 		mov	ah, 0
 		add	ax, 2
-		push	ax
+		push	ax	; slot
 		mov	al, _pid_current
 		mov	ah, 0
-		push	ax
-		call	sub_EFF4
+		push	ax	; altered_colors
+		call	@mrs_put_noalpha_8$qiuiic
 		jmp	short loc_185A3
 ; ---------------------------------------------------------------------------
 
@@ -26781,22 +26684,22 @@ loc_18895:
 		mov	_playfield_fg_shift_x[bx], -4
 
 loc_188A4:
-		mov	si, 10h
+		mov	si, PLAYFIELD_LEFT
 		cmp	_pid_current, 0
 		jz	short loc_188B2
-		add	si, 140h
+		add	si, PLAYFIELD_W_BORDERED
 
 loc_188B2:
-		push	si
-		push	10h
+		push	si	; left
+		push	PLAYFIELD_TOP	; top
 		mov	al, _pid_current
 		mov	ah, 0
 		add	ax, 2
-		push	ax
+		push	ax	; slot
 		mov	al, _pid_current
 		mov	ah, 0
-		push	ax
-		call	sub_EFF4
+		push	ax	; altered_colors
+		call	@mrs_put_noalpha_8$qiuiic
 		mov	al, [bp+@@frame]
 		mov	ah, 0
 		mov	bx, 4
@@ -27027,22 +26930,22 @@ loc_18A68:
 		mov	angle_1FBD4, al
 
 loc_18B2F:
-		mov	si, 10h
+		mov	si, PLAYFIELD_LEFT
 		cmp	_pid_current, 0
 		jz	short loc_18B3D
-		add	si, 140h
+		add	si, PLAYFIELD_W_BORDERED
 
 loc_18B3D:
-		push	si
-		push	10h
+		push	si	; left
+		push	PLAYFIELD_TOP	; top
 		mov	al, _pid_current
 		mov	ah, 0
 		add	ax, 2
-		push	ax
+		push	ax	; slot
 		mov	al, _pid_current
 		mov	ah, 0
-		push	ax
-		call	sub_EFF4
+		push	ax	; altered_colors
+		call	@mrs_put_noalpha_8$qiuiic
 		call	egc_on
 		jmp	short @@ret
 ; ---------------------------------------------------------------------------
@@ -27247,22 +27150,22 @@ loc_18D28:
 		sub	word_1FE56, 2E0h
 
 loc_18D34:
-		mov	si, 10h
+		mov	si, PLAYFIELD_LEFT
 		cmp	_pid_current, 0
 		jz	short loc_18D42
-		add	si, 140h
+		add	si, PLAYFIELD_W_BORDERED
 
 loc_18D42:
-		push	si
-		push	10h
+		push	si	; left
+		push	PLAYFIELD_TOP	; top
 		mov	al, _pid_current
 		mov	ah, 0
 		add	ax, 2
-		push	ax
+		push	ax	; slot
 		mov	al, _pid_current
 		mov	ah, 0
-		push	ax
-		call	sub_EFF4
+		push	ax	; altered_colors
+		call	@mrs_put_noalpha_8$qiuiic
 		jmp	short loc_18DBF
 ; ---------------------------------------------------------------------------
 
@@ -27384,22 +27287,22 @@ loc_18E5F:
 		mov	_playfield_fg_shift_x[bx], -4
 
 loc_18E6E:
-		mov	si, 10h
+		mov	si, PLAYFIELD_LEFT
 		cmp	_pid_current, 0
 		jz	short loc_18E7C
-		add	si, 140h
+		add	si, PLAYFIELD_W_BORDERED
 
 loc_18E7C:
-		push	si
-		push	10h
+		push	si	; left
+		push	PLAYFIELD_TOP	; top
 		mov	al, _pid_current
 		mov	ah, 0
 		add	ax, 2
-		push	ax
+		push	ax	; slot
 		mov	al, _pid_current
 		mov	ah, 0
-		push	ax
-		call	sub_EFF4
+		push	ax	; altered_colors
+		call	@mrs_put_noalpha_8$qiuiic
 		call	grcg_setcolor pascal, (GC_RMW shl 16) + 15
 		mov	ax, 900h
 		sub	ax, word_220EC