From 8b0165738a1fd66f4083b0e9818ff50869fa61f4 Mon Sep 17 00:00:00 2001 From: nmlgc Date: Sun, 8 Nov 2020 21:04:36 +0100 Subject: [PATCH] [Decompilation] [th03] .MRS: Byte-aligned, opaque blitting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Containing not one, but two decompilation innovations, one of which works around a compiler bug using C++ template functions… Completes P0126, funded by [Anonymous] and Blue Bolt. --- Research/Borland C++ decompilation.md | 13 ++ decomp.h | 64 +++++++- th03/formats/mrs.cpp | 105 ++++++++++++ th03/formats/mrs.hpp | 6 + th03_main.asm | 223 ++++++++------------------ 5 files changed, 250 insertions(+), 161 deletions(-) diff --git a/Research/Borland C++ decompilation.md b/Research/Borland C++ decompilation.md index 97ac8018..5e23ed79 100644 --- a/Research/Borland C++ decompilation.md +++ b/Research/Borland C++ decompilation.md @@ -105,6 +105,10 @@ case it's part of an arithmetic expression that was promoted to `int`. ## Assignments +| | | +|-|-| +| `MOV ???, [SI+????]` | Only achievable through pointer arithmetic? | + * When assigning to a array element at a variable or non-0 index, the array element address is typically evaluated before the expression to be assigned. But when assigning @@ -452,3 +456,12 @@ contains one of the following: **Certainty:** Confirmed through reverse-engineering `TCC.EXE`, no way around it. + +### Compiler bugs + +* Dereferencing a `far` pointer constructed from the `_FS` and `_GS` + pseudoregisters emits wrong segment prefix opcodes – 0x46 (`INC SI`) and + 0x4E (`DEC SI`) rather than the correct 0x64 and 0x65, respectively. + + **Workaround**: Not happening when compiling via TASM (`-B` on the command + line, or `#pragma inline`). diff --git a/decomp.h b/decomp.h index 773c3559..74143f79 100644 --- a/decomp.h +++ b/decomp.h @@ -9,6 +9,7 @@ // if(FLAGS_*) { goto some_label; | return; } // these assemble into the single given instruction. Apply the ! operator to // get the N versions. +#define FLAGS_CARRY (_FLAGS & 0x01) /* JC / JAE / JB */ #define FLAGS_ZERO (_FLAGS & 0x40) /* JZ */ #define FLAGS_SIGN (_FLAGS & 0x80) /* JS */ // ---------------- @@ -26,5 +27,66 @@ out dx, ax; \ } -// Versions that actually inline with pseudoregisters +// poke() versions that actually inline with pseudoregisters +// --------------------------------------------------------- #define pokew(sgm, off, val) { *(uint16_t far *)(MK_FP(sgm, off)) = val; } + +// Turbo C++ 4.0 generates wrong segment prefix opcodes for the _FS and _GS +// pseudoregisters - 0x46 (INC SI) and 0x4E (DEC SI) rather than the correct +// 0x64 and 0x65, respectively. These prefixes are also not supported in +// inline assembly, which is limited to pre-386 anyway. Compiling via assembly +// (`#pragma inline`) would work and generate the correct instructions here, +// but that would incur yet another dependency on a 16-bit TASM, for something +// honestly quite insignificant. +// +// So, can we somehow work around this issue while retaining the readability +// of the usage code and pretending that this bug doesn't exist? Comparisons +// with segment registers unfortunately don't inline, so something like +// if(sgm == _FS) +// wouldn't work, even inside a macro that replaces [sgm] with _FS. But since +// __emit__() *does* inline, we can use function templates! The default +// versions provide the regularly intended C code for all other registers, +// while explicit specializations for _FS and _GS __emit__() the correct +// instruction opcodes for all offset registers needed. Then, we only need to +// somehow move the pseudoregisters up into the type system... which can +// simply be done by turning them into class names via preprocessor token +// pasting. Sure, this limits this approach to raw registers with no immediate +// offsets, but let's hope we won't ever need those... +// +// Also, hey, no need for the MK_FP() macro if we directly return the correct +// types. +#ifdef __cplusplus +} + struct Decomp_FS { void __seg* value() { return (void __seg *)(_FS); } }; + struct Decomp_GS { void __seg* value() { return (void __seg *)(_GS); } }; + struct Decomp_DI { void __near* value() { return (void __near *)(_DI); } }; + + // Removing [val] from the parameter lists of the template functions below + // perfects the inlining. + #define poked(sgm, off, val) \ + _EAX = val; \ + poked_eax((Decomp##sgm *)NULL, (Decomp##off *)NULL); + + template inline void poked_eax( + Segment *sgm, Offset *off + ) { + *reinterpret_cast(sgm->value() + off->value()) = _EAX; + } + + inline void poked_eax(Decomp_FS *sgm, Decomp_DI *off) { + __emit__(0x66, 0x64, 0x89, 0x05); // MOV FS:[DI], EAX + } + + inline void poked_eax(Decomp_GS *sgm, Decomp_DI *off) { + __emit__(0x66, 0x65, 0x89, 0x05); // MOV GS:[DI], EAX + } + +extern "C" { +#endif +// --------------------------------------------------------- + +// 32-bit ASM instructions not supported by Turbo C++ 4.0J's built-in +// assembler. Makes no sense to compile with `#pragma inline` (and thus, +// require a 16-bit TASM) just for those. +#define MOVSD __emit__(0x66, 0xA5); +#define REP __emit__(0xF3); diff --git a/th03/formats/mrs.cpp b/th03/formats/mrs.cpp index 4b472ee7..fe749f5b 100644 --- a/th03/formats/mrs.cpp +++ b/th03/formats/mrs.cpp @@ -1,9 +1,15 @@ +#pragma option -3 #pragma codeseg SHARED +extern "C" { +#include #include "platform.h" #include "pc98.h" #include "planar.h" +#include "decomp.h" #include "th03/formats/hfliplut.h" +} + #include "th03/formats/mrs.hpp" static const vram_byte_amount_t MRS_BYTE_W = (MRS_W / BYTE_DOTS); @@ -21,6 +27,9 @@ struct mrs_t { extern mrs_t far *mrs_images[MRS_SLOT_COUNT]; +// Decompilation workarounds +// ------------------------- + // Points [reg_sgm]:[reg_off] to the alpha plane of the .MRS image in the // given [slot]. #define mrs_slot_assign(reg_sgm, reg_off, slot) { \ @@ -29,6 +38,102 @@ extern mrs_t far *mrs_images[MRS_SLOT_COUNT]; __asm { l##reg_sgm reg_off, mrs_images[bx]; } \ } +// Single iteration across [row_dword_w] 32-dot units of a .MRS image, from +// bottom to top. _DI is assumed to point at the bottom left target position, +// while [body] is responsible to increment _DI by [MRS_BYTE_W]. +#define mrs_put_rows(row_dword_w, body) \ + do { \ + _CX = row_dword_w; \ + body \ + _DI -= (ROW_SIZE + MRS_BYTE_W); \ + } while(!FLAGS_CARRY); + +// ZUN optimized mrs_put_noalpha_8() to blit 3 out of the 4 bitplanes within a +// single loop. Annoyingly, he does this by first moving the source pointer to +// the beginning of the G plane within a mrs_t instance, and then accesses the +// earlier planes with *negative* offsets, rather than, y'know, just using +// positive ones like a sane person. +// These offsets are encoded as immediates within the instructions that read +// the dot patterns. Subtracting the raw values wouldn't decompile correctly, +// but thankfully, pointer arithmetic does, and is also a lot cleaner... +// conceptually, at least. It also inlines perfectly, allowing us to give some +// meaningful names to these horrifying expressions. +struct mrs_at_G_t : public mrs_plane_t { + dots32_t dots_from_alpha(void) const { return *(*((this - 3)->dots)); } + dots32_t dots_from_B(void) const { return *(*((this - 2)->dots)); } + dots32_t dots_from_R(void) const { return *(*((this - 1)->dots)); } +}; + +static inline mrs_at_G_t near* mrs_at_G(void) { + return reinterpret_cast(offsetof(mrs_t, planes.G)); +} +// ------------------------- + +inline uint16_t to_bottom_left_8(const screen_x_t &left) { + return ((left >> 3) + ((MRS_H - 1) * ROW_SIZE)); +} + +inline seg_t to_segment(const uscreen_y_t &top) { + _AX = (top / 2); // screen_y_t -> vram_y_t... + _DX = _AX; + return ((_AX << 2) + _DX); // ... and -> segment +} + +void pascal mrs_put_noalpha_8( + screen_x_t left, uscreen_y_t top, int slot, bool altered_colors +) +{ + #define _SI reinterpret_cast(_SI) + #define at_bottom_left _DX // *Not* rooted at (0, 0)! + + __asm { push ds; } + _DI = to_bottom_left_8(left); + _AX = to_segment(top); + mrs_slot_assign(ds, si, slot); + _SI = mrs_at_G(); + + // "I've spent good money on that Intel 386 CPU, so let's actually use + // *all* its segment registers!" :zunpet: :zunpet: :zunpet: + _FS = (_AX += SEG_PLANE_B); // = B + _GS = (_AX += SEG_PLANE_DIST_BRG); // = R + _ES = (_AX += SEG_PLANE_DIST_BRG); // = G + // At this point though, we're out of segment registers. That's why this + // approach of not changing destination segments within a blitting loop + // only works for 3 out of the 4 bitplanes, and why we need a second loop + // for the final one after all. + _BX = (_AX += SEG_PLANE_DIST_E); // = E + at_bottom_left = _DI; + if(altered_colors) { + mrs_put_rows(MRS_DWORD_W, { put_altered: + poked(_FS, _DI, (~_SI->dots_from_alpha() | _SI->dots_from_B())); + poked(_GS, _DI, _SI->dots_from_R()); + MOVSD; + __asm { loop put_altered; } + }); + // SI is now at the beginning of the E plane. Blit it in its own loop + _DI = at_bottom_left; + _ES = _BX; + mrs_put_rows(MRS_DWORD_W, REP MOVSD); + } else { + mrs_put_rows(MRS_DWORD_W, { put_regular: + poked(_FS, _DI, _SI->dots_from_B()); + poked(_GS, _DI, _SI->dots_from_R()); + MOVSD; + _asm { loop put_regular; } + }); + // SI is now at the beginning of the E plane. Blit it in its own loop + _DI = at_bottom_left; + _ES = _BX; + mrs_put_rows(MRS_DWORD_W, REP MOVSD); + } + __asm { pop ds; } + + #undef at_bottom_left + #undef _SI +} + +#pragma codestring "\x90" + void pascal mrs_hflip(int slot) { _CX = sizeof(mrs_t); diff --git a/th03/formats/mrs.hpp b/th03/formats/mrs.hpp index ad9e19d4..3a8fe0db 100644 --- a/th03/formats/mrs.hpp +++ b/th03/formats/mrs.hpp @@ -7,6 +7,12 @@ static const int MRS_SLOT_COUNT = 8; static const pixel_t MRS_W = 288; static const pixel_t MRS_H = 184; +// Displays the .MRS image in the given [slot] at (⌊left/8⌋*8, top), +// disregarding its alpha plane, and optionally altering its colors slightly. +void pascal mrs_put_noalpha_8( + screen_x_t left, uscreen_y_t top, int slot, bool altered_colors +); + // Persistently flips the image in [slot] horizontally, using the [hflip_lut]. void pascal mrs_hflip(int slot); /// --------------------------------------------------------------------------- diff --git a/th03_main.asm b/th03_main.asm index c2670e35..2552e23e 100644 --- a/th03_main.asm +++ b/th03_main.asm @@ -9052,104 +9052,7 @@ sub_EF46 endp ; --------------------------------------------------------------------------- nop - -; =============== S U B R O U T I N E ======================================= - -; Attributes: bp-based frame - -sub_EFF4 proc far - -arg_0 = byte ptr 6 -arg_2 = word ptr 8 -arg_4 = word ptr 0Ah -arg_6 = word ptr 0Ch - - push bp - mov bp, sp - push si - push di - push ds - mov ax, [bp+arg_6] - sar ax, 3 - add ax, 3930h - mov di, ax - mov ax, [bp+arg_4] - shr ax, 1 - mov dx, ax - shl ax, 2 - add ax, dx - mov bx, [bp+arg_2] - shl bx, 2 - lds si, _mrs_images[bx] - mov si, 4DA0h - add ax, 0A800h - mov fs, ax - add ax, 800h - mov gs, ax - add ax, 800h - mov es, ax - add ax, 2800h - mov bx, ax - mov dx, di - cmp [bp+arg_0], 0 - jz short loc_F071 - -loc_F03A: - mov cx, 9 - -loc_F03D: - mov eax, [si-4DA0h] - not eax - or eax, [si-33C0h] - mov fs:[di], eax - mov eax, [si-19E0h] - mov gs:[di], eax - movsd - loop loc_F03D - sub di, 74h ; 't' - jnb short loc_F03A - mov di, dx - mov es, bx - -loc_F064: - mov cx, 9 - rep movsd - sub di, 74h ; 't' - jnb short loc_F064 - jmp short loc_F09E -; --------------------------------------------------------------------------- - -loc_F071: - mov cx, 9 - -loc_F074: - mov eax, [si-33C0h] - mov fs:[di], eax - mov eax, [si-19E0h] - mov gs:[di], eax - movsd - loop loc_F074 - sub di, 74h ; 't' - jnb short loc_F071 - mov di, dx - mov es, bx - -loc_F093: - mov cx, 9 - rep movsd - sub di, 74h ; 't' - jnb short loc_F093 - -loc_F09E: - pop ds - pop di - pop si - pop bp - retf 8 -sub_EFF4 endp - -; --------------------------------------------------------------------------- - nop + extern @MRS_PUT_NOALPHA_8$QIUIIC:proc extern @MRS_HFLIP$QI:proc SPRITE16_SPRITES_COMMIT procdesc pascal far SPRITE16_PUT procdesc pascal far \ @@ -19177,19 +19080,19 @@ loc_1494A: loc_1495C: cmp _pid_current, 0 jz short loc_14967 - add si, 140h + add si, PLAYFIELD_W_BORDERED loc_14967: - push si - push 10h + push si ; left + push PLAYFIELD_TOP ; top mov al, _pid_current mov ah, 0 add ax, 2 - push ax + push ax ; slot mov al, _pid_current mov ah, 0 - push ax - call sub_EFF4 + push ax ; altered_colors + call @mrs_put_noalpha_8$qiuiic mov al, [bp+@@frame] mov ah, 0 mov bx, 8 @@ -20313,18 +20216,18 @@ loc_152D7: mov _palette_changed, 1 mov al, _pid_current mov ah, 0 - imul ax, 140h - add ax, 10h - push ax - push 10h + imul ax, PLAYFIELD_W_BORDERED + add ax, PLAYFIELD_LEFT + push ax ; left + push PLAYFIELD_TOP ; top mov al, _pid_current mov ah, 0 add ax, 2 - push ax + push ax ; slot mov al, _pid_current mov ah, 0 - push ax - call sub_EFF4 + push ax ; altered_colors + call @mrs_put_noalpha_8$qiuiic jmp short loc_15323 ; --------------------------------------------------------------------------- @@ -21393,18 +21296,18 @@ loc_15C23: loc_15C32: mov al, _pid_current mov ah, 0 - imul ax, 140h - add ax, 10h - push ax - push 10h + imul ax, PLAYFIELD_W_BORDERED + add ax, PLAYFIELD_LEFT + push ax ; left + push PLAYFIELD_TOP ; top mov al, _pid_current mov ah, 0 add ax, 2 - push ax + push ax ; slot mov al, _pid_current mov ah, 0 - push ax - call sub_EFF4 + push ax ; altered_colors + call @mrs_put_noalpha_8$qiuiic jmp short loc_15CB0 ; --------------------------------------------------------------------------- @@ -24121,23 +24024,23 @@ loc_172F0: loc_172FF: mov al, _pid_current mov ah, 0 - imul ax, 140h + imul ax, PLAYFIELD_W_BORDERED mov dl, _pid_current mov dh, 0 add dx, dx mov bx, dx add ax, _playfield_fg_shift_x[bx] - add ax, 10h - push ax - push 10h + add ax, PLAYFIELD_LEFT + push ax ; left + push PLAYFIELD_TOP ; top mov al, _pid_current mov ah, 0 add ax, 2 - push ax + push ax ; slot mov al, _pid_current mov ah, 0 - push ax - call sub_EFF4 + push ax ; altered_colors + call @mrs_put_noalpha_8$qiuiic jmp short loc_1737D ; --------------------------------------------------------------------------- @@ -26392,22 +26295,22 @@ loc_184A5: call sub_CE5B loc_18518: - mov si, 10h + mov si, PLAYFIELD_LEFT cmp _pid_current, 0 jz short loc_18526 - add si, 140h + add si, PLAYFIELD_W_BORDERED loc_18526: - push si - push 10h + push si ; left + push PLAYFIELD_TOP ; top mov al, _pid_current mov ah, 0 add ax, 2 - push ax + push ax ; slot mov al, _pid_current mov ah, 0 - push ax - call sub_EFF4 + push ax ; altered_colors + call @mrs_put_noalpha_8$qiuiic jmp short loc_185A3 ; --------------------------------------------------------------------------- @@ -26781,22 +26684,22 @@ loc_18895: mov _playfield_fg_shift_x[bx], -4 loc_188A4: - mov si, 10h + mov si, PLAYFIELD_LEFT cmp _pid_current, 0 jz short loc_188B2 - add si, 140h + add si, PLAYFIELD_W_BORDERED loc_188B2: - push si - push 10h + push si ; left + push PLAYFIELD_TOP ; top mov al, _pid_current mov ah, 0 add ax, 2 - push ax + push ax ; slot mov al, _pid_current mov ah, 0 - push ax - call sub_EFF4 + push ax ; altered_colors + call @mrs_put_noalpha_8$qiuiic mov al, [bp+@@frame] mov ah, 0 mov bx, 4 @@ -27027,22 +26930,22 @@ loc_18A68: mov angle_1FBD4, al loc_18B2F: - mov si, 10h + mov si, PLAYFIELD_LEFT cmp _pid_current, 0 jz short loc_18B3D - add si, 140h + add si, PLAYFIELD_W_BORDERED loc_18B3D: - push si - push 10h + push si ; left + push PLAYFIELD_TOP ; top mov al, _pid_current mov ah, 0 add ax, 2 - push ax + push ax ; slot mov al, _pid_current mov ah, 0 - push ax - call sub_EFF4 + push ax ; altered_colors + call @mrs_put_noalpha_8$qiuiic call egc_on jmp short @@ret ; --------------------------------------------------------------------------- @@ -27247,22 +27150,22 @@ loc_18D28: sub word_1FE56, 2E0h loc_18D34: - mov si, 10h + mov si, PLAYFIELD_LEFT cmp _pid_current, 0 jz short loc_18D42 - add si, 140h + add si, PLAYFIELD_W_BORDERED loc_18D42: - push si - push 10h + push si ; left + push PLAYFIELD_TOP ; top mov al, _pid_current mov ah, 0 add ax, 2 - push ax + push ax ; slot mov al, _pid_current mov ah, 0 - push ax - call sub_EFF4 + push ax ; altered_colors + call @mrs_put_noalpha_8$qiuiic jmp short loc_18DBF ; --------------------------------------------------------------------------- @@ -27384,22 +27287,22 @@ loc_18E5F: mov _playfield_fg_shift_x[bx], -4 loc_18E6E: - mov si, 10h + mov si, PLAYFIELD_LEFT cmp _pid_current, 0 jz short loc_18E7C - add si, 140h + add si, PLAYFIELD_W_BORDERED loc_18E7C: - push si - push 10h + push si ; left + push PLAYFIELD_TOP ; top mov al, _pid_current mov ah, 0 add ax, 2 - push ax + push ax ; slot mov al, _pid_current mov ah, 0 - push ax - call sub_EFF4 + push ax ; altered_colors + call @mrs_put_noalpha_8$qiuiic call grcg_setcolor pascal, (GC_RMW shl 16) + 15 mov ax, 900h sub ax, word_220EC