Duke07 - another MS-DOS port of Duke3D

Reply 40 of 44, by Darkcrafter07

Posted on 2025-10-08, 15:21

Darkcrafter07 Offline

Rank Newbie

Rank: Newbie
Posts: 27
Joined: 2024-06-06, 09:38

analog_programmer wrote on 2025-08-12, 07:28:

Thanks for the clarification! If I understand correctly, when "ydetail" variable in "ENG386.C" is set to 3, the produced executable will be for even lower details.

I have a suggestion. For easier compiling and linking of different executable versions you can separate make-files and altered .C and .H files in separate folders like "D07_SRC", "D07_SRC_LQ2" and "D07_SRC_LQ3".

Not yet sir, let it be like that.

What I've been trying to do for more than 4 months is getting a non-fpu version of slopes drawing to work. There's a version that finally doesn't crash and does perspective correction, yet faster on a nonfpu SX processor, it still looks awful and coming with a lot of geometric distortions.

a part from A.asm:

1BITSOFPRECISNLQ equ 3
2BITSOFPRECISNLQPOW equ 8
3
4ALIGN 16
5PUBLIC setupslopevlin2_
6setupslopevlin2_:
7	mov dword ptr [slopmach3b+3], ebx    ;ptr
8	mov dword ptr [slopmach5b+2], ecx    ;pinc
9	neg ecx
10	mov dword ptr [slopmach6b+2], ecx    ;-pinc
11
12	mov edx, 1
13	mov cl, al
14	shl edx, cl
15	dec edx
16	mov cl, ah
17	shl edx, cl
18	mov dword ptr [slopmach7b+2], edx
19
20	neg ah
21	mov byte ptr [slopmach2b+2], ah
22
23	sub ah, al
24	mov byte ptr [slopmach1b+2], ah
25
26	; FPU removal: Convert floating-point init to integer scaling
27	; Original: fild _asm1 + fstp _asm2
28	mov eax, [_asm1]
29	;shl eax, 16                  ; not needed now
30	mov [_asm2], eax
31
32	ret
33
34ALIGN 16
35PUBLIC slopevlin2_
36slopevlin2_:
37	mov _ebpbak, ebp
38	mov _espbak, esp
39
40	sub ecx, esp
41	mov dword ptr [slopmach4b+3], ecx
42
43	mov ebp, eax                            ; Remove FPU load
44slopmach6b: lea ebp, [eax+88888888h]
45	add ebx, dword ptr _asm2                 ; Replace FPU add with integer op
46
47	mov _asm1, ebx
48	shl ebx, 3
49
50	mov eax, _globalx3
51	mov ecx, _globaly3
52	imul eax, ebx
53	imul ecx, ebx
54	add esi, eax
55	add edi, ecx
56
57	mov ebx, edx
58	jmp short bigslopeloopb
59ALIGN 16
60bigslopeloopb:

…Show last 58 lines

61	mov dword ptr _fpuasm, ebx              ; Replace FPU store
62
63	mov eax, ebx                            ; Modified from original FPU path
64	add eax, eax
65	sbb edx, edx
66	mov ecx, eax
67	shr ecx, 24
68	and eax, 00ffe000h
69	shr eax, 11
70	sub cl, 2
71	mov eax, dword ptr _reciptable[eax]
72	shr eax, cl
73	xor eax, edx
74	mov edx, _asm1
75	mov ecx, _globalx3
76	mov _asm1, eax
77	sub eax, edx
78	mov edx, _globaly3
79	imul ecx, eax
80	imul eax, edx
81
82	add ebx, dword ptr _asm2                ; Replace FPU add with integer op
83
84	cmp ebx, BITSOFPRECISNLQPOW
85	mov _asm4, ebx
86	mov cl, bl
87	jl short slopeskipminb
88	mov cl, BITSOFPRECISNLQPOW
89slopeskipminb:
90
91	mov ebx, esi
92	mov edx, edi
93
94beginnerslopeloopb:
95slopmach1b: shr ebx, 20
96	add esi, ecx
97slopmach2b: shr edx, 26
98slopmach7b: and ebx, 88888888h
99	add edi, eax
100slopmach5b: add ebp, 88888888h
101slopmach3b: mov dl, byte ptr [ebx+edx+88888888h]
102slopmach4b: mov ebx, dword ptr [esp+88888888h]
103	sub esp, 4
104	dec cl
105	mov al, byte ptr [ebx+edx]
106	mov ebx, esi
107	mov [ebp], al
108	mov edx, edi
109	jnz short beginnerslopeloopb
110
111	mov ebx, _asm4
112	sub ebx, BITSOFPRECISNLQPOW
113	jg short bigslopeloopb
114
115	mov esp, _espbak
116	mov ebp, _ebpbak
117	ret

the parts of engine.c:

1
2long reciptable[2048], fpuasm, fpuasmnonfpu;
3long reciptablenonfpu[2048], deltaztable[2048];
4
5#define FIX16_SHIFT 14          // Matches Build's 30-bit fixed-point
6#define FIX16_FACTOR 0x40000000 // 1<<30 in 32-bit (1073741824)
7#define RECIP_TABLE_OFFSET 2048
8
9#define F1_0	0x10000         // 16.16 fixed-point scaling
10
11loadtables()
12{
13    long i, fil;
14    float z, dz, nextz, deltaz;
15
16    if (tablesloaded == 0)
17    {
18        initksqrt();
19
20        // Generate original FPU-based reciptable at all times (it's int too)
21            for(i = 0; i < 2048; i++)
22            { 
23                reciptable[i] = divscale30(2048L, i+2048);
24            }
25        if ( (use_fpu) == 0)
26        {
27            // Generate non-FPU reciptable if nofpu parameter was passed
28            reciptablenonfpu[0] = F1_0 / RECIP_TABLE_OFFSET;
29            for(i = 1; i < 2048; i++)
30            {
31                //reciptablenonfpu[i] = FIX16_FACTOR / (i + RECIP_TABLE_OFFSET);
32                reciptablenonfpu[i] = F1_0 / i;
33            }
34
35            z = 1.0f;  // initial Z (matches original FPU setup)
36            dz = 0.01f; // step size (adjust based on your needs)
37            for (i = 0; i < 2048; i++)
38            {
39                // match max line height
40                nextz = z + dz;
41                // hyperbolic step
42                deltaz = (1.0f/z) - (1.0f/nextz);
43                // convert to fixed-point
44                deltaztable[i] = (long)(deltaz * (1 << 16));
45                z = nextz;
46            }
47        }
48
49
50        // Load built-in tables (sintable, radar angles, fonts)
51        if ((fil = kopen4load("tables.dat", 0)) != -1)
52        {
53            // Reciptable is NOT loaded from file - generated above
54            kread(fil, sintable, 2048 * 2);
55            kread(fil, radarang, 640 * 2);
56            for(i = 0; i < 640; i++) radarang[1279 - i] = -radarang[i];
57            kread(fil, textfont, 1024);
58            kread(fil, smalltextfont, 1024);
59            kread(fil, britable, 1024);
60            kclose(fil);

…Show last 6 lines

61        }
62
63        tablesloaded = 1;
64    }
65}

I think the biggest challenge is this line: add ebx, dword ptr _asm2
because as soon as FPU addition is removed it breaks the "hyperbolical continuity" as AI said. There are two lines like this in the asm code and I even tried to simulate it with deltaztable yet without much success.

Maybe somebody knows it better.

Reply 41 of 44, by Darkcrafter07

Posted on 2025-10-24, 17:43

Darkcrafter07 Offline

Rank Newbie

Rank: Newbie
Posts: 27
Joined: 2024-06-06, 09:38

So we're almost there on non-fpu slopes, I think it still looks meh but much closer to the result I'd desire to have. You can download the compiled exe to see how it's going. My further attempt would be to try decreasing "amplitude" of ebx _asm2 additions. It's slower than low detail degraded version but much faster than fully detailed fpu version on 486sx. Later we could combine both approaches and do fully integer slopes in 2x detail loss instead of 8x or even keep it as is. If it goes fine for sure.

Part of A.asm:

1EXTRN _surfx : dword
2EXTRN _surfy : dword
3EXTRN _slopedastat : byte
4
5BITSOFPRECISNLQ equ 3
6BITSOFPRECISNLQPOW equ 8
7
8ALIGN 16
9PUBLIC setupslopevlin2_
10setupslopevlin2_:
11	mov dword ptr [slopmach3b+3], ebx    ;ptr
12	mov dword ptr [slopmach5b+2], ecx    ;pinc
13	neg ecx
14	mov dword ptr [slopmach6b+2], ecx    ;-pinc
15
16	mov edx, 1
17	mov cl, al
18	shl edx, cl
19	dec edx
20	mov cl, ah
21	shl edx, cl
22	mov dword ptr [slopmach7b+2], edx
23
24	neg ah
25	mov byte ptr [slopmach2b+2], ah
26
27	sub ah, al
28	mov byte ptr [slopmach1b+2], ah
29
30	mov eax, [_asm1]
31	mov [_asm2], eax
32
33	ret
34
35ALIGN 16
36PUBLIC slopevlin2_
37slopevlin2_:
38	; Save critical registers
39	push ebp
40	push esi
41	push edi
42	push ebx
43
44	; Surface type check (ceiling(0) vs floor(1))
45	cmp byte ptr [_slopedastat], 0
46	jz slp2drawasusual                      ; Skip inversion for ceilings
47
48	; Only invert for floors (dastat=1)
49	neg dword ptr [_asm3]
50	neg dword ptr [_globalx3]
51	neg dword ptr [_globaly3]
52
53slp2drawasusual:
54	mov _ebpbak, ebp
55	mov _espbak, esp
56
57	sub ecx, esp
58	mov dword ptr [slopmach4b+3], ecx
59
60	mov ebp, eax                            ; Remove FPU load

…Show last 81 lines

61slopmach6b: lea ebp, [eax+88888888h]
62	add ebx, dword ptr _asm2                ; Replace FPU add with integer op
63
64	mov _asm1, ebx
65	shl ebx, 3
66
67	mov eax, [_globalx3]                    ; Proper dereference
68	mov ecx, [_globaly3]                    ; Proper dereference
69	imul eax, ebx
70	imul ecx, ebx
71	add esi, eax
72	add edi, ecx
73
74	mov ebx, edx
75	jmp short bigslopeloopb
76ALIGN 16
77bigslopeloopb:
78	mov dword ptr _fpuasm, ebx              ; Replace FPU store
79
80	mov eax, ebx                            ; Modified from original FPU path
81	add eax, eax
82	sbb edx, edx
83	mov ecx, eax
84	shr ecx, 24
85	and eax, 00ffe000h
86	shr eax, 11
87	sub cl, 2
88	mov eax, dword ptr _reciptable[eax]
89	shr eax, cl
90	xor eax, edx
91	mov edx, _asm1
92	mov ecx, _surfx
93	mov _asm1, eax
94	sub eax, edx
95	mov edx, _surfy
96	imul ecx, eax
97	imul eax, edx
98
99	add ebx, dword ptr _asm2                ; Replace FPU add with integer op
100	add ebx, 1
101
102
103	cmp ebx, BITSOFPRECISNLQPOW
104	mov _asm4, ebx
105	mov cl, bl
106	jl short slopeskipminb
107	mov cl, BITSOFPRECISNLQPOW
108slopeskipminb:
109
110	mov ebx, esi
111	mov edx, edi
112
113beginnerslopeloopb:
114slopmach1b: shr ebx, 20
115	add esi, ecx
116slopmach2b: shr edx, 26
117slopmach7b: and ebx, 88888888h
118	add edi, eax
119slopmach5b: add ebp, 88888888h
120slopmach3b: mov dl, byte ptr [ebx+edx+88888888h]
121slopmach4b: mov ebx, dword ptr [esp+88888888h]
122	sub esp, 4
123	dec cl
124	mov al, byte ptr [ebx+edx]
125	mov ebx, esi
126	mov [ebp], al
127	mov edx, edi
128	jnz short beginnerslopeloopb
129
130	mov ebx, _asm4
131	sub ebx, BITSOFPRECISNLQPOW
132	jg short bigslopeloopb
133
134	mov esp, _espbak
135	mov ebp, _ebpbak
136	pop ebx
137	pop edi
138	pop esi
139	pop ebp
140	ret

Part of engine.c:

1#define BITSOFPRECISNLQ 3
2long SCALEFACTOR = 15; // extra precision
3long surfx, surfy, surfstepx, surfstepy, surfz, surfstepz;
4long surfx_prev, surfy_prev;
5char slopedastat;
6grouscan_nonfpu (long dax1, long dax2, long sectnum, char dastat)
7{
8	long i, j, k, l, m, n, x, y, dx, dy, wx, wy, x1, y1, x2, y2, daz;
9	long daslope, dasqr;
10	long dashade, shoffs, shinc, m1, m2, *mptr1, *mptr2, *nptr1, *nptr2;
11	walltype *wal;
12	sectortype *sec;
13
14	sec = &sector[sectnum];
15
16	slopedastat = dastat; // pass to asm to draw ceilings-floors differently
17
18	if (dastat == 0)
19	{
20		if (globalposz <= getceilzofslope(sectnum,globalposx,globalposy))
21			return;  //Back-face culling
22		globalorientation = sec->ceilingstat;
23		globalpicnum = sec->ceilingpicnum;
24		globalshade = sec->ceilingshade;
25		globalpal = sec->ceilingpal;
26		daslope = sec->ceilingheinum;
27		daz = sec->ceilingz;
28	}
29	else
30	{
31		if (globalposz >= getflorzofslope(sectnum,globalposx,globalposy))
32			return;  //Back-face culling
33		globalorientation = sec->floorstat;
34		globalpicnum = sec->floorpicnum;
35		globalshade = sec->floorshade;
36		globalpal = sec->floorpal;
37		daslope = sec->floorheinum;
38		daz = sec->floorz;
39	}
40
41	if ((picanm[globalpicnum]&192) != 0) globalpicnum += animateoffs(globalpicnum,sectnum);
42	setgotpic(globalpicnum);
43	if ((tilesizx[globalpicnum] <= 0) || (tilesizy[globalpicnum] <= 0)) return;
44	if (waloff[globalpicnum] == 0) loadtile(globalpicnum);
45
46	wal = &wall[sec->wallptr];
47	wx = wall[wal->point2].x - wal->x;
48	wy = wall[wal->point2].y - wal->y;
49	dasqr = krecipasm(nsqrtasm(wx*wx+wy*wy));
50	i = mulscale21(daslope,dasqr);
51	wx *= i; wy *= i;
52
53	globalx = -mulscale19(singlobalang,xdimenrecip);
54	globaly = mulscale19(cosglobalang,xdimenrecip);
55	globalx1 = (globalposx<<8);
56	globaly1 = -(globalposy<<8);
57	i = (dax1-halfxdimen)*xdimenrecip;
58	globalx2 = mulscale16(cosglobalang<<4,viewingrangerecip) - mulscale27(singlobalang,i);
59	globaly2 = mulscale16(singlobalang<<4,viewingrangerecip) + mulscale27(cosglobalang,i);
60	globalzd = (xdimscale<<9);

…Show last 131 lines

61	globalzx = -dmulscale17(wx,globaly2,-wy,globalx2) + mulscale10(1-globalhoriz,globalzd);
62	globalz = -dmulscale25(wx,globaly,-wy,globalx);
63
64	if (globalorientation&64)  //Relative alignment
65	{
66		dx = mulscale14(wall[wal->point2].x-wal->x,dasqr);
67		dy = mulscale14(wall[wal->point2].y-wal->y,dasqr);
68
69		i = nsqrtasm(daslope*daslope+16777216);
70
71		x = globalx; y = globaly;
72		globalx = dmulscale16(x,dx,y,dy);
73		globaly = mulscale12(dmulscale16(-y,dx,x,dy),i);
74
75		x = ((wal->x-globalposx)<<8); y = ((wal->y-globalposy)<<8);
76		globalx1 = dmulscale16(-x,dx,-y,dy);
77		globaly1 = mulscale12(dmulscale16(-y,dx,x,dy),i);
78
79		x = globalx2; y = globaly2;
80		globalx2 = dmulscale16(x,dx,y,dy);
81		globaly2 = mulscale12(dmulscale16(-y,dx,x,dy),i);
82	}
83	if (globalorientation&0x4)
84	{
85		i = globalx; globalx = -globaly; globaly = -i;
86		i = globalx1; globalx1 = globaly1; globaly1 = i;
87		i = globalx2; globalx2 = -globaly2; globaly2 = -i;
88	}
89	if (globalorientation&0x10) { globalx1 = -globalx1, globalx2 = -globalx2, globalx = -globalx; }
90	if (globalorientation&0x20) { globaly1 = -globaly1, globaly2 = -globaly2, globaly = -globaly; }
91
92      //isn't "daz" a texture scale for the whole function here?
93	daz = dmulscale9(wx,globalposy-wal->y,-wy,globalposx-wal->x) + ((daz-globalposz)<<8);
94	globalx2 = mulscale20(globalx2,daz); globalx = mulscale28(globalx,daz);
95	globaly2 = mulscale20(globaly2,-daz); globaly = mulscale28(globaly,-daz);
96
97	i = 8-(picsiz[globalpicnum]&15); j = 8-(picsiz[globalpicnum]>>4);
98	if (globalorientation&8) { i++; j++; }
99	globalx1 <<= (i+12); globalx2 <<= i; globalx <<= i;
100	globaly1 <<= (j+12); globaly2 <<= j; globaly <<= j;
101
102	if (dastat == 0)
103	{
104		globalx1 += (((long)sec->ceilingxpanning)<<24);
105		globaly1 += (((long)sec->ceilingypanning)<<24);
106	}
107	else
108	{
109		globalx1 += (((long)sec->floorxpanning)<<24);
110		globaly1 += (((long)sec->floorypanning)<<24);
111	}
112
113	asm1 = -(globalzd>>(16-BITSOFPRECISNLQ));
114
115	globvis = globalvisibility;
116	if (sec->visibility != 0) globvis = mulscale4(globvis,(long)((unsigned char)(sec->visibility+16)));
117	globvis = mulscale13(globvis,daz);
118	globvis = mulscale16(globvis,xdimscale);
119	j = FP_OFF(palookup[globalpal]);
120
121	setupslopevlin2(((long)(picsiz[globalpicnum]&15))+(((long)(picsiz[globalpicnum]>>4))<<8),waloff[globalpicnum],-ylookup[1]);
122
123	l = (globalzd>>16);
124
125	shinc = mulscale16(globalz,xdimenscale);
126	if (shinc > 0) shoffs = (4<<15); else shoffs = ((2044-ydimen)<<15);
127	if (dastat == 0) y1 = umost[dax1]; else y1 = max(umost[dax1],dplc[dax1]);
128	m1 = mulscale16(y1,globalzd) + (globalzx>>6);
129		//Avoid visibility overflow by crossing horizon
130	if (globalzd > 0) m1 += (globalzd>>16); else m1 -= (globalzd>>16);
131	m2 = m1+l;
132	mptr1 = (long *)&slopalookup[y1+(shoffs>>15)]; mptr2 = mptr1+1;
133
134	for(x=dax1;x<=dax2;x++)
135	{
136		if (dastat == 0) { y1 = umost[x]; y2 = min(dmost[x],uplc[x])-1; }
137				else { y1 = max(umost[x],dplc[x]); y2 = dmost[x]-1; }
138
139		if (y1 <= y2)
140		{
141			nptr1 = (long *)&slopalookup[y1+(shoffs>>15)];
142			nptr2 = (long *)&slopalookup[y2+(shoffs>>15)];
143			while (nptr1 <= mptr1)
144			{
145				*mptr1-- = j + (getpalookup((long)mulscale24(krecipasm(m1),globvis),globalshade)<<8);
146				m1 -= l;
147			}
148			while (nptr2 >= mptr2)
149			{
150				*mptr2++ = j + (getpalookup((long)mulscale24(krecipasm(m2),globvis),globalshade)<<8);
151				m2 += l;
152			}
153
154			if ( (dastat) == 0)
155			{
156			    //ceilings
157			    globalx3 = (globalx2>>10);
158			    globaly3 = (globaly2>>10);
159			} else {
160			    //floors
161			    globalx3 = -(globalx2>>10);
162			    globaly3 = -(globaly2>>10);
163			}
164
165			    // Replace globalx3/globaly3 with scaled versions
166			    surfx = globalx3 / SCALEFACTOR;
167			    surfy = globaly3 / SCALEFACTOR;
168			    surfz = globalz / SCALEFACTOR;
169			    
170			    // Calculate stepping parameters
171			    surfstepx = globalx / SCALEFACTOR;
172			    surfstepy = globaly / SCALEFACTOR;
173			    surfstepz = globalz / SCALEFACTOR;
174
175			asm3 = mulscale16(y2,globalzd) + (globalzx>>6);
176			slopevlin2(ylookup[y2]+x+frameoffset,krecipasm(asm3>>3),(long)nptr2,y2-y1+1,globalx1,globaly1);
177
178			if ((x&15) == 0) faketimerhandler();
179		}
180		globalx2 += globalx + (globalx >> 16);
181		globaly2 += globaly + (globaly >> 16);
182		globalzx += globalz;
183
184		surfx += surfstepx;
185		surfy += surfstepy;
186		surfz += surfstepz;
187
188		shoffs += shinc;
189	}
190}

Last edited by Darkcrafter07 on 2025-10-24, 19:26. Edited 1 time in total.

Reply 42 of 44, by marxveix

Posted on 2025-10-24, 18:07

marxveix Offline

Rank Oldbie

Rank: Oldbie
Posts: 641
Joined: 2018-03-05, 21:46

Thank you! I try it later, but not today.

Best ATi Rage3 drivers for 3DCIF / Direct3D / OpenGL / DVD : ATi RagePro drivers and software
30+MiniGL / OpenGL Win 9x dll files for all ATi Rage3 cards : Re: ATi RagePro OpenGL files

Reply 43 of 44, by Darkcrafter07

Posted on 2025-10-24, 18:12

Darkcrafter07 Offline

Rank Newbie

Rank: Newbie
Posts: 27
Joined: 2024-06-06, 09:38

marxveix wrote on 2025-10-24, 18:07:

Thank you! I try it later, but not today.

You're welcome but don't hurry, this version is really buggy and may make your computer freeze, there are illegal memory writes are still going.
Upd... fixed

Reply 44 of 44, by marxveix

Posted on Yesterday, 10:45

marxveix Offline

Rank Oldbie

Rank: Oldbie
Posts: 641
Joined: 2018-03-05, 21:46

Darkcrafter07 wrote on 2025-10-24, 18:12:

marxveix wrote on 2025-10-24, 18:07:

Thank you! I try it later, but not today.

You're welcome but don't hurry, this version is really buggy and may make your computer freeze, there are illegal memory writes are still going.
Upd... fixed

Now its bugfixed, better to try with it?

Best ATi Rage3 drivers for 3DCIF / Direct3D / OpenGL / DVD : ATi RagePro drivers and software
30+MiniGL / OpenGL Win 9x dll files for all ATi Rage3 cards : Re: ATi RagePro OpenGL files

Main menu