VOGONS


Duke07 - another MS-DOS port of Duke3D

Topic actions

Reply 40 of 44, by Darkcrafter07

User metadata
Rank Newbie
Rank
Newbie
analog_programmer wrote on 2025-08-12, 07:28:

Thanks for the clarification! If I understand correctly, when "ydetail" variable in "ENG386.C" is set to 3, the produced executable will be for even lower details.

I have a suggestion. For easier compiling and linking of different executable versions you can separate make-files and altered .C and .H files in separate folders like "D07_SRC", "D07_SRC_LQ2" and "D07_SRC_LQ3".

Not yet sir, let it be like that.

What I've been trying to do for more than 4 months is getting a non-fpu version of slopes drawing to work. There's a version that finally doesn't crash and does perspective correction, yet faster on a nonfpu SX processor, it still looks awful and coming with a lot of geometric distortions.

a part from A.asm:

BITSOFPRECISNLQ equ 3
BITSOFPRECISNLQPOW equ 8

ALIGN 16
PUBLIC setupslopevlin2_
setupslopevlin2_:
mov dword ptr [slopmach3b+3], ebx ;ptr
mov dword ptr [slopmach5b+2], ecx ;pinc
neg ecx
mov dword ptr [slopmach6b+2], ecx ;-pinc

mov edx, 1
mov cl, al
shl edx, cl
dec edx
mov cl, ah
shl edx, cl
mov dword ptr [slopmach7b+2], edx

neg ah
mov byte ptr [slopmach2b+2], ah

sub ah, al
mov byte ptr [slopmach1b+2], ah

; FPU removal: Convert floating-point init to integer scaling
; Original: fild _asm1 + fstp _asm2
mov eax, [_asm1]
;shl eax, 16 ; not needed now
mov [_asm2], eax

ret

ALIGN 16
PUBLIC slopevlin2_
slopevlin2_:
mov _ebpbak, ebp
mov _espbak, esp

sub ecx, esp
mov dword ptr [slopmach4b+3], ecx

mov ebp, eax ; Remove FPU load
slopmach6b: lea ebp, [eax+88888888h]
add ebx, dword ptr _asm2 ; Replace FPU add with integer op

mov _asm1, ebx
shl ebx, 3

mov eax, _globalx3
mov ecx, _globaly3
imul eax, ebx
imul ecx, ebx
add esi, eax
add edi, ecx

mov ebx, edx
jmp short bigslopeloopb
ALIGN 16
bigslopeloopb:
Show last 58 lines
	mov dword ptr _fpuasm, ebx              ; Replace FPU store

mov eax, ebx ; Modified from original FPU path
add eax, eax
sbb edx, edx
mov ecx, eax
shr ecx, 24
and eax, 00ffe000h
shr eax, 11
sub cl, 2
mov eax, dword ptr _reciptable[eax]
shr eax, cl
xor eax, edx
mov edx, _asm1
mov ecx, _globalx3
mov _asm1, eax
sub eax, edx
mov edx, _globaly3
imul ecx, eax
imul eax, edx

add ebx, dword ptr _asm2 ; Replace FPU add with integer op

cmp ebx, BITSOFPRECISNLQPOW
mov _asm4, ebx
mov cl, bl
jl short slopeskipminb
mov cl, BITSOFPRECISNLQPOW
slopeskipminb:

mov ebx, esi
mov edx, edi

beginnerslopeloopb:
slopmach1b: shr ebx, 20
add esi, ecx
slopmach2b: shr edx, 26
slopmach7b: and ebx, 88888888h
add edi, eax
slopmach5b: add ebp, 88888888h
slopmach3b: mov dl, byte ptr [ebx+edx+88888888h]
slopmach4b: mov ebx, dword ptr [esp+88888888h]
sub esp, 4
dec cl
mov al, byte ptr [ebx+edx]
mov ebx, esi
mov [ebp], al
mov edx, edi
jnz short beginnerslopeloopb

mov ebx, _asm4
sub ebx, BITSOFPRECISNLQPOW
jg short bigslopeloopb

mov esp, _espbak
mov ebp, _ebpbak
ret

the parts of engine.c:


long reciptable[2048], fpuasm, fpuasmnonfpu;
long reciptablenonfpu[2048], deltaztable[2048];

#define FIX16_SHIFT 14 // Matches Build's 30-bit fixed-point
#define FIX16_FACTOR 0x40000000 // 1<<30 in 32-bit (1073741824)
#define RECIP_TABLE_OFFSET 2048

#define F1_0 0x10000 // 16.16 fixed-point scaling

loadtables()
{
long i, fil;
float z, dz, nextz, deltaz;

if (tablesloaded == 0)
{
initksqrt();

// Generate original FPU-based reciptable at all times (it's int too)
for(i = 0; i < 2048; i++)
{
reciptable[i] = divscale30(2048L, i+2048);
}
if ( (use_fpu) == 0)
{
// Generate non-FPU reciptable if nofpu parameter was passed
reciptablenonfpu[0] = F1_0 / RECIP_TABLE_OFFSET;
for(i = 1; i < 2048; i++)
{
//reciptablenonfpu[i] = FIX16_FACTOR / (i + RECIP_TABLE_OFFSET);
reciptablenonfpu[i] = F1_0 / i;
}

z = 1.0f; // initial Z (matches original FPU setup)
dz = 0.01f; // step size (adjust based on your needs)
for (i = 0; i < 2048; i++)
{
// match max line height
nextz = z + dz;
// hyperbolic step
deltaz = (1.0f/z) - (1.0f/nextz);
// convert to fixed-point
deltaztable[i] = (long)(deltaz * (1 << 16));
z = nextz;
}
}


// Load built-in tables (sintable, radar angles, fonts)
if ((fil = kopen4load("tables.dat", 0)) != -1)
{
// Reciptable is NOT loaded from file - generated above
kread(fil, sintable, 2048 * 2);
kread(fil, radarang, 640 * 2);
for(i = 0; i < 640; i++) radarang[1279 - i] = -radarang[i];
kread(fil, textfont, 1024);
kread(fil, smalltextfont, 1024);
kread(fil, britable, 1024);
kclose(fil);
Show last 6 lines
        }

tablesloaded = 1;
}
}

I think the biggest challenge is this line: add ebx, dword ptr _asm2
because as soon as FPU addition is removed it breaks the "hyperbolical continuity" as AI said. There are two lines like this in the asm code and I even tried to simulate it with deltaztable yet without much success.

Maybe somebody knows it better.

Reply 41 of 44, by Darkcrafter07

User metadata
Rank Newbie
Rank
Newbie

So we're almost there on non-fpu slopes, I think it still looks meh but much closer to the result I'd desire to have. You can download the compiled exe to see how it's going. My further attempt would be to try decreasing "amplitude" of ebx _asm2 additions. It's slower than low detail degraded version but much faster than fully detailed fpu version on 486sx. Later we could combine both approaches and do fully integer slopes in 2x detail loss instead of 8x or even keep it as is. If it goes fine for sure.

Part of A.asm:

EXTRN _surfx : dword
EXTRN _surfy : dword
EXTRN _slopedastat : byte

BITSOFPRECISNLQ equ 3
BITSOFPRECISNLQPOW equ 8

ALIGN 16
PUBLIC setupslopevlin2_
setupslopevlin2_:
mov dword ptr [slopmach3b+3], ebx ;ptr
mov dword ptr [slopmach5b+2], ecx ;pinc
neg ecx
mov dword ptr [slopmach6b+2], ecx ;-pinc

mov edx, 1
mov cl, al
shl edx, cl
dec edx
mov cl, ah
shl edx, cl
mov dword ptr [slopmach7b+2], edx

neg ah
mov byte ptr [slopmach2b+2], ah

sub ah, al
mov byte ptr [slopmach1b+2], ah

mov eax, [_asm1]
mov [_asm2], eax

ret

ALIGN 16
PUBLIC slopevlin2_
slopevlin2_:
; Save critical registers
push ebp
push esi
push edi
push ebx

; Surface type check (ceiling(0) vs floor(1))
cmp byte ptr [_slopedastat], 0
jz slp2drawasusual ; Skip inversion for ceilings

; Only invert for floors (dastat=1)
neg dword ptr [_asm3]
neg dword ptr [_globalx3]
neg dword ptr [_globaly3]

slp2drawasusual:
mov _ebpbak, ebp
mov _espbak, esp

sub ecx, esp
mov dword ptr [slopmach4b+3], ecx

mov ebp, eax ; Remove FPU load
Show last 81 lines
slopmach6b: lea ebp, [eax+88888888h]
add ebx, dword ptr _asm2 ; Replace FPU add with integer op

mov _asm1, ebx
shl ebx, 3

mov eax, [_globalx3] ; Proper dereference
mov ecx, [_globaly3] ; Proper dereference
imul eax, ebx
imul ecx, ebx
add esi, eax
add edi, ecx

mov ebx, edx
jmp short bigslopeloopb
ALIGN 16
bigslopeloopb:
mov dword ptr _fpuasm, ebx ; Replace FPU store

mov eax, ebx ; Modified from original FPU path
add eax, eax
sbb edx, edx
mov ecx, eax
shr ecx, 24
and eax, 00ffe000h
shr eax, 11
sub cl, 2
mov eax, dword ptr _reciptable[eax]
shr eax, cl
xor eax, edx
mov edx, _asm1
mov ecx, _surfx
mov _asm1, eax
sub eax, edx
mov edx, _surfy
imul ecx, eax
imul eax, edx

add ebx, dword ptr _asm2 ; Replace FPU add with integer op
add ebx, 1


cmp ebx, BITSOFPRECISNLQPOW
mov _asm4, ebx
mov cl, bl
jl short slopeskipminb
mov cl, BITSOFPRECISNLQPOW
slopeskipminb:

mov ebx, esi
mov edx, edi

beginnerslopeloopb:
slopmach1b: shr ebx, 20
add esi, ecx
slopmach2b: shr edx, 26
slopmach7b: and ebx, 88888888h
add edi, eax
slopmach5b: add ebp, 88888888h
slopmach3b: mov dl, byte ptr [ebx+edx+88888888h]
slopmach4b: mov ebx, dword ptr [esp+88888888h]
sub esp, 4
dec cl
mov al, byte ptr [ebx+edx]
mov ebx, esi
mov [ebp], al
mov edx, edi
jnz short beginnerslopeloopb

mov ebx, _asm4
sub ebx, BITSOFPRECISNLQPOW
jg short bigslopeloopb

mov esp, _espbak
mov ebp, _ebpbak
pop ebx
pop edi
pop esi
pop ebp
ret

Part of engine.c:

#define BITSOFPRECISNLQ 3
long SCALEFACTOR = 15; // extra precision
long surfx, surfy, surfstepx, surfstepy, surfz, surfstepz;
long surfx_prev, surfy_prev;
char slopedastat;
grouscan_nonfpu (long dax1, long dax2, long sectnum, char dastat)
{
long i, j, k, l, m, n, x, y, dx, dy, wx, wy, x1, y1, x2, y2, daz;
long daslope, dasqr;
long dashade, shoffs, shinc, m1, m2, *mptr1, *mptr2, *nptr1, *nptr2;
walltype *wal;
sectortype *sec;

sec = &sector[sectnum];

slopedastat = dastat; // pass to asm to draw ceilings-floors differently

if (dastat == 0)
{
if (globalposz <= getceilzofslope(sectnum,globalposx,globalposy))
return; //Back-face culling
globalorientation = sec->ceilingstat;
globalpicnum = sec->ceilingpicnum;
globalshade = sec->ceilingshade;
globalpal = sec->ceilingpal;
daslope = sec->ceilingheinum;
daz = sec->ceilingz;
}
else
{
if (globalposz >= getflorzofslope(sectnum,globalposx,globalposy))
return; //Back-face culling
globalorientation = sec->floorstat;
globalpicnum = sec->floorpicnum;
globalshade = sec->floorshade;
globalpal = sec->floorpal;
daslope = sec->floorheinum;
daz = sec->floorz;
}

if ((picanm[globalpicnum]&192) != 0) globalpicnum += animateoffs(globalpicnum,sectnum);
setgotpic(globalpicnum);
if ((tilesizx[globalpicnum] <= 0) || (tilesizy[globalpicnum] <= 0)) return;
if (waloff[globalpicnum] == 0) loadtile(globalpicnum);

wal = &wall[sec->wallptr];
wx = wall[wal->point2].x - wal->x;
wy = wall[wal->point2].y - wal->y;
dasqr = krecipasm(nsqrtasm(wx*wx+wy*wy));
i = mulscale21(daslope,dasqr);
wx *= i; wy *= i;

globalx = -mulscale19(singlobalang,xdimenrecip);
globaly = mulscale19(cosglobalang,xdimenrecip);
globalx1 = (globalposx<<8);
globaly1 = -(globalposy<<8);
i = (dax1-halfxdimen)*xdimenrecip;
globalx2 = mulscale16(cosglobalang<<4,viewingrangerecip) - mulscale27(singlobalang,i);
globaly2 = mulscale16(singlobalang<<4,viewingrangerecip) + mulscale27(cosglobalang,i);
globalzd = (xdimscale<<9);
Show last 131 lines
	globalzx = -dmulscale17(wx,globaly2,-wy,globalx2) + mulscale10(1-globalhoriz,globalzd);
globalz = -dmulscale25(wx,globaly,-wy,globalx);

if (globalorientation&64) //Relative alignment
{
dx = mulscale14(wall[wal->point2].x-wal->x,dasqr);
dy = mulscale14(wall[wal->point2].y-wal->y,dasqr);

i = nsqrtasm(daslope*daslope+16777216);

x = globalx; y = globaly;
globalx = dmulscale16(x,dx,y,dy);
globaly = mulscale12(dmulscale16(-y,dx,x,dy),i);

x = ((wal->x-globalposx)<<8); y = ((wal->y-globalposy)<<8);
globalx1 = dmulscale16(-x,dx,-y,dy);
globaly1 = mulscale12(dmulscale16(-y,dx,x,dy),i);

x = globalx2; y = globaly2;
globalx2 = dmulscale16(x,dx,y,dy);
globaly2 = mulscale12(dmulscale16(-y,dx,x,dy),i);
}
if (globalorientation&0x4)
{
i = globalx; globalx = -globaly; globaly = -i;
i = globalx1; globalx1 = globaly1; globaly1 = i;
i = globalx2; globalx2 = -globaly2; globaly2 = -i;
}
if (globalorientation&0x10) { globalx1 = -globalx1, globalx2 = -globalx2, globalx = -globalx; }
if (globalorientation&0x20) { globaly1 = -globaly1, globaly2 = -globaly2, globaly = -globaly; }

//isn't "daz" a texture scale for the whole function here?
daz = dmulscale9(wx,globalposy-wal->y,-wy,globalposx-wal->x) + ((daz-globalposz)<<8);
globalx2 = mulscale20(globalx2,daz); globalx = mulscale28(globalx,daz);
globaly2 = mulscale20(globaly2,-daz); globaly = mulscale28(globaly,-daz);

i = 8-(picsiz[globalpicnum]&15); j = 8-(picsiz[globalpicnum]>>4);
if (globalorientation&8) { i++; j++; }
globalx1 <<= (i+12); globalx2 <<= i; globalx <<= i;
globaly1 <<= (j+12); globaly2 <<= j; globaly <<= j;

if (dastat == 0)
{
globalx1 += (((long)sec->ceilingxpanning)<<24);
globaly1 += (((long)sec->ceilingypanning)<<24);
}
else
{
globalx1 += (((long)sec->floorxpanning)<<24);
globaly1 += (((long)sec->floorypanning)<<24);
}

asm1 = -(globalzd>>(16-BITSOFPRECISNLQ));

globvis = globalvisibility;
if (sec->visibility != 0) globvis = mulscale4(globvis,(long)((unsigned char)(sec->visibility+16)));
globvis = mulscale13(globvis,daz);
globvis = mulscale16(globvis,xdimscale);
j = FP_OFF(palookup[globalpal]);

setupslopevlin2(((long)(picsiz[globalpicnum]&15))+(((long)(picsiz[globalpicnum]>>4))<<8),waloff[globalpicnum],-ylookup[1]);

l = (globalzd>>16);

shinc = mulscale16(globalz,xdimenscale);
if (shinc > 0) shoffs = (4<<15); else shoffs = ((2044-ydimen)<<15);
if (dastat == 0) y1 = umost[dax1]; else y1 = max(umost[dax1],dplc[dax1]);
m1 = mulscale16(y1,globalzd) + (globalzx>>6);
//Avoid visibility overflow by crossing horizon
if (globalzd > 0) m1 += (globalzd>>16); else m1 -= (globalzd>>16);
m2 = m1+l;
mptr1 = (long *)&slopalookup[y1+(shoffs>>15)]; mptr2 = mptr1+1;

for(x=dax1;x<=dax2;x++)
{
if (dastat == 0) { y1 = umost[x]; y2 = min(dmost[x],uplc[x])-1; }
else { y1 = max(umost[x],dplc[x]); y2 = dmost[x]-1; }

if (y1 <= y2)
{
nptr1 = (long *)&slopalookup[y1+(shoffs>>15)];
nptr2 = (long *)&slopalookup[y2+(shoffs>>15)];
while (nptr1 <= mptr1)
{
*mptr1-- = j + (getpalookup((long)mulscale24(krecipasm(m1),globvis),globalshade)<<8);
m1 -= l;
}
while (nptr2 >= mptr2)
{
*mptr2++ = j + (getpalookup((long)mulscale24(krecipasm(m2),globvis),globalshade)<<8);
m2 += l;
}

if ( (dastat) == 0)
{
//ceilings
globalx3 = (globalx2>>10);
globaly3 = (globaly2>>10);
} else {
//floors
globalx3 = -(globalx2>>10);
globaly3 = -(globaly2>>10);
}

// Replace globalx3/globaly3 with scaled versions
surfx = globalx3 / SCALEFACTOR;
surfy = globaly3 / SCALEFACTOR;
surfz = globalz / SCALEFACTOR;

// Calculate stepping parameters
surfstepx = globalx / SCALEFACTOR;
surfstepy = globaly / SCALEFACTOR;
surfstepz = globalz / SCALEFACTOR;

asm3 = mulscale16(y2,globalzd) + (globalzx>>6);
slopevlin2(ylookup[y2]+x+frameoffset,krecipasm(asm3>>3),(long)nptr2,y2-y1+1,globalx1,globaly1);

if ((x&15) == 0) faketimerhandler();
}
globalx2 += globalx + (globalx >> 16);
globaly2 += globaly + (globaly >> 16);
globalzx += globalz;

surfx += surfstepx;
surfy += surfstepy;
surfz += surfstepz;

shoffs += shinc;
}
}
Last edited by Darkcrafter07 on 2025-10-24, 19:26. Edited 1 time in total.

Reply 43 of 44, by Darkcrafter07

User metadata
Rank Newbie
Rank
Newbie
marxveix wrote on 2025-10-24, 18:07:

Thank you! I try it later, but not today.

You're welcome but don't hurry, this version is really buggy and may make your computer freeze, there are illegal memory writes are still going.
Upd... fixed

Reply 44 of 44, by marxveix

User metadata
Rank Oldbie
Rank
Oldbie
Darkcrafter07 wrote on 2025-10-24, 18:12:
marxveix wrote on 2025-10-24, 18:07:

Thank you! I try it later, but not today.

You're welcome but don't hurry, this version is really buggy and may make your computer freeze, there are illegal memory writes are still going.
Upd... fixed

Now its bugfixed, better to try with it?

Best ATi Rage3 drivers for 3DCIF / Direct3D / OpenGL / DVD : ATi RagePro drivers and software
30+MiniGL / OpenGL Win 9x dll files for all ATi Rage3 cards : Re: ATi RagePro OpenGL files