CGA Graphics library

Reply 60 of 85, by wbhart

Posted on 2019-10-10, 21:22

wbhart Offline

Rank Newbie

Rank: Newbie
Posts: 80
Joined: 2019-08-11, 11:00

Actually, I have an idea how to fix these problems. I can switch the roles of al and ch. Then al will not be used in the non-a section and can be stored to immediate in CS so that the whole of ax is available as an accumulator in the non-a section. Then al can be restored from the immediate before the end of the non-a section.

I'm not sure if this would be faster, but it's worth a try to possibly shave a couple of cycles off.

YouTube Channel - PCRetroTech

Reply 61 of 85, by wbhart

Posted on 2019-11-01, 13:20

wbhart Offline

Rank Newbie

Rank: Newbie
Posts: 80
Joined: 2019-08-11, 11:00

I finally found some more time to work on the horizontalish part of the ellipse code. Not all the same tricks worked, so I ended up using SS and ES to store regs temporarily.

With the following register assignments:

1   ; di, di+bx offsets of points above and below axis, ax: pixels
2   ; dl: deltax (lo8), sp: deltax (hi16), bp: deltay (hi16),
3   ; ch: D (lo8), cl: deltay (lo8), si: D (hi16)
4   ; es: deltay (hi16) temp, ss: yinc

here is what the code looks like:

1ellipse1_h4:
2   mov ah, [di+bx]
3   mov al, [di]
4   and ax, 0fcfch
5ellipse1_patch37:
6   or ax, 0303h
7
8ellipse1_patch38:
9   sub dl, 012h         ; dx -= s^2
10ellipse1_patch39:
11   sbb sp, 01234h
12   add ch, dl           ; D += dx
13   adc si, sp                     
14
15   shr bp, 1            ; if dy/2 < D, increment y
16   cmp bp, si
17   jge ellipse1_skip_y4
18   
19   mov [di+bx], ah
20   mov [di], al
21
22   mov bp, es
23   sub ch, cl           ; D -= dy
24   sbb si, bp
25ellipse1_patch40:
26   add cl, 012h         ; dy += 2r^2
27ellipse1_patch41:
28   adc bp, 01234h
29   
30   mov ax, ss           ; update offset
31   add di, ax
32   xor ax, 0ffb0h       ; update offset update for odd<->even
33   mov ss, ax
34   sub bx, 80           ; decrement/increment y lines 
35
36   mov ah, [di+bx]
37   mov al, [di]
38ellipse1_skip_y4:
39   mov es, bp          
40
41ellipse1_patch42:
42   sub dl, 012h         ; dx -= s^2
43ellipse1_patch43:
44   sbb sp, 01234h
45ellipse1_doneh1_check:
46   jl ellipse1_doneh1
47
48
49ellipse1_h3:
50   and ax, 0f3f3h
51ellipse1_patch44:
52   or ax, 0c0ch
53
54ellipse1_patch45:
55   sub dl, 012h         ; dx -= s^2
56ellipse1_patch46:
57   sbb sp, 01234h
58   add ch, dl           ; D += dx
59   adc si, sp                     
60

…Show last 125 lines

61   shr bp, 1            ; if dy/2 < D, increment y
62   cmp bp, si
63   jge ellipse1_skip_y3
64   
65   mov [di+bx], ah
66   mov [di], al
67
68   mov bp, es
69   sub ch, cl           ; D -= dy
70   sbb si, bp
71ellipse1_patch47:
72   add cl, 012h         ; dy += 2r^2
73ellipse1_patch48:
74   adc bp, 01234h
75
76   mov ax, ss           ; update offset
77   add di, ax
78   xor ax, 0ffb0h       ; update offset update for odd<->even
79   mov ss, ax
80   sub bx, 80           ; decrement/increment y lines 
81
82   mov ah, [di+bx]
83   mov al, [di]
84ellipse1_skip_y3:
85   mov es, bp
86 
87ellipse1_patch49:
88   sub dl, 012h         ; dx -= s^2
89ellipse1_patch50:
90   sbb sp, 01234h
91   jl ellipse1_doneh1_check
92
93
94ellipse1_h2:
95   and ax, 0cfcfh
96ellipse1_patch51:
97   or ax, 03030h
98
99ellipse1_patch52:
100   sub dl, 012h         ; dx -= s^2
101ellipse1_patch53:
102   sbb sp, 01234h
103   add ch, dl           ; D += dx
104   adc si, sp
105
106   shr bp, 1            ; if dy/2 < D, increment y
107   cmp bp, si
108   jge ellipse1_skip_y2
109   
110   mov [di+bx], ah
111   mov [di], al
112   
113   mov bp, es
114   sub ch, cl           ; D -= dy
115   sbb si, bp
116ellipse1_patch54:
117   add cl, 012h         ; dy += 2r^2
118ellipse1_patch55:
119   adc bp, 01234h
120
121   mov ax, ss           ; update offset
122   add di, ax
123   xor ax, 0ffb0h       ; update offset update for odd<->even
124   mov ss, ax
125   sub bx, 80           ; decrement/increment y lines
126
127   mov ah, [di+bx]
128   mov al, [di]
129ellipse1_skip_y2:
130   mov es, bp
131 
132ellipse1_patch56:
133   sub dl, 012h         ; dx -= s^2
134ellipse1_patch57:
135   sbb sp, 01234h
136   jl ellipse1_doneh2
137
138
139ellipse1_h1:
140   and ax, 03f3fh
141ellipse1_patch58:
142   or ax, 0c0c0h
143
144ellipse1_patch59:
145   sub dl, 012h         ; dx -= s^2
146ellipse1_patch60:
147   sbb sp, 01234h
148   add ch, dl           ; D += dx
149   adc si, sp
150
151   mov [di+bx], ah
152   mov [di], al
153   dec di
154
155   shr bp, 1            ; if dy/2 < D, increment y
156   cmp bp, si
157   jge ellipse1_skip_y1
158   
159   inc di
160   mov bp, es
161   sub ch, cl           ; D -= dy
162   sbb si, bp
163ellipse1_patch61:
164   add cl, 012h         ; dy += 2r^2
165ellipse1_patch62:
166   adc bp, 01234h
167
168   mov ax, ss           ; update offset
169   add di, ax
170   xor ax, 0ffb0h       ; update offset update for odd<->even
171   mov ss, ax
172   sub bx, 80           ; decrement/increment y lines
173
174ellipse1_skip_y1:
175   dec di
176   mov es, bp
177 
178ellipse1_patch63:
179   sub dl, 012h         ; dx -= s^2
180ellipse1_patch64:
181   sbb sp, 01234h
182   jl ellipse1_doneh2_skip ; skip extra byte
183
184   jmp ellipse1_h4

Each section is around 80 bytes, which means one of the short jumps has to jump to one of the others, but that only happens once, if at all and so is a negligible cost.

I now need to write the code to set all the millions of patches and to set up the initial registers and jump into the first loop, then the code to switch the regs around between the verticalish and horizontalish section and the code for writing the final pixel.

Hopefully I find time for that in the next few days and we can get some timings.

YouTube Channel - PCRetroTech

Reply 62 of 85, by wbhart

Posted on 2019-11-10, 15:16

wbhart Offline

Rank Newbie

Rank: Newbie
Posts: 80
Joined: 2019-08-11, 11:00

I discovered that my original Julia code had a dumb bug in it and the original algorithm doesn't quite work the way I thought.

Fortunately I've been able to adapt it to something very similar to what I had come up with that works for semiradii less than 100 except for the following:

26 94
44 39
75 96
82 29
91 57

But it requires that a parameter in the range [1..8] be chosen per ellipse, which either means constructing a table of such parameters, or finding a simple function which computes the correct parameter for a given set of semiradii. That's going to take some time to figure out.

What to do about the exceptional cases above, I don't yet know. I've tried various things, but nothing simple works so far. Probably the original ellipse code will have to be called in those cases. That will be needed anyway for ellipses with semiradius >= 100, for which I don't think just the top 16 bits in the comparisons is sufficient.

YouTube Channel - PCRetroTech

Reply 63 of 85, by wbhart

Posted on 2019-11-12, 22:07

wbhart Offline

Rank Newbie

Rank: Newbie
Posts: 80
Joined: 2019-08-11, 11:00

I discovered a way to do all semiradii up to the 160, 100 needed for a full screen ellipse. It's not perfect, with about 50 pairs of semiradii leading to a single pixel artifact, 1 leading to a two pixel artifact and one leading to a 3 pixel artifact. But I think all ellipses will still visually look ok.

I have the code almost written (it's a monster), but debugging will probably take a lot of effort, if it works at all.

The corrected Julia code is:

1function ellipse2(A, r::Int32, s::Int32, n)
2         i = 1;
3         x = r;
4         y = 0;
5         r_orig = Int32(r)
6         c = Int32((s*s) << (n+8))
7         a = Int32((r*r) << (n+8))
8         D = Int32(0)
9         xdelta = Int32(2)*c*r_orig
10         ydelta = Int32(a)
11         while (reinterpret(UInt32, xdelta) >> 16) >= (reinterpret(Int32, ydelta) >> 16)
12            if i > 320; return 320; end
13            A[i] = (x, y); i += 1;
14            D += ydelta; ydelta += Int32(2)*a; y += 1;
15            if (D >> 16) >= (reinterpret(Int32, reinterpret(UInt32, xdelta) >> 17))
16               xdelta -= c; D -= xdelta; xdelta -= c; x -= 1;
17            end
18         end
19         D = -D
20         while (xdelta >> 16) >= 0
21            A[i] = (x, y); i += 1;
22            xdelta -= c; D += xdelta; xdelta -= c; x -= 1;
23            if (D >> 16) > (reinterpret(Int32, reinterpret(UInt32, ydelta) >> 17))
24               D -= ydelta; ydelta += Int32(2)*a; y += 1
25            end
26         end
27         return i - 1
28      end

where the parameter n can be computed as max(1, (r + s + 1)/32) for minimum artifacts. (This code writes the coordinates into an array A of pairs of Int's).

I'm simulating 24 bits for each of xdelta, ydelta and D here by using Int32's and shifting everything left by 8 bits. The right shifts by 16 bits simulate just taking the top 16 bits of the data as we discussed earlier.

I'd post the assembly code as well, but it's over 600 lines for half an ellipse! I should finish the code by tomorrow night, but it might take me until Sunday night to debug it.

YouTube Channel - PCRetroTech

Reply 64 of 85, by wbhart

Posted on 2019-11-18, 00:13

wbhart Offline

Rank Newbie

Rank: Newbie
Posts: 80
Joined: 2019-08-11, 11:00

I finally have the ellipse code working, at least for the right hand half of the ellipse.

So I can now give timings on the 8088 @ 4.77MHz. It's taking 156 cycles per pixel.

On the 8086 @ 8 MHz it takes 130 cycles per pixel.

I actually think that it could be faster to split the algorithm into two parts, the first of which computes an array that stores the increments (as single bits) and a second part that draws the points. The reason this might be faster is that it can take advantage of 4 way symmetry, rather than the two way symmetry I'm exploiting. I speculate it could be done in 100-120 cycles per pixel with this method, though obviously that is just a plain guess.

The current code is in the function cga_draw_ellipse1 in the file cga5.asm in my GitHub repository, linked at the beginning of this thread. It's exactly 700 lines of code for the right hand half of the ellipse. The left hand side will be about the same.

Last edited by wbhart on 2019-11-18, 00:59. Edited 3 times in total.

YouTube Channel - PCRetroTech

Reply 65 of 85, by wbhart

Posted on 2019-11-18, 00:51

wbhart Offline

Rank Newbie

Rank: Newbie
Posts: 80
Joined: 2019-08-11, 11:00

The previous fastest code I had for general ellipses on the 8088, prior to this, was 342 cycles per pixel, so the new code is more than twice as fast.

Update: the code for the full ellipse is now in the repository. It's 1400 lines of assembly code!

YouTube Channel - PCRetroTech

Reply 66 of 85, by wbhart

Posted on 2019-11-24, 09:52

wbhart Offline

Rank Newbie

Rank: Newbie
Posts: 80
Joined: 2019-08-11, 11:00

I made a little video on my channel about the fast ellipses, with a little "demo effect", albeit computed in real time, rather than precomputed:

https://youtu.be/7o07XN6tucQ

YouTube Channel - PCRetroTech

Reply 67 of 85, by wbhart

Posted on 2019-12-22, 14:32

wbhart Offline

Rank Newbie

Rank: Newbie
Posts: 80
Joined: 2019-08-11, 11:00

I wrote some code for drawing precomputed ellipses. It starts with an array of bits specifying which pixels of the verticalish and horizontalish parts should move horizontally/vertically respectively from their predecessors.

The result is pretty fast at 91 cycles per pixel, which is about a 50% improvement over computing the whole thing in real time.

I haven't written assembly code for doing the precomputation yet, so I just do that part in Julia. The code has a single ellipse with semiradii 100, 84 hard coded in an array in CS (called _ellipse_data). However, the ellipse can be placed anywhere on screen in any colour.

It would be possible to do the horizontalish part faster by precomputing entire bytes, but this would be messy due to the 4 pixel per byte CGA layout, which would mean you'd need much more space for precomputation if you wanted to be able to put an ellipse anywhere on the screen. Reading the full bytes from memory might end up being slower than just reading one bit per pixel, as I currently do. So it's not a guaranteed win.

I also didn't bother trying to use 4 way symmetry, for much the same reason. I don't think I will try this, though it could probably be slightly faster (for one hell of a lot of extra work).

The code is in my GitHub repo, linked at the beginning of the thread. I've now separated the assembly graphics routines into three files: line.asm, circle.asm, ellipse.asm, with the new code being in the latter in _cga_draw_ellipse_precomp1 for the right hand side of the ellipse, and _cga_draw_ellipse_precomp2 for the left hand side.

Writing code for precomputed lines is also possible, though I doubt it will be much of an improvement on just drawing the lines. One could of course use the trick of starting in the middle of the line and drawing two pixels at once, moving out from the centre (this is not new, it was done in very early ZX spectrum or Amstrad CPC games or demos, I forget which). Of course, there are two cases, depending whether the centre point of the line is a pixel or a pixel border.

I haven't decided whether I will code up this trick or not. It looks ok in games/demos that use it, so it might be worthwhile.

Of course I can also write routines for xoring ellipses, blanking ellipses and drawing them in (binary) colour 00 and 11. There could be some additional speedups possible here, as I think one ends up with an extra register being available.

All of the routines I've written so far turn off the interrupts. That is not so nice for games or demos with music or tricks involving precise timing.

In case anyone wants the Julia code for precomputing the ellipses, here it is:

1      function ellipse(A, B, r::Int, s::Int)
2          i = 1;
3          x = r;
4          y = 0;
5          r_orig = r
6          c = s*s
7          a = r*r
8          D = 0
9          xdelta = 2*c*r_orig
10          ydelta = a
11          while xdelta >= ydelta
12             A[i] = (x, y); i += 1; #println("(", x, ", ", y, ")")
13             D += ydelta; ydelta += 2a; y += 1;
14             if D >= div(xdelta, 2)
15                xdelta -= c; D -= xdelta; xdelta -= c; x -= 1; 
16             end
17          end
18          D = -D
19          j = 1
20          while xdelta >= 0
21             B[j] = (x, y); j += 1; #println("(", x, ", ", y, ")")
22             xdelta -= c; D += xdelta; xdelta -= c; x -= 1;
23             if D > div(ydelta, 2)
24                D -= ydelta; ydelta += 2a; y += 1
25             end
26          end
27          return i - 1, j - 1
28       end
29
30      function ellipse_precomp(C, A, m::Int, B, n::Int)
31         x = A[1][1]
32         r = (m - 1) % 8
33         if r == 0
34            r = 8
35            q = div(m - 9, 8)
36         else
37            q = div(m - r - 1, 8)
38         end
39         C[1] = q + 1
40         C[2] = r
41         bj = UInt8(1)
42         b = UInt8(0)
43         for j = 1:r
44            if A[j + 1][1] != x
45               b += bj
46            end
47            bj <<= 1
48            x = A[j + 1][1]
49         end
50         C[3] = b
51         for i = 1:q
52            bj = UInt8(1)
53            b = UInt8(0)
54            for j = 1:8
55               if A[(i - 1)*8 + j + r + 1][1] != x
56                  b += bj
57               end
58               bj <<= 1
59               x = A[(i - 1)*8 + j + r + 1][1]
60            end

…Show last 38 lines

61            C[i + 3] = b
62         end
63         v = q + 3
64         y = A[m][2]
65         r = n % 8
66         if r == 0
67            r = 8
68            q = div(n - 8, 8)
69         else
70            q = div(n - r, 8)
71         end
72         C[v + 1] = q + 1
73         C[v + 2] = r
74         bj = UInt8(1)
75         b = UInt8(0)
76         for j = 1:r
77            if B[j][2] == y
78               b += bj
79            end
80            bj <<= 1
81            y = B[j][2]
82         end
83         C[v + 3] = b
84         for i = 1:q
85            bj = UInt8(1)
86            b = UInt8(0)
87            for j = 1:8
88               if B[(i - 1)*8 + j + r][2] == y
89                  b += bj
90               end
91               bj <<= 1
92               y = B[(i - 1)*8 + j + r][2]
93            end
94            C[v + i + 3] = b
95         end
96         return v + q + 3         
97      end

It would be invoked as follows, for an ellipse with semiradii 100, 84:

1julia> A = Array{Tuple{Int, Int}}(undef, 320);
2
3julia> B = Array{Tuple{Int, Int}}(undef, 320);
4
5julia> C = Array{UInt8}(undef, 40);
6
7julia> m, n = ellipse(A, B, 100, 84)
8(54, 78)
9
10julia> q = ellipse_precomp(C, A, m, B, n)
1121
12
13julia> for i = 1:q
14          print(C[i], ", ")
15       end
167, 5, 0, 8, 34, 74, 85, 237, 190, 10, 6, 8, 16, 41, 85, 173, 109, 119, 223, 223, 255,

YouTube Channel - PCRetroTech

Reply 68 of 85, by wbhart

Posted on 2019-12-31, 09:18

wbhart Offline

Rank Newbie

Rank: Newbie
Posts: 80
Joined: 2019-08-11, 11:00

I've now written assembly code to precompute ellipses. It is actually now slightly faster to first precompute the ellipse then draw it from the precomputed information. This is not terribly surprising given that the precomputed data is used for both halves of the ellipse and no longer needs to be computed twice during ellipse drawing.

So the total time (including precomputation) for drawing an ellipse is now 148 cycles per pixel on a 4.77MHz 8088.

YouTube Channel - PCRetroTech

Reply 69 of 85, by wbhart

Posted on 2020-07-14, 11:17

wbhart Offline

Rank Newbie

Rank: Newbie
Posts: 80
Joined: 2019-08-11, 11:00

I have made some significant progress on the CGA graphics library I'm writing, after weeks of effort.

I now have pixel perfect ellipse code that runs in 140 cycles per pixel on the 8088 @4.77MHz without turning off interrupts.

On my 10MHz 8088 it's 147 cycles per pixel (I previously called this an 8086 machine, but it transpires that the markings on the chip are just really hard to read), and on the Amstrad PC1512 with 8086 CPU it's 115 cycles per pixel.

There is an additional 10000 cycles set up cost (8000 for the 8086), but this is currently double counted as I do the same set up twice, once for each half of the ellipse. So that setup time will nearly halve when I make a single function out of it. Also there are a couple of multiplications which can be omitted in half the cases and some loops that don't need so many iterations, so I anticipate something more like 4000 cycles set up cost.

The timings are not quite comparable to the previous ones I gave, since I was amortising the setup cost with the per pixel cost in those timings.

There's no memory accesses, no pushes or pops, just short jumps for all the loops. It's as optimal as it can be, essentially, though I do have to save one general purpose register in ES temporarily at one point, due to no longer using the SP register.

The trick to getting all this to work is a set of three small lookup tables to store corrections. The first correction is the number of bits to shift the starting values by so that all decisions can be made on 16 bit comparisons. The other two corrections are for self modifying the code in certain cases so that different inequalities are used (e.g. < instead of <=). This allows the code to produce pixel perfect ellipses, which the previous code did not. I am really, really surprised this was possible without turning off interrupts, especially since I wrote very many versions of the code before I hit on the correct ideas.

Another thing I've been playing with in the graphics library is using an effective resolution of 160x200 by drawing pairs of pixels in the horizontal direction. This has some advantages, e.g. counts now fit in a byte.

The cool thing about the ellipse code is that it should be possible to use 4-way symmetry instead of 2-way symmetry in this resolution, effectively halving the cost to draw ellipses. Of course that will rely on using SP, but still, it's too big an improvement to ignore.

I have coded up lines in this resolution and the horizontalish ones are about 35% faster as well (the verticalish ones are only about 5% faster).

Another cool trick I found is an approximate line drawing algorithm which saves a register. It's not Bresenham any more, but depends on a fixed point approximation. It's faster, but may not be pixel perfect (not that you'd notice). I also believe a similar trick can be done for ellipses, using a pair of fixed point approximations, but I haven't prototyped this yet.

The other thing I've done is write a 16 colour rotozoom in text mode. There are three different versions for various resolutions (one of which will have snow on original hardware of course). The frame rates are really quite high in all of these. I'll be making a video about that next week.

YouTube Channel - PCRetroTech

Reply 70 of 85, by VileR

Posted on 2020-07-16, 12:51

VileR Offline

Rank l33t

Rank: l33t
Posts: 2199
Joined: 2003-05-14, 22:11
Location: 1-01-80 0:00a

Appreciate your ongoing work on this. I haven't had much to say thus far, but this is fantastic. Keep it up!

I remember seeing an effective "160x200" resolution in some CGA games. I always assumed this was simply to save space, or because the graphics were converted from something like low-res PCjr or C64... hadn't considered the speed advantage of making the coordinates fit in a byte.

I guess a related trick is what Mike Abrash did in his early games (fully addressable 320x200 resolution, but the real action was kept to the leftmost 256 pixels of each scanline - the right side of the screen was reserved for the status display).

[ WEB ] - [ BLOG ] - [ TUBE ] - [ CODE ]

Reply 71 of 85, by wbhart

Posted on 2020-07-16, 17:08

wbhart Offline

Rank Newbie

Rank: Newbie
Posts: 80
Joined: 2019-08-11, 11:00

There's an even more important advantage than making the coordinates fit into a byte, which I didn't mention. You have less computation to do as you move across the screen horizontally.

In the case of drawing a straight line, for example, something like 3/4 of the time is spent computing which pixels to draw, rather than the time spent putting the data into CGA memory. So if you have half as many pixels to "compute", the time goes down substantially.

Another important benefit is fewer cases. You only have two cases per "pixel". It's either in the left half or the right half of the byte. So if your algorithm depends on such cases you end up with half as many cases to deal with, which can mean you fit more cases between jumps. Sometimes you can use short jumps instead of long ones as a result.

So the benefits are really manifold. And it doesn't look too terrible.

Last edited by wbhart on 2020-07-16, 17:43. Edited 1 time in total.

YouTube Channel - PCRetroTech

Reply 72 of 85, by wbhart

Posted on 2020-07-16, 17:22

wbhart Offline

Rank Newbie

Rank: Newbie
Posts: 80
Joined: 2019-08-11, 11:00

For anyone interested, the new ellipse code is here:

https://github.com/wbhart/CGAGraphics/blob/ma … ast/ellipse.asm

Unfortunately it didn't come down to 4000 cycles for the setup, but the total cost for an ellipse is around 8700 + 140 cycles per pixel. That 8700 seems high, but it's only about 16 pixels worth per quadrant, which means that for medium to large ellipses it is not significant.

I could do some more work to reduce the size of the lookup tables, but other than that, I consider this code finished.

YouTube Channel - PCRetroTech

Reply 73 of 85, by wbhart

Posted on 2020-07-18, 20:57

wbhart Offline

Rank Newbie

Rank: Newbie
Posts: 80
Joined: 2019-08-11, 11:00

My video about the new pixel perfect ellipse code and high frame rate rotozoom is up:

https://youtu.be/cSxYljs5OxE

YouTube Channel - PCRetroTech

Reply 74 of 85, by wbhart

Posted on 2020-08-16, 16:19

wbhart Offline

Rank Newbie

Rank: Newbie
Posts: 80
Joined: 2019-08-11, 11:00

At the risk of posting EVERY SECOND VIDEO from my channel....

I have finally achieved 3D rotation (of a tetrahedron) in CGA graphics mode, which is a bit of a milestone for the CGA Graphics library.

If you want to see the video itself, it's here:

https://youtu.be/3DQ7HfGN60s

YouTube Channel - PCRetroTech

Reply 75 of 85, by Zorko

Posted on 2020-11-11, 08:27

Zorko Offline

Rank Newbie

Rank: Newbie
Posts: 7
Joined: 2020-11-11, 07:35

Dear wbhart,

I'm very impressed with your achievements at the quick drawing of pixel graphic and sprites, especially clipping off the outside of the screen! Especially on such a slow machine.

And I thought: wbhart is the person who can only help me.

I want to port the game from ZX Spectrum to DOS/CGA. The screen there consists of 32x24 colour characters (8x8). If we project this to a CGA resolution, we get a grid of the same 32x24 tiles with a resolution of 10x8 each tile. So I need a sprite output routine that outputs a sprite (with pixel-by-pixel precision) of any height and any width (specified in pixels). I started looking for ready-made solutions on the Internet and found that in fact I need to emulate the standard BGI graphics function "putimage" from Turbo C:

void far putimage(int left, Int top, void far *bitmap, int op);

And I would take it ready-made, but everywhere they write about its too low speed.

Image format:

word width (in pixels)
word height (in pixels)
void *data of image

There is no alternating even and odd strings. Although undesirable, it is acceptable.

The list of wishes for the subroutine follows:

1. Not need the RLE compression or pre-compilation of sprites. In this case, this is unnecessary.
2. Need to be able to output images with a width that is not a multiple of 4 (for example, with a width of 10), i.e. the width of image is set in pixels, not in bytes.
3. Logical operations are also necessary.
4. It should also be interesting to have a version of this routine with out-of-screen output (as in your sprite engine). BGI function can't do it.

I've seen a lot of CGA graphics code, but it usually only outputs sprites that are multiples of 4 in width. So I rather need a subroutine for images than for a full-fledged sprite engine. And I think only you can develop such a procedure. Or just give me some good advice.

Thanks!

Reply 76 of 85, by Zorko

Posted on 2020-11-11, 09:06

Zorko Offline

Rank Newbie

Rank: Newbie
Posts: 7
Joined: 2020-11-11, 07:35

I came up with two tricks for the analog of "putimage" subroutine:

1. In the game Battle City for CGA, I saw a subroutine that is approximately similar to what I need. But that subroutine uses an internal buffer for bit-shifting the line.

http://www.balagurov.com/software/tank

How to do without a buffer? To do this, we need to read two bytes of the sprite line at once, shift the word in a register to the desired position, and output only one resulting byte. After that, we increase the line address by 1 to get one new, and one the same byte of a sprite, and shift it again.

So we will seriously speed up the work due to the lack of a buffer.

2. There is no need to store even and odd lines separately. Just one "go to the next line" operation will be enough, regardless of its parity.

To go from an even line to an odd one, need to increase the screen address by 1FFCh, and to go from an odd line to an even line - decrease by 1FB4h.
We can turn a subtraction operation into an addition operation by adjusting the number (-1FB4h = 0E04Ch).

Thus, we calculate the first number to go to the next line, regardless of its parity. And after adding, we turn one number to add to another. We need to turn the number 1FFCh into 0E04Ch, and 0E04Ch into 1FFCh.

N-1FFCh = 0E04Ch (word)
N-0E04C = 1FFCh (word)
N = 48h

Thus, after the increase to go to the next line, we simply adjust the increase value using the operation:

N = 48h - N

Subtraction and addition are quick operations. This way, both code compactness and relative speed will be achieved.

Reply 77 of 85, by Zorko

Posted on 2020-11-11, 11:55

Zorko Offline

Rank Newbie

Rank: Newbie
Posts: 7
Joined: 2020-11-11, 07:35

We can make it even easier:

1        If the first line is odd, GOTO 1$
20$:
3        Draw an even line
4        If there are no more lines, exit
5        Switch to the next odd line
61$:
7        Draw an odd line
8        If there are no more lines, exit
9        Switch to the next even line
10        GOTO 0$

This is a fast and good algorithm. This way, there is really no need to store even and odd lines in separate data blocks.

Reply 78 of 85, by wbhart

Posted on 2020-11-21, 14:20

wbhart Offline

Rank Newbie

Rank: Newbie
Posts: 80
Joined: 2019-08-11, 11:00

Hi zorko,

Thanks for your comments. Sorry that I didn't see your message until today.

I think that a routine to put an image onscreen very fast is a great idea for the CGA library I'm writing. And I agree that sometimes one doesn't want the data format to be too complicated.

I am planning on making a routine for doing text, e.g. for a text scroller, and such a routine would be a good starting point.

I can't promise to do anything very soon, but I'll definitely put this on the list of ideas and work on it as soon as I find some time.

The one problem I see with your idea is that shifting is quite slow on the 8088/8086. It may actually be faster to use a buffer, but this will have to be investigated.

Your idea to use 48h - N is clever. If you have a spare register to hold the 48h then this would be quite quick. If not, one can do -(N - 48h) which is just two instructions, though there's quite a lot of bytes in the "subtraction of immediate value" instruction.

YouTube Channel - PCRetroTech

Reply 79 of 85, by Zorko

Posted on 2020-11-21, 22:14

Zorko Offline

Rank Newbie

Rank: Newbie
Posts: 7
Joined: 2020-11-11, 07:35

Dear wbhart!
Thanks for your reply. We can't do without a shift anyway, since we want to output positions that are not multiples of four.
I developed the subroutine that I roughly wanted. And it can be a starting point for learning. Also I will certainly be very happy with your additions and criticism.

Technical task
Develop a subroutine that outputs a sprite (with pixel accuracy) of any height specified in pixels and any width specified in bytes
So far, there is an implementation only for output with logical operations. It will be necessary to modify the overlay "along the edges".

void PutSpr (int x, int y, void *spr)

Sprite format

Width (in bytes)
Height (in pixels)
Sprite data

1. Calculate the screen address of the first upper byte for output from the x and y coordinates
And the offset for the shift {0..3}.
If the offset =0, the byte is output unchanged, if 1, with a shift of 1 pixel to the right, and so on.
Convert (*2) offset to the number of bits to shift: 0=>0; 1=>2; 2=>4; 3=>6

2. Get from the sprite address the length and height of the sprite, save in registers

3. Draw the sprite line height times. Take into account the specifics of the CGA with its two screen memory planes, using the algorithm described above.

Implementation for CGA (Turbo C):

1void GrApp_PutSpr (unsigned int x, unsigned int y, void *spr) { /* Draw CGA sprite */
2  asm       MOV  BX, x
3  asm       MOV  CL, BL
4  asm       AND  CL, 3
5  asm       SHL  CL, 1      /* CL - число бит для сдвига */
6  asm       MOV  AX, y
7  asm       XCHG AH, AL
8  asm       SHR  AX, 1
9  asm       ADD  BH, AL
10  asm       XOR  AL, AL
11  asm       ADD  BX, AX
12  asm       SHR  AX, 1
13  asm       SHR  AX, 1
14  asm       ADD  BX, AX
15  asm       SHR  BX, 1
16  asm       SHR  BX, 1      /* BX - смещение байта */
17  asm       MOV  AX, 0B800H
18  asm       MOV  ES, AX     /* ES:BX = screen address */
19  asm       MOV  SI, spr    /* DS:SI = sprite address */
20  asm       CLD
21  asm       LODSW           /* len */
22  asm       XCHG DX, AX     /* DL = len; DH = hgt */
23  asm       MOV  DI, BX
24  asm       CMP  BH, 20H
25  asm       JNC  ODD
26      EVEN:
27  asm       CALL @DRAWLINE  /* Draw even line */
28  asm       JZ   EXIT
29  asm       ADD  DI, 2000H
30      ODD:
31  asm       CALL @DRAWLINE  /* Draw odd line */
32  asm       JZ   EXIT
33  asm       SUB  DI, 1FB0H
34  asm       JMP  SHORT EVEN

The sprite line output:

1    Remember the screen address and length of the sprite on the stack
2    Get a byte of sprite data. Save it (in the conditional register AL)
3    Copy of the register AL shift to the desired position {0..3} and output it on the screen
4    Increase the screen address
5    Decrease the width of the sprite (in bytes)
6    If width = 0 GOTO Last_byte
7DRAWBYTE:
8    Get a byte of sprite data
9    Construct from the previously stored and new byte the new word for the next shift
10    Remember the new byte instead of the old one
11    Shift a two-byte word to the desired position {0..3} and output it on the screen
12    Increase the screen address
13    Decrease the width of the sprite (in bytes)
14    If the width is # 0 GOTO DRAWBYTE
15Last_byte:
16    If shift = 0 (the entire byte is already output by the first output, and it doesn't need to be output anymore),
17        then GOTO Exit
18    Construct a new word from the stored byte and zero (the last word in the line)
19    Output it on the screen
20Exit:
21    Restore the length of the sprite and the screen address from the stack, decrease the height of the sprite
22    RETURN

1  asm @DRAWLINE:
2  asm       PUSH DI
3  asm       PUSH DX
4  asm       LODSB
5  asm       MOV  BL, AL
6  asm       SHR  BL, CL     /* 011100.10 => XX.011100 */
7  asm       OR   ES:[DI], BL
8  asm       INC  DI
9  asm       DEC  DL
10  asm       JZ   DRAWLAST
11      DRAWBYTE:
12  asm       MOV  AH, AL
13  asm       LODSB
14  asm       MOV  BX, AX
15  asm       SHR  BX, CL     /* 011100.10 11111111 => 10.111111 */
16  asm       OR   ES:[DI], BL
17  asm       INC  DI
18  asm       DEC  DL
19  asm       JNZ  DRAWBYTE
20      DRAWLAST:
21  asm       MOV  AH, AL
22  asm       XOR  AL, AL
23  asm       CMP  AL, CL
24  asm       JZ   ZEROLAST
25  asm       SHR  AX, CL     /* 111111.11 => 10.111111 */
26  asm       OR   ES:[DI], AL
27      ZEROLAST:
28  asm       POP  DX
29  asm       POP  DI
30  asm       DEC  DH
31  asm       RET
32      EXIT:;
33} /*GrApp_PutSpr*/

Loop through output lines
Output the left byte (for PUT - with clipping, for AND/OR/XOR - with overlay)
Output all middle bytes (there can be 0)
Output the right byte (for PUT - with clipping, for AND/OR/XOR - with overlay)

Main menu

Topic actions

Reply 60 of 85, by wbhart

Reply 61 of 85, by wbhart

Reply 62 of 85, by wbhart

Reply 63 of 85, by wbhart

Reply 64 of 85, by wbhart

Reply 65 of 85, by wbhart

Reply 66 of 85, by wbhart

Reply 67 of 85, by wbhart

Reply 68 of 85, by wbhart

Reply 69 of 85, by wbhart

Reply 70 of 85, by VileR

Reply 71 of 85, by wbhart

Reply 72 of 85, by wbhart

Reply 73 of 85, by wbhart

Reply 74 of 85, by wbhart

Reply 75 of 85, by Zorko

Reply 76 of 85, by Zorko

Reply 77 of 85, by Zorko

Reply 78 of 85, by wbhart

Reply 79 of 85, by Zorko