2003-7-26 Release \ VOGONS

Reply 1 of 23, by ih8registrations

Posted on 2003-07-26, 17:53

ih8registrations Offline

Rank Oldbie

Rank: Oldbie
Posts: 931
Joined: 2003-07-25, 17:20

You know what's coming, more optmization:)

1INLINE Bit16s MidiChannel::getPitchEnvelope(dpoly::partialStatus *pStat, dpoly *poly, bool inDecay) {
2	Bit32u sampoff;
3	patchCache *tcache = &pcache[pStat->partNum];
4
5	Bit16s tc;
6
7	pStat->pitchsustain = false;  
8
9	if(inDecay) {
10		if(pStat->isDecayed || (pStat->envpos[PITCHENV] >= pStat->envsize[PITCHENV])) {
11			tc = tcache->pitchEnv.level[4];
12			pStat->prevlevel[PITCHENV] = tc; 
13			return tc;                       
14		}
15	} else {
16
17		if(pStat->envstat[PITCHENV]==2) {
18			tc =tcache->pitchEnv.level[3];
19
20			if(tcache->sustain)   			
21				pStat->pitchsustain = true    
22			else	
23				StartDecay(PITCHENV, tcache->pitchEnv.level[3], pStat, poly);
24
25			pStat->prevlevel[PITCHENV] = tc;
26			return tc; 
27		} else {
28
29			if((pStat->envstat[PITCHENV]==-1) || (pStat->envpos[PITCHENV] >= pStat->envsize[PITCHENV])) {
30				pStat->envbase[PITCHENV] = tcache->pitchEnv.level[pStat->envstat[PITCHENV]+1];
31				pStat->envstat[PITCHENV]++;
32
33				pStat->envpos[PITCHENV] = 0;
34				pStat->envsize[PITCHENV] = (envtimetable[tcache->pitchEnv.time[pStat->envstat[PITCHENV]]] * fildeptable[tcache->pitchEnv.timekeyfollow][poly->freqnum]) >> 8;
35				pStat->envsize[PITCHENV]++;
36				pStat->envdist[PITCHENV] = tcache->pitchEnv.level[pStat->envstat[PITCHENV]+1] - pStat->envbase[PITCHENV];
37			}
38
39		}
40
41	}
42	tc = pStat->envbase[PITCHENV]; 
43	tc = (tc + ((pStat->envdist[PITCHENV] * pStat->envpos[PITCHENV]) / pStat->envsize[PITCHENV]));
44
45	pStat->prevlevel[PITCHENV] = tc;
46	return tc; 
47
48}

Two extra
'pStat->prevlevel[PITCHENV] = tc;
return tc; '

are still shorter than one extra
'tc = pStat->envbase[PITCHENV];
tc = (tc + ((pStat->envdist[PITCHENV] * pStat->envpos[PITCHENV]) / pStat->envsize[PITCHENV]));'

size opt: deault false assign for pStat->pitchsustain. only one case where it's true. cost of default assign offset by saved jump from the immediate return tc and because that happens more than once, updated function is still faster overall.

Reply 2 of 23, by canadacow

Posted on 2003-07-26, 19:56

canadacow Offline

Rank Member

Rank: Member
Posts: 466
Joined: 2003-05-11, 23:00
Location: United States

Atleast in Visual C, immediate returns are no more optimized than just letting the code run to the end of the routine. The second set of dividing to calculate was to calculate for the decaying side of the envelope. Once decayed, the code should not be allowed into the standard block because once the envelope position (envpos) extends past the envelope size (envsize) the code then moves to the next part of the envelope. The final decay, of course, is the end of the line. This is still needed because even though the pitchenv could be complete in its decay, the other two envelopes (amplitude and filter) could still be far from complete decay. For informational purposes, here's Visual C's generated assembly code for this routine:

1; 1284 : INLINE Bit16s MidiChannel::getPitchEnvelope(dpoly::partialStatus *pStat, dpoly *poly, bool inDecay) {
2
3	push	ebx
4	push	esi
5
6; 1285 : 	Bit32u sampoff;
7; 1286 : 	patchCache *tcache = &pcache[pStat->partNum];
8
9	mov	esi, DWORD PTR _pStat$[esp+4]
10	mov	eax, DWORD PTR [esi+284]
11	imul	eax, 4008				; 00000fa8H
12
13; 1287 : 
14; 1288 : 	Bit16s tc;
15; 1289 : 	pStat->pitchsustain = false;
16
17	xor	ebx, ebx
18
19; 1290 : 	if(inDecay) {
20
21	cmp	BYTE PTR _inDecay$[esp+4], bl
22	push	edi
23	lea	eax, DWORD PTR [eax+ecx+262248]
24	mov	BYTE PTR [esi+280], bl
25	je	SHORT $L68822
26
27; 1291 : 
28; 1292 : 		if((pStat->isDecayed) || (pStat->envpos[PITCHENV] >= pStat->envsize[PITCHENV])) {
29
30	cmp	BYTE PTR [esi+144], bl
31	jne	SHORT $L68824
32	mov	ecx, DWORD PTR [esi+20]
33	mov	edi, DWORD PTR [esi+84]
34	cmp	ecx, edi
35	jge	SHORT $L68824
36
37; 1294 : 		} else {
38; 1295 : 			tc = pStat->envbase[PITCHENV];
39; 1296 : 			tc = (tc + ((pStat->envdist[PITCHENV] * pStat->envpos[PITCHENV]) / pStat->envsize[PITCHENV]));		
40
41	mov	eax, DWORD PTR [esi+68]
42	imul	eax, ecx
43	cdq
44	idiv	edi
45	mov	edi, eax
46	add	di, WORD PTR [esi+52]
47	jmp	$L68826
48$L68824:
49
50; 1293 : 			tc = tcache->pitchEnv.level[4];
51
52	movsx	di, BYTE PTR [eax+119]
53
54; 1297 : 		}
55; 1298 : 	} else {
56
57	jmp	$L68826
58$L68822:
59
60; 1299 :

…Show last 132 lines

61; 1300 : 		if(pStat->envstat[PITCHENV]==2) {
62
63	mov	edx, DWORD PTR [esi+36]
64	cmp	edx, 2
65	jne	SHORT $L68827
66
67; 1301 : 			if(tcache->sustain) {
68
69	cmp	BYTE PTR [eax+24], bl
70	je	SHORT $L68828
71
72; 1302 : 				tc =tcache->pitchEnv.level[3];
73
74	movsx	di, BYTE PTR [eax+118]
75
76; 1303 : 				pStat->prevlevel[PITCHENV] = tc;
77
78	movsx	eax, di
79	mov	DWORD PTR [esi+136], eax
80
81; 1304 : 				pStat->pitchsustain = true;
82
83	mov	BYTE PTR [esi+280], 1
84
85; 1305 : 			} else {
86
87	jmp	SHORT $L68830
88$L68828:
89
90; 1306 : 				tc =tcache->pitchEnv.level[3];
91
92	mov	al, BYTE PTR [eax+118]
93
94; 1307 : 				StartDecay(PITCHENV, tcache->pitchEnv.level[3], pStat, poly);
95
96	push	DWORD PTR _poly$[esp+8]
97	movsx	di, al
98	movsx	eax, al
99	push	esi
100	push	eax
101	push	2
102	call	?StartDecay@MidiChannel@@QAEXHJPAUpartialStatus@dpoly@@PAU3@@Z ; MidiChannel::StartDecay
103
104; 1308 : 			}
105; 1309 : 
106; 1310 : 		} else {
107
108	jmp	SHORT $L68830
109$L68827:
110
111; 1311 : 
112; 1312 : 			if((pStat->envstat[PITCHENV]==-1) || (pStat->envpos[PITCHENV] >= pStat->envsize[PITCHENV])) {
113
114	cmp	edx, -1
115	je	SHORT $L68832
116	mov	ecx, DWORD PTR [esi+20]
117	cmp	ecx, DWORD PTR [esi+84]
118	jl	SHORT $L68831
119$L68832:
120
121; 1313 : 				pStat->envbase[PITCHENV] = tcache->pitchEnv.level[pStat->envstat[PITCHENV]+1];
122
123	movsx	ecx, BYTE PTR [edx+eax+116]
124	mov	DWORD PTR [esi+52], ecx
125
126; 1314 : 				pStat->envstat[PITCHENV]++;
127
128	lea	ecx, DWORD PTR [edx+1]
129
130; 1315 : 
131; 1316 : 				pStat->envpos[PITCHENV] = 0;
132; 1317 : 				pStat->envsize[PITCHENV] = (envtimetable[tcache->pitchEnv.time[pStat->envstat[PITCHENV]]] * fildeptable[tcache->pitchEnv.timekeyfollow][poly->freqnum]) >> 8;
133; 1318 : 				pStat->envsize[PITCHENV]++;
134
135	mov	edx, DWORD PTR _poly$[esp+8]
136	mov	DWORD PTR [esi+36], ecx
137	add	ecx, eax
138	mov	DWORD PTR [esi+20], ebx
139	movsx	eax, BYTE PTR [eax+110]
140	shl	eax, 7
141	add	eax, DWORD PTR [edx+12]
142	movsx	edx, BYTE PTR [ecx+111]
143	mov	eax, DWORD PTR _fildeptable[eax*4]
144	imul	eax, DWORD PTR _envtimetable[edx*4]
145	sar	eax, 8
146	inc	eax
147	mov	DWORD PTR [esi+84], eax
148
149; 1319 : 				pStat->envdist[PITCHENV] = tcache->pitchEnv.level[pStat->envstat[PITCHENV]+1] - pStat->envbase[PITCHENV];
150
151	movsx	eax, BYTE PTR [ecx+116]
152	sub	eax, DWORD PTR [esi+52]
153	mov	DWORD PTR [esi+68], eax
154$L68831:
155
156; 1320 : 			}
157; 1321 : 
158; 1322 : 			tc = pStat->envbase[PITCHENV];
159; 1323 : 			tc = (tc + ((pStat->envdist[PITCHENV] * pStat->envpos[PITCHENV]) / pStat->envsize[PITCHENV]));
160
161	mov	eax, DWORD PTR [esi+68]
162	imul	eax, DWORD PTR [esi+20]
163	cdq
164	idiv	DWORD PTR [esi+84]
165	mov	edi, eax
166	add	di, WORD PTR [esi+52]
167$L68830:
168
169; 1324 : 
170; 1325 : 		}
171; 1326 : 		pStat->prevlevel[PITCHENV] = tc;
172
173	movsx	eax, di
174	mov	DWORD PTR [esi+136], eax
175$L68826:
176
177; 1327 : 
178; 1328 : 
179; 1329 : 	}
180; 1330 : 	
181; 1331 : 	return tc; 
182
183	mov	ax, di
184	pop	edi
185	pop	esi
186	pop	ebx
187
188; 1332 : 
189; 1333 : }
190
191	ret	12					; 0000000cH

Reply 3 of 23, by ih8registrations

Posted on 2003-07-27, 01:33

ih8registrations Offline

Rank Oldbie

Rank: Oldbie
Posts: 931
Joined: 2003-07-25, 17:20

Fair enough. As for the second caclulation, it sounds like you think I only applied the divide to calc for the first case? Barring a bug, the rewrite is functionally equivalent. The handling of both cases were moved outside of the if else structure and became the general case. For the other cases, they should return without hitting it.

The asm readout is of my rewrite? It looks like a mix of old & new. duplicate div to calc of old is in there.

and for my code snippet here:

1	if(inDecay) {
2		if(pStat->isDecayed || (pStat->envpos[PITCHENV] >= pStat->envsize[PITCHENV])) {
3			tc = tcache->pitchEnv.level[4];
4			pStat->prevlevel[PITCHENV] = tc; 
5			return tc;                       
6		}

it looks like it's not setting pStat for this case, 1296's jump to L68826. To match what I wrote it would need to jump to L68830. 1329 is part of the problem as 1326 should not be inside the else.

If that's Visual c's interpretation of my code I'm not impressed, goes against what I told it to do by reversing my size optimization & introduces a bug:P

C does give the power to strictlly tell the compiler what to do by way of the goto statement. Frowned upon in polite society but as you can see it's what the compilers doing anyway and the most direct way to specify forward jumps in c.

1INLINE Bit16s  MidiChannel::getPitchEnvelope(dpoly::partialStatus
2 *pStat, dpoly *poly, bool inDecay) {
3	Bit32u sampoff;
4	patchCache *tcache = &pcache[pStat->partNum];
5
6	Bit16s tc;
7
8	pStat->pitchsustain = false;  
9
10	if(inDecay) {
11		if(pStat->isDecayed || (pStat->envpos[PITCHENV] >= pStat->envsize[PITCHENV])) {
12			tc = tcache->pitchEnv.level[4];
13			goto dowhatisay;                       
14		}
15	} else {
16
17		if(pStat->envstat[PITCHENV]==2) {
18			tc =tcache->pitchEnv.level[3];
19
20			if(tcache->sustain)   			
21				pStat->pitchsustain = true    
22			else	
23				StartDecay(PITCHENV, tcache->pitchEnv.level[3], pStat, poly);
24			goto dowhatisay;
25		} else {
26
27			if((pStat->envstat[PITCHENV]==-1) || (pStat->envpos[PITCHENV] >= pStat->envsize[PITCHENV])) {
28				pStat->envbase[PITCHENV] = tcache->pitchEnv.level[pStat->envstat[PITCHENV]+1];
29				pStat->envstat[PITCHENV]++;
30
31				pStat->envpos[PITCHENV] = 0;
32				pStat->envsize[PITCHENV] = (envtimetable[tcache->pitchEnv.time[pStat->envstat[PITCHENV]]] * fildeptable[tcache->pitchEnv.timekeyfollow][poly->freqnum]) >> 8;
33				pStat->envsize[PITCHENV]++;
34				pStat->envdist[PITCHENV] = tcache->pitchEnv.level[pStat->envstat[PITCHENV]+1] - pStat->envbase[PITCHENV];
35			}
36
37		}
38
39	}
40	tc = pStat->envbase[PITCHENV]; 
41	tc = (tc + ((pStat->envdist[PITCHENV] * pStat->envpos[PITCHENV]) / pStat->envsize[PITCHENV]));
42 dowhatisay:
43	pStat->prevlevel[PITCHENV] = tc;
44	return tc; 
45
46}

How to tell Visual c to do the ending tc calc just once, but that I really mean it this time, I'm unsure. It should be doing what I tell it to as is.

ps. you may have noticed all the indexed refs of pStat->envXXX[idx] are four byte addressing + four byte base + 1byte immediate and when stored put into a four byte register. For size optimization, if there's three or more references to XXX without modifiying, copying to a temp variable before using will save. If modified, six or more will save. stat qualifies but just so, would save a whole two bytes:)

Last edited by ih8registrations on 2003-07-27, 11:22. Edited 1 time in total.

Reply 4 of 23, by ih8registrations

Posted on 2003-07-27, 11:07

ih8registrations Offline

Rank Oldbie

Rank: Oldbie
Posts: 931
Joined: 2003-07-25, 17:20

In InitTables from MidiHandler_mt32 class, merging the following loops save 31k cycles.

1                for(dep=0;dep<=100;dep++) {
2                        for(velt=0;velt<128;velt++) {
3                                float fdep = ((float)dep / 100.0) * 256;
4                                float fv = (velt - 64.0) / 64.0;
5                                tempdep = 256.0 + (fdep * fv);
6                                filveltable[velt][dep] = (int)tempdep;
7                                //LOG_MSG("Filvel dep %d velt %d = %x", dep, velt, filveltable[velt][dep]);
8                        }
9                }
10
11                float lfp, depf, finalval;
12                int depat, pval;
13
14                for(lf=0;lf<=100;lf++) {
15                        // I believe the depth is cubed or something
16                        lfp = pow(((float)lf / 100.0),3);
17                        // Maybe its not
18                        // lfp = (float)lf / 100.0;
19                                                                                                                                             
20                        for(depat=0;depat<=100;depat++) {
21                                depf = ((float)depat - 50.0) / 50.0;
22                                finalval = pow(2, lfp * depf * .25);
23                                pval = (int)(finalval * 256);
24                                                                                                                                             
25                                lfoptable[lf][depat] = pval;
26                                                                                                                                             
27                                //LOG_MSG("lf %d depat %d pval %x", lf,depat,pval);
28                                                                                                                                             
29                        }
30                }
31

1                float lfp, depf, finalval;
2                int depat, pval;
3                for(lf=0;lf<=100;lf++) {
4                        // I believe the depth is cubed or something
5                        lfp = pow(((float)lf / 100.0),3);
6                        // Maybe its not
7                        // lfp = (float)lf / 100.0;
8                                                                                                                                             
9                        for(depat=0;depat<=100;depat++) {
10                                depf = ((float)depat - 50.0) / 50.0;
11                                finalval = pow(2, lfp * depf * .25);
12                                pval = (int)(finalval * 256);
13                                                                                                                                             
14                                lfoptable[lf][depat] = pval;
15                                                                                                                                             
16                                //LOG_MSG("lf %d depat %d pval %x", lf,depat,pval);
17                                                                                                                                             
18                                float fdep = ((float)lf / 100.0) * 256;
19                                float fv = (depat - 64.0) / 64.0;
20                                tempdep = 256.0 + (fdep * fv);
21                                filveltable[depat][lf] = (int)tempdep;
22                                //LOG_MSG("Filvel dep %d velt %d = %x", dep, velt, filveltable[velt][dep]);
23                        }
24                        for(velt=101;velt<128;velt++) {
25                                float fdep = ((float)lf / 100.0) * 256;
26                                float fv = (velt - 64.0) / 64.0;
27                                tempdep = 256.0 + (fdep * fv);
28                                filveltable[velt][lf] = (int)tempdep;
29                                //LOG_MSG("Filvel dep %d velt %d = %x", dep, velt, filveltable[velt][dep]);
30                        }
31                }

Merging the outside loop saves 100 cmp, inc, & jmps, as well as probably a mov since there's probably enough going on to need reloading the counter. That's the 1k. The 30k comes from merging the two inner loops for 100 iterations. again, saving a cmp, inc & jmp, probably not a mov, *100 inner * 100 outer. If we lowball & say they all take only one cycle, possible, ignoring pontential stalls, other, then it's 3*100*100 + outer 1k; 31k. There's several other outside loops in InitTables than can be merged and some other tweaks, for about another 5k or so that I saw, but this is the biggest savings to be had. The cost is the four lines of duplicated code but for 31k cycles, I can live with that:)

Reply 5 of 23, by ih8registrations

Posted on 2003-07-27, 11:43

ih8registrations Offline

Rank Oldbie

Rank: Oldbie
Posts: 931
Joined: 2003-07-25, 17:20

I didn't look close enough, here's nearly 7k easily saved right here:

1                int period = 256;
2                float angdelt = (360 / (float)period) * (PI / 180);
3
4                float angval = 0;
5                for(int ang=0;ang<period;ang++) {
6                                                                                                                                             
7                        int halfang = (period / 2);
8                        int quartang = (period / 4);
9                        int angval = ang % quartang;
10                        float tval = (float)angval / ((float)quartang);
11                        if(ang<=quartang) sintable[ang] = (int)(tval * 256);
12                        else if ((ang<=halfang) && (ang>quartang)) sintable[ang] = (int)((1.0-tval) * 256);
13                        else if ((ang>halfang) && (ang<=(quartang+halfang))) sintable[ang] = (int)(tval * -256);
14                        else if (ang>(quartang+halfang)) sintable[ang] = (int)((1.0-tval) * -256);
15                        sintable[period/4] = 256;
16                        sintable[period/2] = 0;
17                        sintable[(period*3)/4] = -256;
18                                                                                                                                             
19                        //LOG_MSG("Lfo ang %d = value %d", ang, sintable[ang]);
20                                                                                                                       
21
22                        sintable[period] *= 50;
23
24                }
25//                for(ang=0;ang<period;ang++) sintable[period] *= 50;
26                int velt, dep;
27                float tempdep;
28
29                for(velt=0;velt<128;velt++) {
30                       veltkeytable[0][velt] = 256;
31                        for(dep=1;dep<5;dep++) {
32//                                if(dep>0) {
33                                        float ff = ((float)f) / (5 - dep) ;
34                                                                                                                                             
35                                        tempdep = 256.0 - (ff);
36                                        veltkeytable[dep][velt] = (int)tempdep;
37                                        // Crap... parameters not right yet
38                                        //veltkeytable[dep][velt] = 256;
39//                               } else {
40//                                       veltkeytable[dep][velt] = 256;
41//                                }
42                        }
43                }
44

added elses, 3.5k, removed dep>0 check, 2.5k, single line loop 768. There's easilly another trimmable 2k around to round the cycles saved to 40k.

Last edited by ih8registrations on 2003-07-27, 11:50. Edited 1 time in total.

Reply 6 of 23, by ih8registrations

Posted on 2003-07-27, 12:18

ih8registrations Offline

Rank Oldbie

Rank: Oldbie
Posts: 931
Joined: 2003-07-25, 17:20

Changing the beginning of PlayMsg to what's below saves 10 cycles when chan>8.

1        void PlayMsg(Bit32u msg) {
2                int chan = msg & 0xf;
3                isEnabled= true;
4                //if(chan!=0x9) {
5                //      if(chan==12) return;
6                //      chan = chan & 0x7;
7                //
8                //} else {
9                //      chan = 8;
10                //}
11                //if (chan==0) return;
12                //int prechan = chan;
13                //if(code!=0xf0) LOG_MSG("Playing chan %d, code 0x%x note: 0x%x", chan, code, note);
14                                                                                                                                             
15                chan = chantable[chan];
16                if(chan>8) return;
17                //LOG_MSG("Play msg on unreg chan: %d = %d", chan, msg & 0xf);
18                if(chan<0) {
19                        //LOG_MSG("Play msg on unreg chan: %d = %d", chan, msg & 0xf);
20                        return;
21                                                                                                                                             
22                }
23                int h;
24                int code = msg & 0xf0;
25                int note = (msg & 0xff00) >> 8;
26                int velocity = (msg & 0xff0000) >> 16;
27

as well, for case 0xc0: of PlayMsg
remove 'if((chan>=0) && (chan<8))' as it's unnecessary.

Reply 7 of 23, by ih8registrations

Posted on 2003-07-27, 12:38

ih8registrations Offline

Rank Oldbie

Rank: Oldbie
Posts: 931
Joined: 2003-07-25, 17:20

Oi, in PlaySysex, all the range checks

if ((addr>=0x00000) && (addr<0x30000))

following the initial one should be 'else if''s' to avoid needlessly doing all the following checks once having already found & executed the matching range.

Reply 8 of 23, by canadacow

Posted on 2003-07-27, 22:00

canadacow Offline

Rank Member

Rank: Member
Posts: 466
Joined: 2003-05-11, 23:00
Location: United States

Wow... thanks for all the updates. I'm having trouble keeping up. As for the pitch envelope, it needs those duplicate divs because one manages the attack form of the envelope while the other one manages the decay form. Thanks again for your changes. I'm not too incredibly worried about the table generation. On my Celeron 1333Mhz it takes about half a second to generate all the tables--with most of this being consumed by the table generation for the lowpass filter. The real are of concern is the main processing area, the getSample routine. Its in that subroutine where optimizations will be most valuable.

Reply 9 of 23, by ih8registrations

Posted on 2003-07-28, 06:50

ih8registrations Offline

Rank Oldbie

Rank: Oldbie
Posts: 931
Joined: 2003-07-25, 17:20

Ok here' goes:

1INLINE short MidiChannel::getSample(short *lspecial, short *rspecial) {
2
3	int t, m, c, loc, pcm, partplay;
4	dpoly *tmppoly; 
5
6	//if(!isRy) return 0; 
7	//if ((this->channum<2)  || (this->channum>8)) return 0;
8	//if(this->channum!=1) return 0;
9
10	if(isRy) 
11		partplay = DRUMPOLY
12 	else    partplay = DPOLY;
13
14	for (m=0;m<partplay;m++) {
15		Bit16s envval;
16		Bit16s ampval;
17		tmppoly = &notepoly[m];
18		if(tmppoly->isPlaying || tmppoly->isDecay) {
19			int ptemp[5];
20			memset(ptemp,0,sizeof(ptemp));
21			bool isDone = true;
22
23			for(t=0;t<4;t++) { 
24				patchCache *tcache = &pcache[t];
25				dpoly::partialStatus *partCache = &tmppoly->pStatus[t];
26				if(isRy) tcache = &drumCache[tmppoly->pcmnum][t];
27	
28				if((tcache->playPartial) && (!partCache->isDecayed)) {
29					isDone = false;
30					// Calculate TVA envelope
31					ampval = getAmpEnvelope(partCache,tmppoly,partCache->decaying[AMPENV]);
32					ampval = amptable[ampval];
33					int tmpvel = tmppoly->vel;
34					if(tcache->ampenvdir==1) tmpvel = 127-tmpvel;
35					ampval = (ampval * ampveltable[tmpvel][tcache->ampEnv.velosens]) >> 8;
36
37					// Calculate Pitch envelope
38					envval = getPitchEnvelope(partCache,tmppoly,partCache->decaying[PITCHENV]);
39					//if(envval<-50) envval=-50;
40					//if(envval>50) envval=50;
41					//envval += 50;
42					int pdep = penvtable[tcache->pitchEnv.depth][envval];
43
44					// Calculate LFO position
45					// LFO does not kick in until pitch envelope sustains
46					int lfoat;
47					if((tcache->lfodepth>0) && (partCache->pitchsustain))  {
48						if(partCache->lfopos>=tcache->lfoperiod) 
49							partCache->lfopos = 0;
50						else    partCache->lfopos++;
51
52						lfoat = (partCache->lfopos << 8) / tcache->lfoperiod;
53						lfoat = lfoptable[tcache->lfodepth][((sintable[lfoat]) >> 8)+50];
54						//LOG_MSG("lfodepth %d, lfoatr %d, lfoat %x period %d pos %d",tcache->lfodepth,lfoatr, lfoat, tcache->lfoperiod, tmppoly->lfopos);
55					} else  lfoat = 0x100;
56
57					// Get waveform - either PCM or synthesized sawtooth or square
58					soundaddr *pOff = &partCache->partialOff;
59					int delta = 0x10000, noteval = partCache->noteval;
60					if (tcache->PCMPartial) {

…Show last 129 lines

61						// PCM partial
62						if(tcache->rawPCM>53) { 
63							if(tcache->rawPCM>=74) {
64								if (partCache->PCMDone) {
65									pOff->pcmabs =0;
66									partCache->PCMDone = false;
67								}
68								pcm = PCMReassign[tcache->rawPCM - 74];
69							} else  pcm = PCMReassign[tcache->rawPCM - 54];
70						} else          pcm = tcache->convPCM;
71
72						delta = wavtabler[pcm][noteval];
73
74						if (!partCache->PCMDone) {
75							int ra, rb, addr = PCM[pcm].addr;
76							if(delta<0x10000) {
77								// Linear sound interpolation
78								ra = romfile[addr + pOff->pcmoffs.pcmplace];
79								rb = romfile[addr + pOff->pcmoffs.pcmplace+1];
80								ptemp[t] = (ra + (((rb-ra) * pOff->pcmoffs.pcmoffset) >>16));
81							} else 
82								ptemp[t] = romfile[addr + pOff->pcmoffs.pcmplace];
83
84							if ((pOff->pcmoffs.pcmplace) >=PCM[pcm].len) {
85								if(PCM[pcm].loop) 
86									pOff->pcmabs = 0
87								else    partCache->PCMDone = true;
88							}
89						}
90					} else {
91						// Synthesis partial
92						int divis, ofs3, toff, wf;
93
94						toff = pOff->pcmoffs.pcmplace;
95						divis = divtable[noteval]>>15;
96						
97						if(pOff->pcmoffs.pcmplace>=divis) pOff->pcmabs = (pOff->pcmoffs.pcmoffset % divis);
98						
99						if(tcache->waveform == 0) {
100							// Square waveform.  Made by combining two pregenerated bandlimited 
101							// sawtooth waveforms
102							int divmark = divtable[noteval]>>8;
103
104							ofs3 = (toff + ((divmark*pulsetable[tcache->pulsewidth])>>16)) % (divis >> 1);
105							
106							ptemp[t] = waveforms[0][noteval][toff % (divis >> 1)] + waveforms[1][noteval][ofs3];
107						} else {
108							// Sawtooth.  Made by combining the full cosine and half cosine according
109							// to how the MT-32 does it.  This is identical to the MT-32's operation
110							wf = 2;
111							if(toff >= sawtable[noteval][tcache->pulsewidth]) wf++;
112							ptemp[t] = waveforms[wf][noteval][toff];
113						}
114						ptemp[t] = getFiltEnvelope(ptemp[t],partCache,tmppoly,partCache->decaying[FILTENV]);
115					}
116					// Build delta for position of next sample
117					delta = (delta * finetable[tcache->fineshift])>>8;
118					delta = (delta * pdep)>>8;
119					delta = (delta * lfoat)>>8;
120					
121					// Add calculated delta to our waveform offset
122					pOff->pcmabs+=delta;
123
124					// Put volume envelope over generated sample
125					ptemp[t] = (ptemp[t] * (int)ampval * (int)v) >> 14;
126
127					for(int envnum=0;envnum<3;envnum++) partCache->envpos[envnum]++;
128				}
129			}
130			if(isDone) {
131				tmppoly->isPlaying = false;
132				tmppoly->isDecay = false;
133			}
134			// Post process partials and bring them together
135			int temps, s1, s2, i = 0;
136			*lspecial = *rspecial = 0;
137			for(int z=0;z<2;z++) {
138				if(z==0) {
139					temps = mt32ram.params.patch[patch].common.pstruct12; 
140					s1=0;
141					s2=1;
142				} else {
143					temps = mt32ram.params.patch[patch].common.pstruct34;
144					s1=2;
145					s2=3;
146				}
147				if(!pcache[s1].playPartial) s1=4;
148				if(!pcache[s2].playPartial) s2=4;
149				//LOG_MSG("z %d ps %d, s1 %d s2 %d", z, temps, s1, s2);
150
151				temps = PartMixStruct[temps];
152				
153				switch(temps) {
154				case 0: 
155					// Standard sound mix
156					i+=ptemp[s1] + ptemp[s2];
157					break;
158				case 1:
159					// Ring modulation with sound mix 
160					i+=(((ptemp[s1] * ptemp[s2])>>WGAMP) + ptemp[s1]); 
161					break;
162				case 2:
163					// Ring modulation alone 
164					i+=((ptemp[s1] * ptemp[s2])>>WGAMP);
165					break;
166				case 3:
167					// Stereo mixing.  One partial to one channel, one to another.
168					*lspecial += ptemp[s1];
169					*rspecial += ptemp[s2];
170				default:
171					i+=ptemp[s1] + ptemp[s2];
172					break;
173				}
174			}
175			if (!isRy) {
176				// Mix standard tibre
177				c += i;
178			} else {
179				c = 0;
180				// Drums have their special, built in panpot locations
181				*lspecial += ((i * drumPan[tmppoly->pcmnum][0]) >> 8);
182				*rspecial += ((i * drumPan[tmppoly->pcmnum][1]) >> 8);
183			}
184			//tmppoly->pcmoff.pcmabs +=tmppoly->pcmdelta;
185		}
186	}
187	return c;
188}

/*
got rid of linefeeds for whitespace, indentions suffice; easier to trace with more on a page
partplay = DPOLY made part of if else than setting than overriding if isRy
moved int i, r init to where they are used
moved *lspecial = *rspecial = 0 to where they are used, same place as int i;
removed int x, shitguard, unused
removed Bit32u tmpoff, unused
removed bool playwav = true, unused
removed int v & v = volume, unused
init c to 0 moved to bottom of function into conditional
cleaned up calculate lfo position
removed unneccessary temp var pd
cleaned up pcm partial
cleaned up synthesis partial
*/

Again, I think you're misunderstanding my code change, or I'm not understanding what you're saying; my change in the code still does the div for both cases, it just doesn't have two copies of the call; it's a space saving optimization.

Next up to optimize for getSample are the functions it calls.

Reply 10 of 23, by ih8registrations

Posted on 2003-07-28, 08:07

ih8registrations Offline

Rank Oldbie

Rank: Oldbie
Posts: 931
Joined: 2003-07-25, 17:20

Called by getSample.

1INLINE Bit16s MidiChannel::getAmpEnvelope(dpoly::partialStatus *pStat, dpoly *poly, bool inDecay) {
2        Bit16s tc;
3        patchCache *tcache = &pcache[pStat->partNum];
4                                                                                                                                             
5        if(inDecay) {
6                if(!pStat->isDecayed) {
7                        if(pStat->envpos[AMPENV] >= pStat->envsize[AMPENV]) pStat->isDecayed = true;
8                        tc = (pStat->envbase[AMPENV] + ((pStat->envdist[AMPENV] * pStat->envpos[AMPENV]) / pStat->envsize[AMPENV]));
9                } else  tc = 0;
10        } else {
11                if(pStat->envstat[AMPENV]==4) {
12                        tc = tcache->ampEnv.envlevel[3];
13                        if(tcache->sustain)
14                                StartDecay(AMPENV, tc, pStat, poly);
15                } else {
16                        if((pStat->envstat[AMPENV]==-1) || (pStat->envpos[AMPENV] >= pStat->envsize[AMPENV])) {
17                                if(pStat->envstat[AMPENV]==-1)
18                                        pStat->envbase[AMPENV] = 0;
19                                else    pStat->envbase[AMPENV] = tcache->ampEnv.envlevel[pStat->envstat[AMPENV]];
20                                                                                                                                             
21                                pStat->envstat[AMPENV]++;
22                                pStat->envpos[AMPENV] = 0;
23                                                                                                                                             
24                                if(pStat->envstat[AMPENV]==3)
25                                        pStat->envsize[AMPENV] = (decaytimetable[tcache->ampEnv.envtime[pStat->envstat[AMPENV]]] * fildeptable[tcache->ampEnv.envtkf][poly->freqnum]) >> 8;
26                                else    pStat->envsize[AMPENV] =   (envtimetable[tcache->ampEnv.envtime[pStat->envstat[AMPENV]]] * fildeptable[tcache->ampEnv.envtkf][poly->freqnum]) >> 8;
27                                                                                                                                             
28                                //Spot for velocity time follow
29                                //Just a wild guess.  This is hard to measure.
30                                pStat->envsize[AMPENV] = ((pStat->envsize[AMPENV] * veltkeytable[tcache->ampEnv.envvkf][poly->vel]) >> 8)+1;
31                                pStat->envdist[AMPENV] = tcache->ampEnv.envlevel[pStat->envstat[AMPENV]] - pStat->envbase[AMPENV];
32                        }
33                        tc = (pStat->envbase[AMPENV] + ((pStat->envdist[AMPENV] * pStat->envpos[AMPENV]) / pStat->envsize[AMPENV]));
34                }
35                tc = (tc * (int)tcache->ampEnv.level) >> 7;
36        }
37        pStat->prevlevel[AMPENV] = tc;
38                                                                                                                                             
39        //Bias level crap stuff now
40        int bias, max;
41        for(int bt=0;bt<2;bt++) {
42                if(tcache->ampblevel[bt]!=0) {
43                        bias = tcache->ampbias[bt];
44                        max = 0;
45                        if(tcache->ampdir[bt]==0) {
46                                // < Bias
47                                if(poly->freqnum < bias) {
48                                        max = bias - 33;
49                                        bias =- poly->freqnum;
50                                }
51                        } else {
52                                // > Bias
53                                if(poly->freqnum > bias) {
54                                        max = 96 - bias;
55                                        bias = poly->freqnum - bias;
56                                }
57                        }
58                        if(max!=0) {
59                                bias = (((bias << 8) / max) * tcache->ampblevel[bt]) >> 8;
60                                if(bias>12) bias=12;

…Show last 8 lines

61                                //LOG_MSG("bias %d freq %d pos %d lev %d dir %d", bias,poly->freqnum,pos,tcache->ampblevel[bt],tcache->ampdir[bt]);
62                                tc = (biastable[bias] * tc) >> 8;
63                        }
64                }
65        }
66        return tc;
67}

Reply 11 of 23, by ih8registrations

Posted on 2003-07-28, 08:56

ih8registrations Offline

Rank Oldbie

Rank: Oldbie
Posts: 931
Joined: 2003-07-25, 17:20

Called by getSample.

1INLINE Bit16s MidiChannel::getFiltEnvelope(Bit16s wg, dpoly::partialStatus *pStat, dpoly *poly, bool inDecay) {
2//      // unused
3//      Bit32u sampoff;
4//      float out, out2;
5//      float specialfreq,conv;
6//      int usefreq;
7//      realfol;
8//      envCache *myenv = &pcache[pStat->partNum].fEnvCache;
9                                                                                                                                             
10        patchCache *tcache = &pcache[pStat->partNum];
11        float *hist = pStat->history;
12        int reshigh, filt, cutoff, depth;
13        int keyfollow = pStat->filtval;
14        int realfollow = pStat->realval;
15        int fr = poly->freqnum;
16        int wf = tcache->waveform;
17                                                                                                                                             
18        if(inDecay) {
19                if(pStat->isDecayed || (pStat->envpos[FILTENV] >= pStat->envsize[FILTENV]))
20                        reshigh = 0;
21                else    reshigh = (pStat->envbase[FILTENV] + ((pStat->envdist[FILTENV] * pStat->envpos[FILTENV]) / pStat->envsize[FILTENV]));        
22        } else {
23                if(pStat->envstat[FILTENV]==4) {
24                        reshigh = tcache->filtEnv.envlevel[3];
25                        if(!tcache->sustain)
26                                StartDecay(FILTENV, reshigh, pStat, poly);
27                } else {
28                        if((pStat->envstat[FILTENV]==-1) || (pStat->envpos[FILTENV] >= pStat->envsize[FILTENV])) {
29                                if(pStat->envstat[FILTENV]==-1)
30                                        pStat->envbase[FILTENV] = 0;
31                                else    pStat->envbase[FILTENV] = tcache->filtEnv.envlevel[pStat->envstat[FILTENV]];
32                                                                                                                                             
33                                pStat->envstat[FILTENV]++;
34                                pStat->envpos[FILTENV] = 0;
35                                                                                                                                             
36                                if(pStat->envstat[FILTENV]==3)
37                                                                                                                                             
38                                if(pStat->envstat[FILTENV]==3)
39                                        pStat->envsize[FILTENV] = (decaytimetable[tcache->filtEnv.envtime[pStat->envstat[FILTENV]]] * fildeptable[tcache->filtEnv.envtkf][poly->freqnum]) >> 8;
40                                else    pStat->envsize[FILTENV] =   (envtimetable[tcache->filtEnv.envtime[pStat->envstat[FILTENV]]] * fildeptable[tcache->filtEnv.envtkf][poly->freqnum]) >> 8;
41                                                                                                                                             
42                                pStat->envsize[FILTENV]++;
43                                pStat->envdist[FILTENV] = tcache->filtEnv.envlevel[pStat->envstat[FILTENV]] - pStat->envbase[FILTENV];
44                        }
45                        reshigh = (pStat->envbase[FILTENV] + ((pStat->envdist[FILTENV] * pStat->envpos[FILTENV]) / pStat->envsize[FILTENV]));                }
46                pStat->prevlevel[FILTENV] = reshigh;
47        }
48        cutoff = (tcache->filtEnv.cutoff);
49        depth  = (tcache->filtEnv.envdepth);
50                                                                                                                                             
51        //int sensedep = (depth * 127-tcache->filtEnv.envsense) >> 7;
52        depth = (depth * filveltable[poly->vel][tcache->filtEnv.envsense]) >> 8;
53                                                                                                                                             
54        int max, bias = tcache->tvfbias;
55        if(bias!=0) {
56                //LOG_MSG("Cutoff before %d", cutoff);
57                if(tcache->tvfdir == 0) {
58                        if(fr < bias) {
59                                max = bias;
60                                if(max!=0) {

…Show last 55 lines

61                                        bias = ((((bias - fr) << 16) / max) * (tcache->tvfblevel))>>16;
62                                        cutoff = (cutoff * fbiastable[bias+7]) >> 8;
63                                }
64                        }
65                } else {
66                        // > Bias
67                        if(fr > bias) {
68                                max = 108-bias;
69                                if(max!=0) {
70                                        bias = ((((fr - bias) << 8) / max) * (tcache->tvfblevel))>>8;
71                                        cutoff = (cutoff * fbiastable[bias+7]) >> 8;
72                                }
73                        }
74                }
75                //LOG_MSG("Cutoff after %d", cutoff);
76        }
77        reshigh = (reshigh * depth)>>7;
78        filt = ((cutoff + reshigh) * keyfollow) / realfollow;
79        filt = (filt * fildeptable[tcache->tvfdepth][fr]) >> 8;
80                                                                                                                                             
81        if(filt>200) filt = 200;
82        int usefilt = filttable[wf][fr][filt];
83                                                                                                                                             
84        /*
85        if(usefilt==0) {
86                memset(hist,0,sizeof(hist));
87                return 0;
88        }*/
89                                                                                                                                             
90        // Lowpass
91                                                                                                                                             
92       return (int)iir_filter((float)wg,hist,filtcoeff[usefilt][tcache->filtEnv.resonance]);
93                                                                                                                                            
94        /*
95        int res = tcache->filtEnv.resonance;
96                                                                                                                                             
97        float in = (float)wg/32767.0;
98        float res_lp = (float)(res) / 31.0;
99        res_lp = res_lp * res_lp;
100        float cut_lp = usefilt;
101        float n1, n2, n3, n4, fb_lp,fb_lp2;
102                                                                                                                                             
103        n1 = hist[0];
104        n2 = hist[1];
105                                                                                                                                             
106        fb_lp = res_lp+res_lp/(1-cut_lp);
107        n1=n1+cut_lp*(in-n1+fb_lp*(n1-n2));
108        n2=n2+cut_lp*(n1-n2);
109                                                                                                                                             
110        hist[0] = n1;
111        hist[1] = n2;
112                                                                                                                                             
113        return (int)(n2*32767.0);*/
114}

Reply 12 of 23, by ih8registrations

Posted on 2003-07-28, 09:26

ih8registrations Offline

Rank Oldbie

Rank: Oldbie
Posts: 931
Joined: 2003-07-25, 17:20

Incorporate these updates since the last release and I'd say your ready for a new one:D

Reply 13 of 23, by ih8registrations

Posted on 2003-07-30, 16:23

ih8registrations Offline

Rank Oldbie

Rank: Oldbie
Posts: 931
Joined: 2003-07-25, 17:20

from struct dpoly:

1	struct partialStatus {
2		// Keyfollowed note values
3		int noteval;
4
5		// Keyfollowed filter values
6		int realval;
7		int filtval;
8
9		Bit32s envpos[4];
10		Bit32s envstat[4];
11		Bit32s envbase[4];
12		Bit32s envdist[4];
13		Bit32s envsize[4];
14
15		Bit32u lfopos;
16		soundaddr partialOff;
17		// soundaddr wgOff; 
18
19		bool decaying[4];
20		// bool notdecayed[4]; 
21		// Bit32u decay[4]; 
22		Bit32s prevlevel[4];
23		bool isDecayed;
24		bool PCMDone;
25		float history[32];
26		// float pastfilt; 
27		bool pitchsustain;
28
29		int partNum;
30	} pStatus[4];

commented out unused variables; saves not copying up to 5k around in getSample.

Save not copying up to another 8k if pStatus were pulled out of dpoly, with dpoly having a pointer to outside pStatus instead.

Last edited by ih8registrations on 2003-07-30, 16:39. Edited 1 time in total.

Reply 14 of 23, by ih8registrations

Posted on 2003-07-30, 17:40

ih8registrations Offline

Rank Oldbie

Rank: Oldbie
Posts: 931
Joined: 2003-07-25, 17:20

Surpised I missed this one at first:

1			for(t=0;t<4;t++) { 
2				patchCache *tcache = &pcache[t];
3				dpoly::partialStatus *partCache = &tmppoly->pStatus[t];
4				if(isRy) tcache = &drumCache[tmppoly->pcmnum][t];

patchCache is a big structure and the code doing a default assignment of it. If the instrument is a drum, it does another load of this big structure. ugh.

1			for(t=0;t<4;t++) { 
2				patchCache *tcache;
3				dpoly::partialStatus *partCache = &tmppoly->pStatus[t];
4
5				if(isRy) tcache = &drumCache[tmppoly->pcmnum][t]
6				else     tcache = &pcache[t];

This is an optimization for when the if playpartial && isdecayed check doesn't fall through at the cost? of doing the check of isDecayed referencing tmppoly=pStatus[t].

1			for(t=0;t<4;t++) { 
2				patchCache *tcache;
3
4				if(isRy) tcache = &drumCache[tmppoly->pcmnum][t]
5				else 	   tcache = &pcache[t];
6	
7				if((tcache->playPartial) && (!tmppoly->pStatus[t]->isDecayed)) {
8					dpoly::partialStatus *partCache = &tmppoly->pStatus[t];

The same could be done for tcache.

1			for(t=0;t<4;t++) { 
2				if((pcache[t]->playPartial || drumCache[tmppoly->pcmnum][t]->playPartial) && (!tmppoly->pStatus[t]->isDecayed)) {
3					patchCache *tcache;
4                                        dpoly::partialStatus *partCache = &tmppoly->pStatus[t];
5
6					if(isRy) tcache = &drumCache[tmppoly->pcmnum][t]
7                                        else     tcache = &pcache[t];
8

Last edited by ih8registrations on 2003-07-30, 18:21. Edited 1 time in total.

Reply 15 of 23, by canadacow

Posted on 2003-07-30, 18:45

canadacow Offline

Rank Member

Rank: Member
Posts: 466
Joined: 2003-05-11, 23:00
Location: United States

This doesn't really work as an optimization. Look again at this code:

1			for(t=0;t<4;t++) { 
2				patchCache *tcache = &pcache[t];
3				dpoly::partialStatus *partCache = &tmppoly->pStatus[t];
4				if(isRy) tcache = &drumCache[tmppoly->pcmnum][t];

There are no memory moves here. tcache and partCache are pointer variables, not the actual structures in memory. As such, no memory is copied. The structures could be 1 byte in size or 256MB in size, and this code would execute equally as fast. If I used actual structure variables rather than pointer variables, such a consideration would be an optimization. But again, these are pointers.

Have you read Michael Abrash's Zen of Code Optimization? In it, he goes through the ways one could "count cycles" and so forth. His ultimate conclusion though is that counting cycles can only go so far. The best form of optimzation that Abrash suggests is complete reinnovation and rethinking of the algoritmn. A good example was the change from the envelope caches to the evelope timer in my code. Not only was it more precise, its also a good deal faster. This is the kind of code optimization I'm looking for. If there is a faster, more precise way of lowpass filtering that matches the MT-32's output, that's what I need. I need an efficient reverb algorithm. I feel that I could better generate pulse width modified squarewaves without combining two bandlimted square waves. These are the places where the greatest speed benifit will be seen.

Reply 16 of 23, by ih8registrations

Posted on 2003-07-30, 19:23

ih8registrations Offline

Rank Oldbie

Rank: Oldbie
Posts: 931
Joined: 2003-07-25, 17:20

Well, that would explain not seeing it before:)

btw, then I would assume the tcache variable is only for code readability since it's not doing a copy & it's pointing to the same address as pcache.

Aaand, yes, know all about algorthim vs cycle count. Speaking of focusing the main problem areas, have you been profiling your code?

Last edited by ih8registrations on 2003-07-30, 19:51. Edited 1 time in total.

Reply 17 of 23, by canadacow

Posted on 2003-07-30, 21:45

canadacow Offline

Rank Member

Rank: Member
Posts: 466
Joined: 2003-05-11, 23:00
Location: United States

I have profiled the code but I've found that getting reliable, clear results is very diffcult. This is because the music varies in its demand on certain parts of the emulator. PCM samples are easier to play than the analogue synthesis. Likewise, sawtooths are easier to synthesize than square waves. As such, music that's biased in one of these areas will skew results.

Reply 18 of 23, by ih8registrations

Posted on 2003-07-30, 22:12

ih8registrations Offline

Rank Oldbie

Rank: Oldbie
Posts: 931
Joined: 2003-07-25, 17:20

Sounds like individual test cases are needed for each code path. To do that one way that comes to mind is to use a midi sequencer. The midi sequencers I've ever played with allowed you to turn off channels. Combine it with one or a few midi files that use the various types; pcm, synth, drums, xyz effects and you'll have playback that isolates them.

Last edited by ih8registrations on 2003-07-30, 22:24. Edited 1 time in total.

Reply 19 of 23, by canadacow

Posted on 2003-07-30, 22:29

canadacow Offline

Rank Member

Rank: Member
Posts: 466
Joined: 2003-05-11, 23:00
Location: United States

Hmmm... I'll give that at try and post the results here if you're interested. Any profiler you prefer?

Main menu

2003-7-26 Release

Topic actions

First post, by canadacow

Reply 1 of 23, by ih8registrations

Reply 2 of 23, by canadacow

Reply 3 of 23, by ih8registrations

Reply 4 of 23, by ih8registrations

Reply 5 of 23, by ih8registrations

Reply 6 of 23, by ih8registrations

Reply 7 of 23, by ih8registrations

Reply 8 of 23, by canadacow

Reply 9 of 23, by ih8registrations

Reply 10 of 23, by ih8registrations

Reply 11 of 23, by ih8registrations

Reply 12 of 23, by ih8registrations

Reply 13 of 23, by ih8registrations

Reply 14 of 23, by ih8registrations

Reply 15 of 23, by canadacow

Reply 16 of 23, by ih8registrations

Reply 17 of 23, by canadacow

Reply 18 of 23, by ih8registrations

Reply 19 of 23, by canadacow