VOGONS


PrintTok2 text compression: What am I doing wrong?

Topic actions

  • This topic is locked. You cannot reply or edit posts.

First post, by Harry Potter

User metadata
Rank Oldbie
Rank
Oldbie

I've been fighting to get my PrintTok2 text compression programs working, and, while I got them working a few times, the compression ratio was totally horrible: far worse than Deflate. 🙁 The problem must be with my version of Tokenization, as I'm getting too few tokens, and everything I did to gain more tokens, even when they worked, resulted in poorer compression. I'm using the following code on one version to collect the tokens:

unsigned parsetokens1main (void)
{
unsigned i, j, k, l, m;
unsigned curstrlen, curtoklen;
unsigned newtok;
int bestcurtok, bestcurtoklen;
//struct tok2buf * besttok[16];
char *besttokptr;
curstr=strings;
while (curstr) {
curstrlen=curstr->inlen;
//curstr->seg=curseg;
for (i=0; curstrlen>=1 && i<curstrlen;) {
if (0 && !tok2bufsize) {
//addtotok1(curstr->in, 3);
} else {
bestcurtok=-1; newtok=1; bestcurtoklen=3;m=0;
for (j=k=0; j<tok2bufsize; j++) {
if (!memcmp(&curstr->in[i], tok2buf[j].token, tok2buf[j].len)) {
//bestcurtok=j; bestcurtoklen=tok2buf[j].len;
l=getlenmatch(tok2buf[j].token, &curstr->in[i]);
if (tok2buf[j].cu==curstr) {
// if (tok2buf[j].len+l>i) //continue;
// l=tok2buf[j].len-i;
//if (tok2buf[j].token+l>&curstr->in[i]) l=&curstr->in[i]-tok2buf[j].token;
}
//if ((int)(l=getlenmatch(tok2buf[j].token, &curstr->in[i]))>bestcurtoklen) {
if (l>=bestcurtoklen && l==tok2buf[j].len) {
newtok=0;
bestcurtok=j;
bestcurtoklen=l;
} else if (l>bestcurtoklen) {
putchar('.');
newtok=1; m=1;
bestcurtok=j;
bestcurtoklen=l;
}
}
} if (newtok) {
addtotok1 (&curstr->in[i], bestcurtoklen);
} else if (bestcurtok>=0 && bestcurtoklen>=4) {
tok2buf[bestcurtok].occur++;
}
} //i+=bestcurtoklen;
if (bestcurtoklen>=4) i+=bestcurtoklen;
else i++;
printf ("<%d>\n", bestcurtoklen);
}
curstr=curstr->next;
}
printf ("# tokens before sort: %d\n", tok2bufsize);
sorttokens1();
compresstoks();
//compresstoks2();
//compresstoksbpe();
curstring=strings;
while (curstring) {
complit_5a();
curstring=curstring->next;
}
Show last 5 lines
	writetokstofile();
writestrs();
return 0;
}

and to sort the tokens:

void sorttokens1 (void)
{
unsigned i, j, k, l;
struct tok2buf tmpswaptok;
unsigned char c[64];
for (i=0; i<tok2bufsize; i++) {
tok2buf[i].saved=((tok2buf[i].len)*(tok2buf[i].occur+1));
if (tok2buf[i].occur<6 || tok2buf[i].len<3) tok2buf[i].saved=0;
}
for (i=0; i<tok2bufsize-1; i++) {
k=i;
for (j=i+1; j<tok2bufsize; j++) if (tok2buf[j].saved>tok2buf[k].saved) k=j;
if (k!=i) {
memcpy (&tmpswaptok, &tok2buf[i], sizeof(tmpswaptok));
memcpy (&tok2buf[i], &tok2buf[k], sizeof(tmpswaptok));
memcpy (&tok2buf[k], &tmpswaptok, sizeof(tmpswaptok));
}
}
for (i=0; i<tok2bufsize; i++) {
//tok2buf[i].saved=((tok2buf[i].len-1)*(tok2buf[i].occur))-(tok2buf[i].len+1);
memcpy (c, tok2buf[i].token, tok2buf[i].len);
c[tok2buf[i].len]=0;
printf (" Token# %d: \"%s\", Occur %d\n", i, c, tok2buf[i].occur);
}
getchar();
for (i=0; i<tok2bufsize; i++) {
if (tok2buf[i].saved<1) {tok2bufsize=i; break;}
} if (tok2bufsize>128) tok2bufsize=128;
printf ("# tokens after sort: %d\n", tok2bufsize);
collecttokens();
puts ("b");
}

I'm using ANSI-compliant C.

Joseph Rose, a.k.a. Harry Potter
Working magic in the computer community