First post, by Harry Potter
Rank
Oldbie
I've been fighting to get my PrintTok2 text compression programs working, and, while I got them working a few times, the compression ratio was totally horrible: far worse than Deflate. 🙁 The problem must be with my version of Tokenization, as I'm getting too few tokens, and everything I did to gain more tokens, even when they worked, resulted in poorer compression. I'm using the following code on one version to collect the tokens:
unsigned parsetokens1main (void)
{
unsigned i, j, k, l, m;
unsigned curstrlen, curtoklen;
unsigned newtok;
int bestcurtok, bestcurtoklen;
//struct tok2buf * besttok[16];
char *besttokptr;
curstr=strings;
while (curstr) {
curstrlen=curstr->inlen;
//curstr->seg=curseg;
for (i=0; curstrlen>=1 && i<curstrlen;) {
if (0 && !tok2bufsize) {
//addtotok1(curstr->in, 3);
} else {
bestcurtok=-1; newtok=1; bestcurtoklen=3;m=0;
for (j=k=0; j<tok2bufsize; j++) {
if (!memcmp(&curstr->in[i], tok2buf[j].token, tok2buf[j].len)) {
//bestcurtok=j; bestcurtoklen=tok2buf[j].len;
l=getlenmatch(tok2buf[j].token, &curstr->in[i]);
if (tok2buf[j].cu==curstr) {
// if (tok2buf[j].len+l>i) //continue;
// l=tok2buf[j].len-i;
//if (tok2buf[j].token+l>&curstr->in[i]) l=&curstr->in[i]-tok2buf[j].token;
}
//if ((int)(l=getlenmatch(tok2buf[j].token, &curstr->in[i]))>bestcurtoklen) {
if (l>=bestcurtoklen && l==tok2buf[j].len) {
newtok=0;
bestcurtok=j;
bestcurtoklen=l;
} else if (l>bestcurtoklen) {
putchar('.');
newtok=1; m=1;
bestcurtok=j;
bestcurtoklen=l;
}
}
} if (newtok) {
addtotok1 (&curstr->in[i], bestcurtoklen);
} else if (bestcurtok>=0 && bestcurtoklen>=4) {
tok2buf[bestcurtok].occur++;
}
} //i+=bestcurtoklen;
if (bestcurtoklen>=4) i+=bestcurtoklen;
else i++;
printf ("<%d>\n", bestcurtoklen);
}
curstr=curstr->next;
}
printf ("# tokens before sort: %d\n", tok2bufsize);
sorttokens1();
compresstoks();
//compresstoks2();
//compresstoksbpe();
curstring=strings;
while (curstring) {
complit_5a();
curstring=curstring->next;
}
writetokstofile();
writestrs();
return 0;
}
and to sort the tokens:
void sorttokens1 (void)
{
unsigned i, j, k, l;
struct tok2buf tmpswaptok;
unsigned char c[64];
for (i=0; i<tok2bufsize; i++) {
tok2buf[i].saved=((tok2buf[i].len)*(tok2buf[i].occur+1));
if (tok2buf[i].occur<6 || tok2buf[i].len<3) tok2buf[i].saved=0;
}
for (i=0; i<tok2bufsize-1; i++) {
k=i;
for (j=i+1; j<tok2bufsize; j++) if (tok2buf[j].saved>tok2buf[k].saved) k=j;
if (k!=i) {
memcpy (&tmpswaptok, &tok2buf[i], sizeof(tmpswaptok));
memcpy (&tok2buf[i], &tok2buf[k], sizeof(tmpswaptok));
memcpy (&tok2buf[k], &tmpswaptok, sizeof(tmpswaptok));
}
}
for (i=0; i<tok2bufsize; i++) {
//tok2buf[i].saved=((tok2buf[i].len-1)*(tok2buf[i].occur))-(tok2buf[i].len+1);
memcpy (c, tok2buf[i].token, tok2buf[i].len);
c[tok2buf[i].len]=0;
printf (" Token# %d: \"%s\", Occur %d\n", i, c, tok2buf[i].occur);
}
getchar();
for (i=0; i<tok2bufsize; i++) {
if (tok2buf[i].saved<1) {tok2bufsize=i; break;}
} if (tok2bufsize>128) tok2bufsize=128;
printf ("# tokens after sort: %d\n", tok2bufsize);
collecttokens();
puts ("b");
}
I'm using ANSI-compliant C.
Joseph Rose, a.k.a. Harry Potter
Working magic in the computer community