First post, by Harry Potter
Harry Potter
Offline
Rank
Oldbie
I've been fighting to get my PrintTok2 text compression programs working, and, while I got them working a few times, the compression ratio was totally horrible: far worse than Deflate. 🙁 The problem must be with my version of Tokenization, as I'm getting too few tokens, and everything I did to gain more tokens, even when they worked, resulted in poorer compression. I'm using the following code on one version to collect the tokens:
unsigned parsetokens1main (void){unsigned i, j, k, l, m;unsigned curstrlen, curtoklen;unsigned newtok;int bestcurtok, bestcurtoklen;//struct tok2buf * besttok[16];char *besttokptr;curstr=strings;while (curstr) {curstrlen=curstr->inlen;//curstr->seg=curseg;for (i=0; curstrlen>=1 && i<curstrlen;) {if (0 && !tok2bufsize) {//addtotok1(curstr->in, 3);} else {bestcurtok=-1; newtok=1; bestcurtoklen=3;m=0;for (j=k=0; j<tok2bufsize; j++) {if (!memcmp(&curstr->in[i], tok2buf[j].token, tok2buf[j].len)) {//bestcurtok=j; bestcurtoklen=tok2buf[j].len;l=getlenmatch(tok2buf[j].token, &curstr->in[i]);if (tok2buf[j].cu==curstr) {// if (tok2buf[j].len+l>i) //continue;// l=tok2buf[j].len-i;//if (tok2buf[j].token+l>&curstr->in[i]) l=&curstr->in[i]-tok2buf[j].token;}//if ((int)(l=getlenmatch(tok2buf[j].token, &curstr->in[i]))>bestcurtoklen) {if (l>=bestcurtoklen && l==tok2buf[j].len) {newtok=0;bestcurtok=j;bestcurtoklen=l;} else if (l>bestcurtoklen) {putchar('.');newtok=1; m=1;bestcurtok=j;bestcurtoklen=l;}}} if (newtok) {addtotok1 (&curstr->in[i], bestcurtoklen);} else if (bestcurtok>=0 && bestcurtoklen>=4) {tok2buf[bestcurtok].occur++;}} //i+=bestcurtoklen;if (bestcurtoklen>=4) i+=bestcurtoklen;else i++;printf ("<%d>\n", bestcurtoklen);}curstr=curstr->next;}printf ("# tokens before sort: %d\n", tok2bufsize);sorttokens1();compresstoks();//compresstoks2();//compresstoksbpe();curstring=strings;while (curstring) {complit_5a();curstring=curstring->next;}
writetokstofile();writestrs();return 0;}
and to sort the tokens:
void sorttokens1 (void){unsigned i, j, k, l;struct tok2buf tmpswaptok;unsigned char c[64];for (i=0; i<tok2bufsize; i++) {tok2buf[i].saved=((tok2buf[i].len)*(tok2buf[i].occur+1));if (tok2buf[i].occur<6 || tok2buf[i].len<3) tok2buf[i].saved=0;}for (i=0; i<tok2bufsize-1; i++) {k=i;for (j=i+1; j<tok2bufsize; j++) if (tok2buf[j].saved>tok2buf[k].saved) k=j;if (k!=i) {memcpy (&tmpswaptok, &tok2buf[i], sizeof(tmpswaptok));memcpy (&tok2buf[i], &tok2buf[k], sizeof(tmpswaptok));memcpy (&tok2buf[k], &tmpswaptok, sizeof(tmpswaptok));}}for (i=0; i<tok2bufsize; i++) {//tok2buf[i].saved=((tok2buf[i].len-1)*(tok2buf[i].occur))-(tok2buf[i].len+1);memcpy (c, tok2buf[i].token, tok2buf[i].len);c[tok2buf[i].len]=0;printf (" Token# %d: \"%s\", Occur %d\n", i, c, tok2buf[i].occur);}getchar();for (i=0; i<tok2bufsize; i++) {if (tok2buf[i].saved<1) {tok2bufsize=i; break;}} if (tok2bufsize>128) tok2bufsize=128;printf ("# tokens after sort: %d\n", tok2bufsize);collecttokens();puts ("b");}
I'm using ANSI-compliant C.
Joseph Rose, a.k.a. Harry Potter
Working magic in the computer community